[hg] galaxy 2403: Added SOLiD tool section with QC tools.
details: http://www.bx.psu.edu/hg/galaxy/rev/8be3989335a9 changeset: 2403:8be3989335a9 user: guru date: Tue May 05 22:44:30 2009 -0400 description: Added SOLiD tool section with QC tools. 5 file(s) affected in this change: test-data/qualsolid.stats tool_conf.xml.sample tools/solid_tools/solid_qual_boxplot.xml tools/solid_tools/solid_qual_stats.py tools/solid_tools/solid_qual_stats.xml diffs (307 lines): diff -r dd21a47f8035 -r 8be3989335a9 test-data/qualsolid.stats --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/qualsolid.stats Tue May 05 22:44:30 2009 -0400 @@ -0,0 +1,36 @@ +column count min max sum mean Q1 med Q3 IQR lW rW +1 24 2 31 413.0 17.2083333333 5 18.0 28 23 2 31 +2 24 5 31 499.0 20.7916666667 10 26.0 30 20 5 31 +3 24 2 34 324.0 13.5 2 6.5 30 28 2 34 +4 24 2 32 397.0 16.5416666667 3 16.5 31 28 2 32 +5 24 2 31 349.0 14.5416666667 4 13.0 25 21 2 31 +6 24 2 34 456.0 19.0 6 18.5 31 25 2 34 +7 24 2 34 545.0 22.7083333333 15 27.0 30 15 2 34 +8 24 2 31 338.0 14.0833333333 3 10.0 27 24 2 31 +9 24 2 32 377.0 15.7083333333 3 17.0 27 24 2 32 +10 24 2 31 373.0 15.5416666667 4 13.0 27 23 2 31 +11 24 2 31 371.0 15.4583333333 6 12.5 28 22 2 31 +12 24 2 33 505.0 21.0416666667 13 23.0 29 16 2 33 +13 24 2 31 302.0 12.5833333333 4 11.0 23 19 2 31 +14 24 2 33 384.0 16.0 4 18.5 26 22 2 33 +15 24 2 31 392.0 16.3333333333 4 13.0 30 26 2 31 +16 24 2 31 380.0 15.8333333333 6 15.5 29 23 2 31 +17 24 2 31 465.0 19.375 11 24.5 30 19 2 31 +18 24 2 30 304.0 12.6666666667 4 9.0 23 19 2 30 +19 24 2 31 350.0 14.5833333333 4 13.0 27 23 2 31 +20 24 2 31 332.0 13.8333333333 2 13.0 21 19 2 31 +21 24 2 32 358.0 14.9166666667 4 14.5 26 22 2 32 +22 24 2 31 456.0 19.0 8 24.0 28 20 2 31 +23 24 2 31 308.0 12.8333333333 5 8.0 19 14 2 31 +24 24 2 31 280.0 11.6666666667 5 8.5 16 11 2 31 +25 24 2 30 300.0 12.5 5 9.5 21 16 2 30 +26 24 2 31 384.0 16.0 3 20.0 25 22 2 31 +27 24 2 31 389.0 16.2083333333 8 16.0 24 16 2 31 +28 24 2 28 242.0 10.0833333333 2 6.0 21 19 2 28 +29 24 2 26 240.0 10.0 4 6.5 14 10 2 26 +30 24 2 28 297.0 12.375 6 8.5 22 16 2 28 +31 24 2 30 304.0 12.6666666667 3 9.0 22 19 2 30 +32 24 2 31 324.0 13.5 5 12.5 23 18 2 31 +33 24 2 26 255.0 10.625 4 8.0 20 16 2 26 +34 24 2 30 226.0 9.41666666667 2 6.5 13 11 2 29.5 +35 24 2 29 244.0 10.1666666667 2 7.0 19 17 2 29 diff -r dd21a47f8035 -r 8be3989335a9 tool_conf.xml.sample --- a/tool_conf.xml.sample Tue May 05 17:06:11 2009 -0400 +++ b/tool_conf.xml.sample Tue May 05 22:44:30 2009 -0400 @@ -281,6 +281,10 @@ <tool file="emboss_5/emboss_wordmatch.xml" /> </section> --> + <section name="SOLiD Data Analysis" id="solid_tools"> + <tool file="solid_tools/solid_qual_stats.xml" /> + <tool file="solid_tools/solid_qual_boxplot.xml" /> + </section> <section name="FASTA manipulation" id="fasta_manipulation"> <tool file="fasta_tools/fasta_compute_length.xml" /> <tool file="fasta_tools/fasta_filter_by_length.xml" /> diff -r dd21a47f8035 -r 8be3989335a9 tools/solid_tools/solid_qual_boxplot.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/solid_tools/solid_qual_boxplot.xml Tue May 05 22:44:30 2009 -0400 @@ -0,0 +1,36 @@ +<tool id="solid_qual_boxplot" name="Quality Boxplot" version="1.0.0"> + <description>for SOLiD data</description> + + <command>qualsolid_boxplot_graph.sh -t '$input.name' -i $input -o $output</command> + + <inputs> + <param format="txt" name="input" type="data" label="Statistics report file (output of 'Quality Statistics for SOLiD data' tool)" /> + </inputs> + + <outputs> + <data format="png" name="output" metadata_source="input" /> + </outputs> +<help> + +**What it does** + +Creates a boxplot graph for the quality scores in the library. + +.. class:: infomark + +**TIP:** Use the **Quality Statistics for SOLiD data** tool to generate the report file needed for this tool. + +----- + +**Output Example** + +* Black horizontal lines are medians +* Rectangular red boxes show the Inter-quartile Range (IQR) (top value is Q3, bottom value is Q1) +* Whiskers show outlier at max. 1.5*IQR + + +.. image:: ../static/images/solid_qual.png + + +</help> +</tool> diff -r dd21a47f8035 -r 8be3989335a9 tools/solid_tools/solid_qual_stats.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/solid_tools/solid_qual_stats.py Tue May 05 22:44:30 2009 -0400 @@ -0,0 +1,140 @@ +#! /usr/bin/python +#Guruprasad Ananda + +import sys, os, zipfile, tempfile + +QUAL_UPPER_BOUND = 41 +QUAL_LOWER_BOUND = 1 + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + +def unzip( filename ): + zip_file = zipfile.ZipFile( filename, 'r' ) + tmpfilename = tempfile.NamedTemporaryFile().name + for name in zip_file.namelist(): + file( tmpfilename, 'a' ).write( zip_file.read( name ) ) + zip_file.close() + return tmpfilename + +def __main__(): + + infile_score_name = sys.argv[1].strip() + fout = open(sys.argv[2].strip(),'r+w') + + infile_is_zipped = False + if zipfile.is_zipfile( infile_score_name ): + infile_is_zipped = True + infile_name = unzip( infile_score_name ) + else: + infile_name = infile_score_name + + readlen = None + j = 0 + for line in file( infile_name ): + line = line.strip() + if not(line) or line.startswith("#") or line.startswith(">"): + continue + elems = line.split() + try: + for item in elems: + assert int(item) + if not(readlen): + readlen = len(elems) + if len(elems) != readlen: + print "Note: Reads in the input dataset are of variable lengths." + j += 1 + except: + invalid_lines += 1 + if j > 10: + break + + invalid_lines = 0 + position_dict = {} + print >>fout, "column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW" + for k,line in enumerate(file( infile_name )): + line = line.strip() + if not(line) or line.startswith("#") or line.startswith(">"): + continue + elems = line.split() + if position_dict == {}: + for pos in range(readlen): + position_dict[pos] = [0]*QUAL_UPPER_BOUND + if len(elems) != readlen: + invalid_lines += 1 + continue + for ind,item in enumerate(elems): + try: + item = int(item) + position_dict[ind][item]+=1 + except: + pass + + invalid_positions = 0 + for pos in position_dict: + carr = position_dict[pos] #count array for position pos + total = sum(carr) #number of bases found in this column. + med_elem = int(round(total/2.0)) + lowest = None #Lowest quality score value found in this column. + highest = None #Highest quality score value found in this column. + median = None #Median quality score value found in this column. + qsum = 0.0 #Sum of quality score values for this column. + q1 = None #1st quartile quality score. + q3 = None #3rd quartile quality score. + q1_elem = int(round((total+1)/4.0)) + q3_elem = int(round((total+1)*3/4.0)) + + try: + for ind,cnt in enumerate(carr): + qsum += ind*cnt + + if cnt!=0: + highest = ind + + if lowest==None and cnt!=0: #first non-zero count + lowest = ind + + if q1==None: + if sum(carr[:ind+1]) >= q1_elem: + q1 = ind + + if median==None: + if sum(carr[:ind+1]) < med_elem: + continue + median = ind + if total%2 == 0: #even number of elements + median2 = median + if sum(carr[:ind+1]) < med_elem+1: + for ind2,elem in enumerate(carr[ind+1:]): + if elem != 0: + median2 = ind+ind2+1 + break + median = (median + median2)/2.0 + + + if q3==None: + if sum(carr[:ind+1]) >= q3_elem: + q3 = ind + + + mean = qsum/total #Mean quality score value for this column. + iqr = q3-q1 + left_whisker = max(q1 - 1.5*iqr,lowest) + right_whisker = min(q3 + 1.5*iqr,highest) + + print >>fout,"%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" %(pos+1,total,lowest,highest,qsum,mean,q1,median,q3,iqr,left_whisker,right_whisker) + except: + invalid_positions += 1 + nullvals = ['NA']*11 + print >>fout,"%s\t%s" %(pos+1,'\t'.join(nullvals)) + + if invalid_lines: + print "Skipped %d reads as invalid." %invalid_lines + if invalid_positions: + print "Skipped stats computation for %d read postions." %invalid_positions + +if __name__=="__main__": + __main__() + + diff -r dd21a47f8035 -r 8be3989335a9 tools/solid_tools/solid_qual_stats.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/solid_tools/solid_qual_stats.xml Tue May 05 22:44:30 2009 -0400 @@ -0,0 +1,65 @@ +<tool id="solid_qual_stats" name="Quality Statistics" version="1.0.0"> + <description>for SOLiD data</description> + <command interpreter="python">solid_qual_stats.py $input $output1</command> + + <inputs> + <param format="qualsolid, txtseq.zip" name="input" type="data" label="SOLiD qual file" help="If your dataset doesn't show up in the menu, click the pencil icon next to your dataset and set the datatype to 'qualsolid'" /> + </inputs> + <outputs> + <data format="txt" name="output1" metadata_source="input" /> + </outputs> + <tests> + <test> + <param name="input" value="qualscores.qualsolid" /> + <output name="output1" file="qualsolid.stats" /> + </test> + </tests> + +<help> + +**What it does** + +Creates quality statistics report for the given SOLiD quality score file. + +.. class:: infomark + +**TIP:** This statistics report can be used as input for **Quality Boxplot for SOLiD data** tool. + +----- + +**The output file will contain the following fields:** + +* column = column number (position on the read) +* count = number of bases found in this column. +* min = Lowest quality score value found in this column. +* max = Highest quality score value found in this column. +* sum = Sum of quality score values for this column. +* mean = Mean quality score value for this column. +* Q1 = 1st quartile quality score. +* med = Median quality score. +* Q3 = 3rd quartile quality score. +* IQR = Inter-Quartile range (Q3-Q1). +* lW = 'Left-Whisker' value (for boxplotting). +* rW = 'Right-Whisker' value (for boxplotting). + + + + + +**Output Example**:: + + column count min max sum mean Q1 med Q3 IQR lW rW + 1 6362991 2 32 250734117 20.41 5 9 28 23 2 31 + 2 6362991 2 32 250531036 21.37 10 26 30 20 5 31 + 3 6362991 2 34 248722469 19.09 10 26 30 20 5 31 + 4 6362991 2 34 247654797 18.92 10 26 30 20 5 31 + . + . + 32 6362991 2 31 143436943 16.54 3 10 25 22 2 31 + 33 6362991 2 32 114269843 16.96 3 10 25 22 2 31 + 34 6362991 2 29 140638447 12.10 3 10 25 22 2 29 + 35 6362991 2 29 138910532 11.83 3 10 25 22 2 29 + + +</help> +</tool>
participants (1)
-
Greg Von Kuster