details: http://www.bx.psu.edu/hg/galaxy/rev/fab59b1e756d changeset: 2510:fab59b1e756d user: Kelly Vincent <kpvincent@bx.psu.edu> date: Thu Jul 30 12:06:24 2009 -0400 description: Added fastqsanger data format and required bwa_wrapper to take only that format as input 9 file(s) affected in this change: datatypes_conf.xml.sample lib/galaxy/datatypes/registry.py lib/galaxy/datatypes/sequence.py lib/galaxy/datatypes/test/1.fastqsanger lib/galaxy/datatypes/test/2.fastqsanger test-data/1.fastqsanger test-data/2.fastqsanger test/functional/test_sniffing_and_metadata_settings.py tools/sr_mapping/bwa_wrapper.xml diffs (298 lines): diff -r f06777cbd5bb -r fab59b1e756d datatypes_conf.xml.sample --- a/datatypes_conf.xml.sample Thu Jul 30 11:05:03 2009 -0400 +++ b/datatypes_conf.xml.sample Thu Jul 30 12:06:24 2009 -0400 @@ -20,6 +20,7 @@ <datatype extension="fasta" type="galaxy.datatypes.sequence:Fasta" display_in_upload="true"> <converter file="fasta_to_tabular_converter.xml" target_datatype="tabular"/> </datatype> + <datatype extension="fastqsanger" type="galaxy.datatypes.sequence:FastqSanger" display_in_upload="true"/> <datatype extension="fastqsolexa" type="galaxy.datatypes.sequence:FastqSolexa" display_in_upload="true"> <converter file="fastqsolexa_to_fasta_converter.xml" target_datatype="fasta"/> <converter file="fastqsolexa_to_qual_converter.xml" target_datatype="qualsolexa"/> @@ -195,6 +196,7 @@ <sniffer type="galaxy.datatypes.qualityscore:QualityScore454"/> <sniffer type="galaxy.datatypes.sequence:Fasta"/> <sniffer type="galaxy.datatypes.sequence:FastqSolexa"/> + <sniffer type="galaxy.datatypes.sequence:FastqSanger"/> <sniffer type="galaxy.datatypes.interval:Wiggle"/> <sniffer type="galaxy.datatypes.images:Html"/> <sniffer type="galaxy.datatypes.sequence:Axt"/> diff -r f06777cbd5bb -r fab59b1e756d lib/galaxy/datatypes/registry.py --- a/lib/galaxy/datatypes/registry.py Thu Jul 30 11:05:03 2009 -0400 +++ b/lib/galaxy/datatypes/registry.py Thu Jul 30 12:06:24 2009 -0400 @@ -117,6 +117,7 @@ 'customtrack' : interval.CustomTrack(), 'csfasta' : sequence.csFasta(), 'fasta' : sequence.Fasta(), + 'fastqsanger' : sequence.FastqSanger(), 'fastqsolexa' : sequence.FastqSolexa(), 'gff' : interval.Gff(), 'gff3' : interval.Gff3(), @@ -144,6 +145,7 @@ 'customtrack' : 'text/plain', 'csfasta' : 'text/plain', 'fasta' : 'text/plain', + 'fastqsanger' : 'text/plain', 'fastqsolexa' : 'text/plain', 'gff' : 'text/plain', 'gff3' : 'text/plain', @@ -173,6 +175,7 @@ qualityscore.QualityScore454(), sequence.Fasta(), sequence.FastqSolexa(), + sequence.FastqSanger(), interval.Wiggle(), images.Html(), sequence.Axt(), diff -r f06777cbd5bb -r fab59b1e756d lib/galaxy/datatypes/sequence.py --- a/lib/galaxy/datatypes/sequence.py Thu Jul 30 11:05:03 2009 -0400 +++ b/lib/galaxy/datatypes/sequence.py Thu Jul 30 12:06:24 2009 -0400 @@ -176,16 +176,139 @@ except: qscore_int = False + # check length and range of quality scores if qscore_int: if len( headers[3] ) != len( headers[1][0] ): return False + if not self.check_qual_values_within_range(headers[3], 'int'): + return False + try: + if not self.check_qual_values_within_range(headers[7], 'int'): + return False + try: + if not self.check_qual_values_within_range(headers[11], 'int'): + return False + except IndexError: + pass + except IndexError: + pass else: if len( headers[3][0] ) != len( headers[1][0] ): - return False + return False + if not self.check_qual_values_within_range(headers[3][0], 'char'): + return False + try: + if not self.check_qual_values_within_range(headers[7][0], 'char'): + return False + try: + if not self.check_qual_values_within_range(headers[11][0], 'char'): + return False + except IndexError: + pass + except IndexError: + pass return True return False except: return False + def check_qual_values_within_range( self, qual_seq, score_type ): + if score_type == 'char': + for val in qual_seq: + if ord(val) < 59 or ord(val) > 104: + return False + elif score_type == 'int': + for val in qual_seq: + if int(val) < -5 or int(val) > 40: + return False + return True + + +class FastqSanger( Sequence ): + """Class representing a FASTQ sequence ( the Sanger variant )""" + file_ext = "fastqsanger" + + def set_peek( self, dataset ): + if not dataset.dataset.purged: + dataset.peek = data.get_file_peek( dataset.file_name ) + dataset.blurb = data.nice_size( dataset.get_size() ) + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disk' + + def sniff( self, filename ): + """ + Determines whether the file is in fastqsanger format (Sanger Variant) + For details, see http://maq.sourceforge.net/fastq.shtml + + Note: There are two kinds of FASTQ files, known as "Sanger" (sometimes called "Standard") and Solexa + These differ in the representation of the quality scores + + >>> fname = get_test_fname( '1.fastqsanger' ) + >>> FastqSanger().sniff( fname ) + True + >>> fname = get_test_fname( '2.fastqsanger' ) + >>> FastqSanger().sniff( fname ) + True + """ + headers = get_headers( filename, None ) + bases_regexp = re.compile( "^[NGTAC]*$" ) + try: + if len( headers ) >= 4 and headers[0][0] and headers[0][0][0] == "@" and headers[2][0] and headers[2][0][0] == "+" and headers[1][0]: + # Check the sequence line, make sure it contains only G/C/A/T/N + if not bases_regexp.match( headers[1][0] ): + return False + # Check quality score: integer or ascii char. + try: + check = int(headers[3][0]) + qscore_int = True + except: + qscore_int = False + + # check length and range of quality scores + if qscore_int: + if len( headers[3] ) != len( headers[1][0] ): + return False + if not self.check_qual_values_within_range(headers[3], 'int'): + return False + try: + if not self.check_qual_values_within_range(headers[7], 'int'): + return False + try: + if not self.check_qual_values_within_range(headers[11], 'int'): + return False + except IndexError: + pass + except IndexError: + pass + else: + if len( headers[3][0] ) != len( headers[1][0] ): + return False + if not self.check_qual_values_within_range(headers[3][0], 'char'): + return False + try: + if not self.check_qual_values_within_range(headers[7][0], 'char'): + return False + try: + if not self.check_qual_values_within_range(headers[11][0], 'char'): + return False + except IndexError: + pass + except IndexError: + pass + return True + return False + except: + return False + def check_qual_values_within_range( self, qual_seq, score_type ): + if score_type == 'char': + for val in qual_seq: + if ord(val) >= 33 and ord(val) <= 126: + return True + elif score_type == 'int': + for val in qual_seq: + if int(val) >= 0 and int(val) <= 93: + return True + return False try: from galaxy import eggs diff -r f06777cbd5bb -r fab59b1e756d lib/galaxy/datatypes/test/1.fastqsanger --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/galaxy/datatypes/test/1.fastqsanger Thu Jul 30 12:06:24 2009 -0400 @@ -0,0 +1,8 @@ +@1831_573_1004/1 +AATACTTTCGGCGCCCTAAACCAGCTCACTGGGG ++ +><C&&9952+C>5<.?<79,=42<292:<(9/-7 +@1831_573_1050/1 +TTTATGGGTATGGCCGCTCACAGGCCAGCGGCCT ++ +;@@17?@=>7??@A8?==@4A?A4)&+.'&+'1, \ No newline at end of file diff -r f06777cbd5bb -r fab59b1e756d lib/galaxy/datatypes/test/2.fastqsanger --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/galaxy/datatypes/test/2.fastqsanger Thu Jul 30 12:06:24 2009 -0400 @@ -0,0 +1,8 @@ +@1831_573_1004/1 +AATACTTTCGGCGCCCTAAACCAGCTCACTGGGG ++ +29 27 34 5 5 24 24 20 17 10 34 29 20 27 13 30 27 22 24 11 28 19 17 27 17 24 17 25 27 7 24 14 12 22 +@1831_573_1050/1 +TTTATGGGTATGGCCGCTCACAGGCCAGCGGCCT ++ +26 31 31 16 22 30 31 28 29 22 30 30 31 32 23 30 28 28 31 19 32 30 32 19 8 5 10 13 6 5 10 6 16 11 \ No newline at end of file diff -r f06777cbd5bb -r fab59b1e756d test-data/1.fastqsanger --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/1.fastqsanger Thu Jul 30 12:06:24 2009 -0400 @@ -0,0 +1,8 @@ +@1831_573_1004/1 +AATACTTTCGGCGCCCTAAACCAGCTCACTGGGG ++ +><C&&9952+C>5<.?<79,=42<292:<(9/-7 +@1831_573_1050/1 +TTTATGGGTATGGCCGCTCACAGGCCAGCGGCCT ++ +;@@17?@=>7??@A8?==@4A?A4)&+.'&+'1, \ No newline at end of file diff -r f06777cbd5bb -r fab59b1e756d test-data/2.fastqsanger --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/2.fastqsanger Thu Jul 30 12:06:24 2009 -0400 @@ -0,0 +1,8 @@ +@1831_573_1004/1 +AATACTTTCGGCGCCCTAAACCAGCTCACTGGGG ++ +29 27 34 5 5 24 24 20 17 10 34 29 20 27 13 30 27 22 24 11 28 19 17 27 17 24 17 25 27 7 24 14 12 22 +@1831_573_1050/1 +TTTATGGGTATGGCCGCTCACAGGCCAGCGGCCT ++ +26 31 31 16 22 30 31 28 29 22 30 30 31 32 23 30 28 28 31 19 32 30 32 19 8 5 10 13 6 5 10 6 16 11 \ No newline at end of file diff -r f06777cbd5bb -r fab59b1e756d test/functional/test_sniffing_and_metadata_settings.py --- a/test/functional/test_sniffing_and_metadata_settings.py Thu Jul 30 11:05:03 2009 -0400 +++ b/test/functional/test_sniffing_and_metadata_settings.py Thu Jul 30 12:06:24 2009 -0400 @@ -216,6 +216,16 @@ assert latest_hda is not None, "Problem retrieving wig hda from the database" if not latest_hda.name == '1.wig' and not latest_hda.extension == 'wig': raise AssertionError, "wig data type was not correctly sniffed." + def test_085_fastqsanger_datatype( self ): + """Testing correctly sniffing fastqsanger ( the Sanger variant ) data type upon upload""" + self.upload_file( '1.fastqsanger' ) + self.verify_dataset_correctness( '1.fastqsanger' ) + self.check_history_for_string( '1.fastqsanger format: <span class="fastqsanger">fastqsanger</span>, database: \? Info: uploaded fastqsanger file' ) + latest_hda = galaxy.model.HistoryDatasetAssociation.query() \ + .order_by( desc( galaxy.model.HistoryDatasetAssociation.table.c.create_time ) ).first() + assert latest_hda is not None, "Problem retrieving fastqsanger hda from the database" + if not latest_hda.name == '1.fastqsanger' and not latest_hda.extension == 'fastqsanger': + raise AssertionError, "fastqsanger data type was not correctly sniffed." def test_9999_clean_up( self ): self.delete_history( id=self.security.encode_id( history1.id ) ) self.logout() diff -r f06777cbd5bb -r fab59b1e756d tools/sr_mapping/bwa_wrapper.xml --- a/tools/sr_mapping/bwa_wrapper.xml Thu Jul 30 11:05:03 2009 -0400 +++ b/tools/sr_mapping/bwa_wrapper.xml Thu Jul 30 12:06:24 2009 -0400 @@ -71,7 +71,7 @@ <option value="history">Use one from the history</option> </param> <when value="history"> - <param name="ownFile" type="data" label="Select a reference genome" /> + <param name="ownFile" type="data" format="fasta" label="Select a reference genome" /> </when> <when value="indexed"> <param name="indices" type="select" label="Select a reference genome"> @@ -91,7 +91,7 @@ <option value="history">Use one from the history</option> </param> <when value="history"> - <param name="ownFile" type="data" label="Select a reference genome" /> + <param name="ownFile" type="data" format="fasta" label="Select a reference genome" /> </when> <when value="indexed"> <param name="indices" type="select" label="Select a reference genome"> @@ -111,11 +111,11 @@ <option value="paired">Paired-end</option> </param> <when value="single"> - <param name="input1" type="data" label="FASTQ file" /> + <param name="input1" type="data" format="fastqsanger" label="FASTQ file" /> </when> <when value="paired"> - <param name="input1" type="data" label="Forward FASTQ file" /> - <param name="input2" type="data" label="Reverse FASTQ file" /> + <param name="input1" type="data" format="fastqsanger" label="Forward FASTQ file" /> + <param name="input2" type="data" format="fastqsanger" label="Reverse FASTQ file" /> </when> </conditional> <conditional name="params">