details: http://www.bx.psu.edu/hg/galaxy/rev/6252781aa157 changeset: 2841:6252781aa157 user: Kelly Vincent <kpvincent@bx.psu.edu> date: Wed Oct 07 12:43:15 2009 -0400 description: Added fastq (generic) datatype and deleted fastqsolexa datatype 8 file(s) affected in this change: datatypes_conf.xml.sample lib/galaxy/datatypes/registry.py lib/galaxy/datatypes/sequence.py lib/galaxy/datatypes/test/1.fastq lib/galaxy/datatypes/test/2.fastq test-data/1.fastq test-data/2gen.fastq test/functional/test_sniffing_and_metadata_settings.py diffs (380 lines): diff -r ecb6d86a5a9c -r 6252781aa157 datatypes_conf.xml.sample --- a/datatypes_conf.xml.sample Wed Oct 07 11:48:30 2009 -0400 +++ b/datatypes_conf.xml.sample Wed Oct 07 12:43:15 2009 -0400 @@ -22,11 +22,8 @@ <datatype extension="fasta" type="galaxy.datatypes.sequence:Fasta" display_in_upload="true"> <converter file="fasta_to_tabular_converter.xml" target_datatype="tabular"/> </datatype> + <datatype extension="fastq" type="galaxy.datatypes.sequence:Fastq" display_in_upload="true"/> <datatype extension="fastqsanger" type="galaxy.datatypes.sequence:FastqSanger" display_in_upload="true"/> - <datatype extension="fastqsolexa" type="galaxy.datatypes.sequence:FastqSolexa" display_in_upload="true"> - <converter file="fastqsolexa_to_fasta_converter.xml" target_datatype="fasta"/> - <converter file="fastqsolexa_to_qual_converter.xml" target_datatype="qualsolexa"/> - </datatype> <datatype extension="genetrack" type="galaxy.datatypes.tracks:GeneTrack"/> <datatype extension="gff" type="galaxy.datatypes.interval:Gff" display_in_upload="true"> <converter file="gff_to_bed_converter.xml" target_datatype="bed"/> @@ -200,8 +197,8 @@ <sniffer type="galaxy.datatypes.qualityscore:QualityScoreSOLiD"/> <sniffer type="galaxy.datatypes.qualityscore:QualityScore454"/> <sniffer type="galaxy.datatypes.sequence:Fasta"/> - <sniffer type="galaxy.datatypes.sequence:FastqSolexa"/> <sniffer type="galaxy.datatypes.sequence:FastqSanger"/> + <sniffer type="galaxy.datatypes.sequence:Fastq"/> <sniffer type="galaxy.datatypes.interval:Wiggle"/> <sniffer type="galaxy.datatypes.images:Html"/> <sniffer type="galaxy.datatypes.sequence:Axt"/> diff -r ecb6d86a5a9c -r 6252781aa157 lib/galaxy/datatypes/registry.py --- a/lib/galaxy/datatypes/registry.py Wed Oct 07 11:48:30 2009 -0400 +++ b/lib/galaxy/datatypes/registry.py Wed Oct 07 12:43:15 2009 -0400 @@ -119,8 +119,8 @@ 'customtrack' : interval.CustomTrack(), 'csfasta' : sequence.csFasta(), 'fasta' : sequence.Fasta(), + 'fastq' : sequence.Fastq(), 'fastqsanger' : sequence.FastqSanger(), - 'fastqsolexa' : sequence.FastqSolexa(), 'gff' : interval.Gff(), 'gff3' : interval.Gff3(), 'genetrack' : tracks.GeneTrack(), @@ -149,8 +149,8 @@ 'customtrack' : 'text/plain', 'csfasta' : 'text/plain', 'fasta' : 'text/plain', + 'fastq' : 'text/plain', 'fastqsanger' : 'text/plain', - 'fastqsolexa' : 'text/plain', 'gff' : 'text/plain', 'gff3' : 'text/plain', 'interval' : 'text/plain', @@ -179,8 +179,8 @@ qualityscore.QualityScoreSOLiD(), qualityscore.QualityScore454(), sequence.Fasta(), - sequence.FastqSolexa(), sequence.FastqSanger(), + sequence.Fastq(), interval.Wiggle(), images.Html(), sequence.Axt(), diff -r ecb6d86a5a9c -r 6252781aa157 lib/galaxy/datatypes/sequence.py --- a/lib/galaxy/datatypes/sequence.py Wed Oct 07 11:48:30 2009 -0400 +++ b/lib/galaxy/datatypes/sequence.py Wed Oct 07 12:43:15 2009 -0400 @@ -1,5 +1,5 @@ """ -Image classes +Sequence classes """ import data @@ -134,10 +134,10 @@ pass return False -class FastqSolexa( Sequence ): - """Class representing a FASTQ sequence ( the Solexa variant )""" - file_ext = "fastqsolexa" - +class Fastq ( Sequence ): + """Class representing a generic FASTQ sequence""" + file_ext = "fastq" + def set_peek( self, dataset ): if not dataset.dataset.purged: dataset.peek = data.get_file_peek( dataset.file_name ) @@ -145,102 +145,46 @@ else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' - - def sniff( self, filename ): + + def sniff ( self, filename ): """ - Determines whether the file is in fastqsolexa format (Solexa Variant) + Determines whether the file is in generic fastq format For details, see http://maq.sourceforge.net/fastq.shtml - Note: There are two kinds of FASTQ files, known as "Sanger" (sometimes called "Standard") and Solexa + Note: There are three kinds of FASTQ files, known as "Sanger" (sometimes called "Standard"), Solexa, and Illumina These differ in the representation of the quality scores - >>> fname = get_test_fname( '1.fastqsolexa' ) - >>> FastqSolexa().sniff( fname ) + >>> fname = get_test_fname( '1.fastqsanger' ) + >>> Fastq().sniff( fname ) True - >>> fname = get_test_fname( '2.fastqsolexa' ) - >>> FastqSolexa().sniff( fname ) + >>> fname = get_test_fname( '2.fastqsanger' ) + >>> Fastq().sniff( fname ) True """ headers = get_headers( filename, None ) - bases_regexp = re.compile( "^[NGTAC]*$" ) + bases_regexp = re.compile( "^[NGTAC]*" ) + # check that first block looks like a fastq block try: if len( headers ) >= 4 and headers[0][0] and headers[0][0][0] == "@" and headers[2][0] and headers[2][0][0] == "+" and headers[1][0]: # Check the sequence line, make sure it contains only G/C/A/T/N if not bases_regexp.match( headers[1][0] ): return False - - # Check quality score: integer or ascii char. - try: - check = int(headers[3][0]) - qscore_int = True - except: - qscore_int = False - - # check length and range of quality scores - if qscore_int: - if len( headers[3] ) != len( headers[1][0] ): - return False - if not self.check_qual_values_within_range(headers[3], 'int'): - return False - try: - if not self.check_qual_values_within_range(headers[7], 'int'): - return False - try: - if not self.check_qual_values_within_range(headers[11], 'int'): - return False - except IndexError: - pass - except IndexError: - pass - else: - if len( headers[3][0] ) != len( headers[1][0] ): - return False - if not self.check_qual_values_within_range(headers[3][0], 'char'): - return False - try: - if not self.check_qual_values_within_range(headers[7][0], 'char'): - return False - try: - if not self.check_qual_values_within_range(headers[11][0], 'char'): - return False - except IndexError: - pass - except IndexError: - pass return True return False except: return False - def check_qual_values_within_range( self, qual_seq, score_type ): - if score_type == 'char': - for val in qual_seq: - if ord(val) < 59 or ord(val) > 104: - return False - elif score_type == 'int': - for val in qual_seq: - if int(val) < -5 or int(val) > 40: - return False - return True - -class FastqSanger( Sequence ): + +class FastqSanger( Fastq ): """Class representing a FASTQ sequence ( the Sanger variant )""" file_ext = "fastqsanger" - - def set_peek( self, dataset ): - if not dataset.dataset.purged: - dataset.peek = data.get_file_peek( dataset.file_name ) - dataset.blurb = data.nice_size( dataset.get_size() ) - else: - dataset.peek = 'file does not exist' - dataset.blurb = 'file purged from disk' def sniff( self, filename ): """ Determines whether the file is in fastqsanger format (Sanger Variant) For details, see http://maq.sourceforge.net/fastq.shtml - Note: There are two kinds of FASTQ files, known as "Sanger" (sometimes called "Standard") and Solexa + Note: There are three kinds of FASTQ files, known as "Sanger" (sometimes called "Standard"), Solexa, and Illumina These differ in the representation of the quality scores >>> fname = get_test_fname( '1.fastqsanger' ) @@ -254,60 +198,33 @@ bases_regexp = re.compile( "^[NGTAC]*$" ) try: if len( headers ) >= 4 and headers[0][0] and headers[0][0][0] == "@" and headers[2][0] and headers[2][0][0] == "+" and headers[1][0]: - # Check the sequence line, make sure it contains only G/C/A/T/N - if not bases_regexp.match( headers[1][0] ): - return False - # Check quality score: integer or ascii char. - try: - check = int(headers[3][0]) - qscore_int = True - except: - qscore_int = False - - # check length and range of quality scores - if qscore_int: - if len( headers[3] ) != len( headers[1][0] ): - return False - if not self.check_qual_values_within_range(headers[3], 'int'): - return False + # look through first 20 blocks and make sure bases valid and qualities valid + for i in range( 1, 80, 4 ): try: - if not self.check_qual_values_within_range(headers[7], 'int'): + # check that bases are legitimate + if not bases_regexp.match( headers[i][0] ): return False - try: - if not self.check_qual_values_within_range(headers[11], 'int'): - return False - except IndexError: - pass + # check length of qualities (matching bases) + if len( headers[i+2][0] ) != len( headers[1][0] ): + return False + # check qualities within fastqsanger range + if not self.check_qual_values_within_range( headers[i+2][0] ): + return False except IndexError: pass - else: - if len( headers[3][0] ) != len( headers[1][0] ): - return False - if not self.check_qual_values_within_range(headers[3][0], 'char'): - return False - try: - if not self.check_qual_values_within_range(headers[7][0], 'char'): - return False - try: - if not self.check_qual_values_within_range(headers[11][0], 'char'): - return False - except IndexError: - pass - except IndexError: - pass - return True - return False + return True + return False except: return False - def check_qual_values_within_range( self, qual_seq, score_type ): - if score_type == 'char': - for val in qual_seq: - if ord(val) >= 33 and ord(val) <= 126: - return True - elif score_type == 'int': - for val in qual_seq: - if int(val) >= 0 and int(val) <= 93: - return True + def check_qual_values_within_range( self, qual_seq ): + under59 = False + for val in qual_seq: + if ord(val) < 33 or ord(val) > 126: + return False + if not under59 and ord(val) < 59: + under59 = True + if under59: + return True return False try: @@ -521,7 +438,7 @@ >>> fname = get_test_fname( 'alignment.lav' ) >>> Axt().sniff( fname ) False - """ + """ headers = get_headers( filename, None ) if len(headers) < 4: return False diff -r ecb6d86a5a9c -r 6252781aa157 lib/galaxy/datatypes/test/1.fastq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/galaxy/datatypes/test/1.fastq Wed Oct 07 12:43:15 2009 -0400 @@ -0,0 +1,8 @@ +@HANNIBAL_1_FC302VTAAXX:2:1:228:167 +GAATTGATCAGGACATAGGACAACTGTAGGCACCAT ++HANNIBAL_1_FC302VTAAXX:2:1:228:167 +40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4 +@HANNIBAL_1_FC302VTAAXX:2:1:156:340 +GAGTTCTCGTCGCCTGTAGGCACCATCAATCGTATG ++HANNIBAL_1_FC302VTAAXX:2:1:156:340 +40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9 \ No newline at end of file diff -r ecb6d86a5a9c -r 6252781aa157 lib/galaxy/datatypes/test/2.fastq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/galaxy/datatypes/test/2.fastq Wed Oct 07 12:43:15 2009 -0400 @@ -0,0 +1,8 @@ +@seq1 +GACAGCTTGGTTTTTAGTGAGTTGTTCCTTTCTTT ++seq1 +hhhhhhhhhhhhhhhhhhhhhhhhhhPW@hhhhhh +@seq2 +GCAATGACGGCAGCAATAAACTCAACAGGTGCTGG ++seq2 +hhhhhhhhhhhhhhYhhahhhhWhAhFhSIJGChO \ No newline at end of file diff -r ecb6d86a5a9c -r 6252781aa157 test-data/1.fastq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/1.fastq Wed Oct 07 12:43:15 2009 -0400 @@ -0,0 +1,8 @@ +@HANNIBAL_1_FC302VTAAXX:2:1:228:167 +GAATTGATCAGGACATAGGACAACTGTAGGCACCAT ++HANNIBAL_1_FC302VTAAXX:2:1:228:167 +40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4 +@HANNIBAL_1_FC302VTAAXX:2:1:156:340 +GAGTTCTCGTCGCCTGTAGGCACCATCAATCGTATG ++HANNIBAL_1_FC302VTAAXX:2:1:156:340 +40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9 \ No newline at end of file diff -r ecb6d86a5a9c -r 6252781aa157 test-data/2gen.fastq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/2gen.fastq Wed Oct 07 12:43:15 2009 -0400 @@ -0,0 +1,8 @@ +@seq1 +GACAGCTTGGTTTTTAGTGAGTTGTTCCTTTCTTT ++seq1 +hhhhhhhhhhhhhhhhhhhhhhhhhhPW@hhhhhh +@seq2 +GCAATGACGGCAGCAATAAACTCAACAGGTGCTGG ++seq2 +hhhhhhhhhhhhhhYhhahhhhWhAhFhSIJGChO \ No newline at end of file diff -r ecb6d86a5a9c -r 6252781aa157 test/functional/test_sniffing_and_metadata_settings.py --- a/test/functional/test_sniffing_and_metadata_settings.py Wed Oct 07 11:48:30 2009 -0400 +++ b/test/functional/test_sniffing_and_metadata_settings.py Wed Oct 07 12:43:15 2009 -0400 @@ -81,16 +81,6 @@ assert latest_hda is not None, "Problem retrieving fasta hda from the database" if not latest_hda.name == '1.fasta' and not latest_hda.extension == 'fasta': raise AssertionError, "fasta data type was not correctly sniffed." - def test_030_fastqsolexa_datatype( self ): - """Testing correctly sniffing fastqsolexa ( the Solexa variant ) data type upon upload""" - self.upload_file( '1.fastqsolexa' ) - self.verify_dataset_correctness( '1.fastqsolexa' ) - self.check_history_for_string( '1.fastqsolexa format: <span class="fastqsolexa">fastqsolexa</span>, database: \? Info: uploaded fastqsolexa file' ) - latest_hda = galaxy.model.HistoryDatasetAssociation.query() \ - .order_by( desc( galaxy.model.HistoryDatasetAssociation.table.c.create_time ) ).first() - assert latest_hda is not None, "Problem retrieving fastqsolexa hda from the database" - if not latest_hda.name == '1.fastqsolexa' and not latest_hda.extension == 'fastqsolexa': - raise AssertionError, "fastqsolexa data type was not correctly sniffed." def test_035_gff_datatype( self ): """Testing correctly sniffing gff data type upon upload""" self.upload_file( '5.gff' ) @@ -236,6 +226,16 @@ assert latest_hda is not None, "Problem retrieving sam hda from the database" if not latest_hda.name == '1.sam' and not latest_hda.extension == 'sam': raise AssertionError, "sam data type was not correctly sniffed." + def test_095_fastq_datatype( self ): + """Testing correctly sniffing fastq ( generic ) data type upon upload""" + self.upload_file( '2gen.fastq' ) + self.verify_dataset_correctness( '2gen.fastq' ) + self.check_history_for_string( '2gen.fastq format: <span class="fastq">fastq</span>, database: \? Info: uploaded fastq file' ) + latest_hda = galaxy.model.HistoryDatasetAssociation.query() \ + .order_by( desc( galaxy.model.HistoryDatasetAssociation.table.c.create_time ) ).first() + assert latest_hda is not None, "Problem retrieving fastq hda from the database" + if not latest_hda.name == '2gen.fastq' and not latest_hda.extension == 'fastq': + raise AssertionError, "fastq data type was not correctly sniffed." def test_9999_clean_up( self ): self.delete_history( id=self.security.encode_id( history1.id ) ) self.logout()