[hg] galaxy 2380: Replacing \"qual\" datatype with 3 new datatyp...
details: http://www.bx.psu.edu/hg/galaxy/rev/22b08d47f7ba changeset: 2380:22b08d47f7ba user: guru date: Wed Apr 29 11:53:22 2009 -0400 description: Replacing \"qual\" datatype with 3 new datatypes: qualsolid, qual454 and qualsolexa. 10 file(s) affected in this change: datatypes_conf.xml.sample lib/galaxy/datatypes/converters/fastqsolexa_to_qual_converter.xml lib/galaxy/datatypes/qualityscore.py lib/galaxy/datatypes/registry.py lib/galaxy/model/migrate/versions/0006_change_qual_datatype.py tools/metag_tools/fastqsolexa_to_fasta_qual.xml tools/metag_tools/rmapq_wrapper.xml tools/metag_tools/short_reads_figure_high_quality_length.xml tools/metag_tools/short_reads_figure_score.xml tools/metag_tools/short_reads_trim_seq.xml diffs (409 lines): diff -r 5a4aac327bad -r 22b08d47f7ba datatypes_conf.xml.sample --- a/datatypes_conf.xml.sample Wed Apr 29 10:27:57 2009 -0400 +++ b/datatypes_conf.xml.sample Wed Apr 29 11:53:22 2009 -0400 @@ -19,7 +19,7 @@ </datatype> <datatype extension="fastqsolexa" type="galaxy.datatypes.sequence:FastqSolexa" display_in_upload="true"> <converter file="fastqsolexa_to_fasta_converter.xml" target_datatype="fasta"/> - <converter file="fastqsolexa_to_qual_converter.xml" target_datatype="qual"/> + <converter file="fastqsolexa_to_qual_converter.xml" target_datatype="qualsolexa"/> </datatype> <datatype extension="genetrack" type="galaxy.datatypes.tracks:GeneTrack"/> <datatype extension="gff" type="galaxy.datatypes.interval:Gff" display_in_upload="true"> @@ -42,7 +42,9 @@ </datatype> <datatype extension="pdf" type="galaxy.datatypes.images:Image" mimetype="application/pdf"/> <datatype extension="png" type="galaxy.datatypes.images:Image" mimetype="image/png"/> - <datatype extension="qual" type="galaxy.datatypes.qualityscore:QualityScore" display_in_upload="true"/> + <datatype extension="qualsolexa" type="galaxy.datatypes.qualityscore:QualityScoreSolexa" display_in_upload="true"/> + <datatype extension="qualsolid" type="galaxy.datatypes.qualityscore:QualityScoreSOLiD" display_in_upload="true"/> + <datatype extension="qual454" type="galaxy.datatypes.qualityscore:QualityScore454" display_in_upload="true"/> <datatype extension="scf" type="galaxy.datatypes.images:Scf" mimetype="application/octet-stream" display_in_upload="true"/> <datatype extension="taxonomy" type="galaxy.datatypes.tabular:Taxonomy" display_in_upload="true"/> <datatype extension="tabular" type="galaxy.datatypes.tabular:Tabular" display_in_upload="true"/> @@ -186,7 +188,9 @@ <sniffer type="galaxy.datatypes.sequence:Maf"/> <sniffer type="galaxy.datatypes.sequence:Lav"/> <sniffer type="galaxy.datatypes.sequence:csFasta"/> - <sniffer type="galaxy.datatypes.qualityscore:QualityScore"/> + <sniffer type="galaxy.datatypes.qualityscore:QualityScoreSolexa"/> + <sniffer type="galaxy.datatypes.qualityscore:QualityScoreSOLiD"/> + <sniffer type="galaxy.datatypes.qualityscore:QualityScore454"/> <sniffer type="galaxy.datatypes.sequence:Fasta"/> <sniffer type="galaxy.datatypes.sequence:FastqSolexa"/> <sniffer type="galaxy.datatypes.interval:Wiggle"/> diff -r 5a4aac327bad -r 22b08d47f7ba lib/galaxy/datatypes/converters/fastqsolexa_to_qual_converter.xml --- a/lib/galaxy/datatypes/converters/fastqsolexa_to_qual_converter.xml Wed Apr 29 10:27:57 2009 -0400 +++ b/lib/galaxy/datatypes/converters/fastqsolexa_to_qual_converter.xml Wed Apr 29 11:53:22 2009 -0400 @@ -4,7 +4,7 @@ <param format="fastqsolexa" name="input1" type="data" label="Choose Fastqsolexa file"/> </inputs> <outputs> - <data format="qual" name="output1" /> + <data format="qualsolexa" name="output1" /> </outputs> <help> </help> diff -r 5a4aac327bad -r 22b08d47f7ba lib/galaxy/datatypes/qualityscore.py --- a/lib/galaxy/datatypes/qualityscore.py Wed Apr 29 10:27:57 2009 -0400 +++ b/lib/galaxy/datatypes/qualityscore.py Wed Apr 29 11:53:22 2009 -0400 @@ -9,11 +9,11 @@ log = logging.getLogger(__name__) -class QualityScore ( data.Text ): +class QualityScoreSOLiD ( data.Text ): """ until we know more about quality score formats """ - file_ext = "qual" + file_ext = "qualsolid" def set_peek( self, dataset, line_count=None ): if not dataset.dataset.purged: @@ -21,7 +21,7 @@ if line_count is None: dataset.blurb = data.nice_size( dataset.get_size() ) else: - dataset.blurb = "%s lines, Quality score file" % util.commaify( str( line_count ) ) + dataset.blurb = "%s lines, SOLiD Quality score file" % util.commaify( str( line_count ) ) else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' @@ -30,15 +30,80 @@ try: return dataset.peek except: - return "Quality score file (%s)" % ( data.nice_size( dataset.get_size() ) ) + return "SOLiD Quality score file (%s)" % ( data.nice_size( dataset.get_size() ) ) def sniff( self, filename ): """ >>> fname = get_test_fname( 'sequence.fasta' ) - >>> QualityScore().sniff( fname ) + >>> QualityScoreSOLiD().sniff( fname ) False - >>> fname = get_test_fname( 'sequence.qual' ) - >>> QualityScore().sniff( fname ) + >>> fname = get_test_fname( 'sequence.qualsolid' ) + >>> QualityScoreSOLiD().sniff( fname ) + True + """ + try: + fh = open( filename ) + readlen = None + goodblock = 0 + while True: + line = fh.readline() + if not line: + if goodblock > 0: + return True + else: + break #EOF + line = line.strip() + if line and not line.startswith( '#' ): #first non-empty non-comment line + if line.startswith( '>' ): + line = fh.readline().strip() + if line == '' or line.startswith( '>' ): + break + try: + [ int( x ) for x in line.split() ] + if not(readlen): + readlen = len(line.split()) + assert len(line.split()) == readlen #SOLiD reads should be of the same length + except: + break + goodblock += 1 + if goodblock > 10: + return True + else: + break #we found a non-empty line, but it's not a header + except: + pass + return False + +class QualityScore454 ( data.Text ): + """ + until we know more about quality score formats + """ + file_ext = "qual454" + + def set_peek( self, dataset, line_count=None ): + if not dataset.dataset.purged: + dataset.peek = data.get_file_peek( dataset.file_name ) + if line_count is None: + dataset.blurb = data.nice_size( dataset.get_size() ) + else: + dataset.blurb = "%s lines, 454 Quality score file" % util.commaify( str( line_count ) ) + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disk' + + def display_peek(self, dataset): + try: + return dataset.peek + except: + return "454 Quality score file (%s)" % ( data.nice_size( dataset.get_size() ) ) + + def sniff( self, filename ): + """ + >>> fname = get_test_fname( 'sequence.fasta' ) + >>> QualityScore454().sniff( fname ) + False + >>> fname = get_test_fname( 'sequence.qual454' ) + >>> QualityScore454().sniff( fname ) True """ try: @@ -63,3 +128,59 @@ except: pass return False + +class QualityScoreSolexa ( data.Text ): + """ + until we know more about quality score formats + """ + file_ext = "qualsolexa" + + def set_peek( self, dataset, line_count=None ): + if not dataset.dataset.purged: + dataset.peek = data.get_file_peek( dataset.file_name ) + if line_count is None: + dataset.blurb = data.nice_size( dataset.get_size() ) + else: + dataset.blurb = "%s lines, Solexa Quality score file" % util.commaify( str( line_count ) ) + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disk' + + def display_peek(self, dataset): + try: + return dataset.peek + except: + return "Solexa Quality score file (%s)" % ( data.nice_size( dataset.get_size() ) ) + + def sniff( self, filename ): + """ + >>> fname = get_test_fname( 'sequence.fasta' ) + >>> QualityScoreSolexa().sniff( fname ) + False + >>> fname = get_test_fname( 'sequence.qualsolexa' ) + >>> QualityScoreSolexa().sniff( fname ) + True + """ + try: + fh = open( filename ) + readlen = None + while True: + line = fh.readline() + if not line: + break #EOF + line = line.strip() + if line and not line.startswith( '#' ): + if len(line.split('\t')) > 1: + break + try: + [ int( x ) for x in line.split() ] + if not(readlen): + readlen = len(line.split()) + assert len(line.split()) == readlen #Solexa reads should be of the same length + except: + break + + except: + pass + return False + diff -r 5a4aac327bad -r 22b08d47f7ba lib/galaxy/datatypes/registry.py --- a/lib/galaxy/datatypes/registry.py Wed Apr 29 10:27:57 2009 -0400 +++ b/lib/galaxy/datatypes/registry.py Wed Apr 29 11:53:22 2009 -0400 @@ -116,7 +116,9 @@ 'laj' : images.Laj(), 'lav' : sequence.Lav(), 'maf' : sequence.Maf(), - 'qual' : qualityscore.QualityScore(), + 'qualsolid' : qualityscore.QualityScoreSOLiD(), + 'qualsolexa' : qualityscore.QualityScoreSolexa(), + 'qual454' : qualityscore.QualityScore454(), 'scf' : images.Scf(), 'tabular' : tabular.Tabular(), 'taxonomy' : tabular.Taxonomy(), @@ -140,7 +142,9 @@ 'laj' : 'text/plain', 'lav' : 'text/plain', 'maf' : 'text/plain', - 'qual' : 'text/plain', + 'qualsolid' : 'text/plain', + 'qualsolexa' : 'text/plain', + 'qual454' : 'text/plain', 'scf' : 'application/octet-stream', 'tabular' : 'text/plain', 'taxonomy' : 'text/plain', diff -r 5a4aac327bad -r 22b08d47f7ba lib/galaxy/model/migrate/versions/0006_change_qual_datatype.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/galaxy/model/migrate/versions/0006_change_qual_datatype.py Wed Apr 29 11:53:22 2009 -0400 @@ -0,0 +1,47 @@ +from sqlalchemy import * +from sqlalchemy.orm import * +from migrate import * +import sys, logging + +log = logging.getLogger( __name__ ) +log.setLevel(logging.DEBUG) +handler = logging.StreamHandler( sys.stdout ) +format = "%(name)s %(levelname)s %(asctime)s %(message)s" +formatter = logging.Formatter( format ) +handler.setFormatter( formatter ) +log.addHandler( handler ) + +metadata = MetaData( migrate_engine ) +db_session = scoped_session( sessionmaker( bind=migrate_engine, autoflush=False, transactional=False ) ) +HistoryDatasetAssociation_table = Table( "history_dataset_association", metadata, autoload=True ) + +def upgrade(): + # Load existing tables + metadata.reflect() + # Add 2 indexes to the galaxy_user table + i = Index( 'ix_hda_extension', HistoryDatasetAssociation_table.c.extension ) + try: + i.create() + except Exception, e: + log.debug( "Adding index 'ix_hda_extension' to history_dataset_association table failed: %s" % ( str( e ) ) ) + + # Set the default data in the galaxy_user table, but only for null values + cmd = "UPDATE history_dataset_association SET extension = 'qual454' WHERE extension = 'qual' and peek like \'>%%\'" + try: + db_session.execute( cmd ) + except Exception, e: + log.debug( "Resetting extension qual to qual454 in history_dataset_association failed: %s" % ( str( e ) ) ) + cmd = "UPDATE history_dataset_association SET extension = 'qualsolexa' WHERE extension = 'qual' and peek not like \'>%%\'" + try: + db_session.execute( cmd ) + except Exception, e: + log.debug( "Resetting extension qual to qualsolexa in history_dataset_association failed: %s" % ( str( e ) ) ) + # Add 1 index to the history_dataset_association table + try: + i.drop() + except Exception, e: + log.debug( "Dropping index 'ix_hda_extension' to history_dataset_association table failed: %s" % ( str( e ) ) ) + + +def downgrade(): + pass diff -r 5a4aac327bad -r 22b08d47f7ba tools/metag_tools/fastqsolexa_to_fasta_qual.xml --- a/tools/metag_tools/fastqsolexa_to_fasta_qual.xml Wed Apr 29 10:27:57 2009 -0400 +++ b/tools/metag_tools/fastqsolexa_to_fasta_qual.xml Wed Apr 29 11:53:22 2009 -0400 @@ -6,7 +6,7 @@ </inputs> <outputs> <data name="output1" format="fasta"/> - <data name="output2" format="qual"/> + <data name="output2" format="qualsolexa"/> </outputs> <tests> <!-- NOTE: this tool generates 2 output files, but our functional tests currently only handle the last one generated --> diff -r 5a4aac327bad -r 22b08d47f7ba tools/metag_tools/rmapq_wrapper.xml --- a/tools/metag_tools/rmapq_wrapper.xml Wed Apr 29 10:27:57 2009 -0400 +++ b/tools/metag_tools/rmapq_wrapper.xml Wed Apr 29 11:53:22 2009 -0400 @@ -13,7 +13,7 @@ </options> </param> <param name="input_seq" type="data" format="fasta" label="Sequence file"/> - <param name="input_score" type="data" format="qual" label="Quality score file"/> + <param name="input_score" type="data" format="qualsolexa" label="Quality score file"/> <param name="high_score" type="float" size="15" value="40" label="Minimum score for high-quality base (-q)"/> <param name="high_len" type="integer" size="15" value="36" label="Minimal high-quality bases (-M)"/> <param name="align_len" type="integer" size="15" value="11" label="Minimal length of a hit (-h)" help="seed"/> @@ -46,7 +46,7 @@ <test> <param name="database" value="/depot/data2/galaxy/faseq/test" /> <param name="input_seq" value="rmapq_wrapper_test1.fasta" ftype="fasta"/> - <param name="input_score" value="rmapq_wrapper_test1.qual" ftype="qual" /> + <param name="input_score" value="rmapq_wrapper_test1.qual" ftype="qualsolexa" /> <param name="high_score" value="40" /> <param name="high_len" value="36" /> <param name="read_len" value="36" /> diff -r 5a4aac327bad -r 22b08d47f7ba tools/metag_tools/short_reads_figure_high_quality_length.xml --- a/tools/metag_tools/short_reads_figure_high_quality_length.xml Wed Apr 29 10:27:57 2009 -0400 +++ b/tools/metag_tools/short_reads_figure_high_quality_length.xml Wed Apr 29 11:53:22 2009 -0400 @@ -5,7 +5,7 @@ <inputs> <page> - <param name="input1" type="data" format="qual,txtseq.zip" label="Quality score file" help="No dataset? Read tip below"/> + <param name="input1" type="data" format="qualsolexa,qual454,txtseq.zip" label="Quality score file" help="No dataset? Read tip below"/> <param name="input2" type="integer" size="5" value="20" label="Quality score threshold" /> </page> </inputs> @@ -17,12 +17,12 @@ </requirements> <tests> <test> - <param name="input1" value="solexa.qual" ftype="qual" /> + <param name="input1" value="solexa.qual" ftype="qualsolexa" /> <param name="input2" value="5" /> <output name="output1" file="solexa_high_quality_hist.pdf" ftype="pdf"/> </test> <test> - <param name="input1" value="454.qual" ftype="qual" /> + <param name="input1" value="454.qual" ftype="qual454" /> <param name="input2" value="5" /> <output name="output1" file="454_high_quality_hist.pdf" ftype="pdf"/> </test> diff -r 5a4aac327bad -r 22b08d47f7ba tools/metag_tools/short_reads_figure_score.xml --- a/tools/metag_tools/short_reads_figure_score.xml Wed Apr 29 10:27:57 2009 -0400 +++ b/tools/metag_tools/short_reads_figure_score.xml Wed Apr 29 11:53:22 2009 -0400 @@ -5,7 +5,7 @@ <inputs> <page> - <param name="input1" type="data" format="qual, txtseq.zip" label="Quality score file" help="No dataset? Read tip below"/> + <param name="input1" type="data" format="qualsolexa, qual454, txtseq.zip" label="Quality score file" help="No dataset? Read tip below"/> </page> </inputs> @@ -17,11 +17,11 @@ </requirements> <tests> <test> - <param name="input1" value="solexa.qual" ftype="qual" /> + <param name="input1" value="solexa.qual" ftype="qualsolexa" /> <output name="output1" file="solexaScore.png" ftype="png" /> </test> <test> - <param name="input1" value="454.qual" ftype="qual" /> + <param name="input1" value="454.qual" ftype="qual454" /> <output name="output1" file="454Score.png" ftype="png" /> </test> </tests> diff -r 5a4aac327bad -r 22b08d47f7ba tools/metag_tools/short_reads_trim_seq.xml --- a/tools/metag_tools/short_reads_trim_seq.xml Wed Apr 29 10:27:57 2009 -0400 +++ b/tools/metag_tools/short_reads_trim_seq.xml Wed Apr 29 11:53:22 2009 -0400 @@ -7,7 +7,7 @@ <inputs> <page> <param name="input1" type="data" format="fasta,txtseq.zip" label="Reads" /> - <param name="input2" type="data" format="qual,txtseq.zip" label="Quality scores" /> + <param name="input2" type="data" format="qualsolexa,qual454,txtseq.zip" label="Quality scores" /> <param name="trim" type="integer" size="5" value="20" label="Minimal quality score" help="bases scoring below this value will trigger splitting"/> <param name="length" type="integer" size="5" value="100" label="Minimal length of contiguous segment" help="report all high quality segments above this length. Setting this option to '0' will cause the program to return a single longest run of high quality bases per read" /> <conditional name="sequencing_method_choice"> @@ -36,7 +36,7 @@ <test> <param name="sequencer" value="454" /> <param name="input1" value="454.fasta" ftype="fasta" /> - <param name="input2" value="454.qual" ftype="qual" /> + <param name="input2" value="454.qual" ftype="qual454" /> <param name="input3" value="no" /> <param name="trim" value="20" /> <param name="length" value="0" /> @@ -45,7 +45,7 @@ <test> <param name="sequencer" value="Solexa" /> <param name="input1" value="solexa.fasta" ftype="fasta" /> - <param name="input2" value="solexa.qual" ftype="qual" /> + <param name="input2" value="solexa.qual" ftype="qualsolexa" /> <param name="input3" value="0" /> <param name="trim" value="20" /> <param name="length" value="0" />
participants (1)
-
Greg Von Kuster