commit/galaxy-central: dan: Allow FASTQ Groomer tool to work on Color Space files that contain a fake/dummy quality score for the adapter base (e.g. files obtained from the SRA). The Groomer will remove the dummy/fake quality score from the read.
1 new changeset in galaxy-central: http://bitbucket.org/galaxy/galaxy-central/changeset/77b20b28d89c/ changeset: 77b20b28d89c user: dan date: 2011-08-22 22:07:23 summary: Allow FASTQ Groomer tool to work on Color Space files that contain a fake/dummy quality score for the adapter base (e.g. files obtained from the SRA). The Groomer will remove the dummy/fake quality score from the read. affected #: 4 files (1.6 KB) --- a/lib/galaxy_utils/sequence/fastq.py Mon Aug 22 14:58:51 2011 -0400 +++ b/lib/galaxy_utils/sequence/fastq.py Mon Aug 22 16:07:23 2011 -0400 @@ -153,6 +153,8 @@ rval.quality = reversed( rval.get_decimal_quality_scores() ) rval.quality = "%s " % " ".join( map( str, rval.quality ) ) return rval + def apply_galaxy_conventions( self ): + pass class fastqSangerRead( fastqSequencingRead ): format = 'sanger' @@ -206,7 +208,7 @@ if self.has_adapter_base(): qual_len = len( self.get_ascii_quality_scores() ) seq_len = len( self.sequence ) - assert qual_len + 1 == seq_len, "Invalid FASTQ file: quality score length (%i) does not match sequence length (%i with adapter base)" % ( qual_len, seq_len ) + assert ( qual_len + 1 == seq_len ) or ( qual_len == seq_len ), "Invalid FASTQ file: quality score length (%i) does not match sequence length (%i with adapter base)" % ( qual_len, seq_len ) #SRA adds FAKE/DUMMY quality scores to the adapter base, we'll allow the reading of the Improper score here, but remove it in the Reader when "apply_galaxy_conventions" is set to True else: return fastqSequencingRead.assert_sequence_quality_lengths( self ) def get_sequence( self ): @@ -262,7 +264,12 @@ elif new_adapter: rval.sequence = "%s%s" % ( new_adapter, rval.sequence ) return rval - + def apply_galaxy_conventions( self ): + if self.has_adapter_base() and len( self.sequence ) == len( self.get_ascii_quality_scores() ): #SRA adds FAKE/DUMMY quality scores to the adapter base, we remove them here + if self.is_ascii_encoded(): + self.quality = self.quality[1:] + else: + self.quality = " ".join( map( str, self.get_decimal_quality_scores()[1:] ) ) FASTQ_FORMATS = {} for format in [ fastqIlluminaRead, fastqSolexaRead, fastqSangerRead, fastqCSSangerRead ]: @@ -417,9 +424,10 @@ return column_stats class fastqReader( object ): - def __init__( self, fh, format = 'sanger' ): + def __init__( self, fh, format = 'sanger', apply_galaxy_conventions = False ): self.file = fh self.format = format + self.apply_galaxy_conventions = apply_galaxy_conventions def close( self ): return self.file.close() def next(self): @@ -450,6 +458,8 @@ break rval.append_quality( line ) rval.assert_sequence_quality_lengths() + if self.apply_galaxy_conventions: + rval.apply_galaxy_conventions() return rval def __iter__( self ): while True: @@ -494,13 +504,14 @@ raise e class fastqNamedReader( object ): - def __init__( self, fh, format = 'sanger' ): + def __init__( self, fh, format = 'sanger', apply_galaxy_conventions = False ): self.file = fh self.format = format self.reader = fastqReader( self.file, self.format ) #self.last_offset = self.file.tell() self.offset_dict = {} self.eof = False + self.apply_galaxy_conventions = apply_galaxy_conventions def close( self ): return self.file.close() def get( self, sequence_id ): @@ -531,6 +542,8 @@ if fastq_read.identifier not in self.offset_dict: self.offset_dict[ fastq_read.identifier ] = [] self.offset_dict[ fastq_read.identifier ].append( offset ) + if rval is not None and self.apply_galaxy_conventions: + rval.apply_galaxy_conventions() return rval def has_data( self ): #returns a string representation of remaining data, or empty string (False) if no data remaining --- a/tools/fastq/fastq_groomer.py Mon Aug 22 14:58:51 2011 -0400 +++ b/tools/fastq/fastq_groomer.py Mon Aug 22 16:07:23 2011 -0400 @@ -19,7 +19,7 @@ reader = fastqVerboseErrorReader else: reader = fastqReader - for read_count, fastq_read in enumerate( reader( open( input_filename ), format = input_type ) ): + for read_count, fastq_read in enumerate( reader( open( input_filename ), format = input_type, apply_galaxy_conventions = True ) ): if summarize_input: aggregator.consume_read( fastq_read ) out.write( fastq_read ) --- a/tools/fastq/fastq_groomer.xml Mon Aug 22 14:58:51 2011 -0400 +++ b/tools/fastq/fastq_groomer.xml Mon Aug 22 16:07:23 2011 -0400 @@ -1,4 +1,4 @@ -<tool id="fastq_groomer" name="FASTQ Groomer" version="1.0.3"> +<tool id="fastq_groomer" name="FASTQ Groomer" version="1.0.4"><description>convert between various FASTQ quality formats</description><command interpreter="python">fastq_groomer.py '$input_file' '$input_type' '$output_file' #if str( $options_type['options_type_selector'] ) == 'basic': @@ -240,6 +240,15 @@ <param name="summarize_input" value="summarize_input" /><output name="output_file" file="sanger_full_range_as_solexa.fastqsolexa" /></test> + <test> + <param name="input_file" value="sanger_full_range_as_cssanger_adapter_base_with_quality_score.fastqcssanger_fake_score" ftype="fastq" /> + <param name="input_type" value="cssanger" /> + <param name="options_type_selector" value="advanced" /> + <param name="output_type" value="cssanger" /> + <param name="force_quality_encoding" value="None" /> + <param name="summarize_input" value="summarize_input" /> + <output name="output_file" file="sanger_full_range_as_cssanger.fastqcssanger" /> + </test><!-- Test fastq with line wrapping --><test><param name="input_file" value="wrapping_original_sanger.fastqsanger" ftype="fastq" /> Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
Bitbucket