9 new commits in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/0084ac72aa7b/ Changeset: 0084ac72aa7b User: jmchilton Date: 2013-11-13 05:02:58 Summary: FASTQ Opt: In convert_read_to_format, adjust when new_encoding logic is calculated. Doesn't change the behavior or optimize anything, this is just done to simplify subsequent commits. Baseline: I took the first 100 megabytes of a 26 gigabyte example of a FASTQ file filtered with the FASTQ filter tool that I found in an "Important Galaxy User"'s history on main. I ran with the same command-line on my dev box with the start of this file and using Python's -m to profile function times and total amount of time to serve as a baseline as I optimized the FASTQ filter code. Here is the start of the output: Kept 135848 of 250000 reads (54.34%). 200015991 function calls (199765991 primitive calls) in 136.934 seconds Extrapolating this out, that 26 gigabyte file would take roughly 10 hours to process on my laptop - this is slightly longer than what it took on main - indicating to me this is likely not disk bound since my SSD would probably outperform main? Affected #: 1 file diff -r d61de8f362929483c2a6a6cb8e7b7be77fa5cfa4 -r 0084ac72aa7bdd253454027edf1bcbb3cdeb496d lib/galaxy_utils/sequence/fastq.py --- a/lib/galaxy_utils/sequence/fastq.py +++ b/lib/galaxy_utils/sequence/fastq.py @@ -89,7 +89,6 @@ score_list = self.convert_score_solexa_to_phred( self.get_decimal_quality_scores() ) else: score_list = self.get_decimal_quality_scores() - new_read.quality = "%s " % " ".join( map( str, new_class.restrict_scores_to_valid_range( score_list ) ) ) #need trailing space to be valid decimal fastq if force_quality_encoding is None: if self.is_ascii_encoded(): new_encoding = 'ascii' @@ -97,6 +96,7 @@ new_encoding = 'decimal' else: new_encoding = force_quality_encoding + new_read.quality = "%s " % " ".join( map( str, new_class.restrict_scores_to_valid_range( score_list ) ) ) #need trailing space to be valid decimal fastq if new_encoding == 'ascii': new_read.quality = "".join( new_read.get_ascii_quality_scores() ) return new_read https://bitbucket.org/galaxy/galaxy-central/commits/046c608abfa3/ Changeset: 046c608abfa3 User: jmchilton Date: 2013-11-13 05:02:58 Summary: FASTQ Opt: Restructure fastq_filter to only call execfile once and reuse the same function. After this optimization: Kept 135848 of 250000 reads (54.34%). 200015991 function calls (199765991 primitive calls) in 85.375 seconds Down from 136.934 seconds on previous changeset. Main Difference: (ncalls|tottime|percall|cumtime|percall|filename:lineno(function)) 250000 45.130 0.000 67.811 0.000 {execfile} - became - 1 0.000 0.000 0.000 0.000 {execfile} Affected #: 2 files diff -r 0084ac72aa7bdd253454027edf1bcbb3cdeb496d -r 046c608abfa31091f3e030db3979f848666fa602 tools/fastq/fastq_filter.py --- a/tools/fastq/fastq_filter.py +++ b/tools/fastq/fastq_filter.py @@ -18,10 +18,10 @@ i = None reads_kept = 0 + execfile(script_filename, globals()) for i, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ): - local = {'fastq_read':fastq_read, 'ret_val':False} - execfile( script_filename, {}, local ) - if local['ret_val']: + ret_val = fastq_read_pass_filter( fastq_read ) ## fastq_read_pass_filter defined in script_filename + if ret_val: out.write( fastq_read ) reads_kept += 1 out.close() diff -r 0084ac72aa7bdd253454027edf1bcbb3cdeb496d -r 046c608abfa31091f3e030db3979f848666fa602 tools/fastq/fastq_filter.xml --- a/tools/fastq/fastq_filter.xml +++ b/tools/fastq/fastq_filter.xml @@ -96,7 +96,6 @@ return False #end for return True -ret_val = fastq_read_pass_filter( fastq_read ) </configfile></configfiles><outputs> https://bitbucket.org/galaxy/galaxy-central/commits/04c47872b79e/ Changeset: 04c47872b79e User: jmchilton Date: 2013-11-13 05:02:58 Summary: FASTQ Opt: Utilize optimized in place alternatives to restrict_scores_to_valid_range. Separate out ascii vs. decimal encoding branches of convert_read_to_format and use these new transform_ alternatives operate "in place" (don't produce new lists/allocate memory). The ascii version transforms to ascii in place instead of requiring another call and creating another array. Runtime Result: Kept 135848 of 250000 reads (54.34%). 117148711 function calls (116898711 primitive calls) in 48.272 seconds Down from 85.375 seconds on previous changeset. Main Difference: (ncalls|tottime|percall|cumtime|percall|filename:lineno(function)) 135848 1.185 0.000 59.856 0.000 fastq.py:71(convert_read_to_format) -became- 135848 0.502 0.000 22.979 0.000 fastq.py:99(convert_read_to_format) About a third the time is spent convert reads to the correct format. This is a varaint of the core optimization I made when optimizing the FASTQ groomer for MSI - it has likewise a substantial impact on the performance of that tool. Affected #: 1 file diff -r 046c608abfa31091f3e030db3979f848666fa602 -r 04c47872b79eea8dd0b745aed3e7324513653d78 lib/galaxy_utils/sequence/fastq.py --- a/lib/galaxy_utils/sequence/fastq.py +++ b/lib/galaxy_utils/sequence/fastq.py @@ -35,6 +35,34 @@ return max( min( score, cls.quality_max ), cls.quality_min ) return map( restrict_score, decimal_score_list ) @classmethod + def transform_scores_to_valid_range( cls, decimal_score_list): + cls_quality_max = cls.quality_max + cls_quality_min = cls.quality_min + for i in range( len( decimal_score_list ) ): + score = decimal_score_list[i] + if(score > cls_quality_max): + transformed_score = cls_quality_max + elif( score < cls_quality_min ): + transformed_score = cls_quality_min + else: + transformed_score = score + decimal_score_list[i] = str(transformed_score) + @classmethod + def transform_scores_to_valid_range_ascii( cls, decimal_score_list ): + cls_quality_max = cls.quality_max + cls_quality_min = cls.quality_min + to_quality = cls.ascii_min - cls.quality_min + for i in range( len( decimal_score_list ) ): + score = decimal_score_list[i] + if(score > cls_quality_max): + transformed_score = cls_quality_max + elif( score < cls_quality_min ): + transformed_score = cls_quality_min + else: + transformed_score = score + transformed_score = chr(transformed_score + to_quality) + decimal_score_list[i] = transformed_score + @classmethod def convert_base_to_color_space( cls, sequence ): return cls.color_space_converter.to_color_space( sequence ) @classmethod @@ -96,9 +124,14 @@ new_encoding = 'decimal' else: new_encoding = force_quality_encoding - new_read.quality = "%s " % " ".join( map( str, new_class.restrict_scores_to_valid_range( score_list ) ) ) #need trailing space to be valid decimal fastq if new_encoding == 'ascii': - new_read.quality = "".join( new_read.get_ascii_quality_scores() ) + new_class.transform_scores_to_valid_range_ascii( score_list ) + restricted_scores = map( str, score_list ) + new_read.quality = "".join( restricted_scores ) + else: # decimal + new_class.transform_scores_to_valid_range( score_list ) + restricted_scores = map( str, score_list ) + new_read.quality = "%s " % " ".join( restricted_scores ) #need trailing space to be valid decimal fastq return new_read def get_sequence( self ): return self.sequence https://bitbucket.org/galaxy/galaxy-central/commits/f54589589c1b/ Changeset: f54589589c1b User: jmchilton Date: 2013-11-13 05:02:58 Summary: FASTQ Opt: No need to map(str) over chr and join, can just join chr's. Prevent an extra array creation and chr-> str map per base. Shaves 10% off remaining run time of filter on SSD. Kept 135848 of 250000 reads (54.34%). 117012863 function calls (116762863 primitive calls) in 46.897 seconds Down from 48.272 seconds on previous changset. Main Difference: (ncalls|tottime|percall|cumtime|percall|filename:lineno(function)) 135848 0.502 0.000 22.979 0.000 fastq.py:99(convert_read_to_format) -became- 135848 0.448 0.000 21.159 0.000 fastq.py:99(convert_read_to_format) Not huge, but a consistent improvement. Affected #: 1 file diff -r 04c47872b79eea8dd0b745aed3e7324513653d78 -r f54589589c1b3882127aa96cc1d62b7eba9bef61 lib/galaxy_utils/sequence/fastq.py --- a/lib/galaxy_utils/sequence/fastq.py +++ b/lib/galaxy_utils/sequence/fastq.py @@ -1,4 +1,4 @@ -#Dan Blankenberg +##Dan Blankenberg import math import string import transform @@ -126,12 +126,10 @@ new_encoding = force_quality_encoding if new_encoding == 'ascii': new_class.transform_scores_to_valid_range_ascii( score_list ) - restricted_scores = map( str, score_list ) - new_read.quality = "".join( restricted_scores ) + new_read.quality = "".join( score_list ) else: # decimal new_class.transform_scores_to_valid_range( score_list ) - restricted_scores = map( str, score_list ) - new_read.quality = "%s " % " ".join( restricted_scores ) #need trailing space to be valid decimal fastq + new_read.quality = "%s " % " ".join( score_list ) #need trailing space to be valid decimal fastq return new_read def get_sequence( self ): return self.sequence https://bitbucket.org/galaxy/galaxy-central/commits/6d723e3bf2eb/ Changeset: 6d723e3bf2eb User: jmchilton Date: 2013-11-13 05:02:58 Summary: FASTQ Opt: Precompute difference ascii -> decimal difference. Kept 135848 of 250000 reads (54.34%). 117012863 function calls (116762863 primitive calls) in 39.416 seconds Down from 46.897 seconds on previous changeset. Main Difference WRT: (ncalls|tottime|percall|cumtime|percall|filename:lineno(function)) 385848 18.215 0.000 22.866 0.000 fastq.py:90(get_decimal_quality_scores) -became- 385848 11.415 0.000 15.546 0.000 fastq.py:91(get_decimal_quality_scores) Looks like a math optimization but actual optimization is coming from reading local variable instead of dereferencing object variable twice per base per. Affected #: 1 file diff -r f54589589c1b3882127aa96cc1d62b7eba9bef61 -r 6d723e3bf2ebddf0730e7b38761ca2b511d129c3 lib/galaxy_utils/sequence/fastq.py --- a/lib/galaxy_utils/sequence/fastq.py +++ b/lib/galaxy_utils/sequence/fastq.py @@ -82,14 +82,16 @@ quality = self.quality.rstrip() #decimal scores should have a trailing space if quality: try: - return [ chr( int( val ) + self.ascii_min - self.quality_min ) for val in quality.split() ] + to_quality = self.ascii_min - self.quality_min + return [ chr( int( val ) + to_quality ) for val in quality.split() ] except ValueError, e: raise ValueError( 'Error Parsing quality String. ASCII quality strings cannot contain spaces (%s): %s' % ( self.quality, e ) ) else: return [] def get_decimal_quality_scores( self ): if self.is_ascii_encoded(): - return [ ord( val ) - self.ascii_min + self.quality_min for val in self.quality ] + to_quality = self.quality_min - self.ascii_min + return [ ord( val ) + to_quality for val in self.quality ] else: quality = self.quality.rstrip() #decimal scores should have a trailing space if quality: https://bitbucket.org/galaxy/galaxy-central/commits/6265e769718e/ Changeset: 6265e769718e User: jmchilton Date: 2013-11-13 05:02:58 Summary: FASTQ Opt: Do not generate arrays just to check length. ... just compute what length would be. Kept 135848 of 250000 reads (54.34%). 117012863 function calls (116762863 primitive calls) in 38.412 seconds Down from 39.416 seconds on previous changeset. Main Difference: (ncalls|tottime|percall|cumtime|percall|filename:lineno(function)) 500000 0.518 0.000 1.889 0.000 fastq.py:171(insufficient_quality_length) 250000 0.322 0.000 1.103 0.000 fastq.py:173(assert_sequence_quality_lengths) - became - 500000 0.306 0.000 0.932 0.000 fastq.py:187(insufficient_quality_length) 250000 0.164 0.000 0.478 0.000 fastq.py:189(assert_sequence_quality_lengths) Affected #: 1 file diff -r 6d723e3bf2ebddf0730e7b38761ca2b511d129c3 -r 6265e769718e2487b4089c181948769cf67869ce lib/galaxy_utils/sequence/fastq.py --- a/lib/galaxy_utils/sequence/fastq.py +++ b/lib/galaxy_utils/sequence/fastq.py @@ -88,6 +88,22 @@ raise ValueError( 'Error Parsing quality String. ASCII quality strings cannot contain spaces (%s): %s' % ( self.quality, e ) ) else: return [] + def get_ascii_quality_scores_len( self ): + """ + Compute ascii quality score length, without generating relatively + expensive qualty score array. + """ + if self.is_ascii_encoded(): + return len( self.quality ) + else: + quality = self.quality.rstrip() + if quality: + try: + return len( quality.split() ) + except ValueError, e: + raise ValueError( 'Error Parsing quality String. ASCII quality strings cannot contain spaces (%s): %s' % ( self.quality, e ) ) + else: + return 0 def get_decimal_quality_scores( self ): if self.is_ascii_encoded(): to_quality = self.quality_min - self.ascii_min @@ -168,9 +184,9 @@ return False return True def insufficient_quality_length( self ): - return len( self.get_ascii_quality_scores() ) < len( self.sequence ) + return self.get_ascii_quality_scores_len() < len( self.sequence ) def assert_sequence_quality_lengths( self ): - qual_len = len( self.get_ascii_quality_scores() ) + qual_len = self.get_ascii_quality_scores_len() seq_len = len( self.sequence ) assert qual_len == seq_len, "Invalid FASTQ file: quality score length (%i) does not match sequence length (%i)" % ( qual_len, seq_len ) def reverse( self, clone = True ): @@ -235,11 +251,11 @@ return False def insufficient_quality_length( self ): if self.has_adapter_base(): - return len( self.get_ascii_quality_scores() ) + 1 < len( self.sequence ) + return self.get_ascii_quality_scores_len() + 1 < len( self.sequence ) return fastqSequencingRead.insufficient_quality_length( self ) def assert_sequence_quality_lengths( self ): if self.has_adapter_base(): - qual_len = len( self.get_ascii_quality_scores() ) + qual_len = self.get_ascii_quality_scores_len() seq_len = len( self.sequence ) assert ( qual_len + 1 == seq_len ) or ( qual_len == seq_len ), "Invalid FASTQ file: quality score length (%i) does not match sequence length (%i with adapter base)" % ( qual_len, seq_len ) #SRA adds FAKE/DUMMY quality scores to the adapter base, we'll allow the reading of the Improper score here, but remove it in the Reader when "apply_galaxy_conventions" is set to True else: https://bitbucket.org/galaxy/galaxy-central/commits/b5970d24670f/ Changeset: b5970d24670f User: jmchilton Date: 2013-11-13 05:02:58 Summary: FASTQ Opt: Eliminate a few extra calls to is_ascii_encoded. Kept 135848 of 250000 reads (54.34%). 117127015 function calls (116877015 primitive calls) in 38.632 seconds Down from 39.416 seconds on previous changeset. Main Difference (ncalls|tottime|percall|cumtime|percall|filename:lineno(function)) 385848 11.289 0.000 15.499 0.000 fastq.py:91(get_decimal_quality_scores) 1271696 0.716 0.000 0.716 0.000 fastq.py:71(is_ascii_encoded) 250000 0.185 0.000 10.434 0.000 fastq.py:107(get_decimal_quality_scores) 385848 11.380 0.000 15.368 0.000 fastq.py:109(__get_decimal_quality_scores) 1135848 0.649 0.000 0.649 0.000 fastq.py:71(is_ascii_encoded) Affected #: 1 file diff -r 6265e769718e2487b4089c181948769cf67869ce -r b5970d24670fefc0bf028bcab681f81ef2dd62e9 lib/galaxy_utils/sequence/fastq.py --- a/lib/galaxy_utils/sequence/fastq.py +++ b/lib/galaxy_utils/sequence/fastq.py @@ -105,7 +105,9 @@ else: return 0 def get_decimal_quality_scores( self ): - if self.is_ascii_encoded(): + return self.__get_decimal_quality_scores(self.is_ascii_encoded()) + def __get_decimal_quality_scores( self, ascii ): + if ascii: to_quality = self.quality_min - self.ascii_min return [ ord( val ) + to_quality for val in self.quality ] else: @@ -128,15 +130,16 @@ else: new_read.sequence = self.convert_color_to_base_space( self.sequence ) new_read.description = self.description + is_ascii = self.is_ascii_encoded() if self.score_system != new_read.score_system: if self.score_system == 'phred': - score_list = self.convert_score_phred_to_solexa( self.get_decimal_quality_scores() ) + score_list = self.convert_score_phred_to_solexa( self.__get_decimal_quality_scores(is_ascii) ) else: - score_list = self.convert_score_solexa_to_phred( self.get_decimal_quality_scores() ) + score_list = self.convert_score_solexa_to_phred( self.__get_decimal_quality_scores(is_ascii) ) else: - score_list = self.get_decimal_quality_scores() + score_list = self.__get_decimal_quality_scores(is_ascii) if force_quality_encoding is None: - if self.is_ascii_encoded(): + if is_ascii: new_encoding = 'ascii' else: new_encoding = 'decimal' https://bitbucket.org/galaxy/galaxy-central/commits/bb6c21a42274/ Changeset: bb6c21a42274 User: jmchilton Date: 2013-11-13 05:02:58 Summary: FASTQ Opt: Disable formatting output, reading is already enforcing it? This is would be huge optimization for filter at this point - it cuts the runtime of my filter test to half of what is remaining. This changeset if just leaving a comment because I am not certain the results are the same - though I strongly suspect they would be - I just want some confirmation before pulling the trigger. None of the test cases fail as a result of this or any of these changesets. Kept 135848 of 250000 reads (54.34%). 60407639 function calls in 18.837 seconds If change made, this would be down from 38.632 seconds on previous call. Main Difference: (ncalls|tottime|percall|cumtime|percall|filename:lineno(function)) 135848 0.357 0.000 19.710 0.000 fastq.py:631(write) - to - 135848 0.219 0.000 0.609 0.000 fastq.py:631(write) Affected #: 1 file diff -r b5970d24670fefc0bf028bcab681f81ef2dd62e9 -r bb6c21a422741b32ab78f7a4fe6dae571019d532 tools/fastq/fastq_filter.py --- a/tools/fastq/fastq_filter.py +++ b/tools/fastq/fastq_filter.py @@ -14,6 +14,8 @@ os.mkdir( additional_files_path ) shutil.copy( script_filename, os.path.join( additional_files_path, 'debug.txt' ) ) + ## Dan, Others: Can we simply drop the "format=input_type" here since it is specified in reader. + ## This optimization would cut runtime roughly in half (for my test case anyway). -John out = fastqWriter( open( output_filename, 'wb' ), format = input_type ) i = None https://bitbucket.org/galaxy/galaxy-central/commits/374783ca6a48/ Changeset: 374783ca6a48 User: dannon Date: 2013-12-17 16:42:49 Summary: Merged in jmchilton/galaxy-central-fork-1 (pull request #259) FASTQ Optimizations (Round 1 - Filtering) Affected #: 3 files diff -r 31179a01a1ec4176884926907b212259e4148319 -r 374783ca6a480fbb571bfe9eb46052667b9d0479 lib/galaxy_utils/sequence/fastq.py --- a/lib/galaxy_utils/sequence/fastq.py +++ b/lib/galaxy_utils/sequence/fastq.py @@ -1,4 +1,4 @@ -#Dan Blankenberg +##Dan Blankenberg import math import string import transform @@ -35,6 +35,34 @@ return max( min( score, cls.quality_max ), cls.quality_min ) return map( restrict_score, decimal_score_list ) @classmethod + def transform_scores_to_valid_range( cls, decimal_score_list): + cls_quality_max = cls.quality_max + cls_quality_min = cls.quality_min + for i in range( len( decimal_score_list ) ): + score = decimal_score_list[i] + if(score > cls_quality_max): + transformed_score = cls_quality_max + elif( score < cls_quality_min ): + transformed_score = cls_quality_min + else: + transformed_score = score + decimal_score_list[i] = str(transformed_score) + @classmethod + def transform_scores_to_valid_range_ascii( cls, decimal_score_list ): + cls_quality_max = cls.quality_max + cls_quality_min = cls.quality_min + to_quality = cls.ascii_min - cls.quality_min + for i in range( len( decimal_score_list ) ): + score = decimal_score_list[i] + if(score > cls_quality_max): + transformed_score = cls_quality_max + elif( score < cls_quality_min ): + transformed_score = cls_quality_min + else: + transformed_score = score + transformed_score = chr(transformed_score + to_quality) + decimal_score_list[i] = transformed_score + @classmethod def convert_base_to_color_space( cls, sequence ): return cls.color_space_converter.to_color_space( sequence ) @classmethod @@ -54,14 +82,34 @@ quality = self.quality.rstrip() #decimal scores should have a trailing space if quality: try: - return [ chr( int( val ) + self.ascii_min - self.quality_min ) for val in quality.split() ] + to_quality = self.ascii_min - self.quality_min + return [ chr( int( val ) + to_quality ) for val in quality.split() ] except ValueError, e: raise ValueError( 'Error Parsing quality String. ASCII quality strings cannot contain spaces (%s): %s' % ( self.quality, e ) ) else: return [] + def get_ascii_quality_scores_len( self ): + """ + Compute ascii quality score length, without generating relatively + expensive qualty score array. + """ + if self.is_ascii_encoded(): + return len( self.quality ) + else: + quality = self.quality.rstrip() + if quality: + try: + return len( quality.split() ) + except ValueError, e: + raise ValueError( 'Error Parsing quality String. ASCII quality strings cannot contain spaces (%s): %s' % ( self.quality, e ) ) + else: + return 0 def get_decimal_quality_scores( self ): - if self.is_ascii_encoded(): - return [ ord( val ) - self.ascii_min + self.quality_min for val in self.quality ] + return self.__get_decimal_quality_scores(self.is_ascii_encoded()) + def __get_decimal_quality_scores( self, ascii ): + if ascii: + to_quality = self.quality_min - self.ascii_min + return [ ord( val ) + to_quality for val in self.quality ] else: quality = self.quality.rstrip() #decimal scores should have a trailing space if quality: @@ -82,23 +130,27 @@ else: new_read.sequence = self.convert_color_to_base_space( self.sequence ) new_read.description = self.description + is_ascii = self.is_ascii_encoded() if self.score_system != new_read.score_system: if self.score_system == 'phred': - score_list = self.convert_score_phred_to_solexa( self.get_decimal_quality_scores() ) + score_list = self.convert_score_phred_to_solexa( self.__get_decimal_quality_scores(is_ascii) ) else: - score_list = self.convert_score_solexa_to_phred( self.get_decimal_quality_scores() ) + score_list = self.convert_score_solexa_to_phred( self.__get_decimal_quality_scores(is_ascii) ) else: - score_list = self.get_decimal_quality_scores() - new_read.quality = "%s " % " ".join( map( str, new_class.restrict_scores_to_valid_range( score_list ) ) ) #need trailing space to be valid decimal fastq + score_list = self.__get_decimal_quality_scores(is_ascii) if force_quality_encoding is None: - if self.is_ascii_encoded(): + if is_ascii: new_encoding = 'ascii' else: new_encoding = 'decimal' else: new_encoding = force_quality_encoding if new_encoding == 'ascii': - new_read.quality = "".join( new_read.get_ascii_quality_scores() ) + new_class.transform_scores_to_valid_range_ascii( score_list ) + new_read.quality = "".join( score_list ) + else: # decimal + new_class.transform_scores_to_valid_range( score_list ) + new_read.quality = "%s " % " ".join( score_list ) #need trailing space to be valid decimal fastq return new_read def get_sequence( self ): return self.sequence @@ -135,9 +187,9 @@ return False return True def insufficient_quality_length( self ): - return len( self.get_ascii_quality_scores() ) < len( self.sequence ) + return self.get_ascii_quality_scores_len() < len( self.sequence ) def assert_sequence_quality_lengths( self ): - qual_len = len( self.get_ascii_quality_scores() ) + qual_len = self.get_ascii_quality_scores_len() seq_len = len( self.sequence ) assert qual_len == seq_len, "Invalid FASTQ file: quality score length (%i) does not match sequence length (%i)" % ( qual_len, seq_len ) def reverse( self, clone = True ): @@ -202,11 +254,11 @@ return False def insufficient_quality_length( self ): if self.has_adapter_base(): - return len( self.get_ascii_quality_scores() ) + 1 < len( self.sequence ) + return self.get_ascii_quality_scores_len() + 1 < len( self.sequence ) return fastqSequencingRead.insufficient_quality_length( self ) def assert_sequence_quality_lengths( self ): if self.has_adapter_base(): - qual_len = len( self.get_ascii_quality_scores() ) + qual_len = self.get_ascii_quality_scores_len() seq_len = len( self.sequence ) assert ( qual_len + 1 == seq_len ) or ( qual_len == seq_len ), "Invalid FASTQ file: quality score length (%i) does not match sequence length (%i with adapter base)" % ( qual_len, seq_len ) #SRA adds FAKE/DUMMY quality scores to the adapter base, we'll allow the reading of the Improper score here, but remove it in the Reader when "apply_galaxy_conventions" is set to True else: diff -r 31179a01a1ec4176884926907b212259e4148319 -r 374783ca6a480fbb571bfe9eb46052667b9d0479 tools/fastq/fastq_filter.py --- a/tools/fastq/fastq_filter.py +++ b/tools/fastq/fastq_filter.py @@ -14,14 +14,16 @@ os.mkdir( additional_files_path ) shutil.copy( script_filename, os.path.join( additional_files_path, 'debug.txt' ) ) + ## Dan, Others: Can we simply drop the "format=input_type" here since it is specified in reader. + ## This optimization would cut runtime roughly in half (for my test case anyway). -John out = fastqWriter( open( output_filename, 'wb' ), format = input_type ) i = None reads_kept = 0 + execfile(script_filename, globals()) for i, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ): - local = {'fastq_read':fastq_read, 'ret_val':False} - execfile( script_filename, {}, local ) - if local['ret_val']: + ret_val = fastq_read_pass_filter( fastq_read ) ## fastq_read_pass_filter defined in script_filename + if ret_val: out.write( fastq_read ) reads_kept += 1 out.close() diff -r 31179a01a1ec4176884926907b212259e4148319 -r 374783ca6a480fbb571bfe9eb46052667b9d0479 tools/fastq/fastq_filter.xml --- a/tools/fastq/fastq_filter.xml +++ b/tools/fastq/fastq_filter.xml @@ -96,7 +96,6 @@ return False #end for return True -ret_val = fastq_read_pass_filter( fastq_read ) </configfile></configfiles><outputs> Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.