commit/galaxy-central: 6 new changesets
6 new changesets in galaxy-central: http://bitbucket.org/galaxy/galaxy-central/changeset/01bd5792fbc7/ changeset: 01bd5792fbc7 user: fangly date: 2010-12-16 09:26:59 summary: Added 2 Python scripts to deal with FASTQ mate pairs: - the interlacer puts mate pairs present in 2 files into a single file - the deinterlacer puts mate pairs present in a single file into 2 files affected #: 3 files (3.0 KB) --- a/lib/galaxy_utils/sequence/fastq.py Fri Jun 10 12:36:16 2011 -0400 +++ b/lib/galaxy_utils/sequence/fastq.py Thu Dec 16 18:26:59 2010 +1000 @@ -609,12 +609,15 @@ return rval def get_paired_identifier( self, fastq_read ): identifier = fastq_read.identifier + identifier_is_first = None if identifier[-2] == '/': if identifier[-1] == "1": identifier = "%s2" % identifier[:-1] + identifier_is_first = False elif identifier[-1] == "2": identifier = "%s1" % identifier[:-1] - return identifier + identifier_is_first = True + return identifier, identifier_is_first class fastqSplitter( object ): def split( self, fastq_read ): --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/fastq/fastq_paired_end_deinterlacer.py Thu Dec 16 18:26:59 2010 +1000 @@ -0,0 +1,53 @@ +#Florent Angly +import sys +from galaxy_utils.sequence.fastq import fastqReader, fastqWriter, fastqNamedReader, fastqJoiner + +def main(): + input_filename = sys.argv[1] + input_type = sys.argv[2] or 'sanger' + mate1_filename = sys.argv[3] + mate2_filename = sys.argv[4] + + type = input_type + input = fastqNamedReader( open( input_filename, 'rb' ), format = type ) + out1 = fastqWriter( open( mate1_filename, 'wb' ), format = type ) + out2 = fastqWriter( open( mate2_filename, 'wb' ), format = type ) + joiner = fastqJoiner( type ) + + i = None + skip_count = 0 + found = {} + for i, mate1 in enumerate( fastqReader( open( input_filename, 'rb' ), format = type ) ): + + if mate1.identifier in found: + del found[mate1.identifier] + continue + + mate2_id, mate2_is_first = joiner.get_paired_identifier( mate1 ) + + mate2 = input.get( mate2_id ) + if mate2: + found[mate2_id] = None + if mate2_is_first: + out1.write( mate2 ) + out2.write( mate1 ) + else: + out1.write( mate1 ) + out2.write( mate2 ) + else: + skip_count += 1 + + if i is None: + print "Your input file contained no valid FASTQ sequences." + else: + if skip_count: + print '%i reads had no mate.' % skip_count + print 'De-interlaced %s pairs of sequences.' % ( (i - skip_count + 1)/2 ) + + input.close() + out1.close() + out2.close() + + +if __name__ == "__main__": + main() --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/fastq/fastq_paired_end_interlacer.py Thu Dec 16 18:26:59 2010 +1000 @@ -0,0 +1,46 @@ +#Florent Angly +import sys +from galaxy_utils.sequence.fastq import fastqReader, fastqWriter, fastqNamedReader, fastqJoiner + +def main(): + mate1_filename = sys.argv[1] + mate1_type = sys.argv[2] or 'sanger' + mate2_filename = sys.argv[3] + mate2_type = sys.argv[4] or 'sanger' + output_filename = sys.argv[5] + + if mate1_type != mate2_type: + print "WARNING: You are trying to interlace files of two different types: %s and %s." % ( mate1_type, mate2_type ) + return + + type = mate1_type + joiner = fastqJoiner( type ) + out = fastqWriter( open( output_filename, 'wb' ), format = type ) + mate_input = fastqNamedReader( open( mate2_filename, 'rb' ), format = type ) + + i = None + skip_count = 0 + for i, mate1 in enumerate( fastqReader( open( mate1_filename, 'rb' ), format = type ) ): + + mate2 = mate_input.get( joiner.get_paired_identifier( mate1 ) ) + + if mate2: + out.write( mate1 ) + out.write( mate2 ) + else: + skip_count += 1 + + if i is None: + print "Your input file contained no valid FASTQ sequences." + else: + not_used_msg = mate_input.has_data() + if not_used_msg: + print not_used_msg + print 'Interlaced %s pairs of sequences.' % ( i - skip_count + 1 ) + + mate_input.close() + out.close() + + +if __name__ == "__main__": + main() http://bitbucket.org/galaxy/galaxy-central/changeset/8fe0ba2e1910/ changeset: 8fe0ba2e1910 user: fangly date: 2010-12-16 08:12:01 summary: Little bug fix and more informative error message affected #: 1 file (26 bytes) --- a/lib/galaxy_utils/sequence/fastq.py Thu Dec 16 18:26:59 2010 +1000 +++ b/lib/galaxy_utils/sequence/fastq.py Thu Dec 16 17:12:01 2010 +1000 @@ -438,7 +438,7 @@ while True: line = self.file.readline() if not line: - raise Exception( 'Invalid FASTQ file: could not parse second instance of sequence identifier.' ) + raise Exception( 'Invalid FASTQ file: could not find quality score of sequence identifier %s.' % rval.identifier ) line = line.rstrip( '\n\r' ) if line.startswith( '+' ) and ( len( line ) == 1 or line[1:].startswith( fastq_header[1:] ) ): rval.description = line @@ -547,7 +547,7 @@ eof = True self.file.seek( offset ) if count: - rval = "There were %i known sequence reads not utilized. " + rval = "There were %i known sequence reads not utilized. " % count if not eof: rval = "%s%s" % ( rval, "An additional unknown number of reads exist in the input that were not utilized." ) return rval http://bitbucket.org/galaxy/galaxy-central/changeset/eebd5ac107c3/ changeset: eebd5ac107c3 user: fangly date: 2010-12-17 06:04:17 summary: FASTQ interlacer and de-interlacer tools fully integrated in Galaxy and functional affected #: 15 files (17.2 KB) --- a/lib/galaxy_utils/sequence/fastq.py Thu Dec 16 17:12:01 2010 +1000 +++ b/lib/galaxy_utils/sequence/fastq.py Fri Dec 17 15:04:17 2010 +1000 @@ -609,15 +609,22 @@ return rval def get_paired_identifier( self, fastq_read ): identifier = fastq_read.identifier - identifier_is_first = None if identifier[-2] == '/': if identifier[-1] == "1": identifier = "%s2" % identifier[:-1] - identifier_is_first = False elif identifier[-1] == "2": identifier = "%s1" % identifier[:-1] - identifier_is_first = True - return identifier, identifier_is_first + return identifier + def is_first_mate( self, sequence_id ): + is_first = None + if not isinstance( sequence_id, basestring ): + sequence_id = sequence_id.identifier + if sequence_id[-2] == '/': + if sequence_id[-1] == "1": + is_first = True + else: + is_first = False + return is_first class fastqSplitter( object ): def split( self, fastq_read ): --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/paired_end_1.fastqsanger Fri Dec 17 15:04:17 2010 +1000 @@ -0,0 +1,20 @@ +@1539:931/1 +NACATCAACACTCAGTAACGGCTGGCGCAAAATGGCATTGATTAACGAAGACTTCCCGCGCGTGAAGGCGCCGGCAAACGAGGCTCGGGAAGGGGCTCCCG ++1539:931/1 +BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@2971:937/1 +NCGGAGACTTCGAGGCCATCCAGTCGATTGCCAAAGTCATCAAGGGGTCGACGATCTGCTCCCTTGCCCGTTCCAACGAGAATGAAATCCGCCGCGCGTGG ++2971:937/1 +BMQMMRRRSS__________XXXXXVVVVV_b___Y[Y[YXVRVWWPWVX_____BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@3786:949/1 +NTACCGCGCAACGGCATGATGGCTTGGAACTCACGGTCACGCGCCTGTTTGGCAGAGCCGCCCGCCGAGTCACCTTCCACTAGGAACAGTTCGGAGCGGTT ++3786:949/1 +BKGGKKJNJJ_______W__Y__W_TVPVP[YY[[_____BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@4205:944/1 +NGATCTGGGCTTCAGCAAGACCGATGTCGGCGTGATTGCCAAGCATGCCGGACTCTGGCCGGCGGGGTTCGGCGGTGTGCTGGGTGGCTTGGGGGTGGGGG ++4205:944/1 +BLLLLTWTTR_V_______TYYYRYYYYYY____VWRWWW___BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@4534:934/1 +NAATGCCGGTATTTGGCACGATGGCGGCACGCTTCCACGACGACGGGGTGACCTCTCTCTATCAGGCGATGGCATCCAAATTGCACGCGCGGGGTTTGAGG ++4534:934/1 +BGGFGLJLJL______________V____________________YYYPQOTWVT__________BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/paired_end_1_cleaned.fastqsanger Fri Dec 17 15:04:17 2010 +1000 @@ -0,0 +1,16 @@ +@2971:937/1 +NCGGAGACTTCGAGGCCATCCAGTCGATTGCCAAAGTCATCAAGGGGTCGACGATCTGCTCCCTTGCCCGTTCCAACGAGAATGAAATCCGCCGCGCGTGG ++2971:937/1 +BMQMMRRRSS__________XXXXXVVVVV_b___Y[Y[YXVRVWWPWVX_____BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@3786:949/1 +NTACCGCGCAACGGCATGATGGCTTGGAACTCACGGTCACGCGCCTGTTTGGCAGAGCCGCCCGCCGAGTCACCTTCCACTAGGAACAGTTCGGAGCGGTT ++3786:949/1 +BKGGKKJNJJ_______W__Y__W_TVPVP[YY[[_____BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@4205:944/1 +NGATCTGGGCTTCAGCAAGACCGATGTCGGCGTGATTGCCAAGCATGCCGGACTCTGGCCGGCGGGGTTCGGCGGTGTGCTGGGTGGCTTGGGGGTGGGGG ++4205:944/1 +BLLLLTWTTR_V_______TYYYRYYYYYY____VWRWWW___BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@4534:934/1 +NAATGCCGGTATTTGGCACGATGGCGGCACGCTTCCACGACGACGGGGTGACCTCTCTCTATCAGGCGATGGCATCCAAATTGCACGCGCGGGGTTTGAGG ++4534:934/1 +BGGFGLJLJL______________V____________________YYYPQOTWVT__________BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/paired_end_1_errors.fastqsanger Fri Dec 17 15:04:17 2010 +1000 @@ -0,0 +1,20 @@ +@1539:931/1 +NACATCAACACTCAGTAACGGCTGGCGCAAAATGGCATTGATTAACGAAGACTTCCCGCGCGTGAAGGCGCCGGCAAACGAGGCTCGGGAAGGGGCTCCCG ++1539:931/1 +BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@2971:937/1 +NCGGAGACTTCGAGGCCATCCAGTCGATTGCCAAAGTCATCAAGGGGTCGACGATCTGCTCCCTTGCCCGTTCCAACGAGAATGAAATCCGCCGCGCGTGG ++2971:937/1 +BMQMMRRRSS__________XXXXXVVVVV_b___Y[Y[YXVRVWWPWVX_____BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@3786:949/1 +NTACCGCGCAACGGCATGATGGCTTGGAACTCACGGTCACGCGCCTGTTTGGCAGAGCCGCCCGCCGAGTCACCTTCCACTAGGAACAGTTCGGAGCGGTT ++3786:949/1 +BKGGKKJNJJ_______W__Y__W_TVPVP[YY[[_____BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@9999:944/1 +NGATCTGGGCTTCAGCAAGACCGATGTCGGCGTGATTGCCAAGCATGCCGGACTCTGGCCGGCGGGGTTCGGCGGTGTGCTGGGTGGCTTGGGGGTGGGGG ++9999:944/1 +BLLLLTWTTR_V_______TYYYRYYYYYY____VWRWWW___BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@4534:934/1 +NAATGCCGGTATTTGGCACGATGGCGGCACGCTTCCACGACGACGGGGTGACCTCTCTCTATCAGGCGATGGCATCCAAATTGCACGCGCGGGGTTTGAGG ++4534:934/1 +BGGFGLJLJL______________V____________________YYYPQOTWVT__________BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/paired_end_2.fastqsanger Fri Dec 17 15:04:17 2010 +1000 @@ -0,0 +1,20 @@ +@1539:931/2 +GCGCGTAACGTTTCACCTCGAGATCGTTGTCGGCCGCAATCTCCTGGGGGCGCCATTCCGAATCGTAGTTGTCGGCGTCTTCCAGTGCGGCAAGGCATCGT ++1539:931/2 +aee_dcadeeWcaaadJbdaff[fffc]dcfe[dRc^\[^QVOZXXZSPFWNUUZ\P^`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@2971:937/2 +CTCGCACGGCCGCCTCGACCACTTGGTCTGGCGTCATGCGCAATTTTTTCTCCATGTGGAACGGGCTGGTGGCGATGAACGTATGAATATGCCCCCGCGCT ++2971:937/2 +hhhddhefhh_ffffhhhhfah_hhdUdcfW`fbbhfcaec_dfdbba````W^caaaJXGKXSUVYVZY^WY^BBBBBBBBBBBBBBBBBBBBBBBBBBB +@3786:949/2 +CTCAACCAGAACACCGTGATCGGCGACCAGTTGGCGCAGTTCGCCATCAGAAATGCAGGGATGCGGATGCGGGCTAGCACGAAAGTCATCCTCAACACGAT ++3786:949/2 +ffcfcaffff\_edefddff[ffa_fRffRdc]Sdf]affehh_eaebBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@4205:944/2 +GTCGACAGGTGCCTGTACACCACGCCAGGCCAGCCAGGCGAAACCGAGAACGGTCACCATCTGAACCAGACCGAAAACCAACAGTGCGGGGTTGAGCCACG ++4205:944/2 +hhhhhcffcWcdfdcffdffdfQadf[fLfc`Ra`Wcca]`^``]L[^QZGSQWUYZXK[`bJRbZb[_BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@4534:934/2 +GGTAATTGCGGACGGCTTCGGCAATTTCGGCCAGGTAGCGCACGCGCTTCGACGGAACGATGGCGCGCAGGTTCGACGATTGTCGAACGCTGATCAGCGCG ++4534:934/2 +ffffcff[fdhaghh[ffcahhghhhdhadhhhhg_hc[hf]fec]faa]bLb___`^`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/paired_end_2_cleaned.fastqsanger Fri Dec 17 15:04:17 2010 +1000 @@ -0,0 +1,16 @@ +@2971:937/2 +CTCGCACGGCCGCCTCGACCACTTGGTCTGGCGTCATGCGCAATTTTTTCTCCATGTGGAACGGGCTGGTGGCGATGAACGTATGAATATGCCCCCGCGCT ++2971:937/2 +hhhddhefhh_ffffhhhhfah_hhdUdcfW`fbbhfcaec_dfdbba````W^caaaJXGKXSUVYVZY^WY^BBBBBBBBBBBBBBBBBBBBBBBBBBB +@3786:949/2 +CTCAACCAGAACACCGTGATCGGCGACCAGTTGGCGCAGTTCGCCATCAGAAATGCAGGGATGCGGATGCGGGCTAGCACGAAAGTCATCCTCAACACGAT ++3786:949/2 +ffcfcaffff\_edefddff[ffa_fRffRdc]Sdf]affehh_eaebBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@4205:944/2 +GTCGACAGGTGCCTGTACACCACGCCAGGCCAGCCAGGCGAAACCGAGAACGGTCACCATCTGAACCAGACCGAAAACCAACAGTGCGGGGTTGAGCCACG ++4205:944/2 +hhhhhcffcWcdfdcffdffdfQadf[fLfc`Ra`Wcca]`^``]L[^QZGSQWUYZXK[`bJRbZb[_BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@4534:934/2 +GGTAATTGCGGACGGCTTCGGCAATTTCGGCCAGGTAGCGCACGCGCTTCGACGGAACGATGGCGCGCAGGTTCGACGATTGTCGAACGCTGATCAGCGCG ++4534:934/2 +ffffcff[fdhaghh[ffcahhghhhdhadhhhhg_hc[hf]fec]faa]bLb___`^`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/paired_end_2_errors.fastqsanger Fri Dec 17 15:04:17 2010 +1000 @@ -0,0 +1,20 @@ +@1539:931/2 +GCGCGTAACGTTTCACCTCGAGATCGTTGTCGGCCGCAATCTCCTGGGGGCGCCATTCCGAATCGTAGTTGTCGGCGTCTTCCAGTGCGGCAAGGCATCGT ++1539:931/2 +aee_dcadeeWcaaadJbdaff[fffc]dcfe[dRc^\[^QVOZXXZSPFWNUUZ\P^`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@2971:937/2 +CTCGCACGGCCGCCTCGACCACTTGGTCTGGCGTCATGCGCAATTTTTTCTCCATGTGGAACGGGCTGGTGGCGATGAACGTATGAATATGCCCCCGCGCT ++2971:937/2 +hhhddhefhh_ffffhhhhfah_hhdUdcfW`fbbhfcaec_dfdbba````W^caaaJXGKXSUVYVZY^WY^BBBBBBBBBBBBBBBBBBBBBBBBBBB +@9999:949/2 +CTCAACCAGAACACCGTGATCGGCGACCAGTTGGCGCAGTTCGCCATCAGAAATGCAGGGATGCGGATGCGGGCTAGCACGAAAGTCATCCTCAACACGAT ++9999:949/2 +ffcfcaffff\_edefddff[ffa_fRffRdc]Sdf]affehh_eaebBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@4205:944/2 +GTCGACAGGTGCCTGTACACCACGCCAGGCCAGCCAGGCGAAACCGAGAACGGTCACCATCTGAACCAGACCGAAAACCAACAGTGCGGGGTTGAGCCACG ++4205:944/2 +hhhhhcffcWcdfdcffdffdfQadf[fLfc`Ra`Wcca]`^``]L[^QZGSQWUYZXK[`bJRbZb[_BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@4534:934/2 +GGTAATTGCGGACGGCTTCGGCAATTTCGGCCAGGTAGCGCACGCGCTTCGACGGAACGATGGCGCGCAGGTTCGACGATTGTCGAACGCTGATCAGCGCG ++4534:934/2 +ffffcff[fdhaghh[ffcahhghhhdhadhhhhg_hc[hf]fec]faa]bLb___`^`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/paired_end_merged.fastqsanger Fri Dec 17 15:04:17 2010 +1000 @@ -0,0 +1,40 @@ +@1539:931/1 +NACATCAACACTCAGTAACGGCTGGCGCAAAATGGCATTGATTAACGAAGACTTCCCGCGCGTGAAGGCGCCGGCAAACGAGGCTCGGGAAGGGGCTCCCG ++1539:931/1 +BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@1539:931/2 +GCGCGTAACGTTTCACCTCGAGATCGTTGTCGGCCGCAATCTCCTGGGGGCGCCATTCCGAATCGTAGTTGTCGGCGTCTTCCAGTGCGGCAAGGCATCGT ++1539:931/2 +aee_dcadeeWcaaadJbdaff[fffc]dcfe[dRc^\[^QVOZXXZSPFWNUUZ\P^`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@2971:937/1 +NCGGAGACTTCGAGGCCATCCAGTCGATTGCCAAAGTCATCAAGGGGTCGACGATCTGCTCCCTTGCCCGTTCCAACGAGAATGAAATCCGCCGCGCGTGG ++2971:937/1 +BMQMMRRRSS__________XXXXXVVVVV_b___Y[Y[YXVRVWWPWVX_____BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@2971:937/2 +CTCGCACGGCCGCCTCGACCACTTGGTCTGGCGTCATGCGCAATTTTTTCTCCATGTGGAACGGGCTGGTGGCGATGAACGTATGAATATGCCCCCGCGCT ++2971:937/2 +hhhddhefhh_ffffhhhhfah_hhdUdcfW`fbbhfcaec_dfdbba````W^caaaJXGKXSUVYVZY^WY^BBBBBBBBBBBBBBBBBBBBBBBBBBB +@3786:949/1 +NTACCGCGCAACGGCATGATGGCTTGGAACTCACGGTCACGCGCCTGTTTGGCAGAGCCGCCCGCCGAGTCACCTTCCACTAGGAACAGTTCGGAGCGGTT ++3786:949/1 +BKGGKKJNJJ_______W__Y__W_TVPVP[YY[[_____BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@3786:949/2 +CTCAACCAGAACACCGTGATCGGCGACCAGTTGGCGCAGTTCGCCATCAGAAATGCAGGGATGCGGATGCGGGCTAGCACGAAAGTCATCCTCAACACGAT ++3786:949/2 +ffcfcaffff\_edefddff[ffa_fRffRdc]Sdf]affehh_eaebBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@4205:944/1 +NGATCTGGGCTTCAGCAAGACCGATGTCGGCGTGATTGCCAAGCATGCCGGACTCTGGCCGGCGGGGTTCGGCGGTGTGCTGGGTGGCTTGGGGGTGGGGG ++4205:944/1 +BLLLLTWTTR_V_______TYYYRYYYYYY____VWRWWW___BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@4205:944/2 +GTCGACAGGTGCCTGTACACCACGCCAGGCCAGCCAGGCGAAACCGAGAACGGTCACCATCTGAACCAGACCGAAAACCAACAGTGCGGGGTTGAGCCACG ++4205:944/2 +hhhhhcffcWcdfdcffdffdfQadf[fLfc`Ra`Wcca]`^``]L[^QZGSQWUYZXK[`bJRbZb[_BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@4534:934/1 +NAATGCCGGTATTTGGCACGATGGCGGCACGCTTCCACGACGACGGGGTGACCTCTCTCTATCAGGCGATGGCATCCAAATTGCACGCGCGGGGTTTGAGG ++4534:934/1 +BGGFGLJLJL______________V____________________YYYPQOTWVT__________BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@4534:934/2 +GGTAATTGCGGACGGCTTCGGCAATTTCGGCCAGGTAGCGCACGCGCTTCGACGGAACGATGGCGCGCAGGTTCGACGATTGTCGAACGCTGATCAGCGCG ++4534:934/2 +ffffcff[fdhaghh[ffcahhghhhdhadhhhhg_hc[hf]fec]faa]bLb___`^`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/paired_end_merged_cleaned.fastqsanger Fri Dec 17 15:04:17 2010 +1000 @@ -0,0 +1,24 @@ +@1539:931/1 +NACATCAACACTCAGTAACGGCTGGCGCAAAATGGCATTGATTAACGAAGACTTCCCGCGCGTGAAGGCGCCGGCAAACGAGGCTCGGGAAGGGGCTCCCG ++1539:931/1 +BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@1539:931/2 +GCGCGTAACGTTTCACCTCGAGATCGTTGTCGGCCGCAATCTCCTGGGGGCGCCATTCCGAATCGTAGTTGTCGGCGTCTTCCAGTGCGGCAAGGCATCGT ++1539:931/2 +aee_dcadeeWcaaadJbdaff[fffc]dcfe[dRc^\[^QVOZXXZSPFWNUUZ\P^`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@2971:937/1 +NCGGAGACTTCGAGGCCATCCAGTCGATTGCCAAAGTCATCAAGGGGTCGACGATCTGCTCCCTTGCCCGTTCCAACGAGAATGAAATCCGCCGCGCGTGG ++2971:937/1 +BMQMMRRRSS__________XXXXXVVVVV_b___Y[Y[YXVRVWWPWVX_____BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@2971:937/2 +CTCGCACGGCCGCCTCGACCACTTGGTCTGGCGTCATGCGCAATTTTTTCTCCATGTGGAACGGGCTGGTGGCGATGAACGTATGAATATGCCCCCGCGCT ++2971:937/2 +hhhddhefhh_ffffhhhhfah_hhdUdcfW`fbbhfcaec_dfdbba````W^caaaJXGKXSUVYVZY^WY^BBBBBBBBBBBBBBBBBBBBBBBBBBB +@4534:934/1 +NAATGCCGGTATTTGGCACGATGGCGGCACGCTTCCACGACGACGGGGTGACCTCTCTCTATCAGGCGATGGCATCCAAATTGCACGCGCGGGGTTTGAGG ++4534:934/1 +BGGFGLJLJL______________V____________________YYYPQOTWVT__________BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@4534:934/2 +GGTAATTGCGGACGGCTTCGGCAATTTCGGCCAGGTAGCGCACGCGCTTCGACGGAACGATGGCGCGCAGGTTCGACGATTGTCGAACGCTGATCAGCGCG ++4534:934/2 +ffffcff[fdhaghh[ffcahhghhhdhadhhhhg_hc[hf]fec]faa]bLb___`^`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/paired_end_merged_errors.fastqsanger Fri Dec 17 15:04:17 2010 +1000 @@ -0,0 +1,40 @@ +@1539:931/1 +NACATCAACACTCAGTAACGGCTGGCGCAAAATGGCATTGATTAACGAAGACTTCCCGCGCGTGAAGGCGCCGGCAAACGAGGCTCGGGAAGGGGCTCCCG ++1539:931/1 +BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@9999:931/2 +GCGCGTAACGTTTCACCTCGAGATCGTTGTCGGCCGCAATCTCCTGGGGGCGCCATTCCGAATCGTAGTTGTCGGCGTCTTCCAGTGCGGCAAGGCATCGT ++9999:931/2 +aee_dcadeeWcaaadJbdaff[fffc]dcfe[dRc^\[^QVOZXXZSPFWNUUZ\P^`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@2971:937/2 +CTCGCACGGCCGCCTCGACCACTTGGTCTGGCGTCATGCGCAATTTTTTCTCCATGTGGAACGGGCTGGTGGCGATGAACGTATGAATATGCCCCCGCGCT ++2971:937/2 +hhhddhefhh_ffffhhhhfah_hhdUdcfW`fbbhfcaec_dfdbba````W^caaaJXGKXSUVYVZY^WY^BBBBBBBBBBBBBBBBBBBBBBBBBBB +@2971:937/1 +NCGGAGACTTCGAGGCCATCCAGTCGATTGCCAAAGTCATCAAGGGGTCGACGATCTGCTCCCTTGCCCGTTCCAACGAGAATGAAATCCGCCGCGCGTGG ++2971:937/1 +BMQMMRRRSS__________XXXXXVVVVV_b___Y[Y[YXVRVWWPWVX_____BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@3786:949/1 +NTACCGCGCAACGGCATGATGGCTTGGAACTCACGGTCACGCGCCTGTTTGGCAGAGCCGCCCGCCGAGTCACCTTCCACTAGGAACAGTTCGGAGCGGTT ++3786:949/1 +BKGGKKJNJJ_______W__Y__W_TVPVP[YY[[_____BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@3786:949/2 +CTCAACCAGAACACCGTGATCGGCGACCAGTTGGCGCAGTTCGCCATCAGAAATGCAGGGATGCGGATGCGGGCTAGCACGAAAGTCATCCTCAACACGAT ++3786:949/2 +ffcfcaffff\_edefddff[ffa_fRffRdc]Sdf]affehh_eaebBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@4205:944/1 +NGATCTGGGCTTCAGCAAGACCGATGTCGGCGTGATTGCCAAGCATGCCGGACTCTGGCCGGCGGGGTTCGGCGGTGTGCTGGGTGGCTTGGGGGTGGGGG ++4205:944/1 +BLLLLTWTTR_V_______TYYYRYYYYYY____VWRWWW___BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@4205:944/2 +GTCGACAGGTGCCTGTACACCACGCCAGGCCAGCCAGGCGAAACCGAGAACGGTCACCATCTGAACCAGACCGAAAACCAACAGTGCGGGGTTGAGCCACG ++4205:944/2 +hhhhhcffcWcdfdcffdffdfQadf[fLfc`Ra`Wcca]`^``]L[^QZGSQWUYZXK[`bJRbZb[_BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@4534:934/1 +NAATGCCGGTATTTGGCACGATGGCGGCACGCTTCCACGACGACGGGGTGACCTCTCTCTATCAGGCGATGGCATCCAAATTGCACGCGCGGGGTTTGAGG ++4534:934/1 +BGGFGLJLJL______________V____________________YYYPQOTWVT__________BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@4534:934/2 +GGTAATTGCGGACGGCTTCGGCAATTTCGGCCAGGTAGCGCACGCGCTTCGACGGAACGATGGCGCGCAGGTTCGACGATTGTCGAACGCTGATCAGCGCG ++4534:934/2 +ffffcff[fdhaghh[ffcahhghhhdhadhhhhg_hc[hf]fec]faa]bLb___`^`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB --- a/tool_conf.xml.sample Thu Dec 16 17:12:01 2010 +1000 +++ b/tool_conf.xml.sample Fri Dec 17 15:04:17 2010 +1000 @@ -260,6 +260,8 @@ <tool file="fastq/fastq_trimmer.xml" /><tool file="fastq/fastq_trimmer_by_quality.xml" /><tool file="fastq/fastq_masker_by_quality.xml" /> + <tool file="fastq/fastq_paired_end_interlacer.xml" /> + <tool file="fastq/fastq_paired_end_deinterlacer.xml" /><tool file="fastq/fastq_manipulation.xml" /><tool file="fastq/fastq_to_fasta.xml" /><tool file="fastq/fastq_to_tabular.xml" /> --- a/tools/fastq/fastq_paired_end_deinterlacer.py Thu Dec 16 17:12:01 2010 +1000 +++ b/tools/fastq/fastq_paired_end_deinterlacer.py Fri Dec 17 15:04:17 2010 +1000 @@ -23,17 +23,16 @@ del found[mate1.identifier] continue - mate2_id, mate2_is_first = joiner.get_paired_identifier( mate1 ) + mate2 = input.get( joiner.get_paired_identifier( mate1 ) ) - mate2 = input.get( mate2_id ) if mate2: - found[mate2_id] = None - if mate2_is_first: + found[mate2.identifier] = None + if joiner.is_first_mate( mate1 ): + out1.write( mate1 ) + out2.write( mate2 ) + else: out1.write( mate2 ) out2.write( mate1 ) - else: - out1.write( mate1 ) - out2.write( mate2 ) else: skip_count += 1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/fastq/fastq_paired_end_deinterlacer.xml Fri Dec 17 15:04:17 2010 +1000 @@ -0,0 +1,64 @@ +<tool id="fastq_paired_end_deinterlacer" name="FASTQ de-interlacer" version="1.0.0"> + <description>on paired end reads</description> + <command interpreter="python">fastq_paired_end_deinterlacer.py '$input1_file' '${input1_file.extension[len( 'fastq' ):]}' '$output1_file' '$output2_file'</command> + <inputs> + <param name="input1_file" type="data" format="fastqsanger,fastqcssanger" label="FASTQ reads" /> + </inputs> + <outputs> + <data name="output1_file" format="input" /> + <data name="output2_file" format="input" /> + </outputs> + <tests> + <test> + <param name="input1_file" value="paired_end_merged.fastqsanger" ftype="fastqsanger" /> + <output name="output1_file" file="paired_end_1.fastqsanger" /> + <output name="output2_file" file="paired_end_2.fastqsanger" /> + </test> + <test> + <param name="input1_file" value="paired_end_merged_errors.fastqsanger" ftype="fastqsanger" /> + <output name="output1_file" file="paired_end_1_cleaned.fastqsanger" /> + <output name="output2_file" file="paired_end_2_cleaned.fastqsanger" /> + </test> + </tests> + <help> +**What it does** + +De-interlaces a single fastq dataset representing paired-end run into two fastq datasets containing only the first or second mate read. Reads without mate are excluded from the output files. + +Sequence identifiers for paired-end reads must follow the /1 and /2 convention. + +----- + +**Input** + +A multiple-fastq file containing paired-end reads, for example:: + + @1539:931/1 + ACTTCCCGCGCGTGAAGGCGCCGGCAAACGAGGCTCGGGAAGGGGCTCCCG + +1539:931/1 + BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB + @1539:931/2 + CGCCATTCCGAATCGTAGTTGTCGGCGTCTTCCAGTGCGGCAAGGCATCGT + +1539:931/2 + WNUUZ\P^`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB + +----- + +**Output** + +Multi-fastq file with left-hand mate only:: + + @1539:931/1 + ACTTCCCGCGCGTGAAGGCGCCGGCAAACGAGGCTCGGGAAGGGGCTCCCG + +1539:931/1 + BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB + +Multi-fastq file with right-hand mate only:: + + @1539:931/2 + CGCCATTCCGAATCGTAGTTGTCGGCGTCTTCCAGTGCGGCAAGGCATCGT + +1539:931/2 + WNUUZ\P^`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB + + </help> +</tool> --- a/tools/fastq/fastq_paired_end_interlacer.py Thu Dec 16 17:12:01 2010 +1000 +++ b/tools/fastq/fastq_paired_end_interlacer.py Fri Dec 17 15:04:17 2010 +1000 @@ -21,9 +21,7 @@ i = None skip_count = 0 for i, mate1 in enumerate( fastqReader( open( mate1_filename, 'rb' ), format = type ) ): - mate2 = mate_input.get( joiner.get_paired_identifier( mate1 ) ) - if mate2: out.write( mate1 ) out.write( mate2 ) --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/fastq/fastq_paired_end_interlacer.xml Fri Dec 17 15:04:17 2010 +1000 @@ -0,0 +1,64 @@ +<tool id="fastq_paired_end_interlacer" name="FASTQ interlacer" version="1.0.0"> + <description>on paired end reads</description> + <command interpreter="python">fastq_paired_end_interlacer.py '$input1_file' '${input1_file.extension[len( 'fastq' ):]}' '$input2_file' '${input2_file.extension[len( 'fastq' ):]}' '$output_file'</command> + <inputs> + <param name="input1_file" type="data" format="fastqsanger,fastqcssanger" label="Left-hand mates" /> + <param name="input2_file" type="data" format="fastqsanger,fastqcssanger" label="Right-hand mates" /> + </inputs> + <outputs> + <data name="output_file" format="input" /> + </outputs> + <tests> + <test> + <param name="input1_file" value="paired_end_1.fastqsanger" ftype="fastqsanger" /> + <param name="input2_file" value="paired_end_2.fastqsanger" ftype="fastqsanger" /> + <output name="output_file" file="paired_end_merged.fastqsanger" /> + </test> + <test> + <param name="input1_file" value="paired_end_1_errors.fastqsanger" ftype="fastqsanger" /> + <param name="input2_file" value="paired_end_2_errors.fastqsanger" ftype="fastqsanger" /> + <output name="output_file" file="paired_end_merged_cleaned.fastqsanger" /> + </test> + </tests> + <help> +**What it does** + +This tool joins paired end FASTQ reads from two separate files, one with the left mates and one with the right mates, into a single files where letf mates alternate with their right mate. The join is performed using sequence identifiers, allowing the two files to contain differing ordering. If a sequence identifier does not appear in both files, it is excluded from the output. + +Sequence identifiers with /1 and /2 appended override the left-hand and right-hand designation; i.e. if the reads end with /1 and /2, the read containing /1 will be used as the left-hand read and the read containing /2 will be used as the right-hand read. Sequences without this designation will follow the left-hand and right-hand settings set by the user. + +----- + +**Input** + +Left-hand mates, for example:: + + @1539:931/1 + ACTTCCCGCGCGTGAAGGCGCCGGCAAACGAGGCTCGGGAAGGGGCTCCCG + +1539:931/1 + BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB + +Right-hand mates, for example:: + + @1539:931/2 + CGCCATTCCGAATCGTAGTTGTCGGCGTCTTCCAGTGCGGCAAGGCATCGT + +1539:931/2 + WNUUZ\P^`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB + +----- + +**Output** + +A multiple-fastq file containing interlaced left and right paired reads:: + + @1539:931/1 + ACTTCCCGCGCGTGAAGGCGCCGGCAAACGAGGCTCGGGAAGGGGCTCCCG + +1539:931/1 + BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB + @1539:931/2 + CGCCATTCCGAATCGTAGTTGTCGGCGTCTTCCAGTGCGGCAAGGCATCGT + +1539:931/2 + WNUUZ\P^`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB + + </help> +</tool> http://bitbucket.org/galaxy/galaxy-central/changeset/bc292ff9d647/ changeset: bc292ff9d647 user: fangly date: 2011-05-17 09:26:04 summary: Interlacer and de-interlacer now keep track or single reads (that have no mate) affected #: 7 files (4.0 KB) --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/paired_end_1_cleaned_singles.fastqsanger Tue May 17 17:26:04 2011 +1000 @@ -0,0 +1,4 @@ +@1539:931/1 +NACATCAACACTCAGTAACGGCTGGCGCAAAATGGCATTGATTAACGAAGACTTCCCGCGCGTGAAGGCGCCGGCAAACGAGGCTCGGGAAGGGGCTCCCG ++1539:931/1 +BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/paired_end_2_cleaned_singles.fastqsanger Tue May 17 17:26:04 2011 +1000 @@ -0,0 +1,4 @@ +@9999:931/2 +GCGCGTAACGTTTCACCTCGAGATCGTTGTCGGCCGCAATCTCCTGGGGGCGCCATTCCGAATCGTAGTTGTCGGCGTCTTCCAGTGCGGCAAGGCATCGT ++9999:931/2 +aee_dcadeeWcaaadJbdaff[fffc]dcfe[dRc^\[^QVOZXXZSPFWNUUZ\P^`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/paired_end_merged_cleaned_singles.fastqsanger Tue May 17 17:26:04 2011 +1000 @@ -0,0 +1,16 @@ +@3786:949/1 +NTACCGCGCAACGGCATGATGGCTTGGAACTCACGGTCACGCGCCTGTTTGGCAGAGCCGCCCGCCGAGTCACCTTCCACTAGGAACAGTTCGGAGCGGTT ++3786:949/1 +BKGGKKJNJJ_______W__Y__W_TVPVP[YY[[_____BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@9999:944/1 +NGATCTGGGCTTCAGCAAGACCGATGTCGGCGTGATTGCCAAGCATGCCGGACTCTGGCCGGCGGGGTTCGGCGGTGTGCTGGGTGGCTTGGGGGTGGGGG ++9999:944/1 +BLLLLTWTTR_V_______TYYYRYYYYYY____VWRWWW___BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@9999:949/2 +CTCAACCAGAACACCGTGATCGGCGACCAGTTGGCGCAGTTCGCCATCAGAAATGCAGGGATGCGGATGCGGGCTAGCACGAAAGTCATCCTCAACACGAT ++9999:949/2 +ffcfcaffff\_edefddff[ffa_fRffRdc]Sdf]affehh_eaebBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@4205:944/2 +GTCGACAGGTGCCTGTACACCACGCCAGGCCAGCCAGGCGAAACCGAGAACGGTCACCATCTGAACCAGACCGAAAACCAACAGTGCGGGGTTGAGCCACG ++4205:944/2 +hhhhhcffcWcdfdcffdffdfQadf[fLfc`Ra`Wcca]`^``]L[^QZGSQWUYZXK[`bJRbZb[_BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB --- a/tools/fastq/fastq_paired_end_deinterlacer.py Fri Dec 17 15:04:17 2010 +1000 +++ b/tools/fastq/fastq_paired_end_deinterlacer.py Tue May 17 17:26:04 2011 +1000 @@ -3,16 +3,20 @@ from galaxy_utils.sequence.fastq import fastqReader, fastqWriter, fastqNamedReader, fastqJoiner def main(): - input_filename = sys.argv[1] - input_type = sys.argv[2] or 'sanger' - mate1_filename = sys.argv[3] - mate2_filename = sys.argv[4] + input_filename = sys.argv[1] + input_type = sys.argv[2] or 'sanger' + mate1_filename = sys.argv[3] + mate2_filename = sys.argv[4] + single1_filename = sys.argv[5] + single2_filename = sys.argv[6] - type = input_type - input = fastqNamedReader( open( input_filename, 'rb' ), format = type ) - out1 = fastqWriter( open( mate1_filename, 'wb' ), format = type ) - out2 = fastqWriter( open( mate2_filename, 'wb' ), format = type ) - joiner = fastqJoiner( type ) + type = input_type + input = fastqNamedReader( open( input_filename, 'rb' ), format = type ) + mate1_out = fastqWriter( open( mate1_filename, 'wb' ), format = type ) + mate2_out = fastqWriter( open( mate2_filename, 'wb' ), format = type ) + single1_out = fastqWriter( open( single1_filename, 'wb' ), format = type ) + single2_out = fastqWriter( open( single2_filename, 'wb' ), format = type ) + joiner = fastqJoiner( type ) i = None skip_count = 0 @@ -26,27 +30,35 @@ mate2 = input.get( joiner.get_paired_identifier( mate1 ) ) if mate2: + # This is a mate pair found[mate2.identifier] = None if joiner.is_first_mate( mate1 ): - out1.write( mate1 ) - out2.write( mate2 ) + mate1_out.write( mate1 ) + mate2_out.write( mate2 ) else: - out1.write( mate2 ) - out2.write( mate1 ) + mate1_out.write( mate2 ) + mate2_out.write( mate1 ) else: + # This is a single skip_count += 1 + if joiner.is_first_mate( mate1 ): + single1_out.write( mate1 ) + else: + single2_out.write( mate1 ) if i is None: print "Your input file contained no valid FASTQ sequences." else: if skip_count: - print '%i reads had no mate.' % skip_count + print 'There were %i reads with no mate.' % skip_count print 'De-interlaced %s pairs of sequences.' % ( (i - skip_count + 1)/2 ) input.close() - out1.close() - out2.close() - + mate1_out.close() + mate2_out.close() + single1_out.close() + single2_out.close() + if __name__ == "__main__": main() --- a/tools/fastq/fastq_paired_end_deinterlacer.xml Fri Dec 17 15:04:17 2010 +1000 +++ b/tools/fastq/fastq_paired_end_deinterlacer.xml Tue May 17 17:26:04 2011 +1000 @@ -1,29 +1,35 @@ <tool id="fastq_paired_end_deinterlacer" name="FASTQ de-interlacer" version="1.0.0"><description>on paired end reads</description> - <command interpreter="python">fastq_paired_end_deinterlacer.py '$input1_file' '${input1_file.extension[len( 'fastq' ):]}' '$output1_file' '$output2_file'</command> + <command interpreter="python">fastq_paired_end_deinterlacer.py '$input_file' '${input_file.extension[len( 'fastq' ):]}' '$output1_pairs_file' '$output2_pairs_file' '$output1_singles_file' '$output2_singles_file'</command><inputs> - <param name="input1_file" type="data" format="fastqsanger,fastqcssanger" label="FASTQ reads" /> + <param name="input_file" type="data" format="fastqsanger,fastqcssanger" label="FASTQ reads" /></inputs><outputs> - <data name="output1_file" format="input" /> - <data name="output2_file" format="input" /> + <data name="output1_pairs_file" format="input" label="FASTQ de-interlacer left mates from data ${input_file.hid}" /> + <data name="output2_pairs_file" format="input" label="FASTQ de-interlacer right mates from data ${input_file.hid}"/> + <data name="output1_singles_file" format="input" label="FASTQ de-interlacer left singles from data ${input_file.hid}"/> + <data name="output2_singles_file" format="input" label="FASTQ de-interlacer right singles from data ${input_file.hid}"/></outputs><tests><test> - <param name="input1_file" value="paired_end_merged.fastqsanger" ftype="fastqsanger" /> - <output name="output1_file" file="paired_end_1.fastqsanger" /> - <output name="output2_file" file="paired_end_2.fastqsanger" /> + <param name="input_file" value="paired_end_merged.fastqsanger" ftype="fastqsanger" /> + <output name="output1_pairs_file" file="paired_end_1.fastqsanger" /> + <output name="output2_pairs_file" file="paired_end_2.fastqsanger" /> + <output name="output1_singles_file" file="paired_end_1_singles.fastqsanger" /> + <output name="output2_singles_file" file="paired_end_2_singles.fastqsanger" /></test><test> - <param name="input1_file" value="paired_end_merged_errors.fastqsanger" ftype="fastqsanger" /> - <output name="output1_file" file="paired_end_1_cleaned.fastqsanger" /> - <output name="output2_file" file="paired_end_2_cleaned.fastqsanger" /> + <param name="input_file" value="paired_end_merged_errors.fastqsanger" ftype="fastqsanger" /> + <output name="output1_pairs_file" file="paired_end_1_cleaned.fastqsanger" /> + <output name="output2_pairs_file" file="paired_end_2_cleaned.fastqsanger" /> + <output name="output1_singles_file" file="paired_end_1_cleaned_singles.fastqsanger" /> + <output name="output2_singles_file" file="paired_end_2_cleaned_singles.fastqsanger" /></test></tests><help> **What it does** -De-interlaces a single fastq dataset representing paired-end run into two fastq datasets containing only the first or second mate read. Reads without mate are excluded from the output files. +De-interlaces a single fastq dataset representing paired-end run into two fastq datasets containing only the first or second mate read. Reads without mate are saved in separate output files. Sequence identifiers for paired-end reads must follow the /1 and /2 convention. --- a/tools/fastq/fastq_paired_end_interlacer.py Fri Dec 17 15:04:17 2010 +1000 +++ b/tools/fastq/fastq_paired_end_interlacer.py Tue May 17 17:26:04 2011 +1000 @@ -3,11 +3,12 @@ from galaxy_utils.sequence.fastq import fastqReader, fastqWriter, fastqNamedReader, fastqJoiner def main(): - mate1_filename = sys.argv[1] - mate1_type = sys.argv[2] or 'sanger' - mate2_filename = sys.argv[3] - mate2_type = sys.argv[4] or 'sanger' - output_filename = sys.argv[5] + mate1_filename = sys.argv[1] + mate1_type = sys.argv[2] or 'sanger' + mate2_filename = sys.argv[3] + mate2_type = sys.argv[4] or 'sanger' + outfile_pairs = sys.argv[5] + outfile_singles = sys.argv[6] if mate1_type != mate2_type: print "WARNING: You are trying to interlace files of two different types: %s and %s." % ( mate1_type, mate2_type ) @@ -15,29 +16,43 @@ type = mate1_type joiner = fastqJoiner( type ) - out = fastqWriter( open( output_filename, 'wb' ), format = type ) - mate_input = fastqNamedReader( open( mate2_filename, 'rb' ), format = type ) + out_pairs = fastqWriter( open( outfile_pairs, 'wb' ), format = type ) + out_singles = fastqWriter( open( outfile_singles, 'wb' ), format = type ) + # Pairs + singles present in mate1 + nof_singles = 0 + nof_pairs = 0 + mate2_input = fastqNamedReader( open( mate2_filename, 'rb' ), format = type ) i = None - skip_count = 0 for i, mate1 in enumerate( fastqReader( open( mate1_filename, 'rb' ), format = type ) ): - mate2 = mate_input.get( joiner.get_paired_identifier( mate1 ) ) + mate2 = mate2_input.get( joiner.get_paired_identifier( mate1 ) ) if mate2: - out.write( mate1 ) - out.write( mate2 ) + out_pairs.write( mate1 ) + out_pairs.write( mate2 ) + nof_pairs += 1 else: - skip_count += 1 + out_singles.write( mate1 ) + nof_singles += 1 - if i is None: - print "Your input file contained no valid FASTQ sequences." + # Singles present in mate2 + mate1_input = fastqNamedReader( open( mate1_filename, 'rb' ), format = type ) + j = None + for j, mate2 in enumerate( fastqReader( open( mate2_filename, 'rb' ), format = type ) ): + mate1 = mate1_input.get( joiner.get_paired_identifier( mate2 ) ) + if not mate1: + out_singles.write( mate2 ) + nof_singles += 1 + + if (i is None) and (j is None): + print "Your input files contained no valid FASTQ sequences." else: - not_used_msg = mate_input.has_data() - if not_used_msg: - print not_used_msg - print 'Interlaced %s pairs of sequences.' % ( i - skip_count + 1 ) + print 'There were %s single reads.' % ( nof_singles ) + print 'Interlaced %s pairs of sequences.' % ( nof_pairs ) - mate_input.close() - out.close() + mate1_input.close() + mate2_input.close() + out_pairs.close() + out_singles.close() if __name__ == "__main__": --- a/tools/fastq/fastq_paired_end_interlacer.xml Fri Dec 17 15:04:17 2010 +1000 +++ b/tools/fastq/fastq_paired_end_interlacer.xml Tue May 17 17:26:04 2011 +1000 @@ -1,29 +1,35 @@ <tool id="fastq_paired_end_interlacer" name="FASTQ interlacer" version="1.0.0"><description>on paired end reads</description> - <command interpreter="python">fastq_paired_end_interlacer.py '$input1_file' '${input1_file.extension[len( 'fastq' ):]}' '$input2_file' '${input2_file.extension[len( 'fastq' ):]}' '$output_file'</command> + <command interpreter="python">fastq_paired_end_interlacer.py '$input1_file' '${input1_file.extension[len( 'fastq' ):]}' '$input2_file' '${input2_file.extension[len( 'fastq' ):]}' '$outfile_pairs' '$outfile_singles'</command><inputs><param name="input1_file" type="data" format="fastqsanger,fastqcssanger" label="Left-hand mates" /><param name="input2_file" type="data" format="fastqsanger,fastqcssanger" label="Right-hand mates" /></inputs><outputs> - <data name="output_file" format="input" /> + <!-- $input1_file.name = filename , e.g. paired_end_2_errors.fastqsanger --> + <!-- $input1_file.id = ID , e.g. 10 --> + <!-- $input1_file.hid = history ID, e.g. 5 --> + <data name="outfile_pairs" format="input" label="FASTQ interlacer pairs from data ${input1_file.hid} and data ${input2_file.hid}"/> + <data name="outfile_singles" format="input" label="FASTQ interlacer singles from data data ${input1_file.hid} and data ${input2_file.hid}"/></outputs><tests><test><param name="input1_file" value="paired_end_1.fastqsanger" ftype="fastqsanger" /><param name="input2_file" value="paired_end_2.fastqsanger" ftype="fastqsanger" /> - <output name="output_file" file="paired_end_merged.fastqsanger" /> + <output name="outfile_pairs" file="paired_end_merged.fastqsanger" /> + <output name="outfile_singles" file="paired_end_merged_singles.fastqsanger" /></test><test><param name="input1_file" value="paired_end_1_errors.fastqsanger" ftype="fastqsanger" /><param name="input2_file" value="paired_end_2_errors.fastqsanger" ftype="fastqsanger" /> - <output name="output_file" file="paired_end_merged_cleaned.fastqsanger" /> + <output name="outfile_pairs" file="paired_end_merged_cleaned.fastqsanger" /> + <output name="outfile_singles" file="paired_end_merged_cleaned_singles.fastqsanger" /></test></tests><help> **What it does** -This tool joins paired end FASTQ reads from two separate files, one with the left mates and one with the right mates, into a single files where letf mates alternate with their right mate. The join is performed using sequence identifiers, allowing the two files to contain differing ordering. If a sequence identifier does not appear in both files, it is excluded from the output. +This tool joins paired end FASTQ reads from two separate files, one with the left mates and one with the right mates, into a single files where left mates alternate with their right mates. The join is performed using sequence identifiers, allowing the two files to contain differing ordering. If a sequence identifier does not appear in both files, it is included in a separate file. Sequence identifiers with /1 and /2 appended override the left-hand and right-hand designation; i.e. if the reads end with /1 and /2, the read containing /1 will be used as the left-hand read and the read containing /2 will be used as the right-hand read. Sequences without this designation will follow the left-hand and right-hand settings set by the user. @@ -60,5 +66,7 @@ +1539:931/2 WNUUZ\P^`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +A multiple-fastq file containing reads that have no mate is also produced. + </help></tool> http://bitbucket.org/galaxy/galaxy-central/changeset/de86763942a3/ changeset: de86763942a3 user: fangly date: 2011-05-17 09:54:04 summary: Updated tool wrapper version number affected #: 2 files (4 bytes) --- a/tools/fastq/fastq_paired_end_deinterlacer.xml Tue May 17 17:26:04 2011 +1000 +++ b/tools/fastq/fastq_paired_end_deinterlacer.xml Tue May 17 17:54:04 2011 +1000 @@ -1,4 +1,4 @@ -<tool id="fastq_paired_end_deinterlacer" name="FASTQ de-interlacer" version="1.0.0"> +<tool id="fastq_paired_end_deinterlacer" name="FASTQ de-interlacer" version="1.1"><description>on paired end reads</description><command interpreter="python">fastq_paired_end_deinterlacer.py '$input_file' '${input_file.extension[len( 'fastq' ):]}' '$output1_pairs_file' '$output2_pairs_file' '$output1_singles_file' '$output2_singles_file'</command><inputs> --- a/tools/fastq/fastq_paired_end_interlacer.xml Tue May 17 17:26:04 2011 +1000 +++ b/tools/fastq/fastq_paired_end_interlacer.xml Tue May 17 17:54:04 2011 +1000 @@ -1,4 +1,4 @@ -<tool id="fastq_paired_end_interlacer" name="FASTQ interlacer" version="1.0.0"> +<tool id="fastq_paired_end_interlacer" name="FASTQ interlacer" version="1.1"><description>on paired end reads</description><command interpreter="python">fastq_paired_end_interlacer.py '$input1_file' '${input1_file.extension[len( 'fastq' ):]}' '$input2_file' '${input2_file.extension[len( 'fastq' ):]}' '$outfile_pairs' '$outfile_singles'</command><inputs> http://bitbucket.org/galaxy/galaxy-central/changeset/36ced92e1da8/ changeset: 36ced92e1da8 user: kanwei date: 2011-06-10 19:15:17 summary: Typo affected #: 1 file (5 bytes) --- a/tools/fastq/fastq_paired_end_interlacer.xml Tue May 17 17:54:04 2011 +1000 +++ b/tools/fastq/fastq_paired_end_interlacer.xml Fri Jun 10 13:15:17 2011 -0400 @@ -10,7 +10,7 @@ <!-- $input1_file.id = ID , e.g. 10 --><!-- $input1_file.hid = history ID, e.g. 5 --><data name="outfile_pairs" format="input" label="FASTQ interlacer pairs from data ${input1_file.hid} and data ${input2_file.hid}"/> - <data name="outfile_singles" format="input" label="FASTQ interlacer singles from data data ${input1_file.hid} and data ${input2_file.hid}"/> + <data name="outfile_singles" format="input" label="FASTQ interlacer singles from data ${input1_file.hid} and data ${input2_file.hid}"/></outputs><tests><test> Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
Bitbucket