commit/galaxy-central: simleo: tools/fastq/fastq_paired_end_joiner: added support for recent Illumina headers
1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/e232573b7c62/ Changeset: e232573b7c62 User: simleo Date: 2013-06-06 17:33:46 Summary: tools/fastq/fastq_paired_end_joiner: added support for recent Illumina headers Affected #: 2 files diff -r af91c734dbb76b8c0e3166c7471370f9ba8260b5 -r e232573b7c629a94ab60e6b615ecedf156913308 tools/fastq/fastq_paired_end_joiner.py --- a/tools/fastq/fastq_paired_end_joiner.py +++ b/tools/fastq/fastq_paired_end_joiner.py @@ -1,38 +1,162 @@ -#Dan Blankenberg -import sys, os, shutil -from galaxy_utils.sequence.fastq import fastqReader, fastqNamedReader, fastqWriter, fastqJoiner +""" +Extended version of Dan Blankenberg's fastq joiner (adds support for +recent Illumina headers). +""" + +import sys, re +import galaxy_utils.sequence.fastq as fq + + +class IDManager(object): + + def __init__(self, sep="\t"): + """ + Recent Illumina FASTQ header format:: + + @<COORDS><FLAGS> + COORDS = <Instrument>:<Run #>:<Flowcell ID>:<Lane>:<Tile>:<X>:<Y> + FLAGS = <Read>:<Is Filtered>:<Control Number>:<Index Sequence> + + where the whitespace character between <COORDS> and <FLAGS> can be + either a space or a tab. + """ + self.sep = sep + + def parse_id(self, identifier): + try: + coords, flags = identifier.strip()[1:].split(self.sep, 1) + except ValueError: + raise RuntimeError("bad identifier: %r" % (identifier,)) + return coords.split(":"), flags.split(":") + + def join_id(self, parsed_id): + coords, flags = parsed_id + return "@%s%s%s" % (":".join(coords), self.sep, ":".join(flags)) + + def get_read_number(self, parsed_id): + return int(parsed_id[1][0]) + + def set_read_number(self, parsed_id, n): + parsed_id[1][0] = "%d" % n + + def get_paired_identifier(self, read): + t = self.parse_id(read.identifier) + n = self.get_read_number(t) + if n == 1: + pn = 2 + elif n == 2: + pn = 1 + else: + raise RuntimeError("Unknown read number '%d'" % n) + self.set_read_number(t, pn) + return self.join_id(t) + + +class FastqJoiner(fq.fastqJoiner): + + def __init__(self, format, force_quality_encoding=None, sep="\t"): + super(FastqJoiner, self).__init__(format, force_quality_encoding) + self.id_manager = IDManager(sep) + + def join(self, read1, read2): + force_quality_encoding = self.force_quality_encoding + if not force_quality_encoding: + if read1.is_ascii_encoded(): + force_quality_encoding = 'ascii' + else: + force_quality_encoding = 'decimal' + read1 = read1.convert_read_to_format( + self.format, force_quality_encoding=force_quality_encoding + ) + read2 = read2.convert_read_to_format( + self.format, force_quality_encoding=force_quality_encoding + ) + #-- + t1, t2 = [ + self.id_manager.parse_id(r.identifier) for r in (read1, read2) + ] + if self.id_manager.get_read_number(t1) == 2: + if not self.id_manager.get_read_number(t2) == 1: + raise RuntimeError("input files are not from mated pairs") + read1, read2 = read2, read1 + t1, t2 = t2, t1 + #-- + rval = fq.FASTQ_FORMATS[self.format]() + rval.identifier = read1.identifier + rval.description = "+" + if len(read1.description) > 1: + rval.description += rval.identifier[1:] + if rval.sequence_space == 'color': + # convert to nuc space, join, then convert back + rval.sequence = rval.convert_base_to_color_space( + read1.convert_color_to_base_space(read1.sequence) + + read2.convert_color_to_base_space(read2.sequence) + ) + else: + rval.sequence = read1.sequence + read2.sequence + if force_quality_encoding == 'ascii': + rval.quality = read1.quality + read2.quality + else: + rval.quality = "%s %s" % ( + read1.quality.strip(), read2.quality.strip() + ) + return rval + + def get_paired_identifier(self, read): + return self.id_manager.get_paired_identifier(read) + + +def sniff_sep(fastq_fn): + header = "" + with open(fastq_fn) as f: + while header == "": + try: + header = f.next().strip() + except StopIteration: + raise RuntimeError("%r: empty file" % (fastq_fn,)) + return re.search(r"\s", header).group() + def main(): - #Read command line arguments input1_filename = sys.argv[1] input1_type = sys.argv[2] or 'sanger' input2_filename = sys.argv[3] input2_type = sys.argv[4] or 'sanger' output_filename = sys.argv[5] - + fastq_style = sys.argv[6] or 'old' + #-- if input1_type != input2_type: - print "WARNING: You are trying to join files of two different types: %s and %s." % ( input1_type, input2_type ) - - input2 = fastqNamedReader( open( input2_filename, 'rb' ), input2_type ) - joiner = fastqJoiner( input1_type ) - out = fastqWriter( open( output_filename, 'wb' ), format = input1_type ) - + print "WARNING: trying to join files of different types: %s and %s" % ( + input1_type, input2_type + ) + if fastq_style == 'new': + sep = sniff_sep(input1_filename) + joiner = FastqJoiner(input1_type, sep=sep) + else: + joiner = fq.fastqJoiner(input1_type) + #-- + input2 = fq.fastqNamedReader(open(input2_filename, 'rb'), input2_type) + out = fq.fastqWriter(open(output_filename, 'wb'), format=input1_type) i = None skip_count = 0 - for i, fastq_read in enumerate( fastqReader( open( input1_filename, 'rb' ), format = input1_type ) ): - identifier = joiner.get_paired_identifier( fastq_read ) - fastq_paired = input2.get( identifier ) + for i, fastq_read in enumerate(fq.fastqReader( + open(input1_filename, 'rb' ), format=input1_type + )): + identifier = joiner.get_paired_identifier(fastq_read) + fastq_paired = input2.get(identifier) if fastq_paired is None: skip_count += 1 else: - out.write( joiner.join( fastq_read, fastq_paired ) ) + out.write(joiner.join(fastq_read, fastq_paired)) out.close() - if i is None: - print "Your file contains no valid FASTQ reads." + print "Your file contains no valid FASTQ reads" else: print input2.has_data() - print 'Joined %s of %s read pairs (%.2f%%).' % ( i - skip_count + 1, i + 1, float( i - skip_count + 1 ) / float( i + 1 ) * 100.0 ) + print 'Joined %s of %s read pairs (%.2f%%)' % ( + i - skip_count + 1, i + 1, (i - skip_count + 1) / (i + 1) * 100.0 + ) + if __name__ == "__main__": main() diff -r af91c734dbb76b8c0e3166c7471370f9ba8260b5 -r e232573b7c629a94ab60e6b615ecedf156913308 tools/fastq/fastq_paired_end_joiner.xml --- a/tools/fastq/fastq_paired_end_joiner.xml +++ b/tools/fastq/fastq_paired_end_joiner.xml @@ -1,9 +1,13 @@ -<tool id="fastq_paired_end_joiner" name="FASTQ joiner" version="1.0.0"> +<tool id="fastq_paired_end_joiner" name="FASTQ joiner" version="2.0.0"><description>on paired end reads</description> - <command interpreter="python">fastq_paired_end_joiner.py '$input1_file' '${input1_file.extension[len( 'fastq' ):]}' '$input2_file' '${input2_file.extension[len( 'fastq' ):]}' '$output_file'</command> + <command interpreter="python">fastq_paired_end_joiner.py '$input1_file' '${input1_file.extension[len( 'fastq' ):]}' '$input2_file' '${input2_file.extension[len( 'fastq' ):]}' '$output_file' '$style'</command><inputs><param name="input1_file" type="data" format="fastqsanger,fastqcssanger" label="Left-hand Reads" /><param name="input2_file" type="data" format="fastqsanger,fastqcssanger" label="Right-hand Reads" /> + <param name="style" type="select" label="FASTQ Header Style"> + <option value="old" selected="true">old</option> + <option value="new">new</option> + </param></inputs><outputs><data name="output_file" format="input" /> @@ -18,14 +22,19 @@ <help> **What it does** -This tool joins paired end FASTQ reads from two separate files into a single read in one file. The join is performed using sequence identifiers, allowing the two files to contain differing ordering. If a sequence identifier does not appear in both files, it is excluded from the output. - -Sequence identifiers with /1 and /2 appended override the left-hand and right-hand designation; i.e. if the reads end with /1 and /2, the read containing /1 will be used as the left-hand read and the read containing /2 will be used as the right-hand read. Sequences without this designation will follow the left-hand and right-hand settings set by the user. +This tool joins paired end FASTQ reads from two separate files into a +single read in one file. The join is performed using sequence +identifiers, allowing the two files to contain differing ordering. If +a sequence identifier does not appear in both files, it is excluded +from the output. ----- **Input formats** +Both old and new (from recent Illumina software) style FASTQ headers +are supported. The following example uses the "old" style. + Left-hand Read:: @HWI-EAS91_1_30788AAXX:7:21:1542:1758/1 @@ -53,10 +62,26 @@ ------ -**Citation** +**The "new" style** -If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. <http://www.ncbi.nlm.nih.gov/pubmed/20562416>`_ +Recent Illumina FASTQ headers are structured as follows:: + @COORDS FLAGS + COORDS = INSTRUMENT:RUN_#:FLOWCELL_ID:LANE:TILE:X:Y + FLAGS = READ:IS_FILTERED:CONTROL_NUMBER:INDEX_SEQUENCE +where the whitespace character between COORDS and FLAGS can be either +a space or a tab. + +------ + +**Credits** + +This is an extended version (adds support for "new" style FASTQ headers) +of D. Blankenberg's fastq joiner: + +`Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. <http://www.ncbi.nlm.nih.gov/pubmed/20562416>`_ + +New style header support added by Simone Leo <simone.leo@crs4.it> </help></tool> Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
commits-noreply@bitbucket.org