commit/galaxy-central: simleo: tools/fastq/fastq_paired_end_joiner: added support for recent Illumina headers

19 Dec 2013

1 new commit in galaxy-central:

https://bitbucket.org/galaxy/galaxy-central/commits/e232573b7c62/
Changeset:   e232573b7c62
User:        simleo
Date:        2013-06-06 17:33:46
Summary:     tools/fastq/fastq_paired_end_joiner: added support for recent Illumina headers
Affected #:  2 files

diff -r af91c734dbb76b8c0e3166c7471370f9ba8260b5 -r e232573b7c629a94ab60e6b615ecedf156913308 tools/fastq/fastq_paired_end_joiner.py

--- a/tools/fastq/fastq_paired_end_joiner.py
+++ b/tools/fastq/fastq_paired_end_joiner.py
@@ -1,38 +1,162 @@
-#Dan Blankenberg
-import sys, os, shutil
-from galaxy_utils.sequence.fastq import fastqReader, fastqNamedReader, fastqWriter, fastqJoiner
+"""
+Extended version of Dan Blankenberg's fastq joiner (adds support for
+recent Illumina headers).
+"""
+
+import sys, re
+import galaxy_utils.sequence.fastq as fq
+
+
+class IDManager(object):
+
+    def __init__(self, sep="\t"):
+        """
+        Recent Illumina FASTQ header format::
+
+          @<COORDS><FLAGS>
+          COORDS = <Instrument>:<Run #>:<Flowcell ID>:<Lane>:<Tile>:<X>:<Y>
+          FLAGS = <Read>:<Is Filtered>:<Control Number>:<Index Sequence>
+
+        where the whitespace character between <COORDS> and <FLAGS> can be
+        either a space or a tab.
+        """
+        self.sep = sep
+
+    def parse_id(self, identifier):
+        try:
+            coords, flags = identifier.strip()[1:].split(self.sep, 1)
+        except ValueError:
+            raise RuntimeError("bad identifier: %r" % (identifier,))
+        return coords.split(":"), flags.split(":")
+
+    def join_id(self, parsed_id):
+        coords, flags = parsed_id
+        return "@%s%s%s" % (":".join(coords), self.sep, ":".join(flags))
+
+    def get_read_number(self, parsed_id):
+        return int(parsed_id[1][0])
+
+    def set_read_number(self, parsed_id, n):
+        parsed_id[1][0] = "%d" % n
+
+    def get_paired_identifier(self, read):
+        t = self.parse_id(read.identifier)
+        n = self.get_read_number(t)
+        if n == 1:
+            pn = 2
+        elif n == 2:
+            pn = 1
+        else:
+            raise RuntimeError("Unknown read number '%d'" % n)
+        self.set_read_number(t, pn)
+        return self.join_id(t)
+
+
+class FastqJoiner(fq.fastqJoiner):
+
+    def __init__(self, format, force_quality_encoding=None, sep="\t"):
+        super(FastqJoiner, self).__init__(format, force_quality_encoding)
+        self.id_manager = IDManager(sep)
+
+    def join(self, read1, read2):
+        force_quality_encoding = self.force_quality_encoding
+        if not force_quality_encoding:
+            if read1.is_ascii_encoded():
+                force_quality_encoding = 'ascii'
+            else:
+                force_quality_encoding = 'decimal'
+        read1 = read1.convert_read_to_format(
+            self.format, force_quality_encoding=force_quality_encoding
+            )
+        read2 = read2.convert_read_to_format(
+            self.format, force_quality_encoding=force_quality_encoding
+            )
+        #--
+        t1, t2 = [
+            self.id_manager.parse_id(r.identifier) for r in (read1, read2)
+            ]
+        if self.id_manager.get_read_number(t1) == 2:
+            if not self.id_manager.get_read_number(t2) == 1:
+                raise RuntimeError("input files are not from mated pairs")
+            read1, read2 = read2, read1
+            t1, t2 = t2, t1
+        #--
+        rval = fq.FASTQ_FORMATS[self.format]()
+        rval.identifier = read1.identifier
+        rval.description = "+"
+        if len(read1.description) > 1:
+            rval.description += rval.identifier[1:]
+        if rval.sequence_space == 'color':
+            # convert to nuc space, join, then convert back
+            rval.sequence = rval.convert_base_to_color_space(
+                read1.convert_color_to_base_space(read1.sequence) +
+                read2.convert_color_to_base_space(read2.sequence)
+                )
+        else:
+            rval.sequence = read1.sequence + read2.sequence
+        if force_quality_encoding == 'ascii':
+            rval.quality = read1.quality + read2.quality
+        else:
+            rval.quality = "%s %s" % (
+                read1.quality.strip(), read2.quality.strip()
+                )
+        return rval
+
+    def get_paired_identifier(self, read):
+        return self.id_manager.get_paired_identifier(read)
+
+
+def sniff_sep(fastq_fn):
+    header = ""
+    with open(fastq_fn) as f:
+        while header == "":
+            try:
+                header = f.next().strip()
+            except StopIteration:
+                raise RuntimeError("%r: empty file" % (fastq_fn,))
+    return re.search(r"\s", header).group()
+
 
 def main():
-    #Read command line arguments
     input1_filename = sys.argv[1]
     input1_type = sys.argv[2] or 'sanger'
     input2_filename = sys.argv[3]
     input2_type = sys.argv[4] or 'sanger'
     output_filename = sys.argv[5]
-    
+    fastq_style = sys.argv[6] or 'old'
+    #--
     if input1_type != input2_type:
-        print "WARNING: You are trying to join files of two different types: %s and %s." % ( input1_type, input2_type )
-    
-    input2 = fastqNamedReader( open( input2_filename, 'rb' ), input2_type )
-    joiner = fastqJoiner( input1_type )
-    out = fastqWriter( open( output_filename, 'wb' ), format = input1_type )
-    
+        print "WARNING: trying to join files of different types: %s and %s" % (
+            input1_type, input2_type
+            )
+    if fastq_style == 'new':
+        sep = sniff_sep(input1_filename)
+        joiner = FastqJoiner(input1_type, sep=sep)
+    else:
+        joiner = fq.fastqJoiner(input1_type)
+    #--
+    input2 = fq.fastqNamedReader(open(input2_filename, 'rb'), input2_type)
+    out = fq.fastqWriter(open(output_filename, 'wb'), format=input1_type)
     i = None
     skip_count = 0
-    for i, fastq_read in enumerate( fastqReader( open( input1_filename, 'rb' ), format = input1_type ) ):
-        identifier = joiner.get_paired_identifier( fastq_read )
-        fastq_paired = input2.get( identifier )
+    for i, fastq_read in enumerate(fq.fastqReader(
+        open(input1_filename, 'rb' ), format=input1_type
+        )):
+        identifier = joiner.get_paired_identifier(fastq_read)
+        fastq_paired = input2.get(identifier)
         if fastq_paired is None:
             skip_count += 1
         else:
-            out.write( joiner.join( fastq_read, fastq_paired ) )
+            out.write(joiner.join(fastq_read, fastq_paired))
     out.close()
-    
     if i is None:
-        print "Your file contains no valid FASTQ reads."
+        print "Your file contains no valid FASTQ reads"
     else:
         print input2.has_data()
-        print 'Joined %s of %s read pairs (%.2f%%).' % ( i - skip_count + 1, i + 1, float( i - skip_count + 1 ) / float( i + 1 ) * 100.0 )
+        print 'Joined %s of %s read pairs (%.2f%%)' % (
+            i - skip_count + 1, i + 1, (i - skip_count + 1) / (i + 1) * 100.0
+            )
+
 
 if __name__ == "__main__":
     main()

diff -r af91c734dbb76b8c0e3166c7471370f9ba8260b5 -r e232573b7c629a94ab60e6b615ecedf156913308 tools/fastq/fastq_paired_end_joiner.xml
--- a/tools/fastq/fastq_paired_end_joiner.xml
+++ b/tools/fastq/fastq_paired_end_joiner.xml
@@ -1,9 +1,13 @@
-<tool id="fastq_paired_end_joiner" name="FASTQ joiner" version="1.0.0">
+<tool id="fastq_paired_end_joiner" name="FASTQ joiner" version="2.0.0"><description>on paired end reads</description>
-  <command interpreter="python">fastq_paired_end_joiner.py '$input1_file' '${input1_file.extension[len( 'fastq' ):]}' '$input2_file' '${input2_file.extension[len( 'fastq' ):]}' '$output_file'</command>
+  <command interpreter="python">fastq_paired_end_joiner.py '$input1_file' '${input1_file.extension[len( 'fastq' ):]}' '$input2_file' '${input2_file.extension[len( 'fastq' ):]}' '$output_file' '$style'</command><inputs><param name="input1_file" type="data" format="fastqsanger,fastqcssanger" label="Left-hand Reads" /><param name="input2_file" type="data" format="fastqsanger,fastqcssanger" label="Right-hand Reads" />
+    <param name="style" type="select" label="FASTQ Header Style">
+      <option value="old" selected="true">old</option>
+      <option value="new">new</option>
+    </param></inputs><outputs><data name="output_file" format="input" />
@@ -18,14 +22,19 @@
   <help>
 **What it does**
 
-This tool joins paired end FASTQ reads from two separate files into a single read in one file. The join is performed using sequence identifiers, allowing the two files to contain differing ordering. If a sequence identifier does not appear in both files, it is excluded from the output.
-
-Sequence identifiers with /1 and /2 appended override the left-hand and right-hand designation; i.e. if the reads end with /1 and /2, the read containing /1 will be used as the left-hand read and the read containing /2 will be used as the right-hand read. Sequences without this designation will follow the left-hand and right-hand settings set by the user.
+This tool joins paired end FASTQ reads from two separate files into a
+single read in one file.  The join is performed using sequence
+identifiers, allowing the two files to contain differing ordering.  If
+a sequence identifier does not appear in both files, it is excluded
+from the output.
 
 -----
 
 **Input formats**
 
+Both old and new (from recent Illumina software) style FASTQ headers
+are supported.  The following example uses the "old" style.
+
 Left-hand Read::
 
     @HWI-EAS91_1_30788AAXX:7:21:1542:1758/1
@@ -53,10 +62,26 @@
 
 ------
 
-**Citation**
+**The "new" style**
 
-If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. <http://www.ncbi.nlm.nih.gov/pubmed/20562416>`_
+Recent Illumina FASTQ headers are structured as follows::
 
+  @COORDS FLAGS
+  COORDS = INSTRUMENT:RUN_#:FLOWCELL_ID:LANE:TILE:X:Y
+  FLAGS = READ:IS_FILTERED:CONTROL_NUMBER:INDEX_SEQUENCE
 
+where the whitespace character between COORDS and FLAGS can be either
+a space or a tab.
+
+------
+
+**Credits**
+
+This is an extended version (adds support for "new" style FASTQ headers)
+of D. Blankenberg's fastq joiner:
+
+`Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. <http://www.ncbi.nlm.nih.gov/pubmed/20562416>`_
+
+New style header support added by Simone Leo <simone.leo@crs4.it>
   </help></tool>

Repository URL: https://bitbucket.org/galaxy/galaxy-central/

--

This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.

    

commits-noreply＠bitbucket.org

tags

participants (1)