[hg] galaxy 3619: sff converter tool

16 Apr 2010

details:   http://www.bx.psu.edu/hg/galaxy/rev/61b09dc1dff2
changeset: 3619:61b09dc1dff2
user:      rc
date:      Wed Apr 07 11:12:00 2010 -0400
description:
sff converter tool

diffstat:

 tool_conf.xml.sample            |     1 +
 tools/filters/sff_extract.py    |  1505 +++++++++++++++++++++++++++++++++++++++
 tools/filters/sff_extractor.xml |    22 +
 3 files changed, 1528 insertions(+), 0 deletions(-)

diffs (1546 lines):

diff -r ebfc9236bf5a -r 61b09dc1dff2 tool_conf.xml.sample

--- a/tool_conf.xml.sample	Wed Apr 07 08:44:57 2010 -0400
+++ b/tool_conf.xml.sample	Wed Apr 07 11:12:00 2010 -0400
@@ -78,6 +78,7 @@
     <tool file="fasta_tools/tabular_to_fasta.xml" />
     <tool file="fastx_toolkit/fastq_to_fasta.xml" />
     <tool file="filters/wiggle_to_simple.xml" />
+    <tool file="filters/sff_extractor.xml" />
   </section>
   <section name="Extract Features" id="features">
     <tool file="filters/ucsc_gene_bed_to_exon_bed.xml" />
diff -r ebfc9236bf5a -r 61b09dc1dff2 tools/filters/sff_extract.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/sff_extract.py	Wed Apr 07 11:12:00 2010 -0400
@@ -0,0 +1,1505 @@
+#!/usr/bin/python
+'''This software extracts the seq, qual and ancillary information from an sff
+file, like the ones used by the 454 sequencer.
+
+Optionally, it can also split paired-end reads if given the linker sequence.
+The splitting is done with maximum match, i.e., every occurence of the linker
+sequence will be removed, even if occuring multiple times.'''
+
+#copyright Jose Blanca and Bastien Chevreux
+#COMAV institute, Universidad Politecnica de Valencia (UPV)
+#Valencia, Spain
+
+# additions to handle paired end reads by Bastien Chevreux
+# bugfixes for linker specific lengths: Lionel Guy
+
+#This program is free software: you can redistribute it and/or modify
+#it under the terms of the GNU General Public License as published by
+#the Free Software Foundation, either version 3 of the License, or
+#(at your option) any later version.
+#This program is distributed in the hope that it will be useful,
+#but WITHOUT ANY WARRANTY; without even the implied warranty of
+#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#GNU General Public License for more details.
+#You should have received a copy of the GNU General Public License
+#along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+__author__ = 'Jose Blanca and Bastien Chevreux'
+__copyright__ = 'Copyright 2008, Jose Blanca, COMAV, and Bastien Chevreux'
+__license__ = 'GPLv3 or later'
+__version__ = '0.2.8'
+__email__ = 'jblanca@btc.upv.es'
+__status__ = 'beta'
+
+import struct
+import sys
+import os
+import subprocess
+import tempfile
+
+
+fake_sff_name = 'fake_sff_name'
+
+
+# readname as key: lines with matches from SSAHA, one best match
+ssahapematches = {}
+# linker readname as key: length of linker sequence
+linkerlengths = {}
+
+# set to true if something really fishy is going on with the sequences
+stern_warning = True
+
+def read_bin_fragment(struct_def, fileh, offset=0, data=None,
+                                                             byte_padding=None):
+    '''It reads a chunk of a binary file.
+
+    You have to provide the struct, a file object, the offset (where to start
+    reading).
+    Also you can provide an optional dict that will be populated with the
+    extracted data.
+    If a byte_padding is given the number of bytes read will be a multiple of
+    that number, adding the required pad at the end.
+    It returns the number of bytes reads and the data dict.
+    '''
+    if data is None:
+        data = {}
+
+    #we read each item
+    bytes_read = 0
+    for item in struct_def:
+        #we go to the place and read
+        fileh.seek(offset + bytes_read)
+        n_bytes = struct.calcsize(item[1])
+        buffer = fileh.read(n_bytes)
+        read = struct.unpack('>' + item[1], buffer)
+        if len(read) == 1:
+            read = read[0]
+        data[item[0]] = read
+        bytes_read += n_bytes
+
+    #if there is byte_padding the bytes_to_read should be a multiple of the
+    #byte_padding
+    if byte_padding is not None:
+        pad = byte_padding
+        bytes_read = ((bytes_read + pad - 1) // pad) * pad
+
+    return (bytes_read, data)
+
+
+def check_magic(magic):
+    '''It checks that the magic number of the file matches the sff magic.'''
+    if magic != 779314790:
+        raise RuntimeError('This file does not seems to be an sff file.')
+
+def check_version(version):
+    '''It checks that the version is supported, otherwise it raises an error.'''
+    supported = ('\x00', '\x00', '\x00', '\x01')
+    i = 0
+    for item in version:
+        if version[i] != supported[i]:
+            raise RuntimeError('SFF version not supported. Please contact the author of the software.')
+        i += 1
+
+def read_header(fileh):
+    '''It reads the header from the sff file and returns a dict with the
+    information'''
+    #first we read the first part of the header
+    head_struct = [
+        ('magic_number', 'I'),
+        ('version', 'cccc'),
+        ('index_offset', 'Q'),
+        ('index_length', 'I'),
+        ('number_of_reads', 'I'),
+        ('header_length', 'H'),
+        ('key_length', 'H'),
+        ('number_of_flows_per_read', 'H'),
+        ('flowgram_format_code', 'B'),
+    ]
+    data = {}
+    first_bytes, data = read_bin_fragment(struct_def=head_struct, fileh=fileh,
+                                                            offset=0, data=data)
+    check_magic(data['magic_number'])
+    check_version(data['version'])
+    #now that we know the number_of_flows_per_read and the key_length
+    #we can read the second part of the header
+    struct2 = [
+        ('flow_chars', str(data['number_of_flows_per_read']) + 'c'),
+        ('key_sequence', str(data['key_length']) + 'c')
+    ]
+    read_bin_fragment(struct_def=struct2, fileh=fileh, offset=first_bytes,
+                                                                      data=data)
+    return data
+
+
+def read_sequence(header, fileh, fposition):
+    '''It reads one read from the sff file located at the fposition and
+    returns a dict with the information.'''
+    header_length = header['header_length']
+    index_offset = header['index_offset']
+    index_length = header['index_length']
+
+    #the sequence struct
+    read_header_1 = [
+        ('read_header_length', 'H'),
+        ('name_length', 'H'),
+        ('number_of_bases', 'I'),
+        ('clip_qual_left', 'H'),
+        ('clip_qual_right', 'H'),
+        ('clip_adapter_left', 'H'),
+        ('clip_adapter_right', 'H'),
+    ]
+    def read_header_2(name_length):
+        '''It returns the struct definition for the second part of the header'''
+        return [('name', str(name_length) +'c')]
+    def read_data(number_of_bases):
+        '''It returns the struct definition for the read data section.'''
+        #size = {'c': 1, 'B':1, 'H':2, 'I':4, 'Q':8}
+        if header['flowgram_format_code'] == 1:
+            flow_type = 'H'
+        else:
+            raise Error('file version not supported')
+        number_of_bases = str(number_of_bases)
+        return [
+            ('flowgram_values', str(header['number_of_flows_per_read']) +
+                                                                     flow_type),
+            ('flow_index_per_base', number_of_bases + 'B'),
+            ('bases', number_of_bases + 'c'),
+            ('quality_scores', number_of_bases + 'B'),
+        ]
+
+    data = {}
+    #we read the first part of the header
+    bytes_read, data = read_bin_fragment(struct_def=read_header_1,
+                                    fileh=fileh, offset=fposition, data=data)
+
+    read_bin_fragment(struct_def=read_header_2(data['name_length']),
+                          fileh=fileh, offset=fposition + bytes_read, data=data)
+    #we join the letters of the name
+    data['name'] = ''.join(data['name'])
+    offset = data['read_header_length']
+    #we read the sequence and the quality
+    read_data_st = read_data(data['number_of_bases'])
+    bytes_read, data = read_bin_fragment(struct_def=read_data_st,
+                                    fileh=fileh, offset=fposition + offset,
+                                    data=data, byte_padding=8)
+    #we join the bases
+    data['bases'] = ''.join(data['bases'])
+
+    #print data
+    #print "pre cqr: ", data['clip_qual_right']
+    #print "pre car: ", data['clip_adapter_right']
+    #print "pre cql: ", data['clip_qual_left']
+    #print "pre cal: ", data['clip_adapter_left']
+
+    # correct for the case the right clip is <= than the left clip
+    # in this case, left clip is 0 are set to 0 (right clip == 0 means
+    #  "whole sequence")
+    if data['clip_qual_right'] <= data['clip_qual_left'] :
+        data['clip_qual_right'] = 0
+        data['clip_qual_left'] = 0
+    if data['clip_adapter_right'] <= data['clip_adapter_left'] :
+        data['clip_adapter_right'] = 0
+        data['clip_adapter_left'] = 0
+
+    #the clipping section follows the NCBI's guidelines Trace Archive RFC
+    #http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=rfc&m=doc&s=rfc
+    #if there's no adapter clip: qual -> vector
+    #else:  qual-> qual
+    #       adapter -> vector
+
+    if not data['clip_adapter_left']:
+        data['clip_adapter_left'], data['clip_qual_left'] = data['clip_qual_left'], data['clip_adapter_left']
+    if not data['clip_adapter_right']:
+        data['clip_adapter_right'], data['clip_qual_right'] = data['clip_qual_right'], data['clip_adapter_right']
+
+    # see whether we have to override the minimum left clips
+    if config['min_leftclip'] > 0:
+        if data['clip_adapter_left'] >0 and data['clip_adapter_left'] < config['min_leftclip']:
+            data['clip_adapter_left'] = config['min_leftclip']
+        if data['clip_qual_left'] >0 and data['clip_qual_left'] < config['min_leftclip']:
+            data['clip_qual_left'] = config['min_leftclip']
+
+        
+    #print "post cqr: ", data['clip_qual_right']
+    #print "post car: ", data['clip_adapter_right']
+    #print "post cql: ", data['clip_qual_left']
+    #print "post cal: ", data['clip_adapter_left']
+
+
+    # for handling the -c (clip) option gently, we already clip here
+    #  and set all clip points to the sequence end points
+    if config['clip']:
+        data['bases'], data['quality_scores'] = clip_read(data)
+
+        data['number_of_bases']=len(data['bases'])
+        data['clip_qual_right'] = data['number_of_bases']
+        data['clip_adapter_right'] = data['number_of_bases']
+        data['clip_qual_left'] = 0
+        data['clip_adapter_left'] = 0
+        
+    return data['read_header_length'] + bytes_read, data
+
+
+def sequences(fileh, header):
+    '''It returns a generator with the data for each read.'''
+    #now we can read all the sequences
+    fposition = header['header_length']    #position in the file
+    reads_read = 0
+    while True:
+        if fposition == header['index_offset']:
+            #we have to skip the index section
+            fposition += index_length
+            continue
+        else:
+            bytes_read, seq_data = read_sequence(header=header, fileh=fileh,
+                                                            fposition=fposition)
+            yield seq_data
+            fposition += bytes_read
+            reads_read += 1
+            if reads_read >= header['number_of_reads']:
+                break
+
+
+def remove_last_xmltag_in_file(fname, tag=None):
+    '''Given an xml file name and a tag, it removes the last tag of the
+    file if it matches the given tag. Tag removal is performed via file
+    truncation.
+    
+    It the given tag is not the last in the file, a RunTimeError will be
+    raised.
+
+    The resulting xml file will be not xml valid. This function is a hack
+    that allows to append records to xml files in a quick and dirty way.
+    '''
+
+    fh = open(fname, 'r+')
+    #we have to read from the end to the start of the file and keep the
+    #string enclosed by </ >
+    i = -1
+    last_tag = []   #the chars that form the last tag
+    start_offset = None     #in which byte does the last tag starts?
+    end_offset = None     #in which byte does the last tag ends?
+    while True:
+        fh.seek(i, 2)
+        char = fh.read(1)
+        if not char.isspace():
+            last_tag.append(char)
+        if char == '>':
+            end_offset = i
+        if char == '<':
+            start_offset = i
+            break
+        i -= 1
+
+    #we have read the last tag backwards
+    last_tag = ''.join(last_tag[::-1])
+    #we remove the </ and >
+    last_tag = last_tag.rstrip('>').lstrip('</')
+    
+    #we check that we're removing the asked tag
+    if tag is not None and tag != last_tag:
+        raise RuntimeError("The given xml tag wasn't the last one in the file")
+
+    # while we are at it: also remove all white spaces in that line :-)
+    i -= 1
+    while True:
+        fh.seek(i, 2)
+        char = fh.read(1)
+        if not char == ' ' and not char == '\t':
+            break;
+        if fh.tell() == 1:
+            break;
+        i -= 1
+
+    fh.truncate();
+
+    fh.close()
+    return last_tag
+
+
+def create_basic_xml_info(readname, fname):
+    '''Formats a number of read specific infos into XML format.
+    Currently formated: name and the tags set from command line
+    '''
+    to_print = ['    <trace>\n']
+    to_print.append('        <trace_name>')
+    to_print.append(readname)
+    to_print.append('</trace_name>\n')
+
+    #extra information
+    #do we have extra info for this file?
+    info = None
+    if config['xml_info']:
+        #with this name?
+        if fname in config['xml_info']:
+            info = config['xml_info'][fname]
+        else:
+        #with no name?
+            try:
+                info = config['xml_info'][fake_sff_name]
+            except KeyError:
+                pass
+    #we print the info that we have
+    if info:
+        for key in info:
+            to_print.append('        <' + key + '>' + info[key] + \
+                            '</' + key +'>\n')
+
+    return ''.join(to_print)
+
+
+def create_clip_xml_info(readlen, adapl, adapr, quall, qualr):
+    '''Takes the clip values of the read and formats them into XML
+    Corrects "wrong" values that might have resulted through
+    simplified calculations earlier in the process of conversion
+    (especially during splitting of paired-end reads)
+    '''
+
+    to_print = [""]
+
+    # if right borders are >= to read length, they don't need
+    #  to be printed
+    if adapr >= readlen:
+        adapr = 0
+    if qualr >= readlen:
+        qualr = 0
+
+    # BaCh
+    # when called via split_paired_end(), some values may be < 0
+    #  (when clip values were 0 previously)
+    # instead of putting tons of if clauses for different calculations there,
+    #  I centralise corrective measure here
+    # set all values <0 to 0
+
+    if adapr < 0:
+        adapr = 0
+    if qualr <0:
+        qualr = 0
+    if adapl < 0:
+        adapl = 0
+    if quall <0:
+        quall = 0
+
+    if quall:
+        to_print.append('        <clip_quality_left>')
+        to_print.append(str(quall))
+        to_print.append('</clip_quality_left>\n')
+    if qualr:
+        to_print.append('        <clip_quality_right>')
+        to_print.append(str(qualr))
+        to_print.append('</clip_quality_right>\n')
+    if adapl:
+        to_print.append('        <clip_vector_left>')
+        to_print.append(str(adapl))
+        to_print.append('</clip_vector_left>\n')
+    if adapr:
+        to_print.append('        <clip_vector_right>')
+        to_print.append(str(adapr))
+        to_print.append('</clip_vector_right>\n')
+    return ''.join(to_print)
+
+
+def create_xml_for_unpaired_read(data, fname):
+    '''Given the data for one read it returns an str with the xml ancillary
+    data.'''
+    to_print = [create_basic_xml_info(data['name'],fname)]
+    #clippings in the XML only if we do not hard clip
+    if not config['clip']:
+        to_print.append(create_clip_xml_info(data['number_of_bases'],data['clip_adapter_left'], data['clip_adapter_right'], data['clip_qual_left'], data['clip_qual_right']));
+    to_print.append('    </trace>\n')
+    return ''.join(to_print)
+
+
+def format_as_fasta(name,seq,qual):
+    name_line = ''.join(('>', name,'\n'))
+    seqstring = ''.join((name_line, seq, '\n'))
+    qual_line = ' '.join([str(q) for q in qual]) 
+    qualstring = ''.join((name_line, qual_line, '\n'))
+    return seqstring, qualstring
+
+def format_as_fastq(name,seq,qual):
+    qual_line = ''.join([chr(q+33) for q in qual]) 
+    #seqstring = ''.join(('@', name,'\n', seq, '\n+', name,'\n', qual_line, '\n'))
+    seqstring = ''.join(('@', name,'\n', seq, '\n+\n', qual_line, '\n'))
+    return seqstring
+
+
+def get_read_data(data):
+    '''Given the data for one read it returns 2 strs with the fasta seq
+    and fasta qual.'''
+    #seq and qual
+    if config['mix_case']:
+        seq = sequence_case(data)
+        qual = data['quality_scores']
+    else :
+        seq = data['bases']
+        qual = data['quality_scores']
+
+    return seq, qual
+
+def extract_read_info(data, fname):
+    '''Given the data for one read it returns 3 strs with the fasta seq, fasta
+    qual and xml ancillary data.'''
+
+    seq,qual = get_read_data(data)
+    seqstring, qualstring = format_as_fasta(data['name'],seq,qual)
+
+    #name_line = ''.join(('>', data['name'],'\n'))
+    #seq = ''.join((name_line, seq, '\n'))
+    #qual_line = ' '.join([str(q) for q in qual]) 
+    #qual = ''.join((name_line, qual_line, '\n'))
+
+    xmlstring = create_xml_for_unpaired_read(data, fname)
+
+    return seqstring, qualstring, xmlstring
+
+def write_sequence(name,seq,qual,seq_fh,qual_fh):
+    '''Write sequence and quality FASTA and FASTA qual filehandles
+    (or into FASTQ and XML)
+    if sequence length is 0, don't write'''
+
+    if len(seq) == 0 : return
+
+    if qual_fh is None:
+        seq_fh.write(format_as_fastq(name,seq,qual))
+    else:
+        seqstring, qualstring = format_as_fasta(name,seq,qual)
+        seq_fh.write(seqstring)
+        qual_fh.write(qualstring)
+    return
+
+def write_unpaired_read(data, sff_fh, seq_fh, qual_fh, xml_fh):
+    '''Writes an unpaired read into FASTA, FASTA qual and XML filehandles
+    (or into FASTQ and XML)
+    if sequence length is 0, don't write'''
+
+    seq,qual = get_read_data(data)
+    if len(seq) == 0 : return
+
+    write_sequence(data['name'],seq,qual,seq_fh,qual_fh)
+
+    anci = create_xml_for_unpaired_read(data, sff_fh.name)
+    if anci is not None:
+        xml_fh.write(anci)
+    return
+
+
+def reverse_complement(seq):
+    '''Returns the reverse complement of a DNA sequence as string'''
+
+    compdict = {
+        'a': 't', 
+        'c': 'g',
+        'g': 'c',
+        't': 'a',
+        'u': 't',
+        'm': 'k',
+        'r': 'y',
+        'w': 'w',
+        's': 's',
+        'y': 'r',
+        'k': 'm',
+        'v': 'b',
+        'h': 'd',
+        'd': 'h',
+        'b': 'v',
+        'x': 'x',
+        'n': 'n',
+        'A': 'T',
+        'C': 'G',
+        'G': 'C',
+        'T': 'A',
+        'U': 'T',
+        'M': 'K',
+        'R': 'Y',
+        'W': 'W',
+        'S': 'S',
+        'Y': 'R',
+        'K': 'M',
+        'V': 'B',
+        'H': 'D',
+        'D': 'H',
+        'B': 'V',
+        'X': 'X',
+        'N': 'N', 
+        '*': '*'
+        }
+
+    complseq = ''.join([compdict[base] for base in seq])
+    # python hack to reverse a list/string/etc
+    complseq = complseq[::-1]
+    return complseq
+
+
+def mask_sequence(seq, maskchar, fpos, tpos):
+    '''Given a sequence, mask it with maskchar starting at fpos (including) and
+    ending at tpos (excluding)
+    '''
+
+    if len(maskchar) > 1:
+        raise RuntimeError("Internal error: more than one character given to mask_sequence")
+    if fpos<0:
+        fpos = 0
+    if tpos > len(seq):
+        tpos = len(seq)
+
+    newseq = ''.join((seq[:fpos],maskchar*(tpos-fpos), seq[tpos:]))
+
+    return newseq
+
+
+def fragment_sequences(sequence, qualities, splitchar):
+    '''Works like split() on strings, except it does this on a sequence
+    and the corresponding list with quality values.
+    Returns a tuple for each fragment, each sublist has the fragment
+    sequence as first and the fragment qualities as second elemnt'''
+
+    # this is slow (due to zip and list appends... use an iterator over
+    #  the sequence find find variations and splices on seq and qual
+
+    if len(sequence) != len(qualities):
+        print sequence, qualities
+        raise RuntimeError("Internal error: length of sequence and qualities don't match???")
+
+    retlist = ([])
+    if len(sequence) == 0:
+        return retlist
+
+    actseq = ([])
+    actqual = ([])
+    if sequence[0] != splitchar:
+        inseq = True
+    else:
+        inseq = False
+    for char,qual in zip(sequence,qualities):
+        if inseq:
+            if char != splitchar:
+                actseq.append(char)
+                actqual.append(qual)
+            else:
+                retlist.append((''.join(actseq), actqual))
+                actseq = ([])
+                actqual = ([])
+                inseq = False
+        else:
+            if char != splitchar:
+                inseq = True
+                actseq.append(char)
+                actqual.append(qual)
+
+    if inseq and len(actseq):
+        retlist.append((''.join(actseq), actqual))
+        
+    return retlist
+
+
+def calc_subseq_boundaries(maskedseq, maskchar):
+    '''E.g.:
+       ........xxxxxxxx..........xxxxxxxxxxxxxxxxxxxxx.........
+       to
+         (0,8),(8,16),(16,26),(26,47),(47,56)
+    '''
+
+    blist = ([])
+    if len(maskedseq) == 0:
+        return blist
+
+    inmask = True
+    if maskedseq[0] != maskchar:
+        inmask = False
+
+    start = 0
+    for spos in range(len(maskedseq)):
+        if inmask and maskedseq[spos] != maskchar:
+            blist.append(([start,spos]))
+            start = spos
+            inmask = False
+        elif not inmask and maskedseq[spos] == maskchar:
+            blist.append(([start,spos]))
+            start = spos
+            inmask = True
+
+    blist.append(([start,spos+1]))
+
+    return blist
+
+
+def correct_for_smallhits(maskedseq, maskchar, linkername):
+    '''If partial hits were found, take preventive measure: grow
+        the masked areas by 20 bases in each direction
+       Returns either unchanged "maskedseq" or a new sequence
+        with some more characters masked.
+    '''
+    global linkerlengths
+
+    CEBUG = 0
+
+    if CEBUG : print "correct_for_smallhits"
+    if CEBUG : print "Masked seq\n", maskedseq
+    if CEBUG : print "Linkername: ", linkername
+    
+    if len(maskedseq) == 0:
+        return maskedseq
+
+    growl=40
+    growl2=growl/2
+
+    boundaries = calc_subseq_boundaries(maskedseq,maskchar)
+    if CEBUG : print "Boundaries: ", boundaries
+
+    foundpartial = False
+    for bounds in boundaries:
+        if CEBUG : print "\tbounds: ", bounds
+        left, right = bounds
+        if left != 0 and right != len(maskedseq):
+            if maskedseq[left] == maskchar:
+                # allow 10% discrepancy
+                #    -linkerlengths[linkername]/10
+                # that's a kind of safety net if there are slight sequencing 
+                #  errors in the linker itself 
+                if right-left < linkerlengths[linkername]-linkerlengths[linkername]/10:
+                    if CEBUG : print "\t\tPartial: found " + str(right-left) + " gaps, " + linkername + " is " + str(linkerlengths[linkername]) + " nt long."
+                    foundpartial = True
+
+    if not foundpartial:
+        return maskedseq
+
+    # grow
+    newseq = ""
+    for bounds in boundaries:
+        if CEBUG : print "Bounds: ", bounds
+        left, right = bounds
+        if maskedseq[left] == maskchar:
+            newseq += maskedseq[left:right]
+        else:
+            clearstart = 0
+            if left > 0 :
+                clearstart = left+growl2
+            clearstop = len(maskedseq)
+            if right < len(maskedseq):
+                clearstop = right-growl2
+
+            if CEBUG : print "clearstart, clearstop: ",clearstart, clearstop
+
+            if clearstop <= clearstart:
+                newseq += maskchar * (right-left)
+            else:
+                if clearstart != left:
+                    newseq += maskchar * growl2
+                newseq += maskedseq[clearstart:clearstop]
+                if clearstop != right:
+                    newseq += maskchar * growl2
+            
+        #print "newseq\n",newseq
+
+    return newseq
+
+
+def split_paired_end(data, sff_fh, seq_fh, qual_fh, xml_fh):
+    '''Splits a paired end read and writes sequences into FASTA, FASTA qual
+    and XML traceinfo file. Returns the number of sequences created.
+
+    As the linker sequence may be anywhere in the read, including the ends
+    and overlapping with bad quality sequence, we need to perform some
+    computing and eventually set new clip points.
+
+    If the resulting split yields only one sequence (because linker
+    was not present or overlapping with left or right clip), only one
+    sequence will be written with ".fn" appended to the name.
+
+    If the read can be split, two reads will be written. The side left of
+    the linker will be named ".r" and will be written in reverse complement
+    into the file to conform with what approximately all assemblers expect
+    when reading paired-end data: reads in forward direction in file. The side
+    right of the linker will be named ".f"
+
+    If SSAHA found partial linker (linker sequences < length of linker),
+    the sequences will get a "_pl" furthermore be cut back thoroughly.
+
+    If SSAHA found multiple occurences of the linker, the names will get an
+    additional "_mlc" within the name to show that there was "multiple
+    linker contamination".
+
+    For multiple or partial linker, the "good" parts of the reads are
+    stored with a ".part<number>" name, additionally they will not get
+    template information in the XML
+    '''
+
+    global ssahapematches
+
+    CEBUG = 0
+
+    maskchar = "#"
+
+    if CEBUG : print "Need to split: " + data['name']
+
+    numseqs = 0;
+    readname = data['name']
+    readlen = data['number_of_bases']
+
+    leftclip, rightclip = return_merged_clips(data)
+    seq, qual = get_read_data(data)
+
+    if CEBUG : print "Original read:\n",seq
+    
+    maskedseq = seq
+    if leftclip > 0:
+        maskedseq = mask_sequence(maskedseq, maskchar, 0, leftclip-1)
+    if rightclip < len(maskedseq):
+        maskedseq = mask_sequence(maskedseq, maskchar, rightclip, len(maskedseq))
+    
+    leftclip, rightclip = return_merged_clips(data)
+    readlen = data['number_of_bases']
+    
+    if CEBUG : print "Readname:", readname
+    if CEBUG : print "Readlen:", readlen
+    if CEBUG : print "Num matches:", str(len(ssahapematches[data['name']]))
+    if CEBUG : print "matches:", ssahapematches[data['name']]
+
+    for match in ssahapematches[data['name']]:
+        score = int(match[0])
+        linkername = match[2]
+        leftreadhit = int(match[3])
+        rightreadhit = int(match[4])
+        #leftlinkerhit = int(match[5])
+        #rightlinkerhit = int(match[6])
+        #direction = match[7]
+        #hitlen = int(match[8])
+        #hitidentity = float(match[9])
+
+        if CEBUG : print match
+        if CEBUG : print "Match with score:", score
+        if CEBUG : print "Read before:\n", maskedseq
+        maskedseq = mask_sequence(maskedseq, maskchar, leftreadhit-1, rightreadhit)
+        if CEBUG : print "Masked seq:\n", maskedseq
+
+    correctedseq = correct_for_smallhits(maskedseq, maskchar, linkername)
+
+    if len(maskedseq) != len(correctedseq):
+        raise RuntimeError("Internal error: maskedseq != correctedseq")
+
+    partialhits = False
+    if correctedseq != maskedseq:
+        if CEBUG : print "Partial hits in", readname
+        if CEBUG : print "Original seq:\n", seq
+        if CEBUG : print "Masked seq:\n", maskedseq
+        if CEBUG : print "Corrected seq\n", correctedseq
+        partialhits = True
+        readname += "_pl"
+        maskedseq = correctedseq
+
+    fragments = fragment_sequences(maskedseq, qual, maskchar)
+
+    if CEBUG : print "Fragments (", len(fragments), "): ", fragments
+
+    mlcflag = False
+    #if len(ssahapematches[data['name']]) > 1:
+    #    #print "Multi linker contamination"
+    #    mlcflag = True
+    #    readname += "_mlc"
+
+    if len(fragments) > 2:
+        if CEBUG : print "Multi linker contamination"
+        mlcflag = True
+        readname += "_mlc"
+
+
+    #print fragments
+    if mlcflag or partialhits:
+        fragcounter = 1
+        readname += ".part"
+        for frag in fragments:
+            actseq = frag[0]
+            if len(actseq) >= 20:
+                actqual = frag[1]
+                oname = readname + str(fragcounter)
+                #seq_fh.write(">"+oname+"\n")
+                #seq_fh.write(actseq+"\n")
+                #qual_fh.write(">"+oname+"\n")
+                #qual_fh.write(' '.join((str(q) for q in actqual)))
+                #qual_fh.write("\n")
+                write_sequence(oname,actseq,actqual,seq_fh,qual_fh)
+                to_print = [create_basic_xml_info(oname,sff_fh.name)]
+                # No clipping in XML ... the multiple and partial fragments
+                #  are clipped "hard"
+                # No template ID and trace_end: we don't know the
+                #  orientation of the frahments. Even if it were
+                #  only two, the fact we had multiple linkers
+                #  says something went wrong, so simply do not
+                #  write any paired-end information for all these fragments
+                to_print.append('    </trace>\n')
+                xml_fh.write(''.join(to_print))
+                numseqs += 1
+                fragcounter += 1
+    else:
+        if len(fragments) >2:
+            raise RuntimeError("Unexpected: more than two fragments detected in " + readname + ". please contact the authors.")
+        # nothing will happen for 0 fragments
+        if len(fragments) == 1:
+            #print "Tada1"
+            boundaries = calc_subseq_boundaries(maskedseq,maskchar)
+            if len(boundaries) < 1 or len(boundaries) >3:
+                raise RuntimeError("Unexpected case: ", str(len(boundaries)), "boundaries for 1 fragment of ", readname)
+            if len(boundaries) == 3:
+                # case: mask char on both sides of sequence
+                #print "bounds3"
+                data['clip_adapter_left']=1+boundaries[0][1]
+                data['clip_adapter_right']=boundaries[2][0]
+            elif len(boundaries) == 2:
+                # case: mask char left or right of sequence
+                #print "bounds2",
+                if maskedseq[0] == maskchar :
+                    # case: mask char left
+                    #print "left"
+                    data['clip_adapter_left']=1+boundaries[0][1]
+                else:
+                    # case: mask char right
+                    #print "right"
+                    data['clip_adapter_right']=boundaries[1][0]
+            data['name'] = data['name'] + ".fn"
+            write_unpaired_read(data, sff_fh, seq_fh, qual_fh, xml_fh)
+            numseqs = 1
+        elif len(fragments) == 2:
+            #print "Tada2"
+            oname = readname + ".r"
+            seq, qual = get_read_data(data)
+
+            startsearch = False
+            for spos in range(len(maskedseq)):
+                if maskedseq[spos] != maskchar:
+                    startsearch = True;
+                else:
+                    if startsearch:
+                        break
+
+            #print "\nspos: ", spos
+            lseq=seq[:spos]
+            #print "lseq:", lseq
+            actseq = reverse_complement(lseq)
+            lreadlen = len(actseq)
+            lqual = qual[:spos];
+            # python hack to reverse a list/string/etc
+            lqual = lqual[::-1];
+
+            #seq_fh.write(">"+oname+"\n")
+            #seq_fh.write(actseq+"\n")
+            #qual_fh.write(">"+oname+"\n")
+            #qual_fh.write(' '.join((str(q) for q in lqual)))
+            #qual_fh.write("\n")
+
+            write_sequence(oname,actseq,lqual,seq_fh,qual_fh)
+
+            to_print = [create_basic_xml_info(oname,sff_fh.name)]
+            to_print.append(create_clip_xml_info(lreadlen, 0, lreadlen+1-data['clip_adapter_left'], 0, lreadlen+1-data['clip_qual_left']));
+            to_print.append('        <template_id>')
+            to_print.append(readname)
+            to_print.append('</template_id>\n')
+            to_print.append('        <trace_end>r</trace_end>\n')
+            to_print.append('    </trace>\n')
+            xml_fh.write(''.join(to_print))
+
+            oname = readname + ".f"
+            startsearch = False
+            for spos in range(len(maskedseq)-1,-1,-1):
+                if maskedseq[spos] != maskchar:
+                    startsearch = True;
+                else:
+                    if startsearch:
+                        break
+
+            actseq = seq[spos+1:]
+            actqual = qual[spos+1:];
+        
+            #print "\nspos: ", spos
+            #print "rseq:", actseq
+
+            #seq_fh.write(">"+oname+"\n")
+            #seq_fh.write(actseq+"\n")
+            #qual_fh.write(">"+oname+"\n")
+            #qual_fh.write(' '.join((str(q) for q in actqual)))
+            #qual_fh.write("\n")
+            write_sequence(oname,actseq,actqual,seq_fh,qual_fh)
+            
+            rreadlen = len(actseq)
+            to_print = [create_basic_xml_info(oname,sff_fh.name)]
+            to_print.append(create_clip_xml_info(rreadlen, 0, rreadlen-(readlen-data['clip_adapter_right']), 0, rreadlen-(readlen-data['clip_qual_right'])));
+            to_print.append('        <template_id>')
+            to_print.append(readname)
+            to_print.append('</template_id>\n')
+            to_print.append('        <trace_end>f</trace_end>\n')
+            to_print.append('    </trace>\n')
+            xml_fh.write(''.join(to_print))
+            numseqs = 2
+
+    return numseqs
+
+
+
+def extract_reads_from_sff(config, sff_files):
+    '''Given the configuration and the list of sff_files it writes the seqs,
+    qualities and ancillary data into the output file(s).
+
+    If file for paired-end linker was given, first extracts all sequences
+    of an SFF and searches these against the linker(s) with SSAHA2 to
+    create needed information to split reads.
+    '''
+
+    global ssahapematches
+
+    
+    if len(sff_files) == 0 :
+        raise RuntimeError("No SFF file given?")
+
+    #we go through all input files
+    for sff_file in sff_files:
+        if not os.path.getsize(sff_file):
+            raise RuntimeError('Empty file? : ' + sff_file)
+        fh = open(sff_file, 'r')
+        fh.close()
+
+    openmode = 'w'
+    if config['append']:
+        openmode = 'a'
+
+    seq_fh = open(config['seq_fname'], openmode)
+    xml_fh = open(config['xml_fname'], openmode)
+    if config['want_fastq']:
+        qual_fh = None
+        try:
+            os.remove(config['qual_fname'])
+        except :
+            python_formattingwithoutbracesisdumb_dummy = 1
+    else:
+        qual_fh = open(config['qual_fname'], openmode)
+
+    if not config['append']:
+        xml_fh.write('<?xml version="1.0"?>\n<trace_volume>\n')
+    else:
+        remove_last_xmltag_in_file(config['xml_fname'], "trace_volume")
+
+    #we go through all input files
+    for sff_file in sff_files:
+        #print "Working on '" + sff_file + "':"
+        ssahapematches.clear()
+
+        seqcheckstore = ([])
+
+        debug = 0
+
+        if not debug and config['pelinker_fname']:
+            #print "Creating temporary sequences from reads in '" + sff_file + "' ... ",
+            sys.stdout.flush()
+
+            if 0 :
+                # for debugging
+                pid = os.getpid()
+                tmpfasta_fname = 'sffe.tmp.'+ str(pid)+'.fasta'
+                tmpfasta_fh = open(tmpfasta_fname, 'w')
+            else:
+                tmpfasta_fh = tempfile.NamedTemporaryFile(prefix = 'sffeseqs_',
+                                                          suffix = '.fasta')
+
+            sff_fh = open(sff_file, 'rb')
+            header_data = read_header(fileh=sff_fh)
+            for seq_data in sequences(fileh=sff_fh, header=header_data):
+                seq,qual = get_read_data(seq_data)
+                seqstring, qualstring = format_as_fasta(seq_data['name'],seq,qual)
+                tmpfasta_fh.write(seqstring)
+                #seq, qual, anci = extract_read_info(seq_data, sff_fh.name)
+                #tmpfasta_fh.write(seq)
+            #print "done."
+            tmpfasta_fh.seek(0)
+
+            if 0 :
+                # for debugging
+                tmpssaha_fname = 'sffe.tmp.'+str(pid)+'.ssaha2'
+                tmpssaha_fh = open(tmpssaha_fname, 'w+')
+            else:
+                tmpssaha_fh = tempfile.NamedTemporaryFile(prefix = 'sffealig_',
+                                                          suffix = '.ssaha2')
+
+            launch_ssaha(config['pelinker_fname'], tmpfasta_fh.name, tmpssaha_fh)
+            tmpfasta_fh.close()
+
+            tmpssaha_fh.seek(0)
+            read_ssaha_data(tmpssaha_fh)
+            tmpssaha_fh.close()
+
+        if debug:
+            tmpssaha_fh = open("sffe.tmp.10634.ssaha2", 'r')
+            read_ssaha_data(tmpssaha_fh)
+
+        #print "Converting '" + sff_file + "' ... ",
+        sys.stdout.flush()
+        sff_fh = open(sff_file, 'rb')
+        #read_header(infile)
+        header_data = read_header(fileh=sff_fh)
+
+        #now convert all reads
+        nseqs_sff = 0
+        nseqs_out = 0
+        for seq_data in sequences(fileh=sff_fh, header=header_data):
+            nseqs_sff += 1
+
+            seq, qual = clip_read(seq_data)
+            seqcheckstore.append(seq[0:50])
+
+            #if nseqs_sff >1000:
+            #    check_for_dubious_startseq(seqcheckstore,sff_file,seq_data)
+            #    sys.exit()
+
+            if ssahapematches.has_key(seq_data['name']):
+                #print "Paired end:",seq_data['name']
+                nseqs_out += split_paired_end(seq_data, sff_fh, seq_fh, qual_fh, xml_fh)
+            else:
+                #print "Normal:",seq_data['name']
+                if config['pelinker_fname']:
+                    seq_data['name'] = seq_data['name'] + ".fn"
+                write_unpaired_read(seq_data, sff_fh, seq_fh, qual_fh, xml_fh)
+                nseqs_out += 1
+        #print "done."
+        #print 'Converted', str(nseqs_sff), 'reads into', str(nseqs_out), 'sequences.'
+        sff_fh.close()
+
+        check_for_dubious_startseq(seqcheckstore,sff_file,seq_data)
+        seqcheckstore = ([])
+
+    xml_fh.write('</trace_volume>\n')
+
+    xml_fh.close()
+    seq_fh.close()
+    if qual_fh is not None:
+        qual_fh.close()
+
+    return
+
+def check_for_dubious_startseq(seqcheckstore, sffname,seqdata):
+
+    global stern_warning
+
+    foundproblem = ""
+    for checklen in range(1,len(seqcheckstore[0])):
+        foundinloop = False
+        seqdict = {}
+        for seq in seqcheckstore:
+            shortseq = seq[0:checklen]
+            if shortseq in seqdict:
+                seqdict[shortseq] += 1
+            else:
+                seqdict[shortseq] = 1
+
+        for shortseq, count in seqdict.items():
+            if float(count)/len(seqcheckstore) >= 0.5:
+                foundinloop = True
+                stern_warning
+                foundproblem = "\n"+"*" * 80
+                foundproblem += "\nWARNING: "
+                foundproblem += "weird sequences in file " + sffname + "\n\n"
+                foundproblem += "After applying left clips, " + str(count) + " sequences (=" 
+                foundproblem += '%.0f'%(100.0*float(count)/len(seqcheckstore))
+                foundproblem += "%) start with these bases:\n" + shortseq
+                foundproblem += "\n\nThis does not look sane.\n\n"
+                foundproblem += "Countermeasures you *probably* must take:\n"
+                foundproblem += "1) Make your sequence provider aware of that problem and ask whether this can be\n    corrected in the SFF.\n"
+                foundproblem += "2) If you decide that this is not normal and your sequence provider does not\n    react, use the --min_left_clip of sff_extract.\n"
+                left,right = return_merged_clips(seqdata)
+                foundproblem += "    (Probably '--min_left_clip="+ str(left+len(shortseq))+"' but you should cross-check that)\n"
+                foundproblem += "*" * 80 + "\n"
+        if not foundinloop :
+            break
+    if len(foundproblem):
+        print foundproblem
+            
+
+def parse_extra_info(info):
+    '''It parses the information that will go in the xml file.
+
+    There are two formats accepted for the extra information:
+    key1:value1, key2:value2
+    or:
+    file1.sff{key1:value1, key2:value2};file2.sff{key3:value3}
+    '''
+    if not info:
+        return info
+    finfos = info.split(';')    #information for each file
+    data_for_files = {}
+    for finfo in finfos:
+        #we split the file name from the rest
+        items = finfo.split('{')
+        if len(items) == 1:
+            fname = fake_sff_name
+            info = items[0]
+        else:
+            fname = items[0]
+            info = items[1]
+        #now we get each key,value pair in the info
+        info = info.replace('}', '')
+        data = {}
+        for item in info.split(','):
+            key, value = item.strip().split(':')
+            key = key.strip()
+            value = value.strip()
+            data[key] = value
+        data_for_files[fname] = data
+    return data_for_files
+
+
+def return_merged_clips(data):
+    '''It returns the left and right positions to clip.'''
+    def max(a, b):
+        '''It returns the max of the two given numbers.
+
+        It won't take into account the zero values.
+        '''
+        if not a and not b:
+            return None
+        if not a:
+            return b
+        if not b:
+            return a
+        if a >= b:
+            return a
+        else:
+            return b
+    def min(a, b):
+        '''It returns the min of the two given numbers.
+
+        It won't take into account the zero values.
+        '''
+        if not a and not b:
+            return None
+        if not a:
+            return b
+        if not b:
+            return a
+        if a <= b:
+            return a
+        else:
+            return b
+    left = max(data['clip_adapter_left'], data['clip_qual_left'])
+    right = min(data['clip_adapter_right'], data['clip_qual_right'])
+    #maybe both clips where zero
+    if left is None:
+        left = 1
+    if right is None:
+        right = data['number_of_bases']
+    return left, right
+
+def sequence_case(data):
+    '''Given the data for one read it returns the seq with mixed case.
+
+    The regions to be clipped will be lower case and the rest upper case.
+    '''
+    left, right = return_merged_clips(data)
+    seq = data['bases']
+    new_seq = ''.join((seq[:left-1].lower(), seq[left-1:right], seq[right:].lower()))
+    return new_seq
+
+def clip_read(data):
+    '''Given the data for one read it returns clipped seq and qual.'''
+
+    qual = data['quality_scores']
+    left, right = return_merged_clips(data)
+    seq = data['bases']
+    qual = data['quality_scores']
+    new_seq = seq[left-1:right]
+    new_qual = qual[left-1:right]
+
+    return new_seq, new_qual
+
+
+
+def tests_for_ssaha(linker_fname):
+    '''Tests whether SSAHA2 can be successfully called.'''
+    
+    try:
+        print "Testing whether SSAHA2 is installed and can be launched ... ",
+        sys.stdout.flush()
+        fh = open('/dev/null', 'w')
+        retcode = subprocess.call(["ssaha2", "-v"], stdout = fh)
+        fh.close()
+        print "ok."
+    except :
+        print "nope? Uh oh ...\n\n"
+        raise RuntimeError('Could not launch ssaha2. Have you installed it? Is it in your path?')
+
+
+def load_linker_sequences(linker_fname):
+    '''Loads all linker sequences into memory, storing only the length
+    of each linker.'''
+    
+    global linkerlengths
+
+    if not os.path.getsize(linker_fname):
+        raise RuntimeError("File empty? '" + linker_fname + "'")
+    fh = open(linker_fname, 'r')
+    linkerseqs = read_fasta(fh)
+    if len(linkerseqs) == 0:
+        raise RuntimeError(linker_fname + ": no sequence found?")
+    for i in linkerseqs:
+        if linkerlengths.has_key(i.name):
+            raise RuntimeError(linker_fname + ": sequence '" + i.name + "' present multiple times. Aborting.")
+        linkerlengths[i.name] = len(i.sequence)
+    fh.close()
+
+
+def launch_ssaha(linker_fname, query_fname, output_fh):
+    '''Launches SSAHA2 on the linker and query file, string SSAHA2 output
+    into the output filehandle'''
+
+    try:
+        print "Searching linker sequences with SSAHA2 (this may take a while) ... ",
+        sys.stdout.flush()
+        retcode = subprocess.call(["ssaha2", "-output", "ssaha2", "-solexa", "-kmer", "4", "-skip", "1", linker_fname, query_fname], stdout = output_fh)
+        if retcode:
+            raise RuntimeError('Ups.')
+        else:
+            print "ok."
+    except:
+        print "\n"
+        raise RuntimeError('An error occured during the SSAHA2 execution, aborting.')
+
+def read_ssaha_data(ssahadata_fh):
+    '''Given file handle, reads file generated with SSAHA2 (with default
+    output format) and stores all matches as list ssahapematches
+    (ssaha paired-end matches) dictionary'''
+
+    global ssahapematches
+
+    print "Parsing SSAHA2 result file ... ",
+    sys.stdout.flush()
+
+    for line in ssahadata_fh:
+        if line.startswith('ALIGNMENT'):
+            ml = line.split()
+            if len(ml) != 12 :
+                print "\n", line,
+                raise RuntimeError('Expected 12 elements in the SSAHA2 line with ALIGMENT keyword, but found ' + str(len(ml)))
+            if not ssahapematches.has_key(ml[2]) :
+                ssahapematches[ml[2]] = ([])
+            if ml[8] == 'F':
+                #print line,
+
+                # store everything except the first element (output
+                #  format name (ALIGNMENT)) and the last element
+                #  (length)
+                ssahapematches[ml[2]].append(ml[1:-1])
+            else:
+                #print ml
+                ml[4],ml[5] = ml[5],ml[4]
+                #print ml
+                ssahapematches[ml[2]].append(ml[1:-1])
+
+    print "done."
+
+
+##########################################################################
+#
+# BaCh: This block was shamelessly copied from
+#  http://python.genedrift.org/2007/07/04/reading-fasta-files-conclusion/
+# and then subsequently modified to read fasta correctly
+# It's still not fool proof, but should be good enough
+#
+##########################################################################
+
+class Fasta:
+    def __init__(self, name, sequence):
+        self.name = name
+        self.sequence = sequence
+
+def read_fasta(file):
+    items = []
+    aninstance = Fasta('', '')
+    linenum = 0
+    for line in file:
+        linenum += 1
+        if line.startswith(">"):
+            if len(aninstance.sequence):
+                items.append(aninstance)
+                aninstance = Fasta('', '')
+            # name == all characters until the first whitespace
+            #  (split()[0]) but without the starting ">" ([1:])
+            aninstance.name = line.split()[0][1:]
+            aninstance.sequence = ''
+            if len(aninstance.name) == 0:
+                raise RuntimeError(file.name + ': no name in line ' + str(linenum) + '?')
+                
+        else:
+            if len(aninstance.name) == 0:
+                raise RuntimeError(file.name + ': no sequence header at line ' + str(linenum) + '?')
+            aninstance.sequence += line.strip()
+
+    if len(aninstance.name) and len(aninstance.sequence):
+        items.append(aninstance)
+
+    return items
+##########################################################################
+
+def version_string ():
+    return "sff_extract " + __version__
+
+def read_config():
+    '''It reads the configuration options from the command line arguments and
+    it returns a dict with them.'''
+    from optparse import OptionParser, OptionGroup
+    usage = "usage: %prog [options] sff1 sff2 ..."
+    desc = "Extract sequences from 454 SFF files into FASTA, FASTA quality"\
+           " and XML traceinfo format. When a paired-end linker sequence"\
+           " is given (-l), use SSAHA2 to scan the sequences for the linker,"\
+           " then split the sequences, removing the linker."
+    parser = OptionParser(usage = usage, version = version_string(), description = desc)
+    parser.add_option('-a', '--append', action="store_true", dest='append',
+            help='append output to existing files', default=False)
+    parser.add_option('-i', '--xml_info', dest='xml_info',
+            help='extra info to write in the xml file')
+    parser.add_option("-l", "--linker_file", dest="pelinker_fname",
+            help="FASTA file with paired-end linker sequences", metavar="FILE")
+
+    group = OptionGroup(parser, "File name options","")
+    group.add_option('-c', '--clip', action="store_true", dest='clip',
+                     help='clip (completely remove) ends with low qual and/or adaptor sequence', default=False)
+    group.add_option('-u', '--upper_case', action="store_false", dest='mix_case',
+                      help='all bases in upper case, including clipped ends', default=True)
+    group.add_option('', '--min_left_clip', dest='min_leftclip',
+                     metavar="INTEGER", type = "int",
+                     help='if the left clip coming from the SFF is smaller than this value, override it', default=0)
+    group.add_option('-Q', '--fastq', action="store_true", dest='want_fastq',
+                      help='store as FASTQ file instead of FASTA + FASTA quality file', default=False)
+    parser.add_option_group(group)
+
+    group = OptionGroup(parser, "File name options","")
+    group.add_option("-o", "--out_basename", dest="basename",
+            help="base name for all output files")
+    group.add_option("-s", "--seq_file", dest="seq_fname",
+            help="output sequence file name", metavar="FILE")
+    group.add_option("-q", "--qual_file", dest="qual_fname",
+            help="output quality file name", metavar="FILE")
+    group.add_option("-x", "--xml_file", dest="xml_fname",
+            help="output ancillary xml file name", metavar="FILE")
+    parser.add_option_group(group)
+
+    #default fnames
+    #is there an sff file?
+    basename = 'reads'
+    if sys.argv[-1][-4:].lower() == '.sff':
+        basename = sys.argv[-1][:-4]
+    def_seq_fname = basename + '.fasta'
+    def_qual_fname = basename + '.fasta.qual'
+    def_xml_fname = basename + '.xml'
+    def_pelinker_fname = ''
+    parser.set_defaults(seq_fname = def_seq_fname)
+    parser.set_defaults(qual_fname = def_qual_fname)
+    parser.set_defaults(xml_fname = def_xml_fname)
+    parser.set_defaults(pelinker_fname = def_pelinker_fname)
+
+    #we parse the cmd line
+    (options, args) = parser.parse_args()
+
+    #we put the result in a dict
+    global config
+    config = {}
+    for property in dir(options):
+        if property[0] == '_' or property in ('ensure_value', 'read_file', 
+                                                                'read_module'):
+            continue
+        config[property] = getattr(options, property)
+
+    if config['basename'] is None:
+        config['basename']=basename
+
+    #if we have not set a file name with -s, -q or -x we set the basename
+    #based file name
+    if config['want_fastq']:
+        config['qual_fname'] = ''
+        if config['seq_fname'] == def_seq_fname:
+            config['seq_fname'] = config['basename'] + '.fastq'
+    else:
+        if config['seq_fname'] == def_seq_fname:
+            config['seq_fname'] = config['basename'] + '.fasta'
+        if config['qual_fname'] == def_qual_fname:
+            config['qual_fname'] = config['basename'] + '.fasta.qual'
+
+    if config['xml_fname'] == def_xml_fname:
+        config['xml_fname'] = config['basename'] + '.xml'
+
+    #we parse the extra info for the xml file
+    config['xml_info'] = parse_extra_info(config['xml_info'])
+    return config, args
+
+
+
+##########################################################################
+
+
+def testsome():
+    sys.exit()
+    return
+
+
+def debug():
+    try:
+        dummy = 1
+        #debug()
+        #testsome()
+
+        config, args = read_config()
+        load_linker_sequences(config['pelinker_fname'])
+
+        #pid = os.getpid()
+        pid = 15603
+
+        #tmpfasta_fname = 'sffe.tmp.'+ str(pid)+'.fasta'
+        #tmpfasta_fh = open(tmpfasta_fname, 'w')
+        tmpfasta_fname = 'FLVI58L05.fa'
+        tmpfasta_fh = open(tmpfasta_fname, 'r')
+
+        tmpssaha_fname = 'sffe.tmp.'+str(pid)+'.ssaha2'
+        tmpssaha_fh = open(tmpssaha_fname, 'w')
+
+        launch_ssaha(config['pelinker_fname'], tmpfasta_fh.name, tmpssaha_fh)
+
+        tmpssaha_fh = open("sffe.tmp.15603.ssaha2", 'r')    
+        read_ssaha_data(tmpssaha_fh)
+
+        sys.exit()
+
+        extract_reads_from_sff(config, args)
+
+    except (OSError, IOError, RuntimeError), errval:
+        print errval
+        sys.exit()
+
+    sys.exit()
+
+
+def main():
+
+    argv = sys.argv
+    if len(argv) == 1:
+        sys.argv.append('-h')
+        read_config()
+        sys.exit()
+    try:
+        #debug();
+
+        config, args = read_config()
+
+        if config['pelinker_fname']:
+            #tests_for_ssaha(config['pelinker_fname'])
+            load_linker_sequences(config['pelinker_fname'])
+        if len(args) == 0:
+            raise RuntimeError("No SFF file given?")
+        extract_reads_from_sff(config, args)
+    except (OSError, IOError, RuntimeError), errval:
+        print errval
+        return 1
+
+    if stern_warning:
+        return 1
+
+    return 0
+
+
+
+if __name__ == "__main__":
+        sys.exit(main())
diff -r ebfc9236bf5a -r 61b09dc1dff2 tools/filters/sff_extractor.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/sff_extractor.xml	Wed Apr 07 11:12:00 2010 -0400
@@ -0,0 +1,22 @@
+<tool id="Sff_extractor" name="SFF converter" version="1.0.0">
+  <description></description>
+  <command interpreter="python">sff_extract.py -s $out_file1 -q $out_file2 -x $out_file3 $input</command>
+   <inputs>
+    <param format="sff" name="input" type="data" label="Extract from this dataset"/>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+    <data format="qual" name="out_file2" />
+    <data format="xml" name="out_file3" />
+  </outputs>
+  <help>
+**What it does**
+
+This tool extracts data from the 454 Sequencer SFF format and creates three files containing the: 
+    Sequences (FASTA),
+    Qualities (QUAL) and 
+    Clippings (XML)
+  </help>
+</tool>
+
+

    

Greg Von Kuster

tags

participants (1)