details: http://www.bx.psu.edu/hg/galaxy/rev/61b09dc1dff2
changeset: 3619:61b09dc1dff2
user: rc
date: Wed Apr 07 11:12:00 2010 -0400
description:
sff converter tool
diffstat:
tool_conf.xml.sample | 1 +
tools/filters/sff_extract.py | 1505 +++++++++++++++++++++++++++++++++++++++
tools/filters/sff_extractor.xml | 22 +
3 files changed, 1528 insertions(+), 0 deletions(-)
diffs (1546 lines):
diff -r ebfc9236bf5a -r 61b09dc1dff2 tool_conf.xml.sample
--- a/tool_conf.xml.sample Wed Apr 07 08:44:57 2010 -0400
+++ b/tool_conf.xml.sample Wed Apr 07 11:12:00 2010 -0400
@@ -78,6 +78,7 @@
<tool file="fasta_tools/tabular_to_fasta.xml" />
<tool file="fastx_toolkit/fastq_to_fasta.xml" />
<tool file="filters/wiggle_to_simple.xml" />
+ <tool file="filters/sff_extractor.xml" />
</section>
<section name="Extract Features" id="features">
<tool file="filters/ucsc_gene_bed_to_exon_bed.xml" />
diff -r ebfc9236bf5a -r 61b09dc1dff2 tools/filters/sff_extract.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/sff_extract.py Wed Apr 07 11:12:00 2010 -0400
@@ -0,0 +1,1505 @@
+#!/usr/bin/python
+'''This software extracts the seq, qual and ancillary information from an sff
+file, like the ones used by the 454 sequencer.
+
+Optionally, it can also split paired-end reads if given the linker sequence.
+The splitting is done with maximum match, i.e., every occurence of the linker
+sequence will be removed, even if occuring multiple times.'''
+
+#copyright Jose Blanca and Bastien Chevreux
+#COMAV institute, Universidad Politecnica de Valencia (UPV)
+#Valencia, Spain
+
+# additions to handle paired end reads by Bastien Chevreux
+# bugfixes for linker specific lengths: Lionel Guy
+
+#This program is free software: you can redistribute it and/or modify
+#it under the terms of the GNU General Public License as published by
+#the Free Software Foundation, either version 3 of the License, or
+#(at your option) any later version.
+#This program is distributed in the hope that it will be useful,
+#but WITHOUT ANY WARRANTY; without even the implied warranty of
+#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+#GNU General Public License for more details.
+#You should have received a copy of the GNU General Public License
+#along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+__author__ = 'Jose Blanca and Bastien Chevreux'
+__copyright__ = 'Copyright 2008, Jose Blanca, COMAV, and Bastien Chevreux'
+__license__ = 'GPLv3 or later'
+__version__ = '0.2.8'
+__email__ = 'jblanca(a)btc.upv.es'
+__status__ = 'beta'
+
+import struct
+import sys
+import os
+import subprocess
+import tempfile
+
+
+fake_sff_name = 'fake_sff_name'
+
+
+# readname as key: lines with matches from SSAHA, one best match
+ssahapematches = {}
+# linker readname as key: length of linker sequence
+linkerlengths = {}
+
+# set to true if something really fishy is going on with the sequences
+stern_warning = True
+
+def read_bin_fragment(struct_def, fileh, offset=0, data=None,
+ byte_padding=None):
+ '''It reads a chunk of a binary file.
+
+ You have to provide the struct, a file object, the offset (where to start
+ reading).
+ Also you can provide an optional dict that will be populated with the
+ extracted data.
+ If a byte_padding is given the number of bytes read will be a multiple of
+ that number, adding the required pad at the end.
+ It returns the number of bytes reads and the data dict.
+ '''
+ if data is None:
+ data = {}
+
+ #we read each item
+ bytes_read = 0
+ for item in struct_def:
+ #we go to the place and read
+ fileh.seek(offset + bytes_read)
+ n_bytes = struct.calcsize(item[1])
+ buffer = fileh.read(n_bytes)
+ read = struct.unpack('>' + item[1], buffer)
+ if len(read) == 1:
+ read = read[0]
+ data[item[0]] = read
+ bytes_read += n_bytes
+
+ #if there is byte_padding the bytes_to_read should be a multiple of the
+ #byte_padding
+ if byte_padding is not None:
+ pad = byte_padding
+ bytes_read = ((bytes_read + pad - 1) // pad) * pad
+
+ return (bytes_read, data)
+
+
+def check_magic(magic):
+ '''It checks that the magic number of the file matches the sff magic.'''
+ if magic != 779314790:
+ raise RuntimeError('This file does not seems to be an sff file.')
+
+def check_version(version):
+ '''It checks that the version is supported, otherwise it raises an error.'''
+ supported = ('\x00', '\x00', '\x00', '\x01')
+ i = 0
+ for item in version:
+ if version[i] != supported[i]:
+ raise RuntimeError('SFF version not supported. Please contact the author of the software.')
+ i += 1
+
+def read_header(fileh):
+ '''It reads the header from the sff file and returns a dict with the
+ information'''
+ #first we read the first part of the header
+ head_struct = [
+ ('magic_number', 'I'),
+ ('version', 'cccc'),
+ ('index_offset', 'Q'),
+ ('index_length', 'I'),
+ ('number_of_reads', 'I'),
+ ('header_length', 'H'),
+ ('key_length', 'H'),
+ ('number_of_flows_per_read', 'H'),
+ ('flowgram_format_code', 'B'),
+ ]
+ data = {}
+ first_bytes, data = read_bin_fragment(struct_def=head_struct, fileh=fileh,
+ offset=0, data=data)
+ check_magic(data['magic_number'])
+ check_version(data['version'])
+ #now that we know the number_of_flows_per_read and the key_length
+ #we can read the second part of the header
+ struct2 = [
+ ('flow_chars', str(data['number_of_flows_per_read']) + 'c'),
+ ('key_sequence', str(data['key_length']) + 'c')
+ ]
+ read_bin_fragment(struct_def=struct2, fileh=fileh, offset=first_bytes,
+ data=data)
+ return data
+
+
+def read_sequence(header, fileh, fposition):
+ '''It reads one read from the sff file located at the fposition and
+ returns a dict with the information.'''
+ header_length = header['header_length']
+ index_offset = header['index_offset']
+ index_length = header['index_length']
+
+ #the sequence struct
+ read_header_1 = [
+ ('read_header_length', 'H'),
+ ('name_length', 'H'),
+ ('number_of_bases', 'I'),
+ ('clip_qual_left', 'H'),
+ ('clip_qual_right', 'H'),
+ ('clip_adapter_left', 'H'),
+ ('clip_adapter_right', 'H'),
+ ]
+ def read_header_2(name_length):
+ '''It returns the struct definition for the second part of the header'''
+ return [('name', str(name_length) +'c')]
+ def read_data(number_of_bases):
+ '''It returns the struct definition for the read data section.'''
+ #size = {'c': 1, 'B':1, 'H':2, 'I':4, 'Q':8}
+ if header['flowgram_format_code'] == 1:
+ flow_type = 'H'
+ else:
+ raise Error('file version not supported')
+ number_of_bases = str(number_of_bases)
+ return [
+ ('flowgram_values', str(header['number_of_flows_per_read']) +
+ flow_type),
+ ('flow_index_per_base', number_of_bases + 'B'),
+ ('bases', number_of_bases + 'c'),
+ ('quality_scores', number_of_bases + 'B'),
+ ]
+
+ data = {}
+ #we read the first part of the header
+ bytes_read, data = read_bin_fragment(struct_def=read_header_1,
+ fileh=fileh, offset=fposition, data=data)
+
+ read_bin_fragment(struct_def=read_header_2(data['name_length']),
+ fileh=fileh, offset=fposition + bytes_read, data=data)
+ #we join the letters of the name
+ data['name'] = ''.join(data['name'])
+ offset = data['read_header_length']
+ #we read the sequence and the quality
+ read_data_st = read_data(data['number_of_bases'])
+ bytes_read, data = read_bin_fragment(struct_def=read_data_st,
+ fileh=fileh, offset=fposition + offset,
+ data=data, byte_padding=8)
+ #we join the bases
+ data['bases'] = ''.join(data['bases'])
+
+ #print data
+ #print "pre cqr: ", data['clip_qual_right']
+ #print "pre car: ", data['clip_adapter_right']
+ #print "pre cql: ", data['clip_qual_left']
+ #print "pre cal: ", data['clip_adapter_left']
+
+ # correct for the case the right clip is <= than the left clip
+ # in this case, left clip is 0 are set to 0 (right clip == 0 means
+ # "whole sequence")
+ if data['clip_qual_right'] <= data['clip_qual_left'] :
+ data['clip_qual_right'] = 0
+ data['clip_qual_left'] = 0
+ if data['clip_adapter_right'] <= data['clip_adapter_left'] :
+ data['clip_adapter_right'] = 0
+ data['clip_adapter_left'] = 0
+
+ #the clipping section follows the NCBI's guidelines Trace Archive RFC
+ #http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=rfc&m=doc&s=rfc
+ #if there's no adapter clip: qual -> vector
+ #else: qual-> qual
+ # adapter -> vector
+
+ if not data['clip_adapter_left']:
+ data['clip_adapter_left'], data['clip_qual_left'] = data['clip_qual_left'], data['clip_adapter_left']
+ if not data['clip_adapter_right']:
+ data['clip_adapter_right'], data['clip_qual_right'] = data['clip_qual_right'], data['clip_adapter_right']
+
+ # see whether we have to override the minimum left clips
+ if config['min_leftclip'] > 0:
+ if data['clip_adapter_left'] >0 and data['clip_adapter_left'] < config['min_leftclip']:
+ data['clip_adapter_left'] = config['min_leftclip']
+ if data['clip_qual_left'] >0 and data['clip_qual_left'] < config['min_leftclip']:
+ data['clip_qual_left'] = config['min_leftclip']
+
+
+ #print "post cqr: ", data['clip_qual_right']
+ #print "post car: ", data['clip_adapter_right']
+ #print "post cql: ", data['clip_qual_left']
+ #print "post cal: ", data['clip_adapter_left']
+
+
+ # for handling the -c (clip) option gently, we already clip here
+ # and set all clip points to the sequence end points
+ if config['clip']:
+ data['bases'], data['quality_scores'] = clip_read(data)
+
+ data['number_of_bases']=len(data['bases'])
+ data['clip_qual_right'] = data['number_of_bases']
+ data['clip_adapter_right'] = data['number_of_bases']
+ data['clip_qual_left'] = 0
+ data['clip_adapter_left'] = 0
+
+ return data['read_header_length'] + bytes_read, data
+
+
+def sequences(fileh, header):
+ '''It returns a generator with the data for each read.'''
+ #now we can read all the sequences
+ fposition = header['header_length'] #position in the file
+ reads_read = 0
+ while True:
+ if fposition == header['index_offset']:
+ #we have to skip the index section
+ fposition += index_length
+ continue
+ else:
+ bytes_read, seq_data = read_sequence(header=header, fileh=fileh,
+ fposition=fposition)
+ yield seq_data
+ fposition += bytes_read
+ reads_read += 1
+ if reads_read >= header['number_of_reads']:
+ break
+
+
+def remove_last_xmltag_in_file(fname, tag=None):
+ '''Given an xml file name and a tag, it removes the last tag of the
+ file if it matches the given tag. Tag removal is performed via file
+ truncation.
+
+ It the given tag is not the last in the file, a RunTimeError will be
+ raised.
+
+ The resulting xml file will be not xml valid. This function is a hack
+ that allows to append records to xml files in a quick and dirty way.
+ '''
+
+ fh = open(fname, 'r+')
+ #we have to read from the end to the start of the file and keep the
+ #string enclosed by </ >
+ i = -1
+ last_tag = [] #the chars that form the last tag
+ start_offset = None #in which byte does the last tag starts?
+ end_offset = None #in which byte does the last tag ends?
+ while True:
+ fh.seek(i, 2)
+ char = fh.read(1)
+ if not char.isspace():
+ last_tag.append(char)
+ if char == '>':
+ end_offset = i
+ if char == '<':
+ start_offset = i
+ break
+ i -= 1
+
+ #we have read the last tag backwards
+ last_tag = ''.join(last_tag[::-1])
+ #we remove the </ and >
+ last_tag = last_tag.rstrip('>').lstrip('</')
+
+ #we check that we're removing the asked tag
+ if tag is not None and tag != last_tag:
+ raise RuntimeError("The given xml tag wasn't the last one in the file")
+
+ # while we are at it: also remove all white spaces in that line :-)
+ i -= 1
+ while True:
+ fh.seek(i, 2)
+ char = fh.read(1)
+ if not char == ' ' and not char == '\t':
+ break;
+ if fh.tell() == 1:
+ break;
+ i -= 1
+
+ fh.truncate();
+
+ fh.close()
+ return last_tag
+
+
+def create_basic_xml_info(readname, fname):
+ '''Formats a number of read specific infos into XML format.
+ Currently formated: name and the tags set from command line
+ '''
+ to_print = [' <trace>\n']
+ to_print.append(' <trace_name>')
+ to_print.append(readname)
+ to_print.append('</trace_name>\n')
+
+ #extra information
+ #do we have extra info for this file?
+ info = None
+ if config['xml_info']:
+ #with this name?
+ if fname in config['xml_info']:
+ info = config['xml_info'][fname]
+ else:
+ #with no name?
+ try:
+ info = config['xml_info'][fake_sff_name]
+ except KeyError:
+ pass
+ #we print the info that we have
+ if info:
+ for key in info:
+ to_print.append(' <' + key + '>' + info[key] + \
+ '</' + key +'>\n')
+
+ return ''.join(to_print)
+
+
+def create_clip_xml_info(readlen, adapl, adapr, quall, qualr):
+ '''Takes the clip values of the read and formats them into XML
+ Corrects "wrong" values that might have resulted through
+ simplified calculations earlier in the process of conversion
+ (especially during splitting of paired-end reads)
+ '''
+
+ to_print = [""]
+
+ # if right borders are >= to read length, they don't need
+ # to be printed
+ if adapr >= readlen:
+ adapr = 0
+ if qualr >= readlen:
+ qualr = 0
+
+ # BaCh
+ # when called via split_paired_end(), some values may be < 0
+ # (when clip values were 0 previously)
+ # instead of putting tons of if clauses for different calculations there,
+ # I centralise corrective measure here
+ # set all values <0 to 0
+
+ if adapr < 0:
+ adapr = 0
+ if qualr <0:
+ qualr = 0
+ if adapl < 0:
+ adapl = 0
+ if quall <0:
+ quall = 0
+
+ if quall:
+ to_print.append(' <clip_quality_left>')
+ to_print.append(str(quall))
+ to_print.append('</clip_quality_left>\n')
+ if qualr:
+ to_print.append(' <clip_quality_right>')
+ to_print.append(str(qualr))
+ to_print.append('</clip_quality_right>\n')
+ if adapl:
+ to_print.append(' <clip_vector_left>')
+ to_print.append(str(adapl))
+ to_print.append('</clip_vector_left>\n')
+ if adapr:
+ to_print.append(' <clip_vector_right>')
+ to_print.append(str(adapr))
+ to_print.append('</clip_vector_right>\n')
+ return ''.join(to_print)
+
+
+def create_xml_for_unpaired_read(data, fname):
+ '''Given the data for one read it returns an str with the xml ancillary
+ data.'''
+ to_print = [create_basic_xml_info(data['name'],fname)]
+ #clippings in the XML only if we do not hard clip
+ if not config['clip']:
+ to_print.append(create_clip_xml_info(data['number_of_bases'],data['clip_adapter_left'], data['clip_adapter_right'], data['clip_qual_left'], data['clip_qual_right']));
+ to_print.append(' </trace>\n')
+ return ''.join(to_print)
+
+
+def format_as_fasta(name,seq,qual):
+ name_line = ''.join(('>', name,'\n'))
+ seqstring = ''.join((name_line, seq, '\n'))
+ qual_line = ' '.join([str(q) for q in qual])
+ qualstring = ''.join((name_line, qual_line, '\n'))
+ return seqstring, qualstring
+
+def format_as_fastq(name,seq,qual):
+ qual_line = ''.join([chr(q+33) for q in qual])
+ #seqstring = ''.join(('@', name,'\n', seq, '\n+', name,'\n', qual_line, '\n'))
+ seqstring = ''.join(('@', name,'\n', seq, '\n+\n', qual_line, '\n'))
+ return seqstring
+
+
+def get_read_data(data):
+ '''Given the data for one read it returns 2 strs with the fasta seq
+ and fasta qual.'''
+ #seq and qual
+ if config['mix_case']:
+ seq = sequence_case(data)
+ qual = data['quality_scores']
+ else :
+ seq = data['bases']
+ qual = data['quality_scores']
+
+ return seq, qual
+
+def extract_read_info(data, fname):
+ '''Given the data for one read it returns 3 strs with the fasta seq, fasta
+ qual and xml ancillary data.'''
+
+ seq,qual = get_read_data(data)
+ seqstring, qualstring = format_as_fasta(data['name'],seq,qual)
+
+ #name_line = ''.join(('>', data['name'],'\n'))
+ #seq = ''.join((name_line, seq, '\n'))
+ #qual_line = ' '.join([str(q) for q in qual])
+ #qual = ''.join((name_line, qual_line, '\n'))
+
+ xmlstring = create_xml_for_unpaired_read(data, fname)
+
+ return seqstring, qualstring, xmlstring
+
+def write_sequence(name,seq,qual,seq_fh,qual_fh):
+ '''Write sequence and quality FASTA and FASTA qual filehandles
+ (or into FASTQ and XML)
+ if sequence length is 0, don't write'''
+
+ if len(seq) == 0 : return
+
+ if qual_fh is None:
+ seq_fh.write(format_as_fastq(name,seq,qual))
+ else:
+ seqstring, qualstring = format_as_fasta(name,seq,qual)
+ seq_fh.write(seqstring)
+ qual_fh.write(qualstring)
+ return
+
+def write_unpaired_read(data, sff_fh, seq_fh, qual_fh, xml_fh):
+ '''Writes an unpaired read into FASTA, FASTA qual and XML filehandles
+ (or into FASTQ and XML)
+ if sequence length is 0, don't write'''
+
+ seq,qual = get_read_data(data)
+ if len(seq) == 0 : return
+
+ write_sequence(data['name'],seq,qual,seq_fh,qual_fh)
+
+ anci = create_xml_for_unpaired_read(data, sff_fh.name)
+ if anci is not None:
+ xml_fh.write(anci)
+ return
+
+
+def reverse_complement(seq):
+ '''Returns the reverse complement of a DNA sequence as string'''
+
+ compdict = {
+ 'a': 't',
+ 'c': 'g',
+ 'g': 'c',
+ 't': 'a',
+ 'u': 't',
+ 'm': 'k',
+ 'r': 'y',
+ 'w': 'w',
+ 's': 's',
+ 'y': 'r',
+ 'k': 'm',
+ 'v': 'b',
+ 'h': 'd',
+ 'd': 'h',
+ 'b': 'v',
+ 'x': 'x',
+ 'n': 'n',
+ 'A': 'T',
+ 'C': 'G',
+ 'G': 'C',
+ 'T': 'A',
+ 'U': 'T',
+ 'M': 'K',
+ 'R': 'Y',
+ 'W': 'W',
+ 'S': 'S',
+ 'Y': 'R',
+ 'K': 'M',
+ 'V': 'B',
+ 'H': 'D',
+ 'D': 'H',
+ 'B': 'V',
+ 'X': 'X',
+ 'N': 'N',
+ '*': '*'
+ }
+
+ complseq = ''.join([compdict[base] for base in seq])
+ # python hack to reverse a list/string/etc
+ complseq = complseq[::-1]
+ return complseq
+
+
+def mask_sequence(seq, maskchar, fpos, tpos):
+ '''Given a sequence, mask it with maskchar starting at fpos (including) and
+ ending at tpos (excluding)
+ '''
+
+ if len(maskchar) > 1:
+ raise RuntimeError("Internal error: more than one character given to mask_sequence")
+ if fpos<0:
+ fpos = 0
+ if tpos > len(seq):
+ tpos = len(seq)
+
+ newseq = ''.join((seq[:fpos],maskchar*(tpos-fpos), seq[tpos:]))
+
+ return newseq
+
+
+def fragment_sequences(sequence, qualities, splitchar):
+ '''Works like split() on strings, except it does this on a sequence
+ and the corresponding list with quality values.
+ Returns a tuple for each fragment, each sublist has the fragment
+ sequence as first and the fragment qualities as second elemnt'''
+
+ # this is slow (due to zip and list appends... use an iterator over
+ # the sequence find find variations and splices on seq and qual
+
+ if len(sequence) != len(qualities):
+ print sequence, qualities
+ raise RuntimeError("Internal error: length of sequence and qualities don't match???")
+
+ retlist = ([])
+ if len(sequence) == 0:
+ return retlist
+
+ actseq = ([])
+ actqual = ([])
+ if sequence[0] != splitchar:
+ inseq = True
+ else:
+ inseq = False
+ for char,qual in zip(sequence,qualities):
+ if inseq:
+ if char != splitchar:
+ actseq.append(char)
+ actqual.append(qual)
+ else:
+ retlist.append((''.join(actseq), actqual))
+ actseq = ([])
+ actqual = ([])
+ inseq = False
+ else:
+ if char != splitchar:
+ inseq = True
+ actseq.append(char)
+ actqual.append(qual)
+
+ if inseq and len(actseq):
+ retlist.append((''.join(actseq), actqual))
+
+ return retlist
+
+
+def calc_subseq_boundaries(maskedseq, maskchar):
+ '''E.g.:
+ ........xxxxxxxx..........xxxxxxxxxxxxxxxxxxxxx.........
+ to
+ (0,8),(8,16),(16,26),(26,47),(47,56)
+ '''
+
+ blist = ([])
+ if len(maskedseq) == 0:
+ return blist
+
+ inmask = True
+ if maskedseq[0] != maskchar:
+ inmask = False
+
+ start = 0
+ for spos in range(len(maskedseq)):
+ if inmask and maskedseq[spos] != maskchar:
+ blist.append(([start,spos]))
+ start = spos
+ inmask = False
+ elif not inmask and maskedseq[spos] == maskchar:
+ blist.append(([start,spos]))
+ start = spos
+ inmask = True
+
+ blist.append(([start,spos+1]))
+
+ return blist
+
+
+def correct_for_smallhits(maskedseq, maskchar, linkername):
+ '''If partial hits were found, take preventive measure: grow
+ the masked areas by 20 bases in each direction
+ Returns either unchanged "maskedseq" or a new sequence
+ with some more characters masked.
+ '''
+ global linkerlengths
+
+ CEBUG = 0
+
+ if CEBUG : print "correct_for_smallhits"
+ if CEBUG : print "Masked seq\n", maskedseq
+ if CEBUG : print "Linkername: ", linkername
+
+ if len(maskedseq) == 0:
+ return maskedseq
+
+ growl=40
+ growl2=growl/2
+
+ boundaries = calc_subseq_boundaries(maskedseq,maskchar)
+ if CEBUG : print "Boundaries: ", boundaries
+
+ foundpartial = False
+ for bounds in boundaries:
+ if CEBUG : print "\tbounds: ", bounds
+ left, right = bounds
+ if left != 0 and right != len(maskedseq):
+ if maskedseq[left] == maskchar:
+ # allow 10% discrepancy
+ # -linkerlengths[linkername]/10
+ # that's a kind of safety net if there are slight sequencing
+ # errors in the linker itself
+ if right-left < linkerlengths[linkername]-linkerlengths[linkername]/10:
+ if CEBUG : print "\t\tPartial: found " + str(right-left) + " gaps, " + linkername + " is " + str(linkerlengths[linkername]) + " nt long."
+ foundpartial = True
+
+ if not foundpartial:
+ return maskedseq
+
+ # grow
+ newseq = ""
+ for bounds in boundaries:
+ if CEBUG : print "Bounds: ", bounds
+ left, right = bounds
+ if maskedseq[left] == maskchar:
+ newseq += maskedseq[left:right]
+ else:
+ clearstart = 0
+ if left > 0 :
+ clearstart = left+growl2
+ clearstop = len(maskedseq)
+ if right < len(maskedseq):
+ clearstop = right-growl2
+
+ if CEBUG : print "clearstart, clearstop: ",clearstart, clearstop
+
+ if clearstop <= clearstart:
+ newseq += maskchar * (right-left)
+ else:
+ if clearstart != left:
+ newseq += maskchar * growl2
+ newseq += maskedseq[clearstart:clearstop]
+ if clearstop != right:
+ newseq += maskchar * growl2
+
+ #print "newseq\n",newseq
+
+ return newseq
+
+
+def split_paired_end(data, sff_fh, seq_fh, qual_fh, xml_fh):
+ '''Splits a paired end read and writes sequences into FASTA, FASTA qual
+ and XML traceinfo file. Returns the number of sequences created.
+
+ As the linker sequence may be anywhere in the read, including the ends
+ and overlapping with bad quality sequence, we need to perform some
+ computing and eventually set new clip points.
+
+ If the resulting split yields only one sequence (because linker
+ was not present or overlapping with left or right clip), only one
+ sequence will be written with ".fn" appended to the name.
+
+ If the read can be split, two reads will be written. The side left of
+ the linker will be named ".r" and will be written in reverse complement
+ into the file to conform with what approximately all assemblers expect
+ when reading paired-end data: reads in forward direction in file. The side
+ right of the linker will be named ".f"
+
+ If SSAHA found partial linker (linker sequences < length of linker),
+ the sequences will get a "_pl" furthermore be cut back thoroughly.
+
+ If SSAHA found multiple occurences of the linker, the names will get an
+ additional "_mlc" within the name to show that there was "multiple
+ linker contamination".
+
+ For multiple or partial linker, the "good" parts of the reads are
+ stored with a ".part<number>" name, additionally they will not get
+ template information in the XML
+ '''
+
+ global ssahapematches
+
+ CEBUG = 0
+
+ maskchar = "#"
+
+ if CEBUG : print "Need to split: " + data['name']
+
+ numseqs = 0;
+ readname = data['name']
+ readlen = data['number_of_bases']
+
+ leftclip, rightclip = return_merged_clips(data)
+ seq, qual = get_read_data(data)
+
+ if CEBUG : print "Original read:\n",seq
+
+ maskedseq = seq
+ if leftclip > 0:
+ maskedseq = mask_sequence(maskedseq, maskchar, 0, leftclip-1)
+ if rightclip < len(maskedseq):
+ maskedseq = mask_sequence(maskedseq, maskchar, rightclip, len(maskedseq))
+
+ leftclip, rightclip = return_merged_clips(data)
+ readlen = data['number_of_bases']
+
+ if CEBUG : print "Readname:", readname
+ if CEBUG : print "Readlen:", readlen
+ if CEBUG : print "Num matches:", str(len(ssahapematches[data['name']]))
+ if CEBUG : print "matches:", ssahapematches[data['name']]
+
+ for match in ssahapematches[data['name']]:
+ score = int(match[0])
+ linkername = match[2]
+ leftreadhit = int(match[3])
+ rightreadhit = int(match[4])
+ #leftlinkerhit = int(match[5])
+ #rightlinkerhit = int(match[6])
+ #direction = match[7]
+ #hitlen = int(match[8])
+ #hitidentity = float(match[9])
+
+ if CEBUG : print match
+ if CEBUG : print "Match with score:", score
+ if CEBUG : print "Read before:\n", maskedseq
+ maskedseq = mask_sequence(maskedseq, maskchar, leftreadhit-1, rightreadhit)
+ if CEBUG : print "Masked seq:\n", maskedseq
+
+ correctedseq = correct_for_smallhits(maskedseq, maskchar, linkername)
+
+ if len(maskedseq) != len(correctedseq):
+ raise RuntimeError("Internal error: maskedseq != correctedseq")
+
+ partialhits = False
+ if correctedseq != maskedseq:
+ if CEBUG : print "Partial hits in", readname
+ if CEBUG : print "Original seq:\n", seq
+ if CEBUG : print "Masked seq:\n", maskedseq
+ if CEBUG : print "Corrected seq\n", correctedseq
+ partialhits = True
+ readname += "_pl"
+ maskedseq = correctedseq
+
+ fragments = fragment_sequences(maskedseq, qual, maskchar)
+
+ if CEBUG : print "Fragments (", len(fragments), "): ", fragments
+
+ mlcflag = False
+ #if len(ssahapematches[data['name']]) > 1:
+ # #print "Multi linker contamination"
+ # mlcflag = True
+ # readname += "_mlc"
+
+ if len(fragments) > 2:
+ if CEBUG : print "Multi linker contamination"
+ mlcflag = True
+ readname += "_mlc"
+
+
+ #print fragments
+ if mlcflag or partialhits:
+ fragcounter = 1
+ readname += ".part"
+ for frag in fragments:
+ actseq = frag[0]
+ if len(actseq) >= 20:
+ actqual = frag[1]
+ oname = readname + str(fragcounter)
+ #seq_fh.write(">"+oname+"\n")
+ #seq_fh.write(actseq+"\n")
+ #qual_fh.write(">"+oname+"\n")
+ #qual_fh.write(' '.join((str(q) for q in actqual)))
+ #qual_fh.write("\n")
+ write_sequence(oname,actseq,actqual,seq_fh,qual_fh)
+ to_print = [create_basic_xml_info(oname,sff_fh.name)]
+ # No clipping in XML ... the multiple and partial fragments
+ # are clipped "hard"
+ # No template ID and trace_end: we don't know the
+ # orientation of the frahments. Even if it were
+ # only two, the fact we had multiple linkers
+ # says something went wrong, so simply do not
+ # write any paired-end information for all these fragments
+ to_print.append(' </trace>\n')
+ xml_fh.write(''.join(to_print))
+ numseqs += 1
+ fragcounter += 1
+ else:
+ if len(fragments) >2:
+ raise RuntimeError("Unexpected: more than two fragments detected in " + readname + ". please contact the authors.")
+ # nothing will happen for 0 fragments
+ if len(fragments) == 1:
+ #print "Tada1"
+ boundaries = calc_subseq_boundaries(maskedseq,maskchar)
+ if len(boundaries) < 1 or len(boundaries) >3:
+ raise RuntimeError("Unexpected case: ", str(len(boundaries)), "boundaries for 1 fragment of ", readname)
+ if len(boundaries) == 3:
+ # case: mask char on both sides of sequence
+ #print "bounds3"
+ data['clip_adapter_left']=1+boundaries[0][1]
+ data['clip_adapter_right']=boundaries[2][0]
+ elif len(boundaries) == 2:
+ # case: mask char left or right of sequence
+ #print "bounds2",
+ if maskedseq[0] == maskchar :
+ # case: mask char left
+ #print "left"
+ data['clip_adapter_left']=1+boundaries[0][1]
+ else:
+ # case: mask char right
+ #print "right"
+ data['clip_adapter_right']=boundaries[1][0]
+ data['name'] = data['name'] + ".fn"
+ write_unpaired_read(data, sff_fh, seq_fh, qual_fh, xml_fh)
+ numseqs = 1
+ elif len(fragments) == 2:
+ #print "Tada2"
+ oname = readname + ".r"
+ seq, qual = get_read_data(data)
+
+ startsearch = False
+ for spos in range(len(maskedseq)):
+ if maskedseq[spos] != maskchar:
+ startsearch = True;
+ else:
+ if startsearch:
+ break
+
+ #print "\nspos: ", spos
+ lseq=seq[:spos]
+ #print "lseq:", lseq
+ actseq = reverse_complement(lseq)
+ lreadlen = len(actseq)
+ lqual = qual[:spos];
+ # python hack to reverse a list/string/etc
+ lqual = lqual[::-1];
+
+ #seq_fh.write(">"+oname+"\n")
+ #seq_fh.write(actseq+"\n")
+ #qual_fh.write(">"+oname+"\n")
+ #qual_fh.write(' '.join((str(q) for q in lqual)))
+ #qual_fh.write("\n")
+
+ write_sequence(oname,actseq,lqual,seq_fh,qual_fh)
+
+ to_print = [create_basic_xml_info(oname,sff_fh.name)]
+ to_print.append(create_clip_xml_info(lreadlen, 0, lreadlen+1-data['clip_adapter_left'], 0, lreadlen+1-data['clip_qual_left']));
+ to_print.append(' <template_id>')
+ to_print.append(readname)
+ to_print.append('</template_id>\n')
+ to_print.append(' <trace_end>r</trace_end>\n')
+ to_print.append(' </trace>\n')
+ xml_fh.write(''.join(to_print))
+
+ oname = readname + ".f"
+ startsearch = False
+ for spos in range(len(maskedseq)-1,-1,-1):
+ if maskedseq[spos] != maskchar:
+ startsearch = True;
+ else:
+ if startsearch:
+ break
+
+ actseq = seq[spos+1:]
+ actqual = qual[spos+1:];
+
+ #print "\nspos: ", spos
+ #print "rseq:", actseq
+
+ #seq_fh.write(">"+oname+"\n")
+ #seq_fh.write(actseq+"\n")
+ #qual_fh.write(">"+oname+"\n")
+ #qual_fh.write(' '.join((str(q) for q in actqual)))
+ #qual_fh.write("\n")
+ write_sequence(oname,actseq,actqual,seq_fh,qual_fh)
+
+ rreadlen = len(actseq)
+ to_print = [create_basic_xml_info(oname,sff_fh.name)]
+ to_print.append(create_clip_xml_info(rreadlen, 0, rreadlen-(readlen-data['clip_adapter_right']), 0, rreadlen-(readlen-data['clip_qual_right'])));
+ to_print.append(' <template_id>')
+ to_print.append(readname)
+ to_print.append('</template_id>\n')
+ to_print.append(' <trace_end>f</trace_end>\n')
+ to_print.append(' </trace>\n')
+ xml_fh.write(''.join(to_print))
+ numseqs = 2
+
+ return numseqs
+
+
+
+def extract_reads_from_sff(config, sff_files):
+ '''Given the configuration and the list of sff_files it writes the seqs,
+ qualities and ancillary data into the output file(s).
+
+ If file for paired-end linker was given, first extracts all sequences
+ of an SFF and searches these against the linker(s) with SSAHA2 to
+ create needed information to split reads.
+ '''
+
+ global ssahapematches
+
+
+ if len(sff_files) == 0 :
+ raise RuntimeError("No SFF file given?")
+
+ #we go through all input files
+ for sff_file in sff_files:
+ if not os.path.getsize(sff_file):
+ raise RuntimeError('Empty file? : ' + sff_file)
+ fh = open(sff_file, 'r')
+ fh.close()
+
+ openmode = 'w'
+ if config['append']:
+ openmode = 'a'
+
+ seq_fh = open(config['seq_fname'], openmode)
+ xml_fh = open(config['xml_fname'], openmode)
+ if config['want_fastq']:
+ qual_fh = None
+ try:
+ os.remove(config['qual_fname'])
+ except :
+ python_formattingwithoutbracesisdumb_dummy = 1
+ else:
+ qual_fh = open(config['qual_fname'], openmode)
+
+ if not config['append']:
+ xml_fh.write('<?xml version="1.0"?>\n<trace_volume>\n')
+ else:
+ remove_last_xmltag_in_file(config['xml_fname'], "trace_volume")
+
+ #we go through all input files
+ for sff_file in sff_files:
+ #print "Working on '" + sff_file + "':"
+ ssahapematches.clear()
+
+ seqcheckstore = ([])
+
+ debug = 0
+
+ if not debug and config['pelinker_fname']:
+ #print "Creating temporary sequences from reads in '" + sff_file + "' ... ",
+ sys.stdout.flush()
+
+ if 0 :
+ # for debugging
+ pid = os.getpid()
+ tmpfasta_fname = 'sffe.tmp.'+ str(pid)+'.fasta'
+ tmpfasta_fh = open(tmpfasta_fname, 'w')
+ else:
+ tmpfasta_fh = tempfile.NamedTemporaryFile(prefix = 'sffeseqs_',
+ suffix = '.fasta')
+
+ sff_fh = open(sff_file, 'rb')
+ header_data = read_header(fileh=sff_fh)
+ for seq_data in sequences(fileh=sff_fh, header=header_data):
+ seq,qual = get_read_data(seq_data)
+ seqstring, qualstring = format_as_fasta(seq_data['name'],seq,qual)
+ tmpfasta_fh.write(seqstring)
+ #seq, qual, anci = extract_read_info(seq_data, sff_fh.name)
+ #tmpfasta_fh.write(seq)
+ #print "done."
+ tmpfasta_fh.seek(0)
+
+ if 0 :
+ # for debugging
+ tmpssaha_fname = 'sffe.tmp.'+str(pid)+'.ssaha2'
+ tmpssaha_fh = open(tmpssaha_fname, 'w+')
+ else:
+ tmpssaha_fh = tempfile.NamedTemporaryFile(prefix = 'sffealig_',
+ suffix = '.ssaha2')
+
+ launch_ssaha(config['pelinker_fname'], tmpfasta_fh.name, tmpssaha_fh)
+ tmpfasta_fh.close()
+
+ tmpssaha_fh.seek(0)
+ read_ssaha_data(tmpssaha_fh)
+ tmpssaha_fh.close()
+
+ if debug:
+ tmpssaha_fh = open("sffe.tmp.10634.ssaha2", 'r')
+ read_ssaha_data(tmpssaha_fh)
+
+ #print "Converting '" + sff_file + "' ... ",
+ sys.stdout.flush()
+ sff_fh = open(sff_file, 'rb')
+ #read_header(infile)
+ header_data = read_header(fileh=sff_fh)
+
+ #now convert all reads
+ nseqs_sff = 0
+ nseqs_out = 0
+ for seq_data in sequences(fileh=sff_fh, header=header_data):
+ nseqs_sff += 1
+
+ seq, qual = clip_read(seq_data)
+ seqcheckstore.append(seq[0:50])
+
+ #if nseqs_sff >1000:
+ # check_for_dubious_startseq(seqcheckstore,sff_file,seq_data)
+ # sys.exit()
+
+ if ssahapematches.has_key(seq_data['name']):
+ #print "Paired end:",seq_data['name']
+ nseqs_out += split_paired_end(seq_data, sff_fh, seq_fh, qual_fh, xml_fh)
+ else:
+ #print "Normal:",seq_data['name']
+ if config['pelinker_fname']:
+ seq_data['name'] = seq_data['name'] + ".fn"
+ write_unpaired_read(seq_data, sff_fh, seq_fh, qual_fh, xml_fh)
+ nseqs_out += 1
+ #print "done."
+ #print 'Converted', str(nseqs_sff), 'reads into', str(nseqs_out), 'sequences.'
+ sff_fh.close()
+
+ check_for_dubious_startseq(seqcheckstore,sff_file,seq_data)
+ seqcheckstore = ([])
+
+ xml_fh.write('</trace_volume>\n')
+
+ xml_fh.close()
+ seq_fh.close()
+ if qual_fh is not None:
+ qual_fh.close()
+
+ return
+
+def check_for_dubious_startseq(seqcheckstore, sffname,seqdata):
+
+ global stern_warning
+
+ foundproblem = ""
+ for checklen in range(1,len(seqcheckstore[0])):
+ foundinloop = False
+ seqdict = {}
+ for seq in seqcheckstore:
+ shortseq = seq[0:checklen]
+ if shortseq in seqdict:
+ seqdict[shortseq] += 1
+ else:
+ seqdict[shortseq] = 1
+
+ for shortseq, count in seqdict.items():
+ if float(count)/len(seqcheckstore) >= 0.5:
+ foundinloop = True
+ stern_warning
+ foundproblem = "\n"+"*" * 80
+ foundproblem += "\nWARNING: "
+ foundproblem += "weird sequences in file " + sffname + "\n\n"
+ foundproblem += "After applying left clips, " + str(count) + " sequences (="
+ foundproblem += '%.0f'%(100.0*float(count)/len(seqcheckstore))
+ foundproblem += "%) start with these bases:\n" + shortseq
+ foundproblem += "\n\nThis does not look sane.\n\n"
+ foundproblem += "Countermeasures you *probably* must take:\n"
+ foundproblem += "1) Make your sequence provider aware of that problem and ask whether this can be\n corrected in the SFF.\n"
+ foundproblem += "2) If you decide that this is not normal and your sequence provider does not\n react, use the --min_left_clip of sff_extract.\n"
+ left,right = return_merged_clips(seqdata)
+ foundproblem += " (Probably '--min_left_clip="+ str(left+len(shortseq))+"' but you should cross-check that)\n"
+ foundproblem += "*" * 80 + "\n"
+ if not foundinloop :
+ break
+ if len(foundproblem):
+ print foundproblem
+
+
+def parse_extra_info(info):
+ '''It parses the information that will go in the xml file.
+
+ There are two formats accepted for the extra information:
+ key1:value1, key2:value2
+ or:
+ file1.sff{key1:value1, key2:value2};file2.sff{key3:value3}
+ '''
+ if not info:
+ return info
+ finfos = info.split(';') #information for each file
+ data_for_files = {}
+ for finfo in finfos:
+ #we split the file name from the rest
+ items = finfo.split('{')
+ if len(items) == 1:
+ fname = fake_sff_name
+ info = items[0]
+ else:
+ fname = items[0]
+ info = items[1]
+ #now we get each key,value pair in the info
+ info = info.replace('}', '')
+ data = {}
+ for item in info.split(','):
+ key, value = item.strip().split(':')
+ key = key.strip()
+ value = value.strip()
+ data[key] = value
+ data_for_files[fname] = data
+ return data_for_files
+
+
+def return_merged_clips(data):
+ '''It returns the left and right positions to clip.'''
+ def max(a, b):
+ '''It returns the max of the two given numbers.
+
+ It won't take into account the zero values.
+ '''
+ if not a and not b:
+ return None
+ if not a:
+ return b
+ if not b:
+ return a
+ if a >= b:
+ return a
+ else:
+ return b
+ def min(a, b):
+ '''It returns the min of the two given numbers.
+
+ It won't take into account the zero values.
+ '''
+ if not a and not b:
+ return None
+ if not a:
+ return b
+ if not b:
+ return a
+ if a <= b:
+ return a
+ else:
+ return b
+ left = max(data['clip_adapter_left'], data['clip_qual_left'])
+ right = min(data['clip_adapter_right'], data['clip_qual_right'])
+ #maybe both clips where zero
+ if left is None:
+ left = 1
+ if right is None:
+ right = data['number_of_bases']
+ return left, right
+
+def sequence_case(data):
+ '''Given the data for one read it returns the seq with mixed case.
+
+ The regions to be clipped will be lower case and the rest upper case.
+ '''
+ left, right = return_merged_clips(data)
+ seq = data['bases']
+ new_seq = ''.join((seq[:left-1].lower(), seq[left-1:right], seq[right:].lower()))
+ return new_seq
+
+def clip_read(data):
+ '''Given the data for one read it returns clipped seq and qual.'''
+
+ qual = data['quality_scores']
+ left, right = return_merged_clips(data)
+ seq = data['bases']
+ qual = data['quality_scores']
+ new_seq = seq[left-1:right]
+ new_qual = qual[left-1:right]
+
+ return new_seq, new_qual
+
+
+
+def tests_for_ssaha(linker_fname):
+ '''Tests whether SSAHA2 can be successfully called.'''
+
+ try:
+ print "Testing whether SSAHA2 is installed and can be launched ... ",
+ sys.stdout.flush()
+ fh = open('/dev/null', 'w')
+ retcode = subprocess.call(["ssaha2", "-v"], stdout = fh)
+ fh.close()
+ print "ok."
+ except :
+ print "nope? Uh oh ...\n\n"
+ raise RuntimeError('Could not launch ssaha2. Have you installed it? Is it in your path?')
+
+
+def load_linker_sequences(linker_fname):
+ '''Loads all linker sequences into memory, storing only the length
+ of each linker.'''
+
+ global linkerlengths
+
+ if not os.path.getsize(linker_fname):
+ raise RuntimeError("File empty? '" + linker_fname + "'")
+ fh = open(linker_fname, 'r')
+ linkerseqs = read_fasta(fh)
+ if len(linkerseqs) == 0:
+ raise RuntimeError(linker_fname + ": no sequence found?")
+ for i in linkerseqs:
+ if linkerlengths.has_key(i.name):
+ raise RuntimeError(linker_fname + ": sequence '" + i.name + "' present multiple times. Aborting.")
+ linkerlengths[i.name] = len(i.sequence)
+ fh.close()
+
+
+def launch_ssaha(linker_fname, query_fname, output_fh):
+ '''Launches SSAHA2 on the linker and query file, string SSAHA2 output
+ into the output filehandle'''
+
+ try:
+ print "Searching linker sequences with SSAHA2 (this may take a while) ... ",
+ sys.stdout.flush()
+ retcode = subprocess.call(["ssaha2", "-output", "ssaha2", "-solexa", "-kmer", "4", "-skip", "1", linker_fname, query_fname], stdout = output_fh)
+ if retcode:
+ raise RuntimeError('Ups.')
+ else:
+ print "ok."
+ except:
+ print "\n"
+ raise RuntimeError('An error occured during the SSAHA2 execution, aborting.')
+
+def read_ssaha_data(ssahadata_fh):
+ '''Given file handle, reads file generated with SSAHA2 (with default
+ output format) and stores all matches as list ssahapematches
+ (ssaha paired-end matches) dictionary'''
+
+ global ssahapematches
+
+ print "Parsing SSAHA2 result file ... ",
+ sys.stdout.flush()
+
+ for line in ssahadata_fh:
+ if line.startswith('ALIGNMENT'):
+ ml = line.split()
+ if len(ml) != 12 :
+ print "\n", line,
+ raise RuntimeError('Expected 12 elements in the SSAHA2 line with ALIGMENT keyword, but found ' + str(len(ml)))
+ if not ssahapematches.has_key(ml[2]) :
+ ssahapematches[ml[2]] = ([])
+ if ml[8] == 'F':
+ #print line,
+
+ # store everything except the first element (output
+ # format name (ALIGNMENT)) and the last element
+ # (length)
+ ssahapematches[ml[2]].append(ml[1:-1])
+ else:
+ #print ml
+ ml[4],ml[5] = ml[5],ml[4]
+ #print ml
+ ssahapematches[ml[2]].append(ml[1:-1])
+
+ print "done."
+
+
+##########################################################################
+#
+# BaCh: This block was shamelessly copied from
+# http://python.genedrift.org/2007/07/04/reading-fasta-files-conclusion/
+# and then subsequently modified to read fasta correctly
+# It's still not fool proof, but should be good enough
+#
+##########################################################################
+
+class Fasta:
+ def __init__(self, name, sequence):
+ self.name = name
+ self.sequence = sequence
+
+def read_fasta(file):
+ items = []
+ aninstance = Fasta('', '')
+ linenum = 0
+ for line in file:
+ linenum += 1
+ if line.startswith(">"):
+ if len(aninstance.sequence):
+ items.append(aninstance)
+ aninstance = Fasta('', '')
+ # name == all characters until the first whitespace
+ # (split()[0]) but without the starting ">" ([1:])
+ aninstance.name = line.split()[0][1:]
+ aninstance.sequence = ''
+ if len(aninstance.name) == 0:
+ raise RuntimeError(file.name + ': no name in line ' + str(linenum) + '?')
+
+ else:
+ if len(aninstance.name) == 0:
+ raise RuntimeError(file.name + ': no sequence header at line ' + str(linenum) + '?')
+ aninstance.sequence += line.strip()
+
+ if len(aninstance.name) and len(aninstance.sequence):
+ items.append(aninstance)
+
+ return items
+##########################################################################
+
+def version_string ():
+ return "sff_extract " + __version__
+
+def read_config():
+ '''It reads the configuration options from the command line arguments and
+ it returns a dict with them.'''
+ from optparse import OptionParser, OptionGroup
+ usage = "usage: %prog [options] sff1 sff2 ..."
+ desc = "Extract sequences from 454 SFF files into FASTA, FASTA quality"\
+ " and XML traceinfo format. When a paired-end linker sequence"\
+ " is given (-l), use SSAHA2 to scan the sequences for the linker,"\
+ " then split the sequences, removing the linker."
+ parser = OptionParser(usage = usage, version = version_string(), description = desc)
+ parser.add_option('-a', '--append', action="store_true", dest='append',
+ help='append output to existing files', default=False)
+ parser.add_option('-i', '--xml_info', dest='xml_info',
+ help='extra info to write in the xml file')
+ parser.add_option("-l", "--linker_file", dest="pelinker_fname",
+ help="FASTA file with paired-end linker sequences", metavar="FILE")
+
+ group = OptionGroup(parser, "File name options","")
+ group.add_option('-c', '--clip', action="store_true", dest='clip',
+ help='clip (completely remove) ends with low qual and/or adaptor sequence', default=False)
+ group.add_option('-u', '--upper_case', action="store_false", dest='mix_case',
+ help='all bases in upper case, including clipped ends', default=True)
+ group.add_option('', '--min_left_clip', dest='min_leftclip',
+ metavar="INTEGER", type = "int",
+ help='if the left clip coming from the SFF is smaller than this value, override it', default=0)
+ group.add_option('-Q', '--fastq', action="store_true", dest='want_fastq',
+ help='store as FASTQ file instead of FASTA + FASTA quality file', default=False)
+ parser.add_option_group(group)
+
+ group = OptionGroup(parser, "File name options","")
+ group.add_option("-o", "--out_basename", dest="basename",
+ help="base name for all output files")
+ group.add_option("-s", "--seq_file", dest="seq_fname",
+ help="output sequence file name", metavar="FILE")
+ group.add_option("-q", "--qual_file", dest="qual_fname",
+ help="output quality file name", metavar="FILE")
+ group.add_option("-x", "--xml_file", dest="xml_fname",
+ help="output ancillary xml file name", metavar="FILE")
+ parser.add_option_group(group)
+
+ #default fnames
+ #is there an sff file?
+ basename = 'reads'
+ if sys.argv[-1][-4:].lower() == '.sff':
+ basename = sys.argv[-1][:-4]
+ def_seq_fname = basename + '.fasta'
+ def_qual_fname = basename + '.fasta.qual'
+ def_xml_fname = basename + '.xml'
+ def_pelinker_fname = ''
+ parser.set_defaults(seq_fname = def_seq_fname)
+ parser.set_defaults(qual_fname = def_qual_fname)
+ parser.set_defaults(xml_fname = def_xml_fname)
+ parser.set_defaults(pelinker_fname = def_pelinker_fname)
+
+ #we parse the cmd line
+ (options, args) = parser.parse_args()
+
+ #we put the result in a dict
+ global config
+ config = {}
+ for property in dir(options):
+ if property[0] == '_' or property in ('ensure_value', 'read_file',
+ 'read_module'):
+ continue
+ config[property] = getattr(options, property)
+
+ if config['basename'] is None:
+ config['basename']=basename
+
+ #if we have not set a file name with -s, -q or -x we set the basename
+ #based file name
+ if config['want_fastq']:
+ config['qual_fname'] = ''
+ if config['seq_fname'] == def_seq_fname:
+ config['seq_fname'] = config['basename'] + '.fastq'
+ else:
+ if config['seq_fname'] == def_seq_fname:
+ config['seq_fname'] = config['basename'] + '.fasta'
+ if config['qual_fname'] == def_qual_fname:
+ config['qual_fname'] = config['basename'] + '.fasta.qual'
+
+ if config['xml_fname'] == def_xml_fname:
+ config['xml_fname'] = config['basename'] + '.xml'
+
+ #we parse the extra info for the xml file
+ config['xml_info'] = parse_extra_info(config['xml_info'])
+ return config, args
+
+
+
+##########################################################################
+
+
+def testsome():
+ sys.exit()
+ return
+
+
+def debug():
+ try:
+ dummy = 1
+ #debug()
+ #testsome()
+
+ config, args = read_config()
+ load_linker_sequences(config['pelinker_fname'])
+
+ #pid = os.getpid()
+ pid = 15603
+
+ #tmpfasta_fname = 'sffe.tmp.'+ str(pid)+'.fasta'
+ #tmpfasta_fh = open(tmpfasta_fname, 'w')
+ tmpfasta_fname = 'FLVI58L05.fa'
+ tmpfasta_fh = open(tmpfasta_fname, 'r')
+
+ tmpssaha_fname = 'sffe.tmp.'+str(pid)+'.ssaha2'
+ tmpssaha_fh = open(tmpssaha_fname, 'w')
+
+ launch_ssaha(config['pelinker_fname'], tmpfasta_fh.name, tmpssaha_fh)
+
+ tmpssaha_fh = open("sffe.tmp.15603.ssaha2", 'r')
+ read_ssaha_data(tmpssaha_fh)
+
+ sys.exit()
+
+ extract_reads_from_sff(config, args)
+
+ except (OSError, IOError, RuntimeError), errval:
+ print errval
+ sys.exit()
+
+ sys.exit()
+
+
+def main():
+
+ argv = sys.argv
+ if len(argv) == 1:
+ sys.argv.append('-h')
+ read_config()
+ sys.exit()
+ try:
+ #debug();
+
+ config, args = read_config()
+
+ if config['pelinker_fname']:
+ #tests_for_ssaha(config['pelinker_fname'])
+ load_linker_sequences(config['pelinker_fname'])
+ if len(args) == 0:
+ raise RuntimeError("No SFF file given?")
+ extract_reads_from_sff(config, args)
+ except (OSError, IOError, RuntimeError), errval:
+ print errval
+ return 1
+
+ if stern_warning:
+ return 1
+
+ return 0
+
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff -r ebfc9236bf5a -r 61b09dc1dff2 tools/filters/sff_extractor.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/sff_extractor.xml Wed Apr 07 11:12:00 2010 -0400
@@ -0,0 +1,22 @@
+<tool id="Sff_extractor" name="SFF converter" version="1.0.0">
+ <description></description>
+ <command interpreter="python">sff_extract.py -s $out_file1 -q $out_file2 -x $out_file3 $input</command>
+ <inputs>
+ <param format="sff" name="input" type="data" label="Extract from this dataset"/>
+ </inputs>
+ <outputs>
+ <data format="fasta" name="out_file1" />
+ <data format="qual" name="out_file2" />
+ <data format="xml" name="out_file3" />
+ </outputs>
+ <help>
+**What it does**
+
+This tool extracts data from the 454 Sequencer SFF format and creates three files containing the:
+ Sequences (FASTA),
+ Qualities (QUAL) and
+ Clippings (XML)
+ </help>
+</tool>
+
+
details: http://www.bx.psu.edu/hg/galaxy/rev/ebfc9236bf5a
changeset: 3618:ebfc9236bf5a
user: Anton Nekrutenko <anton(a)bx.psu.edu>
date: Wed Apr 07 08:44:57 2010 -0400
description:
Updated output format for solid2fastq converter
diffstat:
tools/next_gen_conversion/solid2fastq.xml | 4 ++--
tools/samtools/pileup_parser.xml | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diffs (26 lines):
diff -r 8fa4e8c12dfc -r ebfc9236bf5a tools/next_gen_conversion/solid2fastq.xml
--- a/tools/next_gen_conversion/solid2fastq.xml Tue Apr 06 19:00:39 2010 -0400
+++ b/tools/next_gen_conversion/solid2fastq.xml Wed Apr 07 08:44:57 2010 -0400
@@ -35,8 +35,8 @@
</param>
</inputs>
<outputs>
- <data format="fastqsanger" name="out_file1"/>
- <data format="fastqsanger" name="out_file2">
+ <data format="fastqcssanger" name="out_file1"/>
+ <data format="fastqcssanger" name="out_file2">
<filter>is_run['paired'] == 'yes'</filter>
</data>
</outputs>
diff -r 8fa4e8c12dfc -r ebfc9236bf5a tools/samtools/pileup_parser.xml
--- a/tools/samtools/pileup_parser.xml Tue Apr 06 19:00:39 2010 -0400
+++ b/tools/samtools/pileup_parser.xml Wed Apr 07 08:44:57 2010 -0400
@@ -353,7 +353,7 @@
**Example 3**: Report everything and print total number of differences
-If you set the **Print total number of differences?** to **Yes** the tool will print an additional column with the total number of reads where a devinat base is above the quality threshold cutoff. So, seetiing parametrs like this:
+If you set the **Print total number of differences?** to **Yes** the tool will print an additional column with the total number of reads where a devinat base is above the quality threshold. So, seetiing parametrs like this:
.. image:: ../static/images/pileup_parser_help3.png