[hg] galaxy 3437: Move FASTA classes in galaxy_utils from fastq....
details: http://www.bx.psu.edu/hg/galaxy/rev/aece26463c9d changeset: 3437:aece26463c9d user: Dan Blankenberg <dan@bx.psu.edu> date: Wed Feb 24 11:57:35 2010 -0500 description: Move FASTA classes in galaxy_utils from fastq.py to fasta.py. diffstat: lib/galaxy_utils/sequence/fasta.py | 108 +++++++++++++++++++++++++++++++++++++ lib/galaxy_utils/sequence/fastq.py | 107 ------------------------------------ tools/fastq/fastq_combiner.py | 3 +- tools/fastq/fastq_to_fasta.py | 3 +- 4 files changed, 112 insertions(+), 109 deletions(-) diffs (260 lines): diff -r 66ba0f2d6d32 -r aece26463c9d lib/galaxy_utils/sequence/fasta.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/galaxy_utils/sequence/fasta.py Wed Feb 24 11:57:35 2010 -0500 @@ -0,0 +1,108 @@ +#Dan Blankenberg + +class fastaSequence( object ): + def __init__( self ): + self.identifier = None + self.sequence = '' #holds raw sequence string: no whitespace + def __len__( self ): + return len( self.sequence ) + def __str__( self ): + return "%s\n%s\n" % ( self.identifier, self.sequence ) + +class fastaReader( object ): + def __init__( self, fh ): + self.file = fh + def close( self ): + return self.file.close() + def next( self ): + line = self.file.readline() + #remove header comment lines + while line and line.startswith( '#' ): + line = self.file.readline() + if not line: + raise StopIteration + assert line.startswith( '>' ), "FASTA headers must start with >" + rval = fastaSequence() + rval.identifier = line.strip() + offset = self.file.tell() + while True: + line = self.file.readline() + if not line or line.startswith( '>' ): + if line: + self.file.seek( offset ) #this causes sequence id lines to be read twice, once to determine previous sequence end and again when getting actual sequence; can we cache this to prevent it from being re-read? + return rval + #454 qual test data that was used has decimal scores that don't have trailing spaces + #so we'll need to parse and build these sequences not based upon de facto standards + #i.e. in a less than ideal fashion + line = line.rstrip() + if ' ' in rval.sequence or ' ' in line: + rval.sequence = "%s%s " % ( rval.sequence, line ) + else: + rval.sequence += line + offset = self.file.tell() + def __iter__( self ): + while True: + yield self.next() + +class fastaNamedReader( object ): + def __init__( self, fh ): + self.file = fh + self.reader = fastaReader( self.file ) + self.offset_dict = {} + self.eof = False + def close( self ): + return self.file.close() + def get( self, sequence_id ): + rval = None + if sequence_id in self.offset_dict: + initial_offset = self.file.tell() + seq_offset = self.offset_dict[ sequence_id ].pop( 0 ) + if not self.offset_dict[ sequence_id ]: + del self.offset_dict[ sequence_id ] + self.file.seek( seq_offset ) + rval = self.reader.next() + self.file.seek( initial_offset ) + else: + while True: + offset = self.file.tell() + try: + fasta_seq = self.reader.next() + except StopIteration: + self.eof = True + break #eof, id not found, will return None + if fasta_seq.identifier == sequence_id: + rval = fasta_seq + break + else: + if fasta_seq.identifier not in self.offset_dict: + self.offset_dict[ fasta_seq.identifier ] = [] + self.offset_dict[ fasta_seq.identifier ].append( offset ) + return rval + def has_data( self ): + #returns a string representation of remaining data, or empty string (False) if no data remaining + eof = self.eof + count = 0 + rval = '' + if self.offset_dict: + count = sum( map( len, self.offset_dict.values() ) ) + if not eof: + offset = self.file.tell() + try: + fasta_seq = self.reader.next() + except StopIteration: + eof = True + self.file.seek( offset ) + if count: + rval = "There were %i known sequences not utilized. " % count + if not eof: + rval = "%s%s" % ( rval, "An additional unknown number of sequences exist in the input that were not utilized." ) + return rval + +class fastaWriter( object ): + def __init__( self, fh ): + self.file = fh + def write( self, fastq_read ): + #this will include SOLiD adapter base if applicable + self.file.write( ">%s\n%s\n" % ( fastq_read.identifier[1:], fastq_read.sequence ) ) + def close( self ): + return self.file.close() diff -r 66ba0f2d6d32 -r aece26463c9d lib/galaxy_utils/sequence/fastq.py --- a/lib/galaxy_utils/sequence/fastq.py Wed Feb 24 10:42:36 2010 -0500 +++ b/lib/galaxy_utils/sequence/fastq.py Wed Feb 24 11:57:35 2010 -0500 @@ -514,15 +514,6 @@ def close( self ): return self.file.close() -class fastaWriter( object ): - def __init__( self, fh ): - self.file = fh - def write( self, fastq_read ): - #this will include SOLiD adapter base if applicable - self.file.write( ">%s\n%s\n" % ( fastq_read.identifier[1:], fastq_read.sequence ) ) - def close( self ): - return self.file.close() - class fastqJoiner( object ): def __init__( self, format, force_quality_encoding = None ): self.format = format @@ -592,104 +583,6 @@ read2.description += "/2" return read1, read2 -class fastaSequence( ): - def __init__( self ): - self.identifier = None - self.sequence = '' #holds raw sequence string: no whitespace - def __len__( self ): - return len( self.sequence ) - def __str__( self ): - return "%s\n%s\n" % ( self.identifier, self.sequence ) - -class fastaReader( object ): - def __init__( self, fh ): - self.file = fh - def close( self ): - return self.file.close() - def next( self ): - line = self.file.readline() - #remove header comment lines - while line and line.startswith( '#' ): - line = self.file.readline() - if not line: - raise StopIteration - assert line.startswith( '>' ), "FASTA headers must start with >" - rval = fastaSequence() - rval.identifier = line.strip() - offset = self.file.tell() - while True: - line = self.file.readline() - if not line or line.startswith( '>' ): - if line: - self.file.seek( offset ) - return rval - #454 qual test data that was used has decimal scores that don't have trailing spaces - #so we'll need to parse and build these sequences not based upon de facto standards - #i.e. in a less than ideal fashion - line = line.rstrip() - if ' ' in rval.sequence or ' ' in line: - rval.sequence = "%s%s " % ( rval.sequence, line ) - else: - rval.sequence += line - offset = self.file.tell() - def __iter__( self ): - while True: - yield self.next() - -class fastaNamedReader( object ): - def __init__( self, fh ): - self.file = fh - self.reader = fastaReader( self.file ) - self.offset_dict = {} - self.eof = False - def close( self ): - return self.file.close() - def get( self, sequence_id ): - rval = None - if sequence_id in self.offset_dict: - initial_offset = self.file.tell() - seq_offset = self.offset_dict[ sequence_id ].pop( 0 ) - if not self.offset_dict[ sequence_id ]: - del self.offset_dict[ sequence_id ] - self.file.seek( seq_offset ) - rval = self.reader.next() - self.file.seek( initial_offset ) - else: - while True: - offset = self.file.tell() - try: - fasta_seq = self.reader.next() - except StopIteration: - self.eof = True - break #eof, id not found, will return None - if fasta_seq.identifier == sequence_id: - rval = fasta_seq - break - else: - if fasta_seq.identifier not in self.offset_dict: - self.offset_dict[ fasta_seq.identifier ] = [] - self.offset_dict[ fasta_seq.identifier ].append( offset ) - return rval - def has_data( self ): - #returns a string representation of remaining data, or empty string (False) if no data remaining - eof = self.eof - count = 0 - rval = '' - if self.offset_dict: - count = sum( map( len, self.offset_dict.values() ) ) - if not eof: - offset = self.file.tell() - try: - fasta_seq = self.reader.next() - except StopIteration: - eof = True - self.file.seek( offset ) - if count: - rval = "There were %i known sequences not utilized. " % count - if not eof: - rval = "%s%s" % ( rval, "An additional unknown number of sequences exist in the input that were not utilized." ) - return rval - class fastqCombiner( object ): def __init__( self, format ): self.format = format diff -r 66ba0f2d6d32 -r aece26463c9d tools/fastq/fastq_combiner.py --- a/tools/fastq/fastq_combiner.py Wed Feb 24 10:42:36 2010 -0500 +++ b/tools/fastq/fastq_combiner.py Wed Feb 24 11:57:35 2010 -0500 @@ -1,6 +1,7 @@ #Dan Blankenberg import sys, os, shutil -from galaxy_utils.sequence.fastq import fastqWriter, fastaReader, fastaNamedReader, fastqSequencingRead, fastqCombiner +from galaxy_utils.sequence.fastq import fastqWriter, fastqSequencingRead, fastqCombiner +from galaxy_utils.sequence.fasta import fastaReader, fastaNamedReader def main(): #Read command line arguments diff -r 66ba0f2d6d32 -r aece26463c9d tools/fastq/fastq_to_fasta.py --- a/tools/fastq/fastq_to_fasta.py Wed Feb 24 10:42:36 2010 -0500 +++ b/tools/fastq/fastq_to_fasta.py Wed Feb 24 11:57:35 2010 -0500 @@ -1,6 +1,7 @@ #Dan Blankenberg import sys -from galaxy_utils.sequence.fastq import fastqReader, fastaWriter +from galaxy_utils.sequence.fastq import fastqReader +from galaxy_utils.sequence.fasta import fastaWriter def main(): input_filename = sys.argv[1]
participants (1)
-
Greg Von Kuster