1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/changeset/9b7d5c1c0be6/ changeset: 9b7d5c1c0be6 user: jgoecks date: 2011-11-16 23:53:27 summary: Refactor data providers to use the get_iterator/process_data framework whenever providing individual data points. affected #: 1 file diff -r aeb72f7dc945f5c51270f6757b6ec6b2d5544cba -r 9b7d5c1c0be64f903a928758b256326692fb0f2d lib/galaxy/visualization/tracks/data_providers.py --- a/lib/galaxy/visualization/tracks/data_providers.py +++ b/lib/galaxy/visualization/tracks/data_providers.py @@ -84,6 +84,21 @@ # Override. pass + def get_iterator( self, chrom, start, end ): + """ + Returns an iterator that provides data in the region chrom:start-end + """ + # Override. + pass + + def process_data( self, iterator, start_val=0, max_vals=None, **kwargs ): + """ + Process data from an iterator to a format that can be provided to client. + """ + # Override. + pass + + def get_data( self, chrom, start, end, start_val=0, max_vals=None, **kwargs ): """ Returns data in region defined by chrom, start, and end. start_val and @@ -93,8 +108,8 @@ Return value must be a dictionary with the following attributes: dataset_type, data """ - # Override. - pass + iterator = self.get_iterator( chrom, start, end ) + return self.process_data( iterator, start_val, max_vals, **kwargs ) def get_filters( self ): """ @@ -236,8 +251,6 @@ bgzip_fname = self.dependencies['bgzip'].file_name - # if os.path.getsize(self.converted_dataset.file_name) == 0: - # return { 'kind': messages.ERROR, 'message': "Tabix converted size was 0, meaning the input file had invalid values." } tabix = ctabix.Tabixfile(bgzip_fname, index_filename=self.converted_dataset.file_name) # If chrom is not found in indexes, try removing the first three @@ -248,11 +261,7 @@ chrom = chrom[3:] return tabix.fetch(reference=chrom, start=start, end=end) - - def get_data( self, chrom, start, end, start_val=0, max_vals=None, **kwargs ): - iterator = self.get_iterator( chrom, start, end ) - return self.process_data( iterator, start_val, max_vals, **kwargs ) - + def write_data_to_file( self, chrom, start, end, filename ): iterator = self.get_iterator( chrom, start, end ) out = open( filename, "w" ) @@ -273,11 +282,7 @@ def get_iterator( self, chrom, start, end ): raise "Unimplemented Method" - - def get_data( self, chrom, start, end, start_val=0, max_vals=None, **kwargs ): - iterator = self.get_iterator( chrom, start, end ) - return self.process_data( iterator, start_val, max_vals, **kwargs ) - + def process_data( self, iterator, start_val=0, max_vals=None, **kwargs ): """ Provides @@ -392,14 +397,6 @@ col_name_data_attr_mapping = { 'Qual' : { 'index': 6 , 'name' : 'Qual' } } - - def get_iterator( self, chrom, start, end ): - raise "Unimplemented Method" - - def get_data( self, chrom, start, end, start_val=0, max_vals=None, **kwargs ): - iterator = self.get_iterator( chrom, start, end ) - return self.process_data( iterator, start_val, max_vals, **kwargs ) - def process_data( self, iterator, start_val=0, max_vals=None, **kwargs ): """ Returns a dict with the following attributes: @@ -607,11 +604,32 @@ # Cleanup. bamfile.close() - - def get_data( self, chrom, start, end, start_val=0, max_vals=sys.maxint, **kwargs ): + + def get_iterator( self, chrom, start, end ): """ - Fetch reads in the region and additional metadata. + Returns an iterator that provides data in the region chrom:start-end + """ + start, end = int(start), int(end) + orig_data_filename = self.original_dataset.file_name + index_filename = self.converted_dataset.file_name + # Attempt to open the BAM file with index + bamfile = csamtools.Samfile( filename=orig_data_filename, mode='rb', index_filename=index_filename ) + try: + data = bamfile.fetch(start=start, end=end, reference=chrom) + except ValueError, e: + # Some BAM files do not prefix chromosome names with chr, try without + if chrom.startswith( 'chr' ): + try: + data = bamfile.fetch( start=start, end=end, reference=chrom[3:] ) + except ValueError: + return None + else: + return None + return data + + def process_data( self, iterator, start_val=0, max_vals=None, **kwargs ): + """ Returns a dict with the following attributes: data - a list of reads with the format [<guid>, <start>, <end>, <name>, <read_1>, <read_2>] @@ -628,26 +646,6 @@ max_high - highest coordinate for the returned reads message - error/informative message """ - start, end = int(start), int(end) - orig_data_filename = self.original_dataset.file_name - index_filename = self.converted_dataset.file_name - no_detail = "no_detail" in kwargs - - # Attempt to open the BAM file with index - bamfile = csamtools.Samfile( filename=orig_data_filename, mode='rb', index_filename=index_filename ) - message = None - try: - data = bamfile.fetch(start=start, end=end, reference=chrom) - except ValueError, e: - # Some BAM files do not prefix chromosome names with chr, try without - if chrom.startswith( 'chr' ): - try: - data = bamfile.fetch( start=start, end=end, reference=chrom[3:] ) - except ValueError: - return None - else: - return None - # Decode strand from read flag. def decode_strand( read_flag, mask ): strand_flag = ( read_flag & mask == 0 ) @@ -660,7 +658,8 @@ results = [] paired_pending = {} unmapped = 0 - for count, read in enumerate( data ): + message = None + for count, read in enumerate( iterator ): if count < start_val: continue if ( count - start_val - unmapped ) >= max_vals: @@ -720,8 +719,8 @@ results.append( [ "%i_%s" % ( read_start, qname ), read_start, read_end, qname, r1, r2 ] ) - # Clean up. - bamfile.close() + # Clean up. TODO: is this needed? If so, we'll need a cleanup function after processing the data. + # bamfile.close() max_low, max_high = get_bounds( results, 1, 2 ) @@ -848,13 +847,15 @@ for interval in feature.intervals: out.write(interval.raw_line + '\n') out.close() - - def get_data( self, chrom, start, end, start_val=0, max_vals=sys.maxint, **kwargs ): + + def get_iterator( self, chrom, start, end ): + """ + Returns an array with values: (a) source file and (b) an iterator that + provides data in the region chrom:start-end + """ start, end = int(start), int(end) source = open( self.original_dataset.file_name ) index = Indexes( self.converted_dataset.file_name ) - results = [] - message = None # If chrom is not found in indexes, try removing the first three # characters (e.g. 'chr') and see if that works. This enables the @@ -862,6 +863,13 @@ chrom = str(chrom) if chrom not in index.indexes and chrom[3:] in index.indexes: chrom = chrom[3:] + + return index.find(chrom, start, end) + + def process_data( self, iterator, start_val=0, max_vals=None, **kwargs ): + results = [] + message = None + source = open( self.original_dataset.file_name ) # # Build data to return. Payload format is: @@ -872,7 +880,7 @@ # filter_cols = from_json_string( kwargs.get( "filter_cols", "[]" ) ) no_detail = ( "no_detail" in kwargs ) - for count, val in enumerate( index.find(chrom, start, end) ): + for count, val in enumerate( iterator ): start, end, offset = val[0], val[1], val[2] if count < start_val: continue @@ -899,14 +907,24 @@ NOTE: this data provider does not use indices, and hence will be very slow for large datasets. """ - def get_data( self, chrom, start, end, start_val=0, max_vals=sys.maxint, **kwargs ): + + def get_iterator( self, chrom, start, end ): + """ + Returns an iterator that provides data in the region chrom:start-end + """ start, end = int( start ), int( end ) source = open( self.original_dataset.file_name ) + return GFFReaderWrapper( source, fix_strand=True ) + + def process_data( self, iterator, start_val=0, max_vals=None, **kwargs ): + """ + Process data from an iterator to a format that can be provided to client. + """ results = [] message = None offset = 0 - for count, feature in enumerate( GFFReaderWrapper( source, fix_strand=True ) ): + for count, feature in enumerate( iterator ): if count < start_val: continue if count-start_val >= max_vals: Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.