commit/galaxy-central: 5 new changesets

30 Aug 2013

5 new commits in galaxy-central:

https://bitbucket.org/galaxy/galaxy-central/commits/fcabfb819232/
Changeset:   fcabfb819232
User:        dannon
Date:        2013-08-30 05:06:53
Summary:     Explicitly require source argument to RawBedDataProvider.get_iterator instead of relying on it in kwargs
Affected #:  1 file

diff -r facc879fe0543f25e6b4d65e3e5d5efe716ff455 -r fcabfb81923220651c0dd95181eed24d1b21ac68 lib/galaxy/visualization/data_providers/genome.py

--- a/lib/galaxy/visualization/data_providers/genome.py
+++ b/lib/galaxy/visualization/data_providers/genome.py
@@ -568,7 +568,7 @@
     for large datasets.
     """
 
-    def get_iterator( self, chrom=None, start=None, end=None, **kwargs ):
+    def get_iterator( self, source, chrom=None, start=None, end=None, **kwargs ):
         # Read first line in order to match chrom naming format.
         line = source.readline()
         dataset_chrom = line.split()[0]


https://bitbucket.org/galaxy/galaxy-central/commits/3314e402ebaf/
Changeset:   3314e402ebaf
User:        dannon
Date:        2013-08-30 05:10:02
Summary:     Add missing import parse_gff_attributes to genome data provider
Affected #:  1 file

diff -r fcabfb81923220651c0dd95181eed24d1b21ac68 -r 3314e402ebaf326d57665615ba0e2e7b83dcc330 lib/galaxy/visualization/data_providers/genome.py
--- a/lib/galaxy/visualization/data_providers/genome.py
+++ b/lib/galaxy/visualization/data_providers/genome.py
@@ -9,7 +9,7 @@
 pkg_resources.require( "pysam" )
 pkg_resources.require( "numpy" )
 import numpy
-from galaxy.datatypes.util.gff_util import GFFReaderWrapper, GFFInterval, GFFFeature, convert_gff_coords_to_bed
+from galaxy.datatypes.util.gff_util import convert_gff_coords_to_bed, GFFFeature, GFFInterval, GFFReaderWrapper, parse_gff_attributes
 from galaxy.util.json import from_json_string
 from bx.interval_index_file import Indexes
 from bx.bbi.bigwig_file import BigWigFile


https://bitbucket.org/galaxy/galaxy-central/commits/4b86e65ee645/
Changeset:   4b86e65ee645
User:        dannon
Date:        2013-08-30 05:14:17
Summary:     Strip unused imports (and trailing whitespace) from genome data provider
Affected #:  1 file

diff -r 3314e402ebaf326d57665615ba0e2e7b83dcc330 -r 4b86e65ee645caa6b5923b05e759a06a9f06113f lib/galaxy/visualization/data_providers/genome.py
--- a/lib/galaxy/visualization/data_providers/genome.py
+++ b/lib/galaxy/visualization/data_providers/genome.py
@@ -3,7 +3,6 @@
 """
 
 import os, sys, re
-from math import ceil, log
 import pkg_resources
 pkg_resources.require( "bx-python" )
 pkg_resources.require( "pysam" )
@@ -14,7 +13,6 @@
 from bx.interval_index_file import Indexes
 from bx.bbi.bigwig_file import BigWigFile
 from bx.bbi.bigbed_file import BigBedFile
-from galaxy.util.lrucache import LRUCache
 from galaxy.visualization.data_providers.basic import BaseDataProvider
 from galaxy.visualization.data_providers.cigar import get_ref_based_read_seq_and_cigar
 from galaxy.datatypes.interval import Bed, Gff, Gtf
@@ -33,7 +31,7 @@
         return None
     else:
         return float(n)
-        
+
 def get_bounds( reads, start_pos_index, end_pos_index ):
     '''
     Returns the minimum and maximum position for a set of reads.
@@ -76,7 +74,7 @@
         line_len = int( textloc_file.readline() )
         file_len = os.path.getsize( self.converted_dataset.file_name )
         query = query.lower()
-    
+
         # Find query in file using binary search.
         low = 0
         high = file_len / line_len
@@ -91,42 +89,42 @@
                 low = mid + 1
             else:
                 high = mid
-        
+
         position = low * line_len
-        
+
         # At right point in file, generate hits.
         result = []
         while True:
             line = textloc_file.readline()
-            if not line.startswith( query ): 
+            if not line.startswith( query ):
                 break
-            if line[ -1: ] == '\n': 
+            if line[ -1: ] == '\n':
                 line = line[ :-1 ]
             result.append( line.split()[1:] )
 
-        textloc_file.close()    
+        textloc_file.close()
         return result
-        
+
 class GenomeDataProvider( BaseDataProvider ):
-    """ 
-    Base class for genome data providers. All genome providers use BED coordinate 
+    """
+    Base class for genome data providers. All genome providers use BED coordinate
     format (0-based, half-open coordinates) for both queries and returned data.
     """
 
     dataset_type = None
-    
-    """ 
+
+    """
     Mapping from column name to payload data; this mapping is used to create
-    filters. Key is column name, value is a dict with mandatory key 'index' and 
+    filters. Key is column name, value is a dict with mandatory key 'index' and
     optional key 'name'. E.g. this defines column 4
 
     col_name_data_attr_mapping = {4 : { index: 5, name: 'Score' } }
     """
     col_name_data_attr_mapping = {}
-    
+
     def __init__( self, converted_dataset=None, original_dataset=None, dependencies=None,
                   error_max_vals="Only the first %i %s in this region are displayed." ):
-        super( GenomeDataProvider, self ).__init__( converted_dataset=converted_dataset, 
+        super( GenomeDataProvider, self ).__init__( converted_dataset=converted_dataset,
                                                     original_dataset=original_dataset,
                                                     dependencies=dependencies,
                                                     error_max_vals=error_max_vals )
@@ -135,44 +133,44 @@
         # queries, such as is necessary for genome-wide data.
         # TODO: add functions to (a) create data_file and (b) clean up data_file.
         self.data_file = None
-        
+
     def write_data_to_file( self, regions, filename ):
         """
         Write data in region defined by chrom, start, and end to a file.
         """
         raise Exception( "Unimplemented Function" )
-        
+
     def valid_chroms( self ):
         """
         Returns chroms/contigs that the dataset contains
         """
         return None # by default
-    
+
     def has_data( self, chrom, start, end, **kwargs ):
         """
         Returns true if dataset has data in the specified genome window, false
         otherwise.
         """
         raise Exception( "Unimplemented Function" )
-        
+
     def get_iterator( self, chrom, start, end, **kwargs ):
         """
         Returns an iterator that provides data in the region chrom:start-end
         """
         raise Exception( "Unimplemented Function" )
-        
+
     def process_data( self, iterator, start_val=0, max_vals=None, **kwargs ):
         """
         Process data from an iterator to a format that can be provided to client.
         """
-        raise Exception( "Unimplemented Function" )        
-        
+        raise Exception( "Unimplemented Function" )
+
     def get_data( self, chrom=None, low=None, high=None, start_val=0, max_vals=sys.maxint, **kwargs ):
-        """ 
+        """
         Returns data in region defined by chrom, start, and end. start_val and
-        max_vals are used to denote the data to return: start_val is the first element to 
+        max_vals are used to denote the data to return: start_val is the first element to
         return and max_vals indicates the number of values to return.
-        
+
         Return value must be a dictionary with the following attributes:
             dataset_type, data
         """
@@ -204,12 +202,12 @@
             'dataset_type': self.dataset_type
         }
 
-        
+
     def get_filters( self ):
-        """ 
-        Returns filters for provider's data. Return value is a list of 
+        """
+        Returns filters for provider's data. Return value is a list of
         filters; each filter is a dictionary with the keys 'name', 'index', 'type'.
-        NOTE: This method uses the original dataset's datatype and metadata to 
+        NOTE: This method uses the original dataset's datatype and metadata to
         create the filters.
         """
         # Get column names.
@@ -220,18 +218,18 @@
                 column_names = range( self.original_dataset.metadata.columns )
             except: # Give up
                 return []
-            
+
         # Dataset must have column types; if not, cannot create filters.
         try:
             column_types = self.original_dataset.metadata.column_types
         except AttributeError:
             return []
-            
+
         # Create and return filters.
         filters = []
         if self.original_dataset.metadata.viz_filter_cols:
             for viz_col_index in self.original_dataset.metadata.viz_filter_cols:
-                # Some columns are optional, so can't assume that a filter 
+                # Some columns are optional, so can't assume that a filter
                 # column is in dataset.
                 if viz_col_index >= len( column_names ):
                     continue;
@@ -248,7 +246,7 @@
 
     def get_default_max_vals( self ):
         return 5000
-        
+
 #
 # -- Base mixins and providers --
 #
@@ -256,26 +254,26 @@
 class FilterableMixin:
     def get_filters( self ):
         """ Returns a dataset's filters. """
-        
+
         # is_ functions taken from Tabular.set_meta
         def is_int( column_text ):
             try:
                 int( column_text )
                 return True
-            except: 
+            except:
                 return False
         def is_float( column_text ):
             try:
                 float( column_text )
                 return True
-            except: 
+            except:
                 if column_text.strip().lower() == 'na':
                     return True #na is special cased to be a float
                 return False
-        
+
         #
         # Get filters.
-        # TODOs: 
+        # TODOs:
         # (a) might be useful to move this into each datatype's set_meta method;
         # (b) could look at first N lines to ensure GTF attribute types are consistent.
         #
@@ -284,9 +282,9 @@
         filter_col = 8
         if isinstance( self.original_dataset.datatype, Gff ):
             # Can filter by score and GTF attributes.
-            filters = [ { 'name': 'Score', 
-                          'type': 'number', 
-                          'index': filter_col, 
+            filters = [ { 'name': 'Score',
+                          'type': 'number',
+                          'index': filter_col,
                           'tool_id': 'Filter1',
                           'tool_exp_name': 'c6' } ]
             filter_col += 1
@@ -294,10 +292,10 @@
                 # Create filters based on dataset metadata.
                 for name, a_type in self.original_dataset.metadata.attribute_types.items():
                     if a_type in [ 'int', 'float' ]:
-                        filters.append( 
+                        filters.append(
                             { 'name': name,
-                              'type': 'number', 
-                              'index': filter_col, 
+                              'type': 'number',
+                              'index': filter_col,
                               'tool_id': 'gff_filter_by_attribute',
                               'tool_exp_name': name } )
                         filter_col += 1
@@ -324,9 +322,9 @@
                 '''
         elif isinstance( self.original_dataset.datatype, Bed ):
             # Can filter by score column only.
-            filters = [ { 'name': 'Score', 
-                          'type': 'number', 
-                          'index': filter_col, 
+            filters = [ { 'name': 'Score',
+                          'type': 'number',
+                          'index': filter_col,
                           'tool_id': 'Filter1',
                           'tool_exp_name': 'c5'
                            } ]
@@ -340,19 +338,19 @@
     """
     Tabix index data provider for the Galaxy track browser.
     """
-    
+
     col_name_data_attr_mapping = { 4 : { 'index': 4 , 'name' : 'Score' } }
-        
+
     def get_iterator( self, chrom, start, end, **kwargs ):
         start, end = int(start), int(end)
         if end >= (2<<29):
             end = (2<<29 - 1) # Tabix-enforced maximum
-                    
+
         bgzip_fname = self.dependencies['bgzip'].file_name
-        
+
         if not self.data_file:
             self.data_file = ctabix.Tabixfile(bgzip_fname, index_filename=self.converted_dataset.file_name)
-        
+
         # Get iterator using either naming scheme.
         iterator = iter( [] )
         if chrom in self.data_file.contigs:
@@ -365,10 +363,10 @@
 
         return iterator
 
-                
+
     def write_data_to_file( self, regions, filename ):
         out = open( filename, "w" )
-        
+
         for region in regions:
             # Write data in region.
             chrom = region.chrom
@@ -377,7 +375,7 @@
             iterator = self.get_iterator( chrom, start, end )
             for line in iterator:
                 out.write( "%s\n" % line )
-                
+
         out.close()
 
 #
@@ -389,20 +387,20 @@
 
     """
     Processes interval data from native format to payload format.
-    
+
     Payload format: [ uid (offset), start, end, name, strand, thick_start, thick_end, blocks ]
     """
-    
+
     def get_iterator( self, chrom, start, end, **kwargs ):
         raise Exception( "Unimplemented Function" )
-            
+
     def process_data( self, iterator, start_val=0, max_vals=None, **kwargs ):
         """
         Provides
         """
         # Build data to return. Payload format is:
         # [ <guid/offset>, <start>, <end>, <name>, <strand> ]
-        # 
+        #
         # First three entries are mandatory, others are optional.
         #
         filter_cols = from_json_string( kwargs.get( "filter_cols", "[]" ) )
@@ -421,7 +419,7 @@
             if max_vals and count-start_val >= max_vals:
                 message = self.error_max_vals % ( max_vals, "features" )
                 break
-            
+
             feature = line.split()
             length = len(feature)
             # Unique id is just a hash of the line
@@ -439,7 +437,7 @@
                 if not name_col: payload.append( "" )
                 payload.append( feature[strand_col] )
 
-            # Score (filter data)    
+            # Score (filter data)
             if length >= 5 and filter_cols and filter_cols[0] == "Score":
                 try:
                     payload.append( float( feature[4] ) )
@@ -467,23 +465,23 @@
 class BedDataProvider( GenomeDataProvider ):
     """
     Processes BED data from native format to payload format.
-    
+
     Payload format: [ uid (offset), start, end, name, strand, thick_start, thick_end, blocks ]
     """
 
     dataset_type = 'interval_index'
-    
+
     def get_iterator( self, chrom, start, end, **kwargs ):
         raise Exception( "Unimplemented Method" )
-            
+
     def process_data( self, iterator, start_val=0, max_vals=None, **kwargs ):
         """
         Provides
         """
         # Build data to return. Payload format is:
-        # [ <guid/offset>, <start>, <end>, <name>, <strand>, <thick_start>, 
+        # [ <guid/offset>, <start>, <end>, <name>, <strand>, <thick_start>,
         #   <thick_end>, <blocks> ]
-        # 
+        #
         # First three entries are mandatory, others are optional.
         #
         filter_cols = from_json_string( kwargs.get( "filter_cols", "[]" ) )
@@ -524,10 +522,10 @@
                 blocks = zip( block_sizes, block_starts )
                 payload.append( [ ( int(feature[1]) + block[1], int(feature[1]) + block[1] + block[0] ) for block in blocks ] )
 
-            # Score (filter data)    
+            # Score (filter data)
             if length >= 5 and filter_cols and filter_cols[0] == "Score":
-                # If dataset doesn't have name/strand/thick start/thick end/blocks, 
-                # add placeholders. There should be 8 entries if all attributes 
+                # If dataset doesn't have name/strand/thick start/thick end/blocks,
+                # add placeholders. There should be 8 entries if all attributes
                 # are present.
                 payload.extend( [ None for i in range( 8 - len( payload ) ) ] )
 
@@ -542,7 +540,7 @@
 
     def write_data_to_file( self, regions, filename ):
         out = open( filename, "w" )
-        
+
         for region in regions:
             # Write data in region.
             chrom = region.chrom
@@ -551,15 +549,15 @@
             iterator = self.get_iterator( chrom, start, end )
             for line in iterator:
                 out.write( "%s\n" % line )
-                
+
         out.close()
-        
+
 class BedTabixDataProvider( TabixDataProvider, BedDataProvider ):
     """
     Provides data from a BED file indexed via tabix.
     """
     pass
-    
+
 class RawBedDataProvider( BedDataProvider ):
     """
     Provide data from BED file.
@@ -590,7 +588,7 @@
                     or ( end is not None and feature_end < start ):
                     continue
                 yield line
-        
+
         return line_filter_iter()
 
 #
@@ -601,10 +599,10 @@
     """
     Abstract class that processes VCF data from native format to payload format.
 
-    Payload format: An array of entries for each locus in the file. Each array 
+    Payload format: An array of entries for each locus in the file. Each array
     has the following entries:
         1. GUID (unused)
-        2. location (0-based) 
+        2. location (0-based)
         3. reference base(s)
         4. alternative base(s)
         5. quality score
@@ -613,20 +611,20 @@
            denotes the reference genotype
         8-end: allele counts for each alternative
     """
-    
+
     col_name_data_attr_mapping = { 'Qual' : { 'index': 6 , 'name' : 'Qual' } }
 
     dataset_type = 'variant'
-    
+
     def process_data( self, iterator, start_val=0, max_vals=None, **kwargs ):
         """
         Returns a dict with the following attributes::
 
-            data - a list of variants with the format 
+            data - a list of variants with the format
 
             .. raw:: text
 
-                [<guid>, <start>, <end>, <name>, cigar, seq] 
+                [<guid>, <start>, <end>, <name>, cigar, seq]
 
             message - error/informative message
 
@@ -636,8 +634,8 @@
 
         def get_mapping( ref, alt ):
             """
-            Returns ( offset, new_seq, cigar ) tuple that defines mapping of 
-            alt to ref. Cigar format is an array of [ op_index, length ] pairs 
+            Returns ( offset, new_seq, cigar ) tuple that defines mapping of
+            alt to ref. Cigar format is an array of [ op_index, length ] pairs
             where op_index is the 0-based index into the string "MIDNSHP=X"
             """
 
@@ -676,7 +674,7 @@
             samples_data = feature [ 9: ]
             # VCF is 1-based.
             pos = int( pos ) - 1
-            
+
             # FIXME: OK to skip?
             if alt == '.':
                 count -= 1
@@ -707,7 +705,7 @@
                             has_alleles = True
                     except ValueError:
                         pass
-                
+
                 # If no alleles, use empty string as proxy.
                 if not has_alleles:
                     genotype = ''
@@ -732,7 +730,7 @@
 
     def write_data_to_file( self, regions, filename ):
         out = open( filename, "w" )
-        
+
         for region in regions:
             # Write data in region.
             chrom = region.chrom
@@ -747,7 +745,7 @@
     """
     Provides data from a VCF file indexed via tabix.
     """
-    
+
     dataset_type = 'variant'
 
 class RawVcfDataProvider( VcfDataProvider ):
@@ -797,17 +795,17 @@
             for data_line in source:
                 if line_in_region( data_line, chrom, start, end ):
                     yield data_line
-        
+
         return line_filter_iter()
 
 class BamDataProvider( GenomeDataProvider, FilterableMixin ):
     """
-    Provides access to intervals from a sorted indexed BAM file. Coordinate 
+    Provides access to intervals from a sorted indexed BAM file. Coordinate
     data is reported in BED format: 0-based, half-open.
     """
 
     dataset_type = 'bai'
-    
+
     def get_filters( self ):
         """
         Returns filters for dataset.
@@ -815,31 +813,31 @@
         # HACK: first 7 fields are for drawing, so start filter column index at 7.
         filter_col = 7
         filters = []
-        filters.append( { 'name': 'Mapping Quality', 
-                        'type': 'number', 
+        filters.append( { 'name': 'Mapping Quality',
+                        'type': 'number',
                         'index': filter_col
                          } )
         return filters
-    
-    
+
+
     def write_data_to_file( self, regions, filename ):
         """
         Write reads in regions to file.
         """
-        
+
         # Open current BAM file using index.
         bamfile = csamtools.Samfile( filename=self.original_dataset.file_name, mode='rb', \
                                      index_filename=self.converted_dataset.file_name )
 
         # TODO: write headers as well?
         new_bamfile = csamtools.Samfile( template=bamfile, filename=filename, mode='wb' )
-        
+
         for region in regions:
             # Write data from region.
             chrom = region.chrom
             start = region.start
             end = region.end
-        
+
             try:
                 data = bamfile.fetch(start=start, end=end, reference=chrom)
             except ValueError, e:
@@ -853,11 +851,11 @@
             # Write reads in region.
             for i, read in enumerate( data ):
                 new_bamfile.write( read )
-        
+
         # Cleanup.
         new_bamfile.close()
         bamfile.close()
-        
+
     def get_iterator( self, chrom, start, end, **kwargs ):
         """
         Returns an iterator that provides data in the region chrom:start-end
@@ -865,7 +863,7 @@
         start, end = int( start ), int( end )
         orig_data_filename = self.original_dataset.file_name
         index_filename = self.converted_dataset.file_name
-        
+
         # Attempt to open the BAM file with index
         bamfile = csamtools.Samfile( filename=orig_data_filename, mode='rb', index_filename=index_filename )
         try:
@@ -878,12 +876,12 @@
             except ValueError:
                 return None
         return data
-                
+
     def process_data( self, iterator, start_val=0, max_vals=None, ref_seq=None, start=0, **kwargs ):
         """
         Returns a dict with the following attributes::
 
-            data - a list of reads with the format 
+            data - a list of reads with the format
                 [<guid>, <start>, <end>, <name>, <read_1>, <read_2>, [empty], <mapq_scores>]
 
                 where <read_1> has the format
@@ -895,10 +893,10 @@
                 Field 7 is empty so that mapq scores' location matches that in single-end reads.
                 For single-end reads, read has format:
                     [<guid>, <start>, <end>, <name>, <cigar>, <strand>, <seq>, <mapq_score>]
-                
+
                 NOTE: read end and sequence data are not valid for reads outside of
                 requested region and should not be used.
-            
+
             max_low - lowest coordinate for the returned reads
             max_high - highest coordinate for the returned reads
             message - error/informative message
@@ -919,7 +917,7 @@
                 return "+"
             else:
                 return "-"
-        
+
         #
         # Encode reads as list of lists.
         #
@@ -933,13 +931,13 @@
             if ( count - start_val - unmapped ) >= max_vals:
                 message = self.error_max_vals % ( max_vals, "reads" )
                 break
-                
+
             # If not mapped, skip read.
             is_mapped = ( read.flag & 0x0004 == 0 )
             if not is_mapped:
                 unmapped += 1
                 continue
-                            
+
             qname = read.qname
             seq = read.seq
             strand = decode_strand( read.flag, 0x0010 )
@@ -951,11 +949,11 @@
             if read.is_proper_pair:
                 if qname in paired_pending: # one in dict is always first
                     pair = paired_pending[qname]
-                    results.append( [ "%i_%s" % ( pair['start'], qname ), 
-                                      pair['start'], 
-                                      read.pos + read_len, 
-                                      qname, 
-                                      [ pair['start'], pair['end'], pair['cigar'], pair['strand'], pair['seq'] ], 
+                    results.append( [ "%i_%s" % ( pair['start'], qname ),
+                                      pair['start'],
+                                      read.pos + read_len,
+                                      qname,
+                                      [ pair['start'], pair['end'], pair['cigar'], pair['strand'], pair['seq'] ],
                                       [ read.pos, read.pos + read_len, read.cigar, strand, seq ],
                                       None, [ pair['mapq'], read.mapq ]
                                      ] )
@@ -964,10 +962,10 @@
                     paired_pending[qname] = { 'start': read.pos, 'end': read.pos + read_len, 'seq': seq, 'mate_start': read.mpos,
                                               'rlen': read_len, 'strand': strand, 'cigar': read.cigar, 'mapq': read.mapq }
             else:
-                results.append( [ "%i_%s" % ( read.pos, qname ), 
-                                read.pos, read.pos + read_len, qname, 
+                results.append( [ "%i_%s" % ( read.pos, qname ),
+                                read.pos, read.pos + read_len, qname,
                                 read.cigar, strand, read.seq, read.mapq ] )
-                
+
         # Take care of reads whose mates are out of range.
         # TODO: count paired reads when adhering to max_vals?
         for qname, read in paired_pending.iteritems():
@@ -989,7 +987,7 @@
                 r2 = [ read['mate_start'], read['mate_start'] ]
 
             results.append( [ "%i_%s" % ( read_start, qname ), read_start, read_end, qname, r1, r2, [read[ 'mapq' ], 125] ] )
-            
+
         # Clean up. TODO: is this needed? If so, we'll need a cleanup function after processing the data.
         # bamfile.close()
 
@@ -999,10 +997,10 @@
                 '''
                 Process a read using the designated fields.
                 '''
-                read_seq, read_cigar = get_ref_based_read_seq_and_cigar( read[ seq_field ].upper(), 
-                                                                         read[ start_field ], 
-                                                                         ref_seq, 
-                                                                         start, 
+                read_seq, read_cigar = get_ref_based_read_seq_and_cigar( read[ seq_field ].upper(),
+                                                                         read[ start_field ],
+                                                                         ref_seq,
+                                                                         start,
                                                                          read[ cigar_field ] )
                 read[ seq_field ] = read_seq
                 read[ cigar_field ] = read_cigar
@@ -1012,7 +1010,7 @@
                 Process single-end read.
                 '''
                 process_read( read, 1, 4, 6)
-                
+
             def process_pe_read( read ):
                 '''
                 Process paired-end read.
@@ -1034,28 +1032,28 @@
                     process_se_read( read )
 
         max_low, max_high = get_bounds( results, 1, 2 )
-                
+
         return { 'data': results, 'message': message, 'max_low': max_low, 'max_high': max_high }
-        
+
 class SamDataProvider( BamDataProvider ):
 
     dataset_type = 'bai'
-    
+
     def __init__( self, converted_dataset=None, original_dataset=None, dependencies=None ):
         """ Create SamDataProvider. """
         super( SamDataProvider, self ).__init__( converted_dataset=converted_dataset,
                                                  original_dataset=original_dataset,
                                                  dependencies=dependencies )
-        
-        # To use BamDataProvider, original dataset must be BAM and 
+
+        # To use BamDataProvider, original dataset must be BAM and
         # converted dataset must be BAI. Use BAI from BAM metadata.
         if converted_dataset:
             self.original_dataset = converted_dataset
             self.converted_dataset = converted_dataset.metadata.bam_index
-        
+
 class BBIDataProvider( GenomeDataProvider ):
     """
-    BBI data provider for the Galaxy track browser. 
+    BBI data provider for the Galaxy track browser.
     """
 
     dataset_type = 'bigwig'
@@ -1063,7 +1061,7 @@
     def valid_chroms( self ):
         # No way to return this info as of now
         return None
-        
+
     def has_data( self, chrom ):
         f, bbi = self._get_dataset()
         all_dat = bbi.query( chrom, 0, 2147483647, 1 ) or \
@@ -1081,18 +1079,18 @@
             return bbi.summarize( chrom, start, end, num_points ) or \
                    bbi.summarize( _convert_between_ucsc_and_ensemble_naming( chrom ) , start, end, num_points )
 
-        # Bigwig can be a standalone bigwig file, in which case we use 
-        # original_dataset, or coming from wig->bigwig conversion in 
+        # Bigwig can be a standalone bigwig file, in which case we use
+        # original_dataset, or coming from wig->bigwig conversion in
         # which we use converted_dataset
         f, bbi = self._get_dataset()
-       
+
         # If stats requested, compute overall summary data for the range
-        # start:endbut no reduced data. This is currently used by client 
+        # start:endbut no reduced data. This is currently used by client
         # to determine the default range.
         if 'stats' in kwargs:
             summary = _summarize_bbi( bbi, chrom, start, end, 1 )
             f.close()
-            
+
             min_val = 0
             max_val = 0
             mean = 0
@@ -1127,12 +1125,12 @@
             summary = _summarize_bbi( bbi, chrom, start, end, num_points )
             if summary:
                 #mean = summary.sum_data / summary.valid_count
-                
+
                 ## Standard deviation by bin, not yet used
                 ## var = summary.sum_squares - mean
                 ## var /= minimum( valid_count - 1, 1 )
                 ## sd = sqrt( var )
-            
+
                 pos = start
                 step_size = (end - start) / num_points
 
@@ -1150,34 +1148,34 @@
             num_points = end - start + 1
             end += 1
         else:
-            # 
-            # The goal is to sample the region between start and end uniformly 
-            # using ~N (num_samples) data points. The challenge is that the size of 
-            # sampled intervals rarely is full bases, so sampling using N points 
-            # will leave the end of the region unsampled due to remainders for 
-            # each interval. To recitify this, a new N is calculated based on the 
+            #
+            # The goal is to sample the region between start and end uniformly
+            # using ~N (num_samples) data points. The challenge is that the size of
+            # sampled intervals rarely is full bases, so sampling using N points
+            # will leave the end of the region unsampled due to remainders for
+            # each interval. To recitify this, a new N is calculated based on the
             # step size that covers as much of the region as possible.
             #
-            # However, this still leaves some of the region unsampled. This 
-            # could be addressed by repeatedly sampling remainder using a 
-            # smaller and smaller step_size, but that would require iteratively 
+            # However, this still leaves some of the region unsampled. This
+            # could be addressed by repeatedly sampling remainder using a
+            # smaller and smaller step_size, but that would require iteratively
             # going to BBI, which could be time consuming.
             #
 
             # Start with N samples.
             num_points = num_samples
             step_size = ( end - start ) / num_points
-            # Add additional points to sample in the remainder not covered by 
+            # Add additional points to sample in the remainder not covered by
             # the initial N samples.
             remainder_start = start + step_size * num_points
             additional_points = ( end - remainder_start ) / step_size
             num_points += additional_points
-            
+
         result = summarize_region( bbi, chrom, start, end, num_points )
-        
+
         # Cleanup and return.
         f.close()
-        return { 
+        return {
             'data': result,
             'dataset_type': self.dataset_type
         }
@@ -1190,7 +1188,7 @@
 
 class BigWigDataProvider ( BBIDataProvider ):
     """
-    Provides data from BigWig files; position data is reported in 1-based 
+    Provides data from BigWig files; position data is reported in 1-based
     coordinate system, i.e. wiggle format.
     """
     def _get_dataset( self ):
@@ -1199,7 +1197,7 @@
         else:
             f = open( self.original_dataset.file_name )
         return f, BigWigFile(file=f)
-            
+
 class IntervalIndexDataProvider( FilterableMixin, GenomeDataProvider ):
     """
     Interval index files used for GFF, Pileup files.
@@ -1207,7 +1205,7 @@
     col_name_data_attr_mapping = { 4 : { 'index': 4 , 'name' : 'Score' } }
 
     dataset_type = 'interval_index'
-    
+
     def write_data_to_file( self, regions, filename ):
         source = open( self.original_dataset.file_name )
         index = Indexes( self.converted_dataset.file_name )
@@ -1230,10 +1228,10 @@
                     feature = reader.next()
                     for interval in feature.intervals:
                         out.write( '\t'.join( interval.fields ) + '\n' )
-                        
+
         source.close()
         out.close()
-        
+
     def get_iterator( self, chrom, start, end, **kwargs ):
         """
         Returns an array with values: (a) source file and (b) an iterator that
@@ -1246,7 +1244,7 @@
         if chrom not in index.indexes:
             # Try alternative naming.
             chrom = _convert_between_ucsc_and_ensemble_naming( chrom )
-            
+
         return index.find(chrom, start, end)
 
     def process_data( self, iterator, start_val=0, max_vals=None, **kwargs ):
@@ -1258,7 +1256,7 @@
         # Build data to return. Payload format is:
         # [ <guid/offset>, <start>, <end>, <name>, <score>, <strand>, <thick_start>,
         #   <thick_end>, <blocks> ]
-        # 
+        #
         # First three entries are mandatory, others are optional.
         #
         filter_cols = from_json_string( kwargs.get( "filter_cols", "[]" ) )
@@ -1272,7 +1270,7 @@
                 break
             source.seek( offset )
             # TODO: can we use column metadata to fill out payload?
-            
+
             # GFF dataset.
             reader = GFFReaderWrapper( source, fix_strand=True )
             feature = reader.next()
@@ -1286,13 +1284,13 @@
 class RawGFFDataProvider( GenomeDataProvider ):
     """
     Provide data from GFF file that has not been indexed.
-    
+
     NOTE: this data provider does not use indices, and hence will be very slow
     for large datasets.
     """
 
     dataset_type = 'interval_index'
-    
+
     def get_iterator( self, chrom, start, end, **kwargs ):
         """
         Returns an iterator that provides data in the region chrom:start-end as well as
@@ -1302,18 +1300,18 @@
 
         # Read first line in order to match chrom naming format.
         line = source.readline()
-        
+
         # If line empty, assume file is empty and return empty iterator.
         if len( line ) == 0:
             return iter([])
-        
+
         # Determine chromosome naming format.
         dataset_chrom = line.split()[0]
         if not _chrom_naming_matches( chrom, dataset_chrom ):
             chrom = _convert_between_ucsc_and_ensemble_naming( chrom )
         # Undo read.
         source.seek( 0 )
-    
+
         def features_in_region_iter():
             offset = 0
             for feature in GFFReaderWrapper( source, fix_strand=True ):
@@ -1324,7 +1322,7 @@
                 offset += feature.raw_size
 
         return features_in_region_iter()
-            
+
     def process_data( self, iterator, start_val=0, max_vals=None, **kwargs ):
         """
         Process data from an iterator to a format that can be provided to client.
@@ -1340,22 +1338,22 @@
             if count-start_val >= max_vals:
                 message = self.error_max_vals % ( max_vals, "reads" )
                 break
-                
+
             payload = package_gff_feature( feature, no_detail=no_detail, filter_cols=filter_cols )
             payload.insert( 0, offset )
             results.append( payload )
 
-            
+
         return { 'data': results, 'dataset_type': self.dataset_type, 'message': message }
-        
+
 class GtfTabixDataProvider( TabixDataProvider ):
     """
     Returns data from GTF datasets that are indexed via tabix.
     """
-    
+
     def process_data( self, iterator, start_val=0, max_vals=None, **kwargs ):
         # Loop through lines and group by transcript_id; each group is a feature.
-        
+
         # TODO: extend this code or use code in gff_util to process GFF/3 as well
         # and then create a generic GFFDataProvider that can be used with both
         # raw and tabix datasets.
@@ -1369,7 +1367,7 @@
                 feature = []
                 features[ transcript_id ] = feature
             feature.append( GFFInterval( None, line.split( '\t') ) )
-                                
+
         # Process data.
         filter_cols = from_json_string( kwargs.get( "filter_cols", "[]" ) )
         no_detail = ( "no_detail" in kwargs )
@@ -1382,12 +1380,12 @@
             if count-start_val >= max_vals:
                 message = self.error_max_vals % ( max_vals, "reads" )
                 break
-            
-            feature = GFFFeature( None, intervals=intervals )    
+
+            feature = GFFFeature( None, intervals=intervals )
             payload = package_gff_feature( feature, no_detail=no_detail, filter_cols=filter_cols )
             payload.insert( 0, feature.intervals[ 0 ].attributes[ 'transcript_id' ] )
             results.append( payload )
-                        
+
         return { 'data': results, 'message': message }
 
 #
@@ -1397,26 +1395,26 @@
 class ENCODEPeakDataProvider( GenomeDataProvider ):
     """
     Abstract class that processes ENCODEPeak data from native format to payload format.
-    
+
     Payload format: [ uid (offset), start, end, name, strand, thick_start, thick_end, blocks ]
     """
-    
+
     def get_iterator( self, chrom, start, end, **kwargs ):
         raise "Unimplemented Method"
-            
+
     def process_data( self, iterator, start_val=0, max_vals=None, **kwargs ):
         """
         Provides
         """
-        
+
         ## FIXMEs:
         # (1) should be able to unify some of this code with BedDataProvider.process_data
         # (2) are optional number of parameters supported?
-        
+
         # Build data to return. Payload format is:
         # [ <guid/offset>, <start>, <end>, <name>, <strand>, <thick_start>,
         #   <thick_end>, <blocks> ]
-        # 
+        #
         # First three entries are mandatory, others are optional.
         #
         no_detail = ( "no_detail" in kwargs )
@@ -1431,16 +1429,16 @@
 
             feature = line.split()
             length = len( feature )
-            
+
             # Feature initialization.
             payload = [
                 # GUID is just a hash of the line
                 hash( line ),
                 # Add start, end.
-                int( feature[1] ), 
+                int( feature[1] ),
                 int( feature[2] )
                         ]
-            
+
             if no_detail:
                 rval.append( payload )
                 continue
@@ -1448,7 +1446,7 @@
             # Extend with additional data.
             payload.extend( [
                 # Add name, strand.
-                feature[3], 
+                feature[3],
                 feature[5],
                 # Thick start, end are feature start, end for now.
                 int( feature[1] ),
@@ -1465,12 +1463,12 @@
             rval.append( payload )
 
         return { 'data': rval, 'message': message }
-        
+
 class ENCODEPeakTabixDataProvider( TabixDataProvider, ENCODEPeakDataProvider ):
     """
     Provides data from an ENCODEPeak dataset indexed via tabix.
     """
-    
+
     def get_filters( self ):
         """
         Returns filters for dataset.
@@ -1478,26 +1476,26 @@
         # HACK: first 8 fields are for drawing, so start filter column index at 9.
         filter_col = 8
         filters = []
-        filters.append( { 'name': 'Score', 
-                          'type': 'number', 
+        filters.append( { 'name': 'Score',
+                          'type': 'number',
                           'index': filter_col,
                           'tool_id': 'Filter1',
                           'tool_exp_name': 'c6' } )
         filter_col += 1
-        filters.append( { 'name': 'Signal Value', 
-                          'type': 'number', 
+        filters.append( { 'name': 'Signal Value',
+                          'type': 'number',
                           'index': filter_col,
                           'tool_id': 'Filter1',
                           'tool_exp_name': 'c7' } )
         filter_col += 1
-        filters.append( { 'name': 'pValue', 
-                        'type': 'number', 
+        filters.append( { 'name': 'pValue',
+                        'type': 'number',
                         'index': filter_col,
                         'tool_id': 'Filter1',
                         'tool_exp_name': 'c8' } )
         filter_col += 1
-        filters.append( { 'name': 'qValue', 
-                        'type': 'number', 
+        filters.append( { 'name': 'qValue',
+                        'type': 'number',
                         'index': filter_col,
                         'tool_id': 'Filter1',
                         'tool_exp_name': 'c9' } )
@@ -1523,7 +1521,7 @@
 
             feature = line.split()
             length = len( feature )
-            
+
             s1 = int( feature[1] )
             e1 = int( feature[2] )
             c = feature[3]
@@ -1538,14 +1536,14 @@
                 # Add start1, end1, chr2, start2, end2, value.
                 s1, e1, c, s2, e2, v
             ]
-            
+
             rval.append( payload )
 
         return { 'data': rval, 'message': message }
 
     def get_default_max_vals( self ):
         return 100000;
-    
+
 class ChromatinInteractionsTabixDataProvider( TabixDataProvider, ChromatinInteractionsDataProvider ):
     def get_iterator( self, chrom, start=0, end=sys.maxint, interchromosomal=False, **kwargs ):
         """
@@ -1556,7 +1554,7 @@
         def filter( iter ):
             for line in iter:
                 feature = line.split()
-                s1 = int( feature[1] ) 
+                s1 = int( feature[1] )
                 e1 = int( feature[2] )
                 c = feature[3]
                 s2 = int( feature[4] )
@@ -1568,22 +1566,22 @@
                 if interchromosomal and c != chrom:
                     yield line
         return filter( TabixDataProvider.get_iterator( self, chrom, filter_start, end ) )
-               
-#        
+
+#
 # -- Helper methods. --
 #
 
 def package_gff_feature( feature, no_detail=False, filter_cols=[] ):
     """ Package a GFF feature in an array for data providers. """
     feature = convert_gff_coords_to_bed( feature )
-    
+
     # No detail means only start, end.
     if no_detail:
         return [ feature.start, feature.end ]
-    
+
     # Return full feature.
-    payload = [ feature.start, 
-                feature.end, 
+    payload = [ feature.start,
+                feature.end,
                 feature.name(),
                 feature.strand,
                 # No notion of thick start, end in GFF, so make everything
@@ -1591,9 +1589,9 @@
                 feature.start,
                 feature.end
                 ]
-    
-    # HACK: ignore interval with name 'transcript' from feature. 
-    # Cufflinks puts this interval in each of its transcripts, 
+
+    # HACK: ignore interval with name 'transcript' from feature.
+    # Cufflinks puts this interval in each of its transcripts,
     # and they mess up trackster by covering the feature's blocks.
     # This interval will always be a feature's first interval,
     # and the GFF's third column is its feature name.
@@ -1605,7 +1603,7 @@
     block_starts = [ ( interval.start - feature.start ) for interval in feature_intervals ]
     blocks = zip( block_sizes, block_starts )
     payload.append( [ ( feature.start + block[1], feature.start + block[1] + block[0] ) for block in blocks ] )
-    
+
     # Add filter data to payload.
     for col in filter_cols:
         if col == "Score":


https://bitbucket.org/galaxy/galaxy-central/commits/d2a624fd6dc2/
Changeset:   d2a624fd6dc2
User:        dannon
Date:        2013-08-30 05:17:36
Summary:     dataprovider dataset cleanup, add missing bx.bbi import
Affected #:  1 file

diff -r 4b86e65ee645caa6b5923b05e759a06a9f06113f -r d2a624fd6dc2fecdc319848f3d35c2f4b66a389e lib/galaxy/datatypes/dataproviders/dataset.py
--- a/lib/galaxy/datatypes/dataproviders/dataset.py
+++ b/lib/galaxy/datatypes/dataproviders/dataset.py
@@ -6,19 +6,18 @@
         (e.g. parsing genomic regions from their source)
 """
 
-from galaxy import eggs
 
-import pkg_resources
-pkg_resources.require( 'bx-python' )
-from bx import seq as bx_seq
-from bx import wiggle as bx_wig
-
-import exceptions
 import base
 import line
 import column
 import external
 
+from galaxy import eggs
+eggs.require( 'bx-python' )
+from bx import seq as bx_seq
+from bx import wiggle as bx_wig
+from bx import bbi as bx_bbi
+
 _TODO = """
 use bx as much as possible
 gff3 hierarchies


https://bitbucket.org/galaxy/galaxy-central/commits/068acf051f9a/
Changeset:   068acf051f9a
User:        dannon
Date:        2013-08-30 05:20:05
Summary:     Variable confusion in dataproviders/dataset -- clarify and use correct indices var
Affected #:  1 file

diff -r d2a624fd6dc2fecdc319848f3d35c2f4b66a389e -r 068acf051f9acfb8058f2bc50b0361d9a59d8cdb lib/galaxy/datatypes/dataproviders/dataset.py
--- a/lib/galaxy/datatypes/dataproviders/dataset.py
+++ b/lib/galaxy/datatypes/dataproviders/dataset.py
@@ -145,10 +145,10 @@
         :returns: list of column indeces for the named columns.
         """
         region_column_names = ( 'chromCol', 'startCol', 'endCol' )
-        region_indeces = [ self.get_metadata_column_index_by_name( name ) for name in region_column_names ]
-        if check and not all( map( lambda i: i != None, indeces ) ):
-            raise ValueError( "Could not determine proper column indeces for chrom, start, end: %s" %( str( indeces ) ) )
-        return region_indeces
+        region_indices = [ self.get_metadata_column_index_by_name( name ) for name in region_column_names ]
+        if check and not all( map( lambda i: i != None, region_indices) ):
+            raise ValueError( "Could not determine proper column indices for chrom, start, end: %s" %( str( region_indices ) ) )
+        return region_indices
 
 
 class ConvertedDatasetDataProvider( DatasetDataProvider ):

Repository URL: https://bitbucket.org/galaxy/galaxy-central/

--

This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.

    

commits-noreply＠bitbucket.org

tags

participants (1)