commit/galaxy-central: jgoecks: Rewrite sampling code for BBI data provider to handle (a) boundary cases during base-level resolution and (b) remainder of region not sampled during first pass.

15 Aug 2012

1 new commit in galaxy-central:


https://bitbucket.org/galaxy/galaxy-central/changeset/565476ce4f03/
changeset:   565476ce4f03
user:        jgoecks
date:        2012-08-15 23:49:39
summary:     Rewrite sampling code for BBI data provider to handle (a) boundary cases during base-level resolution and (b) remainder of region not sampled during first pass.
affected #:  1 file

diff -r 2531e085f2625b60135a6b4972f125e22a4fd354 -r 565476ce4f0301d23538d87aeef805edc099badf lib/galaxy/visualization/tracks/data_providers.py

--- a/lib/galaxy/visualization/tracks/data_providers.py
+++ b/lib/galaxy/visualization/tracks/data_providers.py
@@ -947,55 +947,69 @@
 
                 return dict( data=dict( min=summary.min_val[0], max=summary.max_val[0], mean=mean, sd=sd ) )
 
-        # The following seems not to work very well, for example it will only return one
-        # data point if the tile is 1280px wide. Not sure what the intent is.
+        # Sample from region using approximately this many samples.
+        N = 1000
 
-        # The first zoom level for BBI files is 640. If too much is requested, it will look at each block instead
-        # of summaries. The calculation done is: zoom <> (end-start)/num_points/2.
-        # Thus, the optimal number of points is (end-start)/num_points/2 = 640
-        # num_points = (end-start) / 1280
-        #num_points = (end-start) / 1280
-        #if num_points < 1:
-        #    num_points = end - start
-        #else:
-        #    num_points = min(num_points, 500)
+        def summarize_region( bbi, chrom, start, end, num_points ):
+            '''
+            Returns results from summarizing a region using num_points.
+            NOTE: num_points cannot be greater than end - start or BBI
+            will return None for all positions.s
+            '''
+            result = []
 
-        # For now, we'll do 1000 data points by default. However, the summaries
-        # don't seem to work when a summary pixel corresponds to less than one
-        # datapoint, so we prevent that. 
+            # Get summary; this samples at intervals of length
+            # (end - start)/num_points -- i.e. drops any fractional component
+            # of interval length.
+            summary = bbi.summarize( chrom, start, end, num_points )
+            if summary:
+                #mean = summary.sum_data / summary.valid_count
+                
+                ## Standard deviation by bin, not yet used
+                ## var = summary.sum_squares - mean
+                ## var /= minimum( valid_count - 1, 1 )
+                ## sd = sqrt( var )
+            
+                pos = start
+                step_size = (end - start) / num_points
 
-        # FIXME: need to choose the number of points to maximize coverage of the area. 
-        # It appears that BBI calculates points using intervals of 
-        # floor( num_points / end - start ) 
-        # In some cases, this prevents sampling near the end of the interval, 
-        # especially when (a) the total interval is small ( < 20-30Kb) and (b) the 
-        # computed interval size has a large fraction, e.g. 14.7 or 35.8
-        num_points = min( 1000, end - start )
+                for i in range( num_points ):
+                    result.append( (pos, float_nan( summary.sum_data[i] / summary.valid_count[i] ) ) )
+                    pos += step_size
 
-        # HACK to address the FIXME above; should generalize.
-        if end - start <= 2000:
-            num_points = end - start
+            return result
 
-        summary = bbi.summarize( chrom, start, end, num_points )
+        # Approach is different depending on region size.
+        if end - start < N:
+            # Get values for individual bases in region, including start and end.
+            # To do this, need to increase end to next base and request number of points.
+            num_points = end - start + 1
+            end += 1
+ 
+            result = summarize_region( bbi, chrom, start, end, num_points )
+        else:
+            # 
+            # The goal is to sample the region between start and end uniformly 
+            # using N data points. The challenge is that the size of sampled 
+            # intervals rarely is full bases, so sampling using N points will 
+            # leave the end of the region unsampled. To recitify this, samples
+            # beyond N are taken at the end of the interval.
+            #
+
+            # Do initial summary.
+            num_points = N
+            result = summarize_region( bbi, chrom, start, end, num_points )        
+
+            # Do summary of remaining part of region.
+            step_size = ( end - start ) / num_points
+            new_start = start + step_size * num_points
+            new_num_points = min( ( end - new_start ) / step_size, end - start )
+            if new_num_points is not 0:
+                result.extend( summarize_region( bbi, chrom, new_start, end, new_num_points ) )
+            #TODO: progressively reduce step_size to generate more datapoints.
+        
+        # Cleanup and return.
         f.close()
-
-        result = []
-
-        if summary:
-            #mean = summary.sum_data / summary.valid_count
-            
-            ## Standard deviation by bin, not yet used
-            ## var = summary.sum_squares - mean
-            ## var /= minimum( valid_count - 1, 1 )
-            ## sd = sqrt( var )
-        
-            pos = start
-            step_size = (end - start) / num_points
-
-            for i in range( num_points ):
-                result.append( (pos, float_nan( summary.sum_data[i] / summary.valid_count[i] ) ) )
-                pos += step_size
-        
         return { 'data': result }
 
 class BigBedDataProvider( BBIDataProvider ):

Repository URL: https://bitbucket.org/galaxy/galaxy-central/

--

This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.

    

Bitbucket

tags

participants (1)