1 new changeset in galaxy-central:
http://bitbucket.org/galaxy/galaxy-central/changeset/af6ed2f06f35/
changeset: af6ed2f06f35
user: jgoecks
date: 2011-09-19 17:59:09
summary: Trackster: enable tools that produce BED output to be used in visual
analytics framework. Enable cluster tool in Trackster.
affected #: 3 files (-1 bytes)
--- a/lib/galaxy/visualization/tracks/data_providers.py Mon Sep 19 10:24:11 2011 -0400
+++ b/lib/galaxy/visualization/tracks/data_providers.py Mon Sep 19 11:59:09 2011 -0400
@@ -135,6 +135,88 @@
{ 'name' : attrs[ 'name' ], 'type' :
column_types[viz_col_index], \
'index' : attrs[ 'index' ] } )
return filters
+
+class BedDataProvider( TracksDataProvider ):
+ """
+ Abstract class that processes BED data from text format to payload format.
+
+ Payload format: [ uid (offset), start, end, name, strand, thick_start, thick_end,
blocks ]
+ """
+
+ def get_iterator( self, chrom, start, end ):
+ raise "Unimplemented Method"
+
+ def get_data( self, chrom, start, end, start_val=0, max_vals=None, **kwargs ):
+ iterator = self.get_iterator( chrom, start, end )
+ return self.process_data( iterator, start_val, max_vals, **kwargs )
+
+ def process_data( self, iterator, start_val=0, max_vals=None, **kwargs ):
+ """
+ Provides
+ """
+ # Build data to return. Payload format is:
+ # [ <guid/offset>, <start>, <end>, <name>, <score>,
<strand>, <thick_start>,
+ # <thick_end>, <blocks> ]
+ #
+ # First three entries are mandatory, others are optional.
+ #
+ filter_cols = from_json_string( kwargs.get( "filter_cols",
"[]" ) )
+ no_detail = ( "no_detail" in kwargs )
+ rval = []
+ message = None
+ for count, line in enumerate( iterator ):
+ if count < start_val:
+ continue
+ if max_vals and count-start_val >= max_vals:
+ message = ERROR_MAX_VALS % ( max_vals, "features" )
+ break
+ # TODO: can we use column metadata to fill out payload?
+ # TODO: use function to set payload data
+
+ feature = line.split()
+ length = len(feature)
+ # Unique id is just a hash of the line
+ payload = [ hash(line), int(feature[1]), int(feature[2]) ]
+
+ if no_detail:
+ rval.append( payload )
+ continue
+
+ # Simpler way to add stuff, but type casting is not done.
+ # Name, score, strand, thick start, thick end.
+ #end = min( len( feature ), 8 )
+ #payload.extend( feature[ 3:end ] )
+
+ # Name, strand, thick start, thick end.
+ if length >= 4:
+ payload.append(feature[3])
+ if length >= 6:
+ payload.append(feature[5])
+ if length >= 8:
+ payload.append(int(feature[6]))
+ payload.append(int(feature[7]))
+
+ # Blocks.
+ if length >= 12:
+ block_sizes = [ int(n) for n in feature[10].split(',') if n !=
'']
+ block_starts = [ int(n) for n in feature[11].split(',') if n !=
'' ]
+ blocks = zip( block_sizes, block_starts )
+ payload.append( [ ( int(feature[1]) + block[1], int(feature[1]) +
block[1] + block[0] ) for block in blocks ] )
+
+ # Score (filter data)
+ if length >= 5 and filter_cols and filter_cols[0] == "Score":
+ payload.append( float(feature[4]) )
+
+ rval.append( payload )
+
+ return { 'data': rval, 'message': message }
+
+ def write_data_to_file( self, chrom, start, end, filename ):
+ iterator = self.get_iterator( chrom, start, end )
+ out = open( filename, "w" )
+ for line in iterator:
+ out.write( "%s\n" % line )
+ out.close()
class SummaryTreeDataProvider( TracksDataProvider ):
"""
@@ -573,78 +655,7 @@
results.append( payload )
return { 'data': results, 'message': message }
-
-class BedDataProvider( TabixDataProvider ):
- """
- Payload format: [ uid (offset), start, end, name, strand, thick_start, thick_end,
blocks ]
- """
-
- def process_data( self, iterator, start_val=0, max_vals=sys.maxint, **kwargs ):
- #
- # Build data to return. Payload format is:
- # [ <guid/offset>, <start>, <end>, <name>, <score>,
<strand>, <thick_start>,
- # <thick_end>, <blocks> ]
- #
- # First three entries are mandatory, others are optional.
- #
- filter_cols = from_json_string( kwargs.get( "filter_cols",
"[]" ) )
- no_detail = ( "no_detail" in kwargs )
- rval = []
- message = None
- for count, line in enumerate( iterator ):
- if count < start_val:
- continue
- if count-start_val >= max_vals:
- message = ERROR_MAX_VALS % ( max_vals, "features" )
- break
- # TODO: can we use column metadata to fill out payload?
- # TODO: use function to set payload data
-
- feature = line.split()
- length = len(feature)
- # Unique id is just a hash of the line
- payload = [ hash(line), int(feature[1]), int(feature[2]) ]
-
- if no_detail:
- rval.append( payload )
- continue
-
- # Simpler way to add stuff, but type casting is not done.
- # Name, score, strand, thick start, thick end.
- #end = min( len( feature ), 8 )
- #payload.extend( feature[ 3:end ] )
-
- # Name, strand, thick start, thick end.
- if length >= 4:
- payload.append(feature[3])
- if length >= 6:
- payload.append(feature[5])
- if length >= 8:
- payload.append(int(feature[6]))
- payload.append(int(feature[7]))
-
- # Blocks.
- if length >= 12:
- block_sizes = [ int(n) for n in feature[10].split(',') if n !=
'']
- block_starts = [ int(n) for n in feature[11].split(',') if n !=
'' ]
- blocks = zip( block_sizes, block_starts )
- payload.append( [ ( int(feature[1]) + block[1], int(feature[1]) +
block[1] + block[0] ) for block in blocks ] )
-
- # Score (filter data)
- if length >= 5 and filter_cols and filter_cols[0] == "Score":
- payload.append( float(feature[4]) )
- rval.append( payload )
-
- return { 'data': rval, 'message': message }
-
- def write_data_to_file( self, chrom, start, end, filename ):
- iterator = self.get_iterator( chrom, start, end )
- out = open( filename, "w" )
- for line in iterator:
- out.write( "%s\n" % line )
- out.close()
-
class VcfDataProvider( TabixDataProvider ):
"""
VCF data provider for the Galaxy track browser.
@@ -685,7 +696,7 @@
Provide data from GFF file.
NOTE: this data provider does not use indices, and hence will be very slow
- for large datasets.
+ for large datasets.
"""
def get_data( self, chrom, start, end, start_val=0, max_vals=sys.maxint, **kwargs ):
start, end = int( start ), int( end )
@@ -710,6 +721,30 @@
offset += feature.raw_size
return { 'data': results, 'message': message }
+
+class BedTabixDataProvider( TabixDataProvider, BedDataProvider ):
+ """
+ Provides data from a BED file indexed via tabix.
+ """
+ pass
+
+class RawBedDataProvider( BedDataProvider ):
+ """
+ Provide data from BED file.
+
+ NOTE: this data provider does not use indices, and hence will be very slow
+ for large datasets.
+ """
+
+ def get_iterator( self, chrom, start, end ):
+ def line_filter_iter():
+ for line in open( self.original_dataset.file_name ):
+ feature = line.split()
+ feature_chrom, feature_start, feature_end = feature[ 0:3 ]
+ if feature_chrom != chrom or feature_start > end or feature_end <
start:
+ continue
+ yield line
+ return line_filter_iter()
#
# Helper methods.
@@ -719,7 +754,7 @@
# type. First key is converted dataset type; if result is another dict, second key
# is original dataset type. TODO: This needs to be more flexible.
dataset_type_name_to_data_provider = {
- "tabix": { Vcf: VcfDataProvider, Bed: BedDataProvider, "default"
: TabixDataProvider },
+ "tabix": { Vcf: VcfDataProvider, Bed: BedTabixDataProvider,
"default" : TabixDataProvider },
"interval_index": IntervalIndexDataProvider,
"bai": BamDataProvider,
"summary_tree": SummaryTreeDataProvider,
--- a/lib/galaxy/web/controllers/tracks.py Mon Sep 19 10:24:11 2011 -0400
+++ b/lib/galaxy/web/controllers/tracks.py Mon Sep 19 11:59:09 2011 -0400
@@ -397,7 +397,7 @@
def raw_data( self, trans, dataset_id, chrom, low, high, **kwargs ):
"""
Uses original (raw) dataset to return data. This method is useful
- when the dataset is not yet indexed and hence using /data would
+ when the dataset is not yet indexed and hence using data would
be slow because indexes need to be created.
"""
@@ -409,10 +409,15 @@
# Return data.
data = None
+ # TODO: for raw data requests, map dataset type to provider using dict in
data_providers.py
if isinstance( dataset.datatype, Gff ):
data = GFFDataProvider( original_dataset=dataset ).get_data( chrom, low,
high, **kwargs )
data[ 'dataset_type' ] = 'interval_index'
data[ 'extra_info' ] = None
+ if isinstance( dataset.datatype, Bed ):
+ data = RawBedDataProvider( original_dataset=dataset ).get_data( chrom, low,
high, **kwargs )
+ data[ 'dataset_type' ] = 'interval_index'
+ data[ 'extra_info' ] = None
return data
@web.json
--- a/tools/new_operations/cluster.xml Mon Sep 19 10:24:11 2011 -0400
+++ b/tools/new_operations/cluster.xml Mon Sep 19 11:59:09 2011 -0400
@@ -1,5 +1,6 @@
<tool id="gops_cluster_1" name="Cluster"><description>the
intervals of a dataset</description>
+ <trackster_conf/><command interpreter="python">gops_cluster.py
$input1 $output -1
${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol}
-d $distance -m $minregions -o $returntype</command><inputs><param
format="interval" name="input1" type="data">
Repository URL:
https://bitbucket.org/galaxy/galaxy-central/
--
This is a commit notification from
bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.