[hg] galaxy 3683: Updates for 'Profile Annotations for a set of ...

10 May 2010

details:   http://www.bx.psu.edu/hg/galaxy/rev/742fa2afcad9
changeset: 3683:742fa2afcad9
user:      Dan Blankenberg <dan@bx.psu.edu>
date:      Fri Apr 23 11:14:26 2010 -0400
description:
Updates for 'Profile Annotations for a set of genomic intervals' tool. This tool will now report a 'data version'. Add a script that creates the indexes and table description xml from a UCSC database dump.

diffstat:

 lib/galaxy/tools/parameters/basic.py                          |   12 +-
 scripts/tools/annotation_profiler/README.txt                  |   54 +
 scripts/tools/annotation_profiler/build_profile_indexes.py    |  338 ++++++++++
 tool-data/annotation_profiler_options.xml.sample              |    2 +-
 tools/annotation_profiler/annotation_profiler.xml             |    4 +-
 tools/annotation_profiler/annotation_profiler_for_interval.py |   74 +-
 6 files changed, 449 insertions(+), 35 deletions(-)

diffs (603 lines):

diff -r c37de7a983e7 -r 742fa2afcad9 lib/galaxy/tools/parameters/basic.py

--- a/lib/galaxy/tools/parameters/basic.py	Thu Apr 22 21:11:17 2010 -0400
+++ b/lib/galaxy/tools/parameters/basic.py	Fri Apr 23 11:14:26 2010 -0400
@@ -960,8 +960,11 @@
                 if filter.get( 'type' ) == 'data_meta':
                     if filter.get( 'data_ref' ) not in self.filtered:
                         self.filtered[filter.get( 'data_ref' )] = {}
-                    self.filtered[filter.get( 'data_ref' )][filter.get( 'meta_key' )] = { 'value': filter.get( 'value' ), 'options':[] }
-                    recurse_option_elems( self.filtered[filter.get( 'data_ref' )][filter.get( 'meta_key' )]['options'], filter.find( 'options' ).findall( 'option' ) )
+                    if filter.get( 'meta_key' ) not in self.filtered[filter.get( 'data_ref' )]:
+                        self.filtered[filter.get( 'data_ref' )][filter.get( 'meta_key' )] = {}
+                    if filter.get( 'value' ) not in self.filtered[filter.get( 'data_ref' )][filter.get( 'meta_key' )]:
+                        self.filtered[filter.get( 'data_ref' )][filter.get( 'meta_key' )][filter.get( 'value' )] = []
+                    recurse_option_elems( self.filtered[filter.get( 'data_ref' )][filter.get( 'meta_key' )][filter.get( 'value' )], filter.find( 'options' ).findall( 'option' ) )
         else:
             recurse_option_elems( self.options, elem.find( 'options' ).findall( 'option' ) )
     
@@ -974,8 +977,9 @@
                     dataset = dataset.dataset
                 if dataset:
                     for meta_key, meta_dict in filter_value.iteritems():
-                        if dataset.metadata.spec[meta_key].param.to_string( dataset.metadata.get( meta_key ) ) == meta_dict['value']:
-                            options.extend( meta_dict['options'] )
+                        check_meta_val = dataset.metadata.spec[meta_key].param.to_string( dataset.metadata.get( meta_key ) )
+                        if check_meta_val in meta_dict:
+                            options.extend( meta_dict[check_meta_val] )
             return options
         return self.options
     
diff -r c37de7a983e7 -r 742fa2afcad9 scripts/tools/annotation_profiler/README.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/tools/annotation_profiler/README.txt	Fri Apr 23 11:14:26 2010 -0400
@@ -0,0 +1,54 @@
+This file explains how to create annotation indexes for the annotation profiler tool. Annotation profiler indexes are an exceedingly simple binary format, 
+containing no header information and consisting of an ordered linear list of (start,stop encoded individually as '<I') regions which are covered by a UCSC table partitioned 
+by chromosome name. Genomic regions are merged by overlap / direct adjacency (e.g. a table having ranges of: 1-10, 6-12, 12-20 and 25-28 results in two merged ranges of: 1-20 and 25-28).
+
+Files are arranged like:
+/profiled_annotations/DBKEY/TABLE_NAME/
+                                       CHROMOSOME_NAME.covered
+                                       CHROMOSOME_NAME.total_coverage
+                                       CHROMOSOME_NAME.total_regions
+/profiled_annotations/DBKEY/
+                            DBKEY_tables.xml
+                            chromosomes.txt
+                            profiled_info.txt
+
+
+where CHROMOSOME_NAME.covered is the binary file, CHROMOSOME_NAME.total_coverage is a text file containing the integer count of bases covered by the 
+table and CHROMOSOME_NAME.total_regions contains the integer count of the number of regions found in CHROMOSOME_NAME.covered
+
+DBKEY_tables.xml should be appended to the annotation profile available table configuration file (tool-data/annotation_profiler_options.xml).
+The DBKEY should also be added as a new line to the annotation profiler valid builds file (annotation_profiler_valid_builds.txt).
+The output (/profiled_annotations/DBKEY) should be made available as GALAXY_ROOT/tool-data/annotation_profiler/DBKEY.
+
+profiled_info.txt contains info on the generated annotations, separated by lines with tab-delimited label,value pairs:
+        profiler_version - the version of the build_profile_indexes.py script that was used to generate the profiled data
+        dbkey - the dbkey used for the run
+        chromosomes - contains the names and lengths of chromosomes that were used to parse single-chromosome tables (tables divided into individual files by chromosome)
+        dump_time - the declared dump time of the database, taken from trackDb.txt.gz
+        profiled_time - seconds since epoch in utc for when the database dump was profiled
+        database_hash - a md5 hex digest of all the profiled table info 
+
+
+Typical usage includes:
+
+python build_profile_indexes.py -d hg19 -i /ucsc_data/hg19/database/ > hg19.txt
+
+where the genome build is hg19 and /ucsc_data/hg19/database/ contains the downloaded database dump from UCSC (e.g. obtained by rsync: rsync -avzP rsync://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/ /ucsc_data/hg19/database/).
+
+
+
+By default, chromosome names come from a file named 'chromInfo.txt.gz' found in the input directory, with FTP used as a backup.
+When FTP is used to obtain the names of chromosomes from UCSC for a particular genome build, alternate ftp sites and paths can be specified by using the --ftp_site and --ftp_path attributes. 
+Chromosome names can instead be provided on the commandline via the --chromosomes option, which accepts a comma separated list of:ChromName1[=length],ChromName2[=length],...
+
+
+
+    usage = "usage: %prog options"
+    parser = OptionParser( usage=usage )
+    parser.add_option( '-d', '--dbkey', dest='dbkey', default='hg18', help='dbkey to process' )
+    parser.add_option( '-i', '--input_dir', dest='input_dir', default=os.path.join( 'golden_path','%s', 'database' ), help='Input Directory' )
+    parser.add_option( '-o', '--output_dir', dest='output_dir', default=os.path.join( 'profiled_annotations','%s' ), help='Output Directory' )
+    parser.add_option( '-c', '--chromosomes', dest='chromosomes', default='', help='Comma separated list of: ChromName1[=length],ChromName2[=length],...' )
+    parser.add_option( '-b', '--bitset_size', dest='bitset_size', default=DEFAULT_BITSET_SIZE, type='int', help='Default BitSet size; overridden by sizes specified in chromInfo.txt.gz or by --chromosomes' )
+    parser.add_option( '-f', '--ftp_site', dest='ftp_site', default='hgdownload.cse.ucsc.edu', help='FTP site; used for chromosome info when chromInfo.txt.gz method fails' )
+    parser.add_option( '-p', '--ftp_path', dest='ftp_path', default='/goldenPath/%s/chromosomes/', help='FTP Path; used for chromosome info when chromInfo.txt.gz method fails' )
diff -r c37de7a983e7 -r 742fa2afcad9 scripts/tools/annotation_profiler/build_profile_indexes.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/tools/annotation_profiler/build_profile_indexes.py	Fri Apr 23 11:14:26 2010 -0400
@@ -0,0 +1,338 @@
+#!/usr/bin/env python
+#Dan Blankenberg
+
+VERSION = '1.0.0' # version of this script
+
+from optparse import OptionParser
+import os, gzip, struct, time
+from ftplib import FTP #do we want a diff method than using FTP to determine Chrom Names, eg use local copy
+
+#import md5 from hashlib; if python2.4 or less, use old md5
+try:
+    from hashlib import md5
+except ImportError:
+    from md5 import new as md5
+
+#import BitSet from bx-python, try using eggs and package resources, fall back to any local installation
+try:
+    from galaxy import eggs
+    import pkg_resources
+    pkg_resources.require( "bx-python" )
+except: pass #Maybe there is a local installation available
+from bx.bitset import BitSet
+
+#Define constants
+STRUCT_FMT = '<I'
+STRUCT_SIZE = struct.calcsize( STRUCT_FMT )
+DEFAULT_BITSET_SIZE = 300000000
+CHUNK_SIZE = 1024
+
+#Headers used to parse .sql files to determine column indexes for chromosome name, start and end
+alias_spec = { 
+    'chromCol'  : [ 'chrom' , 'CHROMOSOME' , 'CHROM', 'Chromosome Name', 'tName' ],  
+    'startCol'  : [ 'start' , 'START', 'chromStart', 'txStart', 'Start Position (bp)', 'tStart', 'genoStart' ],
+    'endCol'    : [ 'end'   , 'END'  , 'STOP', 'chromEnd', 'txEnd', 'End Position (bp)', 'tEnd', 'genoEnd' ], 
+}
+
+#Headers used to parse trackDb.txt.gz
+#TODO: these should be parsed directly from trackDb.sql
+trackDb_headers = ["tableName", "shortLabel", "type", "longLabel", "visibility", "priority", "colorR", "colorG", "colorB", "altColorR", "altColorG", "altColorB", "useScore", "private", "restrictCount", "restrictList", "url", "html", "grp", "canPack", "settings"]
+
+def get_columns( filename ):
+    input_sql = open( filename ).read()
+    input_sql = input_sql.split( 'CREATE TABLE ' )[1].split( ';' )[0]
+    input_sql = input_sql.split( ' (', 1 )
+    table_name = input_sql[0].strip().strip( '`' )
+    input_sql = [ split.strip().split( ' ' )[0].strip().strip( '`' ) for split in input_sql[1].rsplit( ')', 1 )[0].strip().split( '\n' ) ]
+    print input_sql
+    chrom_col = None
+    start_col = None
+    end_col = None
+    for col_name in alias_spec['chromCol']:
+        for i, header_name in enumerate( input_sql ):
+            if col_name == header_name:
+                chrom_col = i
+                break
+        if chrom_col is not None:
+            break
+    
+    for col_name in alias_spec['startCol']:
+        for i, header_name in enumerate( input_sql ):
+            if col_name == header_name:
+                start_col = i
+                break
+        if start_col is not None:
+            break
+
+    for col_name in alias_spec['endCol']:
+        for i, header_name in enumerate( input_sql ):
+            if col_name == header_name:
+                end_col = i
+                break
+        if end_col is not None:
+            break
+
+    return table_name, chrom_col, start_col, end_col
+
+
+def create_grouping_xml( input_dir, output_dir, dbkey ):
+    output_filename = os.path.join( output_dir, '%s_tables.xml' % dbkey )
+    def load_groups( file_name = 'grp.txt.gz' ):
+        groups = {}
+        for line in gzip.open( os.path.join( input_dir, file_name ) ):
+            fields = line.split( '\t' )
+            groups[fields[0]] = { 'desc': fields[1], 'priority':  fields[2] }
+        return groups
+    f = gzip.open( os.path.join( input_dir, 'trackDb.txt.gz' ) )
+    out = open( output_filename, 'wb' )
+    tables = {}
+    cur_buf = ''
+    while True:
+        line = f.readline()
+        if not line: break
+        #remove new lines
+        line = line.rstrip( '\n\r' )
+        line = line.replace( '\\\t', ' ' ) #replace escaped tabs with space
+        cur_buf += "%s\n" % line.rstrip( '\\' )
+        if line.endswith( '\\' ):
+            continue #line is wrapped, next line
+        #all fields should be loaded now...
+        fields = cur_buf.split( '\t' )
+        cur_buf = '' #reset buffer
+        assert len( fields ) == len( trackDb_headers ), 'Failed Parsing trackDb.txt.gz; fields: %s' % fields
+        table_name = fields[ 0 ]
+        tables[ table_name ] = {}
+        for field_name, field_value in zip( trackDb_headers, fields ):
+            tables[ table_name ][ field_name ] = field_value
+        #split settings fields into dict
+        fields = fields[-1].split( '\n' )
+        tables[ table_name ][ 'settings' ] = {}
+        for field in fields:
+            setting_fields = field.split( ' ', 1 )
+            setting_name = setting_value = setting_fields[ 0 ]
+            if len( setting_fields ) > 1:
+                setting_value = setting_fields[ 1 ]
+            if setting_name or setting_value:
+                tables[ table_name ][ 'settings' ][ setting_name ] = setting_value
+    #Load Groups
+    groups = load_groups()
+    in_groups = {}
+    for table_name, values in tables.iteritems():
+        if os.path.exists( os.path.join( output_dir, table_name ) ):
+            group = values['grp']
+            if group not in in_groups:
+                in_groups[group]={}
+            #***NAME CHANGE***, 'subTrack' no longer exists as a setting...use 'parent' instead
+            #subTrack = values.get('settings', {} ).get( 'subTrack', table_name )
+            subTrack = values.get('settings', {} ).get( 'parent', table_name ).split( ' ' )[0] #need to split, because could be e.g. 'trackgroup on'
+            if subTrack not in in_groups[group]:
+                in_groups[group][subTrack]=[]
+            in_groups[group][subTrack].append( table_name )
+    
+    assigned_tables = []
+    out.write( """<filter type="data_meta" data_ref="input1" meta_key="dbkey" value="%s">\n""" % ( dbkey ) )
+    out.write( "  <options>\n" )
+    for group, subTracks in sorted( in_groups.iteritems() ):
+        out.write( """    <option name="%s" value="group-%s">\n""" % ( groups[group]['desc'], group ) )
+        for sub_name, sub_tracks in subTracks.iteritems():
+            if len( sub_tracks ) > 1:
+                out.write( """      <option name="%s" value="subtracks-%s">\n""" % ( sub_name, sub_name ) )
+                sub_tracks.sort()
+                for track in sub_tracks:
+                    track_label = track
+                    if "$" not in tables[track]['shortLabel']:
+                        track_label = tables[track]['shortLabel']
+                    out.write( """        <option name="%s" value="%s"/>\n""" % ( track_label, track ) )
+                    assigned_tables.append( track )
+                out.write( "      </option>\n" )
+            else:
+                track = sub_tracks[0]
+                track_label = track
+                if "$" not in tables[track]['shortLabel']:
+                    track_label = tables[track]['shortLabel']
+                out.write( """        <option name="%s" value="%s"/>\n""" % ( track_label, track ) )
+                assigned_tables.append( track )
+        out.write( "    </option>\n" )
+    unassigned_tables = list( sorted( [ table_dir for table_dir in os.listdir( output_dir ) if table_dir not in assigned_tables and os.path.isdir( os.path.join( output_dir, table_dir ) ) ] ) )
+    if unassigned_tables:
+        out.write( """    <option name="Uncategorized Tables" value="group-trackDbUnassigned">\n""" )
+        for table_name in unassigned_tables:
+            out.write( """        <option name="%s" value="%s"/>\n""" % ( table_name, table_name ) )
+        out.write( "    </option>\n" )
+    out.write( "  </options>\n" )
+    out.write( """</filter>\n""" )
+    out.close()
+
+def write_database_dump_info( input_dir, output_dir, dbkey, chrom_lengths, default_bitset_size ):
+    #generate hash for profiled table directories
+    #sort directories off output root (files in output root not hashed, including the profiler_info.txt file)
+    #sort files in each directory and hash file contents
+    profiled_hash = md5()
+    for table_dir in sorted( [ table_dir for table_dir in os.listdir( output_dir ) if os.path.isdir( os.path.join( output_dir, table_dir ) ) ] ):
+        for filename in sorted( os.listdir( os.path.join( output_dir, table_dir ) ) ):
+            f  = open( os.path.join( output_dir, table_dir, filename ), 'rb' )
+            while True:
+                hash_chunk = f.read( CHUNK_SIZE )
+                if not hash_chunk:
+                    break
+                profiled_hash.update( hash_chunk )
+    profiled_hash = profiled_hash.hexdigest()
+    
+    #generate hash for input dir
+    #sort directories off input root
+    #sort files in each directory and hash file contents
+    database_hash = md5()
+    for dirpath, dirnames, filenames in sorted( os.walk( input_dir ) ):
+        for filename in sorted( filenames ):
+            f  = open( os.path.join( input_dir, dirpath, filename ), 'rb' )
+            while True:
+                hash_chunk = f.read( CHUNK_SIZE )
+                if not hash_chunk:
+                    break
+                database_hash.update( hash_chunk )
+    database_hash = database_hash.hexdigest()
+    
+    #write out info file
+    out = open( os.path.join( output_dir, 'profiler_info.txt' ), 'wb' )
+    out.write( 'dbkey\t%s\n' % ( dbkey ) )
+    out.write( 'chromosomes\t%s\n' % ( ','.join( [ '%s=%s' % ( chrom_name, chrom_len ) for chrom_name, chrom_len in chrom_lengths.iteritems() ] ) ) )
+    out.write( 'bitset_size\t%s\n' % ( default_bitset_size ) )
+    for line in open( os.path.join( input_dir, 'trackDb.sql' ) ):
+        line = line.strip()
+        if line.startswith( '-- Dump completed on ' ):
+            line = line[ len( '-- Dump completed on ' ): ]
+            out.write( 'dump_time\t%s\n' % ( line ) )
+            break
+    out.write( 'dump_hash\t%s\n' % ( database_hash ) )
+    out.write( 'profiler_time\t%s\n' % ( time.time() ) )
+    out.write( 'profiler_hash\t%s\n' % ( profiled_hash ) )
+    out.write( 'profiler_version\t%s\n' % ( VERSION ) )
+    out.write( 'profiler_struct_format\t%s\n' % ( STRUCT_FMT ) )
+    out.write( 'profiler_struct_size\t%s\n' % ( STRUCT_SIZE ) )
+    out.close()
+    
+def __main__():
+    usage = "usage: %prog options"
+    parser = OptionParser( usage=usage )
+    parser.add_option( '-d', '--dbkey', dest='dbkey', default='hg18', help='dbkey to process' )
+    parser.add_option( '-i', '--input_dir', dest='input_dir', default=os.path.join( 'golden_path','%s', 'database' ), help='Input Directory' )
+    parser.add_option( '-o', '--output_dir', dest='output_dir', default=os.path.join( 'profiled_annotations','%s' ), help='Output Directory' )
+    parser.add_option( '-c', '--chromosomes', dest='chromosomes', default='', help='Comma separated list of: ChromName1[=length],ChromName2[=length],...' )
+    parser.add_option( '-b', '--bitset_size', dest='bitset_size', default=DEFAULT_BITSET_SIZE, type='int', help='Default BitSet size; overridden by sizes specified in chromInfo.txt.gz or by --chromosomes' )
+    parser.add_option( '-f', '--ftp_site', dest='ftp_site', default='hgdownload.cse.ucsc.edu', help='FTP site; used for chromosome info when chromInfo.txt.gz method fails' )
+    parser.add_option( '-p', '--ftp_path', dest='ftp_path', default='/goldenPath/%s/chromosomes/', help='FTP Path; used for chromosome info when chromInfo.txt.gz method fails' )
+    
+    ( options, args ) = parser.parse_args()
+    
+    input_dir = options.input_dir
+    if '%' in input_dir:
+        input_dir = input_dir % options.dbkey
+    assert os.path.exists( input_dir ), 'Input directory does not exist'
+    output_dir = options.output_dir
+    if '%' in output_dir:
+        output_dir = output_dir % options.dbkey
+    assert not os.path.exists( output_dir ), 'Output directory already exists'
+    os.makedirs( output_dir )
+    ftp_path = options.ftp_path
+    if '%' in ftp_path:
+        ftp_path = ftp_path % options.dbkey
+    
+    #Get chromosome names and lengths
+    chrom_lengths = {}
+    if options.chromosomes:
+        for chrom in options.chromosomes.split( ',' ):
+            fields = chrom.split( '=' )
+            chrom = fields[0]
+            if len( fields ) > 1:
+                chrom_len = int( fields[1] )
+            else:
+                chrom_len = options.bitset_size
+            chrom_lengths[ chrom ] = chrom_len
+        chroms = chrom_lengths.keys()
+        print 'Chrom info taken from command line option.'
+    else:
+        try:
+            for line in gzip.open( os.path.join( input_dir, 'chromInfo.txt.gz' ) ):
+                fields = line.strip().split( '\t' )
+                chrom_lengths[ fields[0] ] = int( fields[ 1 ] )
+            chroms = chrom_lengths.keys()
+            print 'Chrom info taken from chromInfo.txt.gz.'
+        except Exception, e:
+            print 'Error loading chrom info from chromInfo.txt.gz, trying FTP method.'
+            chrom_lengths = {} #zero out chrom_lengths
+            chroms = []
+            ftp = FTP( options.ftp_site )
+            ftp.login()
+            for name in ftp.nlst( ftp_path ):
+                if name.endswith( '.fa.gz' ):
+                    chroms.append( name.split( '/' )[-1][ :-len( '.fa.gz' ) ] )
+            ftp.close()
+            for chrom in chroms:
+                chrom_lengths[ chrom ] = options.bitset_size
+    #sort chroms by length of name, decending; necessary for when table names start with chrom name
+    chroms = list( reversed( [ chrom for chrom_len, chrom in sorted( [ ( len( chrom ), chrom ) for chrom in chroms ] ) ] ) )
+    
+    #parse tables from local files
+    #loop through directory contents, if file ends in '.sql', process table
+    for filename in os.listdir( input_dir ):
+        if filename.endswith ( '.sql' ):
+            base_filename = filename[ 0:-len( '.sql' ) ]
+            table_out_dir = os.path.join( output_dir, base_filename )
+            #some tables are chromosome specific, lets strip off the chrom name
+            for chrom in chroms:
+                if base_filename.startswith( "%s_" % chrom ):
+                    #found chromosome
+                    table_out_dir = os.path.join( output_dir, base_filename[len( "%s_" % chrom ):] )
+                    break
+            #create table dir
+            if not os.path.exists( table_out_dir ):
+                os.mkdir( table_out_dir ) #table dir may already exist in the case of single chrom tables
+                print "Created table dir (%s)." % table_out_dir
+            else:
+                print "Table dir (%s) already exists." % table_out_dir
+            #find column assignments
+            table_name, chrom_col, start_col, end_col = get_columns( "%s.sql" % os.path.join( input_dir, base_filename ) )
+            if chrom_col is None or start_col is None or end_col is None:
+                print "Table %s (%s) does not appear to have a chromosome, a start, or a stop." % ( table_name, "%s.sql" % os.path.join( input_dir, base_filename ) )
+                if not os.listdir( table_out_dir ):
+                    print "Removing empty table (%s) directory (%s)." % ( table_name, table_out_dir )
+                    os.rmdir( table_out_dir )
+                continue
+            #build bitsets from table
+            bitset_dict = {}
+            for line in gzip.open( '%s.txt.gz' % os.path.join( input_dir, base_filename )  ):
+                fields = line.strip().split( '\t' )
+                chrom = fields[ chrom_col ]
+                start = int( fields[ start_col ] )
+                end = int( fields[ end_col ] )
+                if chrom not in bitset_dict:
+                    bitset_dict[ chrom ] = BitSet( chrom_lengths.get( chrom, options.bitset_size ) )
+                bitset_dict[ chrom ].set_range(  start, end - start  )
+            #write bitsets as profiled annotations
+            for chrom_name, chrom_bits in bitset_dict.iteritems():
+                out = open( os.path.join( table_out_dir, '%s.covered' % chrom_name  ), 'wb' )
+                end = 0
+                total_regions = 0
+                total_coverage = 0
+                max_size = chrom_lengths.get( chrom_name, options.bitset_size )
+                while True:
+                    start = chrom_bits.next_set( end )
+                    if start >= max_size:
+                        break
+                    end = chrom_bits.next_clear( start )
+                    out.write( struct.pack( STRUCT_FMT, start ) )
+                    out.write( struct.pack( STRUCT_FMT, end ) )
+                    total_regions += 1
+                    total_coverage += end - start
+                    if end >= max_size:
+                        break
+                out.close()
+                open( os.path.join( table_out_dir, '%s.total_regions' % chrom_name  ), 'wb' ).write( str( total_regions ) )
+                open( os.path.join( table_out_dir, '%s.total_coverage' % chrom_name  ), 'wb' ).write( str( total_coverage ) )
+    
+    #create xml
+    create_grouping_xml( input_dir, output_dir, options.dbkey )
+    #create database dump info file, for database version control
+    write_database_dump_info( input_dir, output_dir, options.dbkey, chrom_lengths, options.bitset_size )
+    
+if __name__ == "__main__": __main__()
diff -r c37de7a983e7 -r 742fa2afcad9 tool-data/annotation_profiler_options.xml.sample
--- a/tool-data/annotation_profiler_options.xml.sample	Thu Apr 22 21:11:17 2010 -0400
+++ b/tool-data/annotation_profiler_options.xml.sample	Fri Apr 23 11:14:26 2010 -0400
@@ -1,4 +1,4 @@
-<filter type="meta_key" name="dbkey" value="hg18">
+<filter type="data_meta" data_ref="input1" meta_key="dbkey" value="hg18">
   <options>
     <option name="Mapping and Sequencing Tracks" value="group-map">
         <option name="STS Markers" value="stsMap"/>
diff -r c37de7a983e7 -r 742fa2afcad9 tools/annotation_profiler/annotation_profiler.xml
--- a/tools/annotation_profiler/annotation_profiler.xml	Thu Apr 22 21:11:17 2010 -0400
+++ b/tools/annotation_profiler/annotation_profiler.xml	Fri Apr 23 11:14:26 2010 -0400
@@ -1,6 +1,6 @@
 <tool id="Annotation_Profiler_0" name="Profile Annotations" Version="1.0.0">
   <description>for a set of genomic intervals</description>
-  <command interpreter="python">annotation_profiler_for_interval.py -i $input1 -c ${input1.metadata.chromCol} -s ${input1.metadata.startCol} -e ${input1.metadata.endCol} -o $out_file1 $keep_empty -p ${GALAXY_DATA_INDEX_DIR}/annotation_profiler/$dbkey $summary -l ${chromInfo} -b 3 -t $table_names</command>
+  <command interpreter="python">annotation_profiler_for_interval.py -i $input1 -c ${input1.metadata.chromCol} -s ${input1.metadata.startCol} -e ${input1.metadata.endCol} -o $out_file1 $keep_empty -p ${GALAXY_DATA_INDEX_DIR}/annotation_profiler/$dbkey $summary -b 3 -t $table_names</command>
   <inputs>
     <param format="interval" name="input1" type="data" label="Choose Intervals">
       <validator type="dataset_metadata_in_file" filename="annotation_profiler_valid_builds.txt" metadata_name="dbkey" metadata_column="0" message="Profiling is not currently available for this species."/>
@@ -41,7 +41,7 @@
    <help>
 **What it does**
 
-Takes an input set of intervals and for each interval determines the base coverage of the interval by a set of features (tables) available from UCSC.
+Takes an input set of intervals and for each interval determines the base coverage of the interval by a set of features (tables) available from UCSC. Genomic regions from the input feature data have been merged by overlap / direct adjacency (e.g. a table having ranges of: 1-10, 6-12, 12-20 and 25-28 results in two merged ranges of: 1-20 and 25-28).
 
 By default, this tool will check the coverage of your intervals against all available features; you may, however, choose to select only those tables that you want to include. Selecting a section heading will effectively cause all of its children to be selected.
 
diff -r c37de7a983e7 -r 742fa2afcad9 tools/annotation_profiler/annotation_profiler_for_interval.py
--- a/tools/annotation_profiler/annotation_profiler_for_interval.py	Thu Apr 22 21:11:17 2010 -0400
+++ b/tools/annotation_profiler/annotation_profiler_for_interval.py	Fri Apr 23 11:14:26 2010 -0400
@@ -18,12 +18,13 @@
 assert sys.version_info[:2] >= ( 2, 4 )
 
 class CachedRangesInFile:
-    fmt = '<I'
-    fmt_size = struct.calcsize( fmt )
-    def __init__( self, filename ):
+    DEFAULT_STRUCT_FORMAT = '<I'
+    def __init__( self, filename, profiler_info ):
         self.file_size = os.stat( filename ).st_size
         self.file = open( filename, 'rb' )
         self.filename = filename
+        self.fmt = profiler_info.get( 'profiler_struct_format', self.DEFAULT_STRUCT_FORMAT )
+        self.fmt_size = int( profiler_info.get( 'profiler_struct_size', struct.calcsize( self.fmt ) ) )
         self.length = int( self.file_size / self.fmt_size / 2 )
         self._cached_ranges = [ None for i in xrange( self.length ) ]
     def __getitem__( self, i ):
@@ -43,9 +44,9 @@
         return self.length
 
 class RegionCoverage:
-    def __init__( self, filename_base ):
+    def __init__( self, filename_base, profiler_info ):
         try:
-            self._coverage = CachedRangesInFile( "%s.covered" % filename_base )
+            self._coverage = CachedRangesInFile( "%s.covered" % filename_base, profiler_info )
         except Exception, e:
             #print "Error loading coverage file %s: %s" % ( "%s.covered" % filename_base, e )
             self._coverage = []
@@ -89,12 +90,14 @@
         return coverage, region_count, start_index
 
 class CachedCoverageReader:
-    def __init__( self, base_file_path, buffer = 10, table_names = None ):
+    def __init__( self, base_file_path, buffer = 10, table_names = None, profiler_info = None ):
         self._base_file_path = base_file_path
         self._buffer = buffer #number of chromosomes to keep in memory at a time
         self._coverage = {}
-        if table_names is None: table_names = os.listdir( self._base_file_path )
+        if table_names is None: table_names = [ table_dir for table_dir in os.listdir( self._base_file_path ) if os.path.isdir( os.path.join( self._base_file_path, table_dir ) ) ]
         for tablename in table_names: self._coverage[tablename] = {}
+        if profiler_info is None: profiler_info = {}
+        self._profiler_info = profiler_info
     def iter_table_coverage_by_region( self, chrom, start, end ):
         for tablename, coverage, regions in self.iter_table_coverage_regions_by_region( chrom, start, end ):
             yield tablename, coverage
@@ -107,7 +110,7 @@
                 if len( chromosomes ) >= self._buffer:
                     #randomly remove one chromosome from this table
                     del chromosomes[ chromosomes.keys().pop( random.randint( 0, self._buffer - 1 ) ) ]
-                chromosomes[chrom] = RegionCoverage( os.path.join ( self._base_file_path, tablename, chrom ) )
+                chromosomes[chrom] = RegionCoverage( os.path.join ( self._base_file_path, tablename, chrom ), self._profiler_info )
             coverage, regions, index = chromosomes[chrom].get_coverage_regions_index_overlap( start, end )
             yield tablename, coverage, regions, index
 
@@ -240,19 +243,35 @@
             print "%s has max length of %s, exceeded by %s%s." % ( chrom, chrom_lengths.get( chrom ), ", ".join( map( str, regions[:3] ) ), extra_region_info )
 
 class ChromosomeLengths:
-    def __init__( self, filename ):
+    def __init__( self, profiler_info ):
         self.chroms = {}
-        try:
-            for line in open( filename ):
-                try:
-                    fields = line.strip().split( "\t" )
-                    self.chroms[fields[0]] = int( fields[1] )
-                except:
-                    continue
-        except:
-            pass
+        self.default_bitset_size = int( profiler_info.get( 'bitset_size', bx.bitset.MAX ) )
+        chroms = profiler_info.get( 'chromosomes', None )
+        if chroms:
+            for chrom in chroms.split( ',' ):
+                for fields in chrom.rsplit( '=', 1 ):
+                    if len( fields ) == 2:
+                        self.chroms[ fields[0] ] = int( fields[1] )
+                    else:
+                        self.chroms[ fields[0] ] = self.default_bitset_size
     def get( self, name ):
-        return self.chroms.get( name, bx.bitset.MAX )
+        return self.chroms.get( name, self.default_bitset_size )
+
+def parse_profiler_info( filename ):
+    profiler_info = {}
+    try:
+        for line in open( filename ):
+            fields = line.rstrip( '\n\r' ).split( '\t', 1 )
+            if len( fields ) == 2:
+                if fields[0] in profiler_info:
+                    if not isinstance( profiler_info[ fields[0] ], list ):
+                        profiler_info[ fields[0] ] = [ profiler_info[ fields[0] ] ]
+                    profiler_info[ fields[0] ].append( fields[1] )
+                else:
+                    profiler_info[ fields[0] ] = fields[1]
+    except:
+        pass #likely missing file
+    return profiler_info
 
 def __main__():
     parser = optparse.OptionParser()
@@ -294,16 +313,10 @@
         help='Path to profiled data for this organism'
     )
     parser.add_option(
-        '-l','--lengths',
-        dest='lengths',
-        type='str',default='test-data/shared/ucsc/hg18.len',
-        help='Path to chromosome lengths data for this organism'
-    )
-    parser.add_option(
         '-t','--table_names',
         dest='table_names',
         type='str',default='None',
-        help='Path to profiled data for this organism'
+        help='Table names requested'
     )
     parser.add_option(
         '-i','--input',
@@ -327,14 +340,19 @@
     
     options, args = parser.parse_args()
     
+    #get profiler_info
+    profiler_info = parse_profiler_info( os.path.join( options.path, 'profiler_info.txt' ) )
+    
     table_names = options.table_names.split( "," )
     if table_names == ['None']: table_names = None
-    coverage_reader = CachedCoverageReader( options.path, buffer = options.buffer, table_names = table_names )
+    coverage_reader = CachedCoverageReader( options.path, buffer = options.buffer, table_names = table_names, profiler_info = profiler_info )
     
     if options.summary:
-        profile_summary( options.interval_filename, options.chrom_col - 1, options.start_col - 1, options.end_col -1, options.out_filename, options.keep_empty, coverage_reader, ChromosomeLengths( options.lengths ) )
+        profile_summary( options.interval_filename, options.chrom_col - 1, options.start_col - 1, options.end_col -1, options.out_filename, options.keep_empty, coverage_reader, ChromosomeLengths( profiler_info ) )
     else:
         profile_per_interval( options.interval_filename, options.chrom_col - 1, options.start_col - 1, options.end_col -1, options.out_filename, options.keep_empty, coverage_reader )
     
+    #print out data version info
+    print 'Data version (%s:%s:%s)' % ( profiler_info.get( 'dbkey', 'unknown' ), profiler_info.get( 'profiler_hash', 'unknown' ), profiler_info.get( 'dump_time', 'unknown' ) )
 
 if __name__ == "__main__": __main__()

    

Nate Coraor

tags

participants (1)