details: http://www.bx.psu.edu/hg/galaxy/rev/742fa2afcad9 changeset: 3683:742fa2afcad9 user: Dan Blankenberg <dan@bx.psu.edu> date: Fri Apr 23 11:14:26 2010 -0400 description: Updates for 'Profile Annotations for a set of genomic intervals' tool. This tool will now report a 'data version'. Add a script that creates the indexes and table description xml from a UCSC database dump. diffstat: lib/galaxy/tools/parameters/basic.py | 12 +- scripts/tools/annotation_profiler/README.txt | 54 + scripts/tools/annotation_profiler/build_profile_indexes.py | 338 ++++++++++ tool-data/annotation_profiler_options.xml.sample | 2 +- tools/annotation_profiler/annotation_profiler.xml | 4 +- tools/annotation_profiler/annotation_profiler_for_interval.py | 74 +- 6 files changed, 449 insertions(+), 35 deletions(-) diffs (603 lines): diff -r c37de7a983e7 -r 742fa2afcad9 lib/galaxy/tools/parameters/basic.py --- a/lib/galaxy/tools/parameters/basic.py Thu Apr 22 21:11:17 2010 -0400 +++ b/lib/galaxy/tools/parameters/basic.py Fri Apr 23 11:14:26 2010 -0400 @@ -960,8 +960,11 @@ if filter.get( 'type' ) == 'data_meta': if filter.get( 'data_ref' ) not in self.filtered: self.filtered[filter.get( 'data_ref' )] = {} - self.filtered[filter.get( 'data_ref' )][filter.get( 'meta_key' )] = { 'value': filter.get( 'value' ), 'options':[] } - recurse_option_elems( self.filtered[filter.get( 'data_ref' )][filter.get( 'meta_key' )]['options'], filter.find( 'options' ).findall( 'option' ) ) + if filter.get( 'meta_key' ) not in self.filtered[filter.get( 'data_ref' )]: + self.filtered[filter.get( 'data_ref' )][filter.get( 'meta_key' )] = {} + if filter.get( 'value' ) not in self.filtered[filter.get( 'data_ref' )][filter.get( 'meta_key' )]: + self.filtered[filter.get( 'data_ref' )][filter.get( 'meta_key' )][filter.get( 'value' )] = [] + recurse_option_elems( self.filtered[filter.get( 'data_ref' )][filter.get( 'meta_key' )][filter.get( 'value' )], filter.find( 'options' ).findall( 'option' ) ) else: recurse_option_elems( self.options, elem.find( 'options' ).findall( 'option' ) ) @@ -974,8 +977,9 @@ dataset = dataset.dataset if dataset: for meta_key, meta_dict in filter_value.iteritems(): - if dataset.metadata.spec[meta_key].param.to_string( dataset.metadata.get( meta_key ) ) == meta_dict['value']: - options.extend( meta_dict['options'] ) + check_meta_val = dataset.metadata.spec[meta_key].param.to_string( dataset.metadata.get( meta_key ) ) + if check_meta_val in meta_dict: + options.extend( meta_dict[check_meta_val] ) return options return self.options diff -r c37de7a983e7 -r 742fa2afcad9 scripts/tools/annotation_profiler/README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/tools/annotation_profiler/README.txt Fri Apr 23 11:14:26 2010 -0400 @@ -0,0 +1,54 @@ +This file explains how to create annotation indexes for the annotation profiler tool. Annotation profiler indexes are an exceedingly simple binary format, +containing no header information and consisting of an ordered linear list of (start,stop encoded individually as '<I') regions which are covered by a UCSC table partitioned +by chromosome name. Genomic regions are merged by overlap / direct adjacency (e.g. a table having ranges of: 1-10, 6-12, 12-20 and 25-28 results in two merged ranges of: 1-20 and 25-28). + +Files are arranged like: +/profiled_annotations/DBKEY/TABLE_NAME/ + CHROMOSOME_NAME.covered + CHROMOSOME_NAME.total_coverage + CHROMOSOME_NAME.total_regions +/profiled_annotations/DBKEY/ + DBKEY_tables.xml + chromosomes.txt + profiled_info.txt + + +where CHROMOSOME_NAME.covered is the binary file, CHROMOSOME_NAME.total_coverage is a text file containing the integer count of bases covered by the +table and CHROMOSOME_NAME.total_regions contains the integer count of the number of regions found in CHROMOSOME_NAME.covered + +DBKEY_tables.xml should be appended to the annotation profile available table configuration file (tool-data/annotation_profiler_options.xml). +The DBKEY should also be added as a new line to the annotation profiler valid builds file (annotation_profiler_valid_builds.txt). +The output (/profiled_annotations/DBKEY) should be made available as GALAXY_ROOT/tool-data/annotation_profiler/DBKEY. + +profiled_info.txt contains info on the generated annotations, separated by lines with tab-delimited label,value pairs: + profiler_version - the version of the build_profile_indexes.py script that was used to generate the profiled data + dbkey - the dbkey used for the run + chromosomes - contains the names and lengths of chromosomes that were used to parse single-chromosome tables (tables divided into individual files by chromosome) + dump_time - the declared dump time of the database, taken from trackDb.txt.gz + profiled_time - seconds since epoch in utc for when the database dump was profiled + database_hash - a md5 hex digest of all the profiled table info + + +Typical usage includes: + +python build_profile_indexes.py -d hg19 -i /ucsc_data/hg19/database/ > hg19.txt + +where the genome build is hg19 and /ucsc_data/hg19/database/ contains the downloaded database dump from UCSC (e.g. obtained by rsync: rsync -avzP rsync://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/ /ucsc_data/hg19/database/). + + + +By default, chromosome names come from a file named 'chromInfo.txt.gz' found in the input directory, with FTP used as a backup. +When FTP is used to obtain the names of chromosomes from UCSC for a particular genome build, alternate ftp sites and paths can be specified by using the --ftp_site and --ftp_path attributes. +Chromosome names can instead be provided on the commandline via the --chromosomes option, which accepts a comma separated list of:ChromName1[=length],ChromName2[=length],... + + + + usage = "usage: %prog options" + parser = OptionParser( usage=usage ) + parser.add_option( '-d', '--dbkey', dest='dbkey', default='hg18', help='dbkey to process' ) + parser.add_option( '-i', '--input_dir', dest='input_dir', default=os.path.join( 'golden_path','%s', 'database' ), help='Input Directory' ) + parser.add_option( '-o', '--output_dir', dest='output_dir', default=os.path.join( 'profiled_annotations','%s' ), help='Output Directory' ) + parser.add_option( '-c', '--chromosomes', dest='chromosomes', default='', help='Comma separated list of: ChromName1[=length],ChromName2[=length],...' ) + parser.add_option( '-b', '--bitset_size', dest='bitset_size', default=DEFAULT_BITSET_SIZE, type='int', help='Default BitSet size; overridden by sizes specified in chromInfo.txt.gz or by --chromosomes' ) + parser.add_option( '-f', '--ftp_site', dest='ftp_site', default='hgdownload.cse.ucsc.edu', help='FTP site; used for chromosome info when chromInfo.txt.gz method fails' ) + parser.add_option( '-p', '--ftp_path', dest='ftp_path', default='/goldenPath/%s/chromosomes/', help='FTP Path; used for chromosome info when chromInfo.txt.gz method fails' ) diff -r c37de7a983e7 -r 742fa2afcad9 scripts/tools/annotation_profiler/build_profile_indexes.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/tools/annotation_profiler/build_profile_indexes.py Fri Apr 23 11:14:26 2010 -0400 @@ -0,0 +1,338 @@ +#!/usr/bin/env python +#Dan Blankenberg + +VERSION = '1.0.0' # version of this script + +from optparse import OptionParser +import os, gzip, struct, time +from ftplib import FTP #do we want a diff method than using FTP to determine Chrom Names, eg use local copy + +#import md5 from hashlib; if python2.4 or less, use old md5 +try: + from hashlib import md5 +except ImportError: + from md5 import new as md5 + +#import BitSet from bx-python, try using eggs and package resources, fall back to any local installation +try: + from galaxy import eggs + import pkg_resources + pkg_resources.require( "bx-python" ) +except: pass #Maybe there is a local installation available +from bx.bitset import BitSet + +#Define constants +STRUCT_FMT = '<I' +STRUCT_SIZE = struct.calcsize( STRUCT_FMT ) +DEFAULT_BITSET_SIZE = 300000000 +CHUNK_SIZE = 1024 + +#Headers used to parse .sql files to determine column indexes for chromosome name, start and end +alias_spec = { + 'chromCol' : [ 'chrom' , 'CHROMOSOME' , 'CHROM', 'Chromosome Name', 'tName' ], + 'startCol' : [ 'start' , 'START', 'chromStart', 'txStart', 'Start Position (bp)', 'tStart', 'genoStart' ], + 'endCol' : [ 'end' , 'END' , 'STOP', 'chromEnd', 'txEnd', 'End Position (bp)', 'tEnd', 'genoEnd' ], +} + +#Headers used to parse trackDb.txt.gz +#TODO: these should be parsed directly from trackDb.sql +trackDb_headers = ["tableName", "shortLabel", "type", "longLabel", "visibility", "priority", "colorR", "colorG", "colorB", "altColorR", "altColorG", "altColorB", "useScore", "private", "restrictCount", "restrictList", "url", "html", "grp", "canPack", "settings"] + +def get_columns( filename ): + input_sql = open( filename ).read() + input_sql = input_sql.split( 'CREATE TABLE ' )[1].split( ';' )[0] + input_sql = input_sql.split( ' (', 1 ) + table_name = input_sql[0].strip().strip( '`' ) + input_sql = [ split.strip().split( ' ' )[0].strip().strip( '`' ) for split in input_sql[1].rsplit( ')', 1 )[0].strip().split( '\n' ) ] + print input_sql + chrom_col = None + start_col = None + end_col = None + for col_name in alias_spec['chromCol']: + for i, header_name in enumerate( input_sql ): + if col_name == header_name: + chrom_col = i + break + if chrom_col is not None: + break + + for col_name in alias_spec['startCol']: + for i, header_name in enumerate( input_sql ): + if col_name == header_name: + start_col = i + break + if start_col is not None: + break + + for col_name in alias_spec['endCol']: + for i, header_name in enumerate( input_sql ): + if col_name == header_name: + end_col = i + break + if end_col is not None: + break + + return table_name, chrom_col, start_col, end_col + + +def create_grouping_xml( input_dir, output_dir, dbkey ): + output_filename = os.path.join( output_dir, '%s_tables.xml' % dbkey ) + def load_groups( file_name = 'grp.txt.gz' ): + groups = {} + for line in gzip.open( os.path.join( input_dir, file_name ) ): + fields = line.split( '\t' ) + groups[fields[0]] = { 'desc': fields[1], 'priority': fields[2] } + return groups + f = gzip.open( os.path.join( input_dir, 'trackDb.txt.gz' ) ) + out = open( output_filename, 'wb' ) + tables = {} + cur_buf = '' + while True: + line = f.readline() + if not line: break + #remove new lines + line = line.rstrip( '\n\r' ) + line = line.replace( '\\\t', ' ' ) #replace escaped tabs with space + cur_buf += "%s\n" % line.rstrip( '\\' ) + if line.endswith( '\\' ): + continue #line is wrapped, next line + #all fields should be loaded now... + fields = cur_buf.split( '\t' ) + cur_buf = '' #reset buffer + assert len( fields ) == len( trackDb_headers ), 'Failed Parsing trackDb.txt.gz; fields: %s' % fields + table_name = fields[ 0 ] + tables[ table_name ] = {} + for field_name, field_value in zip( trackDb_headers, fields ): + tables[ table_name ][ field_name ] = field_value + #split settings fields into dict + fields = fields[-1].split( '\n' ) + tables[ table_name ][ 'settings' ] = {} + for field in fields: + setting_fields = field.split( ' ', 1 ) + setting_name = setting_value = setting_fields[ 0 ] + if len( setting_fields ) > 1: + setting_value = setting_fields[ 1 ] + if setting_name or setting_value: + tables[ table_name ][ 'settings' ][ setting_name ] = setting_value + #Load Groups + groups = load_groups() + in_groups = {} + for table_name, values in tables.iteritems(): + if os.path.exists( os.path.join( output_dir, table_name ) ): + group = values['grp'] + if group not in in_groups: + in_groups[group]={} + #***NAME CHANGE***, 'subTrack' no longer exists as a setting...use 'parent' instead + #subTrack = values.get('settings', {} ).get( 'subTrack', table_name ) + subTrack = values.get('settings', {} ).get( 'parent', table_name ).split( ' ' )[0] #need to split, because could be e.g. 'trackgroup on' + if subTrack not in in_groups[group]: + in_groups[group][subTrack]=[] + in_groups[group][subTrack].append( table_name ) + + assigned_tables = [] + out.write( """<filter type="data_meta" data_ref="input1" meta_key="dbkey" value="%s">\n""" % ( dbkey ) ) + out.write( " <options>\n" ) + for group, subTracks in sorted( in_groups.iteritems() ): + out.write( """ <option name="%s" value="group-%s">\n""" % ( groups[group]['desc'], group ) ) + for sub_name, sub_tracks in subTracks.iteritems(): + if len( sub_tracks ) > 1: + out.write( """ <option name="%s" value="subtracks-%s">\n""" % ( sub_name, sub_name ) ) + sub_tracks.sort() + for track in sub_tracks: + track_label = track + if "$" not in tables[track]['shortLabel']: + track_label = tables[track]['shortLabel'] + out.write( """ <option name="%s" value="%s"/>\n""" % ( track_label, track ) ) + assigned_tables.append( track ) + out.write( " </option>\n" ) + else: + track = sub_tracks[0] + track_label = track + if "$" not in tables[track]['shortLabel']: + track_label = tables[track]['shortLabel'] + out.write( """ <option name="%s" value="%s"/>\n""" % ( track_label, track ) ) + assigned_tables.append( track ) + out.write( " </option>\n" ) + unassigned_tables = list( sorted( [ table_dir for table_dir in os.listdir( output_dir ) if table_dir not in assigned_tables and os.path.isdir( os.path.join( output_dir, table_dir ) ) ] ) ) + if unassigned_tables: + out.write( """ <option name="Uncategorized Tables" value="group-trackDbUnassigned">\n""" ) + for table_name in unassigned_tables: + out.write( """ <option name="%s" value="%s"/>\n""" % ( table_name, table_name ) ) + out.write( " </option>\n" ) + out.write( " </options>\n" ) + out.write( """</filter>\n""" ) + out.close() + +def write_database_dump_info( input_dir, output_dir, dbkey, chrom_lengths, default_bitset_size ): + #generate hash for profiled table directories + #sort directories off output root (files in output root not hashed, including the profiler_info.txt file) + #sort files in each directory and hash file contents + profiled_hash = md5() + for table_dir in sorted( [ table_dir for table_dir in os.listdir( output_dir ) if os.path.isdir( os.path.join( output_dir, table_dir ) ) ] ): + for filename in sorted( os.listdir( os.path.join( output_dir, table_dir ) ) ): + f = open( os.path.join( output_dir, table_dir, filename ), 'rb' ) + while True: + hash_chunk = f.read( CHUNK_SIZE ) + if not hash_chunk: + break + profiled_hash.update( hash_chunk ) + profiled_hash = profiled_hash.hexdigest() + + #generate hash for input dir + #sort directories off input root + #sort files in each directory and hash file contents + database_hash = md5() + for dirpath, dirnames, filenames in sorted( os.walk( input_dir ) ): + for filename in sorted( filenames ): + f = open( os.path.join( input_dir, dirpath, filename ), 'rb' ) + while True: + hash_chunk = f.read( CHUNK_SIZE ) + if not hash_chunk: + break + database_hash.update( hash_chunk ) + database_hash = database_hash.hexdigest() + + #write out info file + out = open( os.path.join( output_dir, 'profiler_info.txt' ), 'wb' ) + out.write( 'dbkey\t%s\n' % ( dbkey ) ) + out.write( 'chromosomes\t%s\n' % ( ','.join( [ '%s=%s' % ( chrom_name, chrom_len ) for chrom_name, chrom_len in chrom_lengths.iteritems() ] ) ) ) + out.write( 'bitset_size\t%s\n' % ( default_bitset_size ) ) + for line in open( os.path.join( input_dir, 'trackDb.sql' ) ): + line = line.strip() + if line.startswith( '-- Dump completed on ' ): + line = line[ len( '-- Dump completed on ' ): ] + out.write( 'dump_time\t%s\n' % ( line ) ) + break + out.write( 'dump_hash\t%s\n' % ( database_hash ) ) + out.write( 'profiler_time\t%s\n' % ( time.time() ) ) + out.write( 'profiler_hash\t%s\n' % ( profiled_hash ) ) + out.write( 'profiler_version\t%s\n' % ( VERSION ) ) + out.write( 'profiler_struct_format\t%s\n' % ( STRUCT_FMT ) ) + out.write( 'profiler_struct_size\t%s\n' % ( STRUCT_SIZE ) ) + out.close() + +def __main__(): + usage = "usage: %prog options" + parser = OptionParser( usage=usage ) + parser.add_option( '-d', '--dbkey', dest='dbkey', default='hg18', help='dbkey to process' ) + parser.add_option( '-i', '--input_dir', dest='input_dir', default=os.path.join( 'golden_path','%s', 'database' ), help='Input Directory' ) + parser.add_option( '-o', '--output_dir', dest='output_dir', default=os.path.join( 'profiled_annotations','%s' ), help='Output Directory' ) + parser.add_option( '-c', '--chromosomes', dest='chromosomes', default='', help='Comma separated list of: ChromName1[=length],ChromName2[=length],...' ) + parser.add_option( '-b', '--bitset_size', dest='bitset_size', default=DEFAULT_BITSET_SIZE, type='int', help='Default BitSet size; overridden by sizes specified in chromInfo.txt.gz or by --chromosomes' ) + parser.add_option( '-f', '--ftp_site', dest='ftp_site', default='hgdownload.cse.ucsc.edu', help='FTP site; used for chromosome info when chromInfo.txt.gz method fails' ) + parser.add_option( '-p', '--ftp_path', dest='ftp_path', default='/goldenPath/%s/chromosomes/', help='FTP Path; used for chromosome info when chromInfo.txt.gz method fails' ) + + ( options, args ) = parser.parse_args() + + input_dir = options.input_dir + if '%' in input_dir: + input_dir = input_dir % options.dbkey + assert os.path.exists( input_dir ), 'Input directory does not exist' + output_dir = options.output_dir + if '%' in output_dir: + output_dir = output_dir % options.dbkey + assert not os.path.exists( output_dir ), 'Output directory already exists' + os.makedirs( output_dir ) + ftp_path = options.ftp_path + if '%' in ftp_path: + ftp_path = ftp_path % options.dbkey + + #Get chromosome names and lengths + chrom_lengths = {} + if options.chromosomes: + for chrom in options.chromosomes.split( ',' ): + fields = chrom.split( '=' ) + chrom = fields[0] + if len( fields ) > 1: + chrom_len = int( fields[1] ) + else: + chrom_len = options.bitset_size + chrom_lengths[ chrom ] = chrom_len + chroms = chrom_lengths.keys() + print 'Chrom info taken from command line option.' + else: + try: + for line in gzip.open( os.path.join( input_dir, 'chromInfo.txt.gz' ) ): + fields = line.strip().split( '\t' ) + chrom_lengths[ fields[0] ] = int( fields[ 1 ] ) + chroms = chrom_lengths.keys() + print 'Chrom info taken from chromInfo.txt.gz.' + except Exception, e: + print 'Error loading chrom info from chromInfo.txt.gz, trying FTP method.' + chrom_lengths = {} #zero out chrom_lengths + chroms = [] + ftp = FTP( options.ftp_site ) + ftp.login() + for name in ftp.nlst( ftp_path ): + if name.endswith( '.fa.gz' ): + chroms.append( name.split( '/' )[-1][ :-len( '.fa.gz' ) ] ) + ftp.close() + for chrom in chroms: + chrom_lengths[ chrom ] = options.bitset_size + #sort chroms by length of name, decending; necessary for when table names start with chrom name + chroms = list( reversed( [ chrom for chrom_len, chrom in sorted( [ ( len( chrom ), chrom ) for chrom in chroms ] ) ] ) ) + + #parse tables from local files + #loop through directory contents, if file ends in '.sql', process table + for filename in os.listdir( input_dir ): + if filename.endswith ( '.sql' ): + base_filename = filename[ 0:-len( '.sql' ) ] + table_out_dir = os.path.join( output_dir, base_filename ) + #some tables are chromosome specific, lets strip off the chrom name + for chrom in chroms: + if base_filename.startswith( "%s_" % chrom ): + #found chromosome + table_out_dir = os.path.join( output_dir, base_filename[len( "%s_" % chrom ):] ) + break + #create table dir + if not os.path.exists( table_out_dir ): + os.mkdir( table_out_dir ) #table dir may already exist in the case of single chrom tables + print "Created table dir (%s)." % table_out_dir + else: + print "Table dir (%s) already exists." % table_out_dir + #find column assignments + table_name, chrom_col, start_col, end_col = get_columns( "%s.sql" % os.path.join( input_dir, base_filename ) ) + if chrom_col is None or start_col is None or end_col is None: + print "Table %s (%s) does not appear to have a chromosome, a start, or a stop." % ( table_name, "%s.sql" % os.path.join( input_dir, base_filename ) ) + if not os.listdir( table_out_dir ): + print "Removing empty table (%s) directory (%s)." % ( table_name, table_out_dir ) + os.rmdir( table_out_dir ) + continue + #build bitsets from table + bitset_dict = {} + for line in gzip.open( '%s.txt.gz' % os.path.join( input_dir, base_filename ) ): + fields = line.strip().split( '\t' ) + chrom = fields[ chrom_col ] + start = int( fields[ start_col ] ) + end = int( fields[ end_col ] ) + if chrom not in bitset_dict: + bitset_dict[ chrom ] = BitSet( chrom_lengths.get( chrom, options.bitset_size ) ) + bitset_dict[ chrom ].set_range( start, end - start ) + #write bitsets as profiled annotations + for chrom_name, chrom_bits in bitset_dict.iteritems(): + out = open( os.path.join( table_out_dir, '%s.covered' % chrom_name ), 'wb' ) + end = 0 + total_regions = 0 + total_coverage = 0 + max_size = chrom_lengths.get( chrom_name, options.bitset_size ) + while True: + start = chrom_bits.next_set( end ) + if start >= max_size: + break + end = chrom_bits.next_clear( start ) + out.write( struct.pack( STRUCT_FMT, start ) ) + out.write( struct.pack( STRUCT_FMT, end ) ) + total_regions += 1 + total_coverage += end - start + if end >= max_size: + break + out.close() + open( os.path.join( table_out_dir, '%s.total_regions' % chrom_name ), 'wb' ).write( str( total_regions ) ) + open( os.path.join( table_out_dir, '%s.total_coverage' % chrom_name ), 'wb' ).write( str( total_coverage ) ) + + #create xml + create_grouping_xml( input_dir, output_dir, options.dbkey ) + #create database dump info file, for database version control + write_database_dump_info( input_dir, output_dir, options.dbkey, chrom_lengths, options.bitset_size ) + +if __name__ == "__main__": __main__() diff -r c37de7a983e7 -r 742fa2afcad9 tool-data/annotation_profiler_options.xml.sample --- a/tool-data/annotation_profiler_options.xml.sample Thu Apr 22 21:11:17 2010 -0400 +++ b/tool-data/annotation_profiler_options.xml.sample Fri Apr 23 11:14:26 2010 -0400 @@ -1,4 +1,4 @@ -<filter type="meta_key" name="dbkey" value="hg18"> +<filter type="data_meta" data_ref="input1" meta_key="dbkey" value="hg18"> <options> <option name="Mapping and Sequencing Tracks" value="group-map"> <option name="STS Markers" value="stsMap"/> diff -r c37de7a983e7 -r 742fa2afcad9 tools/annotation_profiler/annotation_profiler.xml --- a/tools/annotation_profiler/annotation_profiler.xml Thu Apr 22 21:11:17 2010 -0400 +++ b/tools/annotation_profiler/annotation_profiler.xml Fri Apr 23 11:14:26 2010 -0400 @@ -1,6 +1,6 @@ <tool id="Annotation_Profiler_0" name="Profile Annotations" Version="1.0.0"> <description>for a set of genomic intervals</description> - <command interpreter="python">annotation_profiler_for_interval.py -i $input1 -c ${input1.metadata.chromCol} -s ${input1.metadata.startCol} -e ${input1.metadata.endCol} -o $out_file1 $keep_empty -p ${GALAXY_DATA_INDEX_DIR}/annotation_profiler/$dbkey $summary -l ${chromInfo} -b 3 -t $table_names</command> + <command interpreter="python">annotation_profiler_for_interval.py -i $input1 -c ${input1.metadata.chromCol} -s ${input1.metadata.startCol} -e ${input1.metadata.endCol} -o $out_file1 $keep_empty -p ${GALAXY_DATA_INDEX_DIR}/annotation_profiler/$dbkey $summary -b 3 -t $table_names</command> <inputs> <param format="interval" name="input1" type="data" label="Choose Intervals"> <validator type="dataset_metadata_in_file" filename="annotation_profiler_valid_builds.txt" metadata_name="dbkey" metadata_column="0" message="Profiling is not currently available for this species."/> @@ -41,7 +41,7 @@ <help> **What it does** -Takes an input set of intervals and for each interval determines the base coverage of the interval by a set of features (tables) available from UCSC. +Takes an input set of intervals and for each interval determines the base coverage of the interval by a set of features (tables) available from UCSC. Genomic regions from the input feature data have been merged by overlap / direct adjacency (e.g. a table having ranges of: 1-10, 6-12, 12-20 and 25-28 results in two merged ranges of: 1-20 and 25-28). By default, this tool will check the coverage of your intervals against all available features; you may, however, choose to select only those tables that you want to include. Selecting a section heading will effectively cause all of its children to be selected. diff -r c37de7a983e7 -r 742fa2afcad9 tools/annotation_profiler/annotation_profiler_for_interval.py --- a/tools/annotation_profiler/annotation_profiler_for_interval.py Thu Apr 22 21:11:17 2010 -0400 +++ b/tools/annotation_profiler/annotation_profiler_for_interval.py Fri Apr 23 11:14:26 2010 -0400 @@ -18,12 +18,13 @@ assert sys.version_info[:2] >= ( 2, 4 ) class CachedRangesInFile: - fmt = '<I' - fmt_size = struct.calcsize( fmt ) - def __init__( self, filename ): + DEFAULT_STRUCT_FORMAT = '<I' + def __init__( self, filename, profiler_info ): self.file_size = os.stat( filename ).st_size self.file = open( filename, 'rb' ) self.filename = filename + self.fmt = profiler_info.get( 'profiler_struct_format', self.DEFAULT_STRUCT_FORMAT ) + self.fmt_size = int( profiler_info.get( 'profiler_struct_size', struct.calcsize( self.fmt ) ) ) self.length = int( self.file_size / self.fmt_size / 2 ) self._cached_ranges = [ None for i in xrange( self.length ) ] def __getitem__( self, i ): @@ -43,9 +44,9 @@ return self.length class RegionCoverage: - def __init__( self, filename_base ): + def __init__( self, filename_base, profiler_info ): try: - self._coverage = CachedRangesInFile( "%s.covered" % filename_base ) + self._coverage = CachedRangesInFile( "%s.covered" % filename_base, profiler_info ) except Exception, e: #print "Error loading coverage file %s: %s" % ( "%s.covered" % filename_base, e ) self._coverage = [] @@ -89,12 +90,14 @@ return coverage, region_count, start_index class CachedCoverageReader: - def __init__( self, base_file_path, buffer = 10, table_names = None ): + def __init__( self, base_file_path, buffer = 10, table_names = None, profiler_info = None ): self._base_file_path = base_file_path self._buffer = buffer #number of chromosomes to keep in memory at a time self._coverage = {} - if table_names is None: table_names = os.listdir( self._base_file_path ) + if table_names is None: table_names = [ table_dir for table_dir in os.listdir( self._base_file_path ) if os.path.isdir( os.path.join( self._base_file_path, table_dir ) ) ] for tablename in table_names: self._coverage[tablename] = {} + if profiler_info is None: profiler_info = {} + self._profiler_info = profiler_info def iter_table_coverage_by_region( self, chrom, start, end ): for tablename, coverage, regions in self.iter_table_coverage_regions_by_region( chrom, start, end ): yield tablename, coverage @@ -107,7 +110,7 @@ if len( chromosomes ) >= self._buffer: #randomly remove one chromosome from this table del chromosomes[ chromosomes.keys().pop( random.randint( 0, self._buffer - 1 ) ) ] - chromosomes[chrom] = RegionCoverage( os.path.join ( self._base_file_path, tablename, chrom ) ) + chromosomes[chrom] = RegionCoverage( os.path.join ( self._base_file_path, tablename, chrom ), self._profiler_info ) coverage, regions, index = chromosomes[chrom].get_coverage_regions_index_overlap( start, end ) yield tablename, coverage, regions, index @@ -240,19 +243,35 @@ print "%s has max length of %s, exceeded by %s%s." % ( chrom, chrom_lengths.get( chrom ), ", ".join( map( str, regions[:3] ) ), extra_region_info ) class ChromosomeLengths: - def __init__( self, filename ): + def __init__( self, profiler_info ): self.chroms = {} - try: - for line in open( filename ): - try: - fields = line.strip().split( "\t" ) - self.chroms[fields[0]] = int( fields[1] ) - except: - continue - except: - pass + self.default_bitset_size = int( profiler_info.get( 'bitset_size', bx.bitset.MAX ) ) + chroms = profiler_info.get( 'chromosomes', None ) + if chroms: + for chrom in chroms.split( ',' ): + for fields in chrom.rsplit( '=', 1 ): + if len( fields ) == 2: + self.chroms[ fields[0] ] = int( fields[1] ) + else: + self.chroms[ fields[0] ] = self.default_bitset_size def get( self, name ): - return self.chroms.get( name, bx.bitset.MAX ) + return self.chroms.get( name, self.default_bitset_size ) + +def parse_profiler_info( filename ): + profiler_info = {} + try: + for line in open( filename ): + fields = line.rstrip( '\n\r' ).split( '\t', 1 ) + if len( fields ) == 2: + if fields[0] in profiler_info: + if not isinstance( profiler_info[ fields[0] ], list ): + profiler_info[ fields[0] ] = [ profiler_info[ fields[0] ] ] + profiler_info[ fields[0] ].append( fields[1] ) + else: + profiler_info[ fields[0] ] = fields[1] + except: + pass #likely missing file + return profiler_info def __main__(): parser = optparse.OptionParser() @@ -294,16 +313,10 @@ help='Path to profiled data for this organism' ) parser.add_option( - '-l','--lengths', - dest='lengths', - type='str',default='test-data/shared/ucsc/hg18.len', - help='Path to chromosome lengths data for this organism' - ) - parser.add_option( '-t','--table_names', dest='table_names', type='str',default='None', - help='Path to profiled data for this organism' + help='Table names requested' ) parser.add_option( '-i','--input', @@ -327,14 +340,19 @@ options, args = parser.parse_args() + #get profiler_info + profiler_info = parse_profiler_info( os.path.join( options.path, 'profiler_info.txt' ) ) + table_names = options.table_names.split( "," ) if table_names == ['None']: table_names = None - coverage_reader = CachedCoverageReader( options.path, buffer = options.buffer, table_names = table_names ) + coverage_reader = CachedCoverageReader( options.path, buffer = options.buffer, table_names = table_names, profiler_info = profiler_info ) if options.summary: - profile_summary( options.interval_filename, options.chrom_col - 1, options.start_col - 1, options.end_col -1, options.out_filename, options.keep_empty, coverage_reader, ChromosomeLengths( options.lengths ) ) + profile_summary( options.interval_filename, options.chrom_col - 1, options.start_col - 1, options.end_col -1, options.out_filename, options.keep_empty, coverage_reader, ChromosomeLengths( profiler_info ) ) else: profile_per_interval( options.interval_filename, options.chrom_col - 1, options.start_col - 1, options.end_col -1, options.out_filename, options.keep_empty, coverage_reader ) + #print out data version info + print 'Data version (%s:%s:%s)' % ( profiler_info.get( 'dbkey', 'unknown' ), profiler_info.get( 'profiler_hash', 'unknown' ), profiler_info.get( 'dump_time', 'unknown' ) ) if __name__ == "__main__": __main__()