2 new commits in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/c559805d44df/ Changeset: c559805d44df User: davebgx Date: 2014-05-19 19:01:46 Summary: Migrate 22 tools from the distribution to the tool shed. Affected #: 67 files diff -r 1fae433a54eaaab6392da9e9a8eec94a3b568c14 -r c559805d44df064f98dfbc94b59edba7be01adb9 lib/tool_shed/galaxy_install/migrate/versions/0011_tools.py --- /dev/null +++ b/lib/tool_shed/galaxy_install/migrate/versions/0011_tools.py @@ -0,0 +1,62 @@ +""" +The following tools have been eliminated from the distribution: + +1: Profile Annotations for a set of genomic intervals +2: Polymorphism of the Reads +3: Coverage of the Reads in wiggle format +4: Canonical Correlation Analysis +5: Convert Color Space to Nucleotides +6: Compute sequence length +7: Concatenate FASTA alignment by species +8: Filter sequences by length +9: FASTA-to-Tabular converter +10: FASTQSOLEXA-to-FASTA-QUAL extracts sequences and quality scores from FASTQSOLEXA data +11: Kernel Canonical Correlation Analysis +12: Kernel Principal Component Analysis +13: Format mapping data as UCSC custom track +14: Megablast compare short reads against htgs, nt, and wgs databases +15: Parse blast XML output +16: Principal Component Analysis +17: RMAP for Solexa Short Reads Alignment +18: RMAPQ for Solexa Short Reads Alignment with Quality Scores +19: Histogram of high quality score reads +20: Build base quality distribution +21: Select high quality segments +22: Tabular-to-FASTA + +The tools are now available in the repositories respectively: + +1: annotation_profiler +2: blat_coverage_report +3: blat_mapping +4: canonical_correlation_analysis +5: convert_solid_color2nuc +6: fasta_compute_length +7: fasta_concatenate_by_species +8: fasta_filter_by_length +9: fasta_to_tabular +10: fastqsolexa_to_fasta_qual +11: kernel_canonical_correlation_analysis +12: kernel_principal_component_analysis +13: mapping_to_ucsc +14: megablast_wrapper +15: megablast_xml_parser +16: principal_component_analysis +17: rmap +18: rmapq +19: short_reads_figure_high_quality_length +20: short_reads_figure_score +21: short_reads_trim_seq +22: tabular_to_fasta + +from the main Galaxy tool shed at http://toolshed.g2.bx.psu.edu +and will be installed into your local Galaxy instance at the +location discussed above by running the following command. + +""" + +def upgrade( migrate_engine ): + print __doc__ + +def downgrade( migrate_engine ): + pass diff -r 1fae433a54eaaab6392da9e9a8eec94a3b568c14 -r c559805d44df064f98dfbc94b59edba7be01adb9 scripts/migrate_tools/0011_tools.sh --- /dev/null +++ b/scripts/migrate_tools/0011_tools.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +cd `dirname $0`/../.. +python ./scripts/migrate_tools/migrate_tools.py 0011_tools.xml $@ diff -r 1fae433a54eaaab6392da9e9a8eec94a3b568c14 -r c559805d44df064f98dfbc94b59edba7be01adb9 scripts/migrate_tools/0011_tools.xml --- /dev/null +++ b/scripts/migrate_tools/0011_tools.xml @@ -0,0 +1,69 @@ +<?xml version="1.0"?> +<toolshed name="toolshed.g2.bx.psu.edu"> + <repository changeset_revision="3b33da018e74" owner="devteam" name="annotation_profiler" description="Profile Annotations for a set of genomic intervals"> + <tool file="annotation_profiler/annotation_profiler.xml" id="Annotation_Profiler_0" version="1.0.0" /> + </repository> + <repository changeset_revision="30f0948c649c" owner="devteam" name="blat_coverage_report" description="Polymorphism of the Reads"> + <tool id="generate_coverage_report" version="1.0.0" file="metag_tools/blat_coverage_report.xml" /> + </repository> + <repository changeset_revision="807e3e50845a" owner="devteam" name="blat_mapping" description="Coverage of the Reads in wiggle format"> + <tool id="blat2wig" version="1.0.0" file="metag_tools/blat_mapping.xml" /> + </repository> + <repository changeset_revision="9bc0c48a027f" owner="devteam" name="canonical_correlation_analysis" description="Canonical Correlation Analysis"> + <tool file="multivariate_stats/cca.xml" id="cca1" version="1.0.0" /> + </repository> + <repository changeset_revision="ab28e7de2db3" owner="devteam" name="convert_solid_color2nuc" description="Convert Color Space to Nucleotides"> + <tool id="color2nuc" version="1.0.0" file="metag_tools/convert_SOLiD_color2nuc.xm" /> + </repository> + <repository changeset_revision="ece409f6573c" owner="devteam" name="fasta_compute_length" description="Compute sequence length"> + <tool file="fasta_tools/fasta_compute_length.xml" id="fasta_compute_length" version="__VERSION__" /> + </repository> + <repository changeset_revision="2126e1b833a2" owner="devteam" name="fasta_concatenate_by_species" description="Concatenate FASTA alignment by species"> + <tool file="fasta_tools/fasta_concatenate_by_species.xml" id="fasta_concatenate0" version="0.0.0" /> + </repository> + <repository changeset_revision="16679a7f554a" owner="devteam" name="fasta_filter_by_length" description="Filter sequences by length"> + <tool file="fasta_tools/fasta_filter_by_length.xml" id="fasta_filter_by_length" version="1.1" /> + </repository> + <repository changeset_revision="9d189d08f2ad" owner="devteam" name="fasta_to_tabular" description="FASTA-to-Tabular converter"> + <tool file="fasta_tools/fasta_to_tabular.xml" id="fasta2tab" version="1.1.0" /> + </repository> + <repository changeset_revision="ef23c03d7497" owner="devteam" name="fastqsolexa_to_fasta_qual" description="FASTQSOLEXA-to-FASTA-QUAL extracts sequences and quality scores from FASTQSOLEXA data"> + <tool id="fastqsolexa_to_fasta_qual" version="1.0.0" file="metag_tools/fastqsolexa_to_fasta_qual.xml" /> + </repository> + <repository changeset_revision="7a092113eb8c" owner="devteam" name="kernel_canonical_correlation_analysis" description="Kernel Canonical Correlation Analysis"> + <tool file="multivariate_stats/kcca.xml" id="kcca1" version="1.0.0" /> + </repository> + <repository changeset_revision="e9ebd4bfbdfc" owner="devteam" name="kernel_principal_component_analysis" description="Kernel Principal Component Analysis"> + <tool file="multivariate_stats/kpca.xml" id="kpca1" version="1.0.0" /> + </repository> + <repository changeset_revision="601abbd22cea" owner="devteam" name="mapping_to_ucsc" description="Format mapping data as UCSC custom track"> + <tool id="mapToUCSC" version="1.0.0" file="metag_tools/mapping_to_ucsc.xml" /> + </repository> + <repository changeset_revision="dc7b4acb3fa6" owner="devteam" name="megablast_wrapper" description="Megablast compare short reads against htgs, nt, and wgs databases"> + <tool file="metag_tools/megablast_wrapper.xml" id="megablast_wrapper" version="1.2.0" /> + </repository> + <repository changeset_revision="03ca082aeb2e" owner="devteam" name="megablast_xml_parser" description="Parse blast XML output"> + <tool file="metag_tools/megablast_xml_parser.xml" id="megablast_xml_parser" version="1.0.0" /> + </repository> + <repository changeset_revision="f568051cdf2e" owner="devteam" name="principal_component_analysis" description="Principal Component Analysis"> + <tool file="multivariate_stats/pca.xml" id="pca1" version="1.0.2" /> + </repository> + <repository changeset_revision="ee49255302d8" owner="devteam" name="rmap" description="RMAP for Solexa Short Reads Alignment"> + <tool id="rmap_wrapper" version="1.0.0" file="metag_tools/rmap_wrapper.xml" /> + </repository> + <repository changeset_revision="f6e5bb5aa2f5" owner="devteam" name="rmapq" description="RMAPQ for Solexa Short Reads Alignment with Quality Scores"> + <tool id="rmapq_wrapper" version="1.0.0" file="metag_tools/rmapq_wrapper.xml" /> + </repository> + <repository changeset_revision="556ceed24699" owner="devteam" name="short_reads_figure_high_quality_length" description="Histogram of high quality score reads"> + <tool id="hist_high_quality_score" version="1.0.0" file="metag_tools/short_reads_figure_high_quality_length.xml" /> + </repository> + <repository changeset_revision="b52b9c7aabd9" owner="devteam" name="short_reads_figure_score" description="Build base quality distribution"> + <tool file="metag_tools/short_reads_figure_score.xml" id="quality_score_distribution" version="1.0.2" /> + </repository> + <repository changeset_revision="f17a1585733b" owner="devteam" name="short_reads_trim_seq" description="Select high quality segments"> + <tool file="metag_tools/short_reads_trim_seq.xml" id="trim_reads" version="1.0.0" /> + </repository> + <repository changeset_revision="0b4e36026794" owner="devteam" name="tabular_to_fasta" description="Tabular-to-FASTA"> + <tool file="fasta_tools/tabular_to_fasta.xml" id="tab2fasta" version="1.1.0" /> + </repository> +</toolshed> \ No newline at end of file diff -r 1fae433a54eaaab6392da9e9a8eec94a3b568c14 -r c559805d44df064f98dfbc94b59edba7be01adb9 scripts/tools/annotation_profiler/README.txt --- a/scripts/tools/annotation_profiler/README.txt +++ /dev/null @@ -1,54 +0,0 @@ -This file explains how to create annotation indexes for the annotation profiler tool. Annotation profiler indexes are an exceedingly simple binary format, -containing no header information and consisting of an ordered linear list of (start,stop encoded individually as '<I') regions which are covered by a UCSC table partitioned -by chromosome name. Genomic regions are merged by overlap / direct adjacency (e.g. a table having ranges of: 1-10, 6-12, 12-20 and 25-28 results in two merged ranges of: 1-20 and 25-28). - -Files are arranged like: -/profiled_annotations/DBKEY/TABLE_NAME/ - CHROMOSOME_NAME.covered - CHROMOSOME_NAME.total_coverage - CHROMOSOME_NAME.total_regions -/profiled_annotations/DBKEY/ - DBKEY_tables.xml - chromosomes.txt - profiled_info.txt - - -where CHROMOSOME_NAME.covered is the binary file, CHROMOSOME_NAME.total_coverage is a text file containing the integer count of bases covered by the -table and CHROMOSOME_NAME.total_regions contains the integer count of the number of regions found in CHROMOSOME_NAME.covered - -DBKEY_tables.xml should be appended to the annotation profile available table configuration file (tool-data/annotation_profiler_options.xml). -The DBKEY should also be added as a new line to the annotation profiler valid builds file (annotation_profiler_valid_builds.txt). -The output (/profiled_annotations/DBKEY) should be made available as GALAXY_ROOT/tool-data/annotation_profiler/DBKEY. - -profiled_info.txt contains info on the generated annotations, separated by lines with tab-delimited label,value pairs: - profiler_version - the version of the build_profile_indexes.py script that was used to generate the profiled data - dbkey - the dbkey used for the run - chromosomes - contains the names and lengths of chromosomes that were used to parse single-chromosome tables (tables divided into individual files by chromosome) - dump_time - the declared dump time of the database, taken from trackDb.txt.gz - profiled_time - seconds since epoch in utc for when the database dump was profiled - database_hash - a md5 hex digest of all the profiled table info - - -Typical usage includes: - -python build_profile_indexes.py -d hg19 -i /ucsc_data/hg19/database/ > hg19.txt - -where the genome build is hg19 and /ucsc_data/hg19/database/ contains the downloaded database dump from UCSC (e.g. obtained by rsync: rsync -avzP rsync://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/ /ucsc_data/hg19/database/). - - - -By default, chromosome names come from a file named 'chromInfo.txt.gz' found in the input directory, with FTP used as a backup. -When FTP is used to obtain the names of chromosomes from UCSC for a particular genome build, alternate ftp sites and paths can be specified by using the --ftp_site and --ftp_path attributes. -Chromosome names can instead be provided on the commandline via the --chromosomes option, which accepts a comma separated list of:ChromName1[=length],ChromName2[=length],... - - - - usage = "usage: %prog options" - parser = OptionParser( usage=usage ) - parser.add_option( '-d', '--dbkey', dest='dbkey', default='hg18', help='dbkey to process' ) - parser.add_option( '-i', '--input_dir', dest='input_dir', default=os.path.join( 'golden_path','%s', 'database' ), help='Input Directory' ) - parser.add_option( '-o', '--output_dir', dest='output_dir', default=os.path.join( 'profiled_annotations','%s' ), help='Output Directory' ) - parser.add_option( '-c', '--chromosomes', dest='chromosomes', default='', help='Comma separated list of: ChromName1[=length],ChromName2[=length],...' ) - parser.add_option( '-b', '--bitset_size', dest='bitset_size', default=DEFAULT_BITSET_SIZE, type='int', help='Default BitSet size; overridden by sizes specified in chromInfo.txt.gz or by --chromosomes' ) - parser.add_option( '-f', '--ftp_site', dest='ftp_site', default='hgdownload.cse.ucsc.edu', help='FTP site; used for chromosome info when chromInfo.txt.gz method fails' ) - parser.add_option( '-p', '--ftp_path', dest='ftp_path', default='/goldenPath/%s/chromosomes/', help='FTP Path; used for chromosome info when chromInfo.txt.gz method fails' ) diff -r 1fae433a54eaaab6392da9e9a8eec94a3b568c14 -r c559805d44df064f98dfbc94b59edba7be01adb9 scripts/tools/annotation_profiler/build_profile_indexes.py --- a/scripts/tools/annotation_profiler/build_profile_indexes.py +++ /dev/null @@ -1,338 +0,0 @@ -#!/usr/bin/env python -#Dan Blankenberg - -VERSION = '1.0.0' # version of this script - -from optparse import OptionParser -import os, gzip, struct, time -from ftplib import FTP #do we want a diff method than using FTP to determine Chrom Names, eg use local copy - -#import md5 from hashlib; if python2.4 or less, use old md5 -try: - from hashlib import md5 -except ImportError: - from md5 import new as md5 - -#import BitSet from bx-python, try using eggs and package resources, fall back to any local installation -try: - from galaxy import eggs - import pkg_resources - pkg_resources.require( "bx-python" ) -except: pass #Maybe there is a local installation available -from bx.bitset import BitSet - -#Define constants -STRUCT_FMT = '<I' -STRUCT_SIZE = struct.calcsize( STRUCT_FMT ) -DEFAULT_BITSET_SIZE = 300000000 -CHUNK_SIZE = 1024 - -#Headers used to parse .sql files to determine column indexes for chromosome name, start and end -alias_spec = { - 'chromCol' : [ 'chrom' , 'CHROMOSOME' , 'CHROM', 'Chromosome Name', 'tName' ], - 'startCol' : [ 'start' , 'START', 'chromStart', 'txStart', 'Start Position (bp)', 'tStart', 'genoStart' ], - 'endCol' : [ 'end' , 'END' , 'STOP', 'chromEnd', 'txEnd', 'End Position (bp)', 'tEnd', 'genoEnd' ], -} - -#Headers used to parse trackDb.txt.gz -#TODO: these should be parsed directly from trackDb.sql -trackDb_headers = ["tableName", "shortLabel", "type", "longLabel", "visibility", "priority", "colorR", "colorG", "colorB", "altColorR", "altColorG", "altColorB", "useScore", "private", "restrictCount", "restrictList", "url", "html", "grp", "canPack", "settings"] - -def get_columns( filename ): - input_sql = open( filename ).read() - input_sql = input_sql.split( 'CREATE TABLE ' )[1].split( ';' )[0] - input_sql = input_sql.split( ' (', 1 ) - table_name = input_sql[0].strip().strip( '`' ) - input_sql = [ split.strip().split( ' ' )[0].strip().strip( '`' ) for split in input_sql[1].rsplit( ')', 1 )[0].strip().split( '\n' ) ] - print input_sql - chrom_col = None - start_col = None - end_col = None - for col_name in alias_spec['chromCol']: - for i, header_name in enumerate( input_sql ): - if col_name == header_name: - chrom_col = i - break - if chrom_col is not None: - break - - for col_name in alias_spec['startCol']: - for i, header_name in enumerate( input_sql ): - if col_name == header_name: - start_col = i - break - if start_col is not None: - break - - for col_name in alias_spec['endCol']: - for i, header_name in enumerate( input_sql ): - if col_name == header_name: - end_col = i - break - if end_col is not None: - break - - return table_name, chrom_col, start_col, end_col - - -def create_grouping_xml( input_dir, output_dir, dbkey ): - output_filename = os.path.join( output_dir, '%s_tables.xml' % dbkey ) - def load_groups( file_name = 'grp.txt.gz' ): - groups = {} - for line in gzip.open( os.path.join( input_dir, file_name ) ): - fields = line.split( '\t' ) - groups[fields[0]] = { 'desc': fields[1], 'priority': fields[2] } - return groups - f = gzip.open( os.path.join( input_dir, 'trackDb.txt.gz' ) ) - out = open( output_filename, 'wb' ) - tables = {} - cur_buf = '' - while True: - line = f.readline() - if not line: break - #remove new lines - line = line.rstrip( '\n\r' ) - line = line.replace( '\\\t', ' ' ) #replace escaped tabs with space - cur_buf += "%s\n" % line.rstrip( '\\' ) - if line.endswith( '\\' ): - continue #line is wrapped, next line - #all fields should be loaded now... - fields = cur_buf.split( '\t' ) - cur_buf = '' #reset buffer - assert len( fields ) == len( trackDb_headers ), 'Failed Parsing trackDb.txt.gz; fields: %s' % fields - table_name = fields[ 0 ] - tables[ table_name ] = {} - for field_name, field_value in zip( trackDb_headers, fields ): - tables[ table_name ][ field_name ] = field_value - #split settings fields into dict - fields = fields[-1].split( '\n' ) - tables[ table_name ][ 'settings' ] = {} - for field in fields: - setting_fields = field.split( ' ', 1 ) - setting_name = setting_value = setting_fields[ 0 ] - if len( setting_fields ) > 1: - setting_value = setting_fields[ 1 ] - if setting_name or setting_value: - tables[ table_name ][ 'settings' ][ setting_name ] = setting_value - #Load Groups - groups = load_groups() - in_groups = {} - for table_name, values in tables.iteritems(): - if os.path.exists( os.path.join( output_dir, table_name ) ): - group = values['grp'] - if group not in in_groups: - in_groups[group]={} - #***NAME CHANGE***, 'subTrack' no longer exists as a setting...use 'parent' instead - #subTrack = values.get('settings', {} ).get( 'subTrack', table_name ) - subTrack = values.get('settings', {} ).get( 'parent', table_name ).split( ' ' )[0] #need to split, because could be e.g. 'trackgroup on' - if subTrack not in in_groups[group]: - in_groups[group][subTrack]=[] - in_groups[group][subTrack].append( table_name ) - - assigned_tables = [] - out.write( """<filter type="data_meta" data_ref="input1" meta_key="dbkey" value="%s">\n""" % ( dbkey ) ) - out.write( " <options>\n" ) - for group, subTracks in sorted( in_groups.iteritems() ): - out.write( """ <option name="%s" value="group-%s">\n""" % ( groups[group]['desc'], group ) ) - for sub_name, sub_tracks in subTracks.iteritems(): - if len( sub_tracks ) > 1: - out.write( """ <option name="%s" value="subtracks-%s">\n""" % ( sub_name, sub_name ) ) - sub_tracks.sort() - for track in sub_tracks: - track_label = track - if "$" not in tables[track]['shortLabel']: - track_label = tables[track]['shortLabel'] - out.write( """ <option name="%s" value="%s"/>\n""" % ( track_label, track ) ) - assigned_tables.append( track ) - out.write( " </option>\n" ) - else: - track = sub_tracks[0] - track_label = track - if "$" not in tables[track]['shortLabel']: - track_label = tables[track]['shortLabel'] - out.write( """ <option name="%s" value="%s"/>\n""" % ( track_label, track ) ) - assigned_tables.append( track ) - out.write( " </option>\n" ) - unassigned_tables = list( sorted( [ table_dir for table_dir in os.listdir( output_dir ) if table_dir not in assigned_tables and os.path.isdir( os.path.join( output_dir, table_dir ) ) ] ) ) - if unassigned_tables: - out.write( """ <option name="Uncategorized Tables" value="group-trackDbUnassigned">\n""" ) - for table_name in unassigned_tables: - out.write( """ <option name="%s" value="%s"/>\n""" % ( table_name, table_name ) ) - out.write( " </option>\n" ) - out.write( " </options>\n" ) - out.write( """</filter>\n""" ) - out.close() - -def write_database_dump_info( input_dir, output_dir, dbkey, chrom_lengths, default_bitset_size ): - #generate hash for profiled table directories - #sort directories off output root (files in output root not hashed, including the profiler_info.txt file) - #sort files in each directory and hash file contents - profiled_hash = md5() - for table_dir in sorted( [ table_dir for table_dir in os.listdir( output_dir ) if os.path.isdir( os.path.join( output_dir, table_dir ) ) ] ): - for filename in sorted( os.listdir( os.path.join( output_dir, table_dir ) ) ): - f = open( os.path.join( output_dir, table_dir, filename ), 'rb' ) - while True: - hash_chunk = f.read( CHUNK_SIZE ) - if not hash_chunk: - break - profiled_hash.update( hash_chunk ) - profiled_hash = profiled_hash.hexdigest() - - #generate hash for input dir - #sort directories off input root - #sort files in each directory and hash file contents - database_hash = md5() - for dirpath, dirnames, filenames in sorted( os.walk( input_dir ) ): - for filename in sorted( filenames ): - f = open( os.path.join( input_dir, dirpath, filename ), 'rb' ) - while True: - hash_chunk = f.read( CHUNK_SIZE ) - if not hash_chunk: - break - database_hash.update( hash_chunk ) - database_hash = database_hash.hexdigest() - - #write out info file - out = open( os.path.join( output_dir, 'profiler_info.txt' ), 'wb' ) - out.write( 'dbkey\t%s\n' % ( dbkey ) ) - out.write( 'chromosomes\t%s\n' % ( ','.join( [ '%s=%s' % ( chrom_name, chrom_len ) for chrom_name, chrom_len in chrom_lengths.iteritems() ] ) ) ) - out.write( 'bitset_size\t%s\n' % ( default_bitset_size ) ) - for line in open( os.path.join( input_dir, 'trackDb.sql' ) ): - line = line.strip() - if line.startswith( '-- Dump completed on ' ): - line = line[ len( '-- Dump completed on ' ): ] - out.write( 'dump_time\t%s\n' % ( line ) ) - break - out.write( 'dump_hash\t%s\n' % ( database_hash ) ) - out.write( 'profiler_time\t%s\n' % ( time.time() ) ) - out.write( 'profiler_hash\t%s\n' % ( profiled_hash ) ) - out.write( 'profiler_version\t%s\n' % ( VERSION ) ) - out.write( 'profiler_struct_format\t%s\n' % ( STRUCT_FMT ) ) - out.write( 'profiler_struct_size\t%s\n' % ( STRUCT_SIZE ) ) - out.close() - -def __main__(): - usage = "usage: %prog options" - parser = OptionParser( usage=usage ) - parser.add_option( '-d', '--dbkey', dest='dbkey', default='hg18', help='dbkey to process' ) - parser.add_option( '-i', '--input_dir', dest='input_dir', default=os.path.join( 'golden_path','%s', 'database' ), help='Input Directory' ) - parser.add_option( '-o', '--output_dir', dest='output_dir', default=os.path.join( 'profiled_annotations','%s' ), help='Output Directory' ) - parser.add_option( '-c', '--chromosomes', dest='chromosomes', default='', help='Comma separated list of: ChromName1[=length],ChromName2[=length],...' ) - parser.add_option( '-b', '--bitset_size', dest='bitset_size', default=DEFAULT_BITSET_SIZE, type='int', help='Default BitSet size; overridden by sizes specified in chromInfo.txt.gz or by --chromosomes' ) - parser.add_option( '-f', '--ftp_site', dest='ftp_site', default='hgdownload.cse.ucsc.edu', help='FTP site; used for chromosome info when chromInfo.txt.gz method fails' ) - parser.add_option( '-p', '--ftp_path', dest='ftp_path', default='/goldenPath/%s/chromosomes/', help='FTP Path; used for chromosome info when chromInfo.txt.gz method fails' ) - - ( options, args ) = parser.parse_args() - - input_dir = options.input_dir - if '%' in input_dir: - input_dir = input_dir % options.dbkey - assert os.path.exists( input_dir ), 'Input directory does not exist' - output_dir = options.output_dir - if '%' in output_dir: - output_dir = output_dir % options.dbkey - assert not os.path.exists( output_dir ), 'Output directory already exists' - os.makedirs( output_dir ) - ftp_path = options.ftp_path - if '%' in ftp_path: - ftp_path = ftp_path % options.dbkey - - #Get chromosome names and lengths - chrom_lengths = {} - if options.chromosomes: - for chrom in options.chromosomes.split( ',' ): - fields = chrom.split( '=' ) - chrom = fields[0] - if len( fields ) > 1: - chrom_len = int( fields[1] ) - else: - chrom_len = options.bitset_size - chrom_lengths[ chrom ] = chrom_len - chroms = chrom_lengths.keys() - print 'Chrom info taken from command line option.' - else: - try: - for line in gzip.open( os.path.join( input_dir, 'chromInfo.txt.gz' ) ): - fields = line.strip().split( '\t' ) - chrom_lengths[ fields[0] ] = int( fields[ 1 ] ) - chroms = chrom_lengths.keys() - print 'Chrom info taken from chromInfo.txt.gz.' - except Exception, e: - print 'Error loading chrom info from chromInfo.txt.gz, trying FTP method.' - chrom_lengths = {} #zero out chrom_lengths - chroms = [] - ftp = FTP( options.ftp_site ) - ftp.login() - for name in ftp.nlst( ftp_path ): - if name.endswith( '.fa.gz' ): - chroms.append( name.split( '/' )[-1][ :-len( '.fa.gz' ) ] ) - ftp.close() - for chrom in chroms: - chrom_lengths[ chrom ] = options.bitset_size - #sort chroms by length of name, decending; necessary for when table names start with chrom name - chroms = list( reversed( [ chrom for chrom_len, chrom in sorted( [ ( len( chrom ), chrom ) for chrom in chroms ] ) ] ) ) - - #parse tables from local files - #loop through directory contents, if file ends in '.sql', process table - for filename in os.listdir( input_dir ): - if filename.endswith ( '.sql' ): - base_filename = filename[ 0:-len( '.sql' ) ] - table_out_dir = os.path.join( output_dir, base_filename ) - #some tables are chromosome specific, lets strip off the chrom name - for chrom in chroms: - if base_filename.startswith( "%s_" % chrom ): - #found chromosome - table_out_dir = os.path.join( output_dir, base_filename[len( "%s_" % chrom ):] ) - break - #create table dir - if not os.path.exists( table_out_dir ): - os.mkdir( table_out_dir ) #table dir may already exist in the case of single chrom tables - print "Created table dir (%s)." % table_out_dir - else: - print "Table dir (%s) already exists." % table_out_dir - #find column assignments - table_name, chrom_col, start_col, end_col = get_columns( "%s.sql" % os.path.join( input_dir, base_filename ) ) - if chrom_col is None or start_col is None or end_col is None: - print "Table %s (%s) does not appear to have a chromosome, a start, or a stop." % ( table_name, "%s.sql" % os.path.join( input_dir, base_filename ) ) - if not os.listdir( table_out_dir ): - print "Removing empty table (%s) directory (%s)." % ( table_name, table_out_dir ) - os.rmdir( table_out_dir ) - continue - #build bitsets from table - bitset_dict = {} - for line in gzip.open( '%s.txt.gz' % os.path.join( input_dir, base_filename ) ): - fields = line.strip().split( '\t' ) - chrom = fields[ chrom_col ] - start = int( fields[ start_col ] ) - end = int( fields[ end_col ] ) - if chrom not in bitset_dict: - bitset_dict[ chrom ] = BitSet( chrom_lengths.get( chrom, options.bitset_size ) ) - bitset_dict[ chrom ].set_range( start, end - start ) - #write bitsets as profiled annotations - for chrom_name, chrom_bits in bitset_dict.iteritems(): - out = open( os.path.join( table_out_dir, '%s.covered' % chrom_name ), 'wb' ) - end = 0 - total_regions = 0 - total_coverage = 0 - max_size = chrom_lengths.get( chrom_name, options.bitset_size ) - while True: - start = chrom_bits.next_set( end ) - if start >= max_size: - break - end = chrom_bits.next_clear( start ) - out.write( struct.pack( STRUCT_FMT, start ) ) - out.write( struct.pack( STRUCT_FMT, end ) ) - total_regions += 1 - total_coverage += end - start - if end >= max_size: - break - out.close() - open( os.path.join( table_out_dir, '%s.total_regions' % chrom_name ), 'wb' ).write( str( total_regions ) ) - open( os.path.join( table_out_dir, '%s.total_coverage' % chrom_name ), 'wb' ).write( str( total_coverage ) ) - - #create xml - create_grouping_xml( input_dir, output_dir, options.dbkey ) - #create database dump info file, for database version control - write_database_dump_info( input_dir, output_dir, options.dbkey, chrom_lengths, options.bitset_size ) - -if __name__ == "__main__": __main__() diff -r 1fae433a54eaaab6392da9e9a8eec94a3b568c14 -r c559805d44df064f98dfbc94b59edba7be01adb9 static/images/blat_mapping_example.png Binary file static/images/blat_mapping_example.png has changed diff -r 1fae433a54eaaab6392da9e9a8eec94a3b568c14 -r c559805d44df064f98dfbc94b59edba7be01adb9 static/images/dualcolorcode.png Binary file static/images/dualcolorcode.png has changed diff -r 1fae433a54eaaab6392da9e9a8eec94a3b568c14 -r c559805d44df064f98dfbc94b59edba7be01adb9 static/images/short_reads_boxplot.png Binary file static/images/short_reads_boxplot.png has changed diff -r 1fae433a54eaaab6392da9e9a8eec94a3b568c14 -r c559805d44df064f98dfbc94b59edba7be01adb9 test-data/annotation_profiler_1.out --- a/test-data/annotation_profiler_1.out +++ /dev/null @@ -1,9 +0,0 @@ -chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 + multiz17way 1700000 1 -chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 + mrna 1476531 12 -chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 + multiz28way 1700000 1 -chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 + refGene 1247808 15 -chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 + knownAlt 14617 57 -chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 + affyGnf1h 16218 2 -chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 + snp126 8224 7262 -chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 + acembly 1532618 20 -chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 + knownGene 1282789 18 diff -r 1fae433a54eaaab6392da9e9a8eec94a3b568c14 -r c559805d44df064f98dfbc94b59edba7be01adb9 test-data/annotation_profiler_2.out --- a/test-data/annotation_profiler_2.out +++ /dev/null @@ -1,10 +0,0 @@ -#tableName tableChromosomeCoverage tableChromosomeCount tableRegionCoverage tableRegionCount allIntervalCount allIntervalSize allCoverage allTableRegionsOverlaped allIntervalsOverlapingTable nrIntervalCount nrIntervalSize nrCoverage nrTableRegionsOverlaped nrIntervalsOverlapingTable -multiz17way 1232617592 115 107496500 7 25 2178864 2178864 25 25 24 2178828 2178828 7 24 -mrna 610115393 8453 53577685 617 25 2178864 1904380 38 24 24 2178828 1904344 33 23 -multiz28way 1233785185 143 107466479 10 25 2178864 2178864 25 25 24 2178828 2178828 8 24 -refGene 496767116 7324 46112187 488 25 2178864 1677947 30 23 24 2178828 1677911 27 22 -knownAlt 8647368 20213 766619 1630 25 2178864 5612 31 11 24 2178828 5612 31 11 -affyGnf1h 24034558 3995 2446754 307 25 2178864 191851 9 6 24 2178828 191851 9 6 -snp126 5297125 4456213 382226 331523 25 2178864 9205 7074 25 24 2178828 9205 7074 24 -acembly 710938193 13800 63146381 938 25 2178864 1903560 35 24 24 2178828 1903524 30 23 -knownGene 555770538 7921 50317496 558 25 2178864 1822985 30 23 24 2178828 1822949 27 22 This diff is so big that we needed to truncate the remainder. https://bitbucket.org/galaxy/galaxy-central/commits/175bacc005a1/ Changeset: 175bacc005a1 User: davebgx Date: 2014-05-19 19:02:11 Summary: Remove migrated tools from the tool conf files. Affected #: 2 files diff -r c559805d44df064f98dfbc94b59edba7be01adb9 -r 175bacc005a1437bfeedc8c76fe757467c08a870 tool_conf.xml.main --- a/tool_conf.xml.main +++ b/tool_conf.xml.main @@ -51,13 +51,6 @@ <tool file="filters/wig_to_bigwig.xml" /><tool file="filters/bed_to_bigbed.xml" /></section> - <section id="fasta_manipulation" name="FASTA manipulation"> - <tool file="fasta_tools/fasta_compute_length.xml" /> - <tool file="fasta_tools/fasta_filter_by_length.xml" /> - <tool file="fasta_tools/fasta_concatenate_by_species.xml" /> - <tool file="fasta_tools/fasta_to_tabular.xml" /> - <tool file="fasta_tools/tabular_to_fasta.xml" /> - </section><section id="filter" name="Filter and Sort"><tool file="stats/filtering.xml" /><tool file="filters/sorter.xml" /> @@ -97,9 +90,6 @@ <tool file="filters/wiggle_to_simple.xml" /><tool file="stats/aggregate_binned_scores_in_intervals.xml" /></section> - <section id="bxops" name="Operate on Genomic Intervals"> - <tool file="annotation_profiler/annotation_profiler.xml" /> - </section><section id="stats" name="Statistics"><tool file="stats/gsummary.xml" /><tool file="filters/uniq.xml" /> @@ -116,12 +106,6 @@ <tool file="maf/vcf_to_maf_customtrack.xml" /><tool file="mutation/visualize.xml" /></section> - <section id="multVar" name="Multivariate Analysis"> - <tool file="multivariate_stats/pca.xml" /> - <tool file="multivariate_stats/cca.xml" /> - <tool file="multivariate_stats/kpca.xml" /> - <tool file="multivariate_stats/kcca.xml" /> - </section><section id="hgv" name="Phenotype Association"><tool file="evolution/codingSnps.xml" /><tool file="evolution/add_scores.xml" /> @@ -141,8 +125,6 @@ <section id="cshl_library_information" name="NGS: QC and manipulation"><label id="illumina" text="Illumina data" /><label id="454" text="Roche-454 data" /> - <tool file="metag_tools/short_reads_figure_score.xml" /> - <tool file="metag_tools/short_reads_trim_seq.xml" /><label id="solid" text="AB-SOLiD data" /><tool file="next_gen_conversion/solid2fastq.xml" /><tool file="solid_tools/solid_qual_stats.xml" /> @@ -153,8 +135,6 @@ <section id="ngs_mapping" name="NGS: Mapping"><label id="illumina" text="Illumina" /><label id="roche_454" text="Roche-454" /> - <tool file="metag_tools/megablast_wrapper.xml" /> - <tool file="metag_tools/megablast_xml_parser.xml" /><label id="ab_solid" text="AB-SOLiD" /></section><section id="samtools" name="NGS: SAM Tools"> diff -r c559805d44df064f98dfbc94b59edba7be01adb9 -r 175bacc005a1437bfeedc8c76fe757467c08a870 tool_conf.xml.sample --- a/tool_conf.xml.sample +++ b/tool_conf.xml.sample @@ -68,13 +68,11 @@ <tool file="filters/axt_to_fasta.xml" /><tool file="filters/axt_to_lav.xml" /><tool file="filters/bed2gff.xml" /> - <tool file="fasta_tools/fasta_to_tabular.xml" /><tool file="filters/gff2bed.xml" /><tool file="filters/lav_to_bed.xml" /><tool file="maf/maf_to_bed.xml" /><tool file="maf/maf_to_interval.xml" /><tool file="maf/maf_to_fasta.xml" /> - <tool file="fasta_tools/tabular_to_fasta.xml" /><tool file="filters/wiggle_to_simple.xml" /><tool file="filters/sff_extractor.xml" /><tool file="filters/gtf2bedgraph.xml" /> @@ -105,9 +103,6 @@ <tool file="filters/wiggle_to_simple.xml" /><tool file="stats/aggregate_binned_scores_in_intervals.xml" /></section> - <section id="bxops" name="Operate on Genomic Intervals"> - <tool file="annotation_profiler/annotation_profiler.xml" /> - </section><section id="stats" name="Statistics"><tool file="stats/gsummary.xml" /><tool file="filters/uniq.xml" /> @@ -130,12 +125,6 @@ <tool file="maf/vcf_to_maf_customtrack.xml" /><tool file="mutation/visualize.xml" /></section> - <section id="multVar" name="Multivariate Analysis"> - <tool file="multivariate_stats/pca.xml" /> - <tool file="multivariate_stats/cca.xml" /> - <tool file="multivariate_stats/kpca.xml" /> - <tool file="multivariate_stats/kcca.xml" /> - </section><section id="hyphy" name="Evolution"><tool file="evolution/codingSnps.xml" /><tool file="evolution/add_scores.xml" /> @@ -144,15 +133,6 @@ <tool file="meme/meme.xml" /><tool file="meme/fimo.xml" /></section> - <section id="clustal" name="Multiple Alignments"> - </section> - <section id="fasta_manipulation" name="FASTA manipulation"> - <tool file="fasta_tools/fasta_compute_length.xml" /> - <tool file="fasta_tools/fasta_filter_by_length.xml" /> - <tool file="fasta_tools/fasta_concatenate_by_species.xml" /> - <tool file="fasta_tools/fasta_to_tabular.xml" /> - <tool file="fasta_tools/tabular_to_fasta.xml" /> - </section><section id="NGS_QC" name="NGS: QC and manipulation"><label id="fastqcsambam" text="FastQC: fastq/sam/bam" /> @@ -160,9 +140,6 @@ <label id="illumina" text="Illumina fastq" /><label id="454" text="Roche-454 data" /> - <tool file="metag_tools/short_reads_figure_score.xml" /> - <tool file="metag_tools/short_reads_trim_seq.xml" /> - <label id="solid" text="AB-SOLiD data" /><tool file="next_gen_conversion/solid2fastq.xml" /><tool file="solid_tools/solid_qual_stats.xml" /> @@ -185,8 +162,6 @@ --><section id="solexa_tools" name="NGS: Mapping"><tool file="sr_mapping/bfast_wrapper.xml" /> - <tool file="metag_tools/megablast_wrapper.xml" /> - <tool file="metag_tools/megablast_xml_parser.xml" /><tool file="sr_mapping/PerM.xml" /><tool file="sr_mapping/srma_wrapper.xml" /><tool file="sr_mapping/mosaik.xml" /> Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.