1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/changeset/849fcfda099e/ changeset: 849fcfda099e user: natefoo date: 2012-04-09 19:13:44 summary: Miller Lab's Genome Diversity tools moved to the Tool Shed. affected #: 14 files diff -r 0cab089a6ad7305c3ec968c39a7032b640376363 -r 849fcfda099e3fb55a0cad7e0e478142930d2e8e datatypes_conf.xml.sample --- a/datatypes_conf.xml.sample +++ b/datatypes_conf.xml.sample @@ -171,7 +171,6 @@ <display file="rviewer/vcf.xml" inherit="True"/></datatype><datatype extension="bcf" type="galaxy.datatypes.binary:Binary" subclass="True"/> - <datatype extension="wsf" type="galaxy.datatypes.wsf:SnpFile" display_in_upload="true"/><datatype extension="velvet" type="galaxy.datatypes.assembly:Velvet" display_in_upload="false"/><datatype extension="wig" type="galaxy.datatypes.interval:Wiggle" display_in_upload="true"><converter file="wig_to_bigwig_converter.xml" target_datatype="bigwig"/> diff -r 0cab089a6ad7305c3ec968c39a7032b640376363 -r 849fcfda099e3fb55a0cad7e0e478142930d2e8e lib/galaxy/datatypes/registry.py --- a/lib/galaxy/datatypes/registry.py +++ b/lib/galaxy/datatypes/registry.py @@ -2,7 +2,7 @@ Provides mapping between extensions and datatypes, mime-types, etc. """ import os, sys, tempfile, threading, logging -import data, tabular, interval, images, sequence, qualityscore, genetics, xml, coverage, tracks, chrominfo, binary, assembly, ngsindex, wsf +import data, tabular, interval, images, sequence, qualityscore, genetics, xml, coverage, tracks, chrominfo, binary, assembly, ngsindex import galaxy.util from galaxy.util.odict import odict from display_applications.application import DisplayApplication diff -r 0cab089a6ad7305c3ec968c39a7032b640376363 -r 849fcfda099e3fb55a0cad7e0e478142930d2e8e lib/galaxy/datatypes/wsf.py --- a/lib/galaxy/datatypes/wsf.py +++ /dev/null @@ -1,156 +0,0 @@ -""" -SnpFile datatype -""" - -import re -import data -from galaxy import util -from galaxy.datatypes.sniff import * -from galaxy.datatypes.tabular import Tabular -from galaxy.datatypes import metadata -from galaxy.datatypes.metadata import MetadataElement - -class SnpFile( Tabular ): - """ Webb's SNP file format """ - file_ext = 'wsf' - species_regex = re.compile('species=(\S+)') - MetadataElement( name="species", desc="species", default='', no_value='', visible=False, readonly=True ) - MetadataElement( name="scaffold", desc="scaffold column", param=metadata.ColumnParameter, default=0 ) - MetadataElement( name="pos", desc="pos column", param=metadata.ColumnParameter, default=0 ) - MetadataElement( name="ref", desc="ref column", param=metadata.ColumnParameter, default=0 ) - MetadataElement( name="rPos", desc="rPos column", param=metadata.ColumnParameter, default=0 ) - MetadataElement( name="labels", desc="Number of labels", default=0, no_value=0, visible=False, readonly=True ) - MetadataElement( name="label_for_column", desc="Mapping from column to label", default=[], no_value=[], visible=False, readonly=True ) - MetadataElement( name="columns_with_label", desc="Mapping from label to columns", param=metadata.DictParameter, default={}, no_value={}, visible=False, readonly=True ) - MetadataElement( name="column_headers", desc="Column headers", default=[], no_value=[], visible=False, readonly=True ) - - - def set_meta( self, dataset, overwrite = True, **kwd ): - Tabular.set_meta( self, dataset, overwrite=overwrite, max_data_lines=None, **kwd ) - # these two if statements work around a potential bug in metadata.py - if dataset.metadata.labels is None or dataset.metadata.labels == dataset.metadata.spec['labels'].no_value: - self._set_column_labels_metadata( dataset ) - if dataset.metadata.column_headers is None or dataset.metadata.column_headers == dataset.metadata.spec['column_headers'].no_value: - self._set_column_headers_metadata( dataset ) - self._set_columnParameter_metadata( dataset ) - - - def _set_column_labels_metadata( self, dataset ): - def build_map_from_label_to_comma_separated_column_list( labels ): - map = {} - for index, label in enumerate( labels ): - map.setdefault( label, [] ).append( index ) - - for label in map: - map[label] = ','.join( [ str( index + 1 ) for index in map[label] ] ) - return map - - def strip_list_elements( list ): - return [ element.strip() for element in list ] - - def initial_comment_lines_of_dataset( dataset ): - comment_lines = [] - if dataset.has_data(): - try: - fh = open( dataset.file_name, 'r' ) - for line in fh: - if not line.startswith('#'): - break - line = line[1:] - line = line.rstrip( '\r\n' ) - if line: - comment_lines.append( line ) - fh.close() - except: - pass - return comment_lines - - def set_metadata_from_comment_lines( dataset ): - labels = [] - comment_lines = initial_comment_lines_of_dataset( dataset ) - - for line in comment_lines: - match = SnpFile.species_regex.match( line ) - if match: - dataset.metadata.species = match.group(1) - continue - elems = line.split( '\t' ) - if len(elems) > 1: - labels = strip_list_elements( elems ) - - dataset.metadata.labels = len( labels ) - dataset.metadata.label_for_column = labels[:] - if labels: - dataset.metadata.label_for_column.insert(0, '') - dataset.metadata.columns_with_label = build_map_from_label_to_comma_separated_column_list( labels ) - - set_metadata_from_comment_lines( dataset ) - - - def _set_column_headers_metadata( self, dataset ): - if dataset.metadata.labels < dataset.metadata.columns: - column_headers = dataset.metadata.label_for_column[1:] + [ '' ] * ( dataset.metadata.columns - dataset.metadata.labels ) - else: - column_headers = dataset.metadata.label_for_column[1:dataset.metadata.columns+1] - - dataset.metadata.column_headers = column_headers - - - def _set_columnParameter_metadata( self, dataset ): - def unique_column_number_or_zero( string ): - try: - val = int( string ) - except: - val = 0 - return val - - for name in self._metadata_columnParameter_names( dataset ): - if name in dataset.metadata.columns_with_label: - if dataset.metadata.columns_with_label[name]: - column = unique_column_number_or_zero( dataset.metadata.columns_with_label[name] ) - if column: - setattr( dataset.metadata, name, column ) - - - def _metadata_columnParameter_names( self, dataset ): - for name, spec in dataset.metadata.spec.items(): - if isinstance( spec.param, metadata.ColumnParameter ): - yield name - - - def set_peek( self, dataset, line_count=None, is_multi_byte=False ): - super(Tabular, self).set_peek( dataset, line_count=line_count, is_multi_byte=is_multi_byte, skipchars=[ '#' ]) - - - def make_html_table( self, dataset, skipchars=[ '#' ] ): - """Create HTML table, used for displaying peek""" - def table_header_values( dataset ): - headers = dataset.metadata.column_headers[:] - for name in self._metadata_columnParameter_names( dataset ): - col = getattr( dataset.metadata, name ) - assert col <= dataset.metadata.columns, Exception( 'ColumnParameter %s %d > %d columns for dataset %s.' % ( name, col, dataset.metadata.columns, dataset.id ) ) - if col > 0: - headers[ col - 1 ] = name - return headers - - def table_headers( dataset ): - out = [ '<tr>' ] - headers = table_header_values( dataset ) - for index, header in enumerate( headers ): - column = index + 1 - if header: - out.append( "<th>%d.%s</th>" % ( column, header ) ) - else: - out.append( "<th>%d.</th>" % column ) - out.append( '</tr>' ) - return out - - try: - out = ['<table cellspacing="0" cellpadding="3">'] - out.extend( table_headers( dataset ) ) - out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) ) - out.append( '</table>' ) - out = "".join( out ) - except Exception, exc: - out = "Can't create peek %s" % exc - return out diff -r 0cab089a6ad7305c3ec968c39a7032b640376363 -r 849fcfda099e3fb55a0cad7e0e478142930d2e8e tool_conf.xml.sample --- a/tool_conf.xml.sample +++ b/tool_conf.xml.sample @@ -482,12 +482,6 @@ <tool file="phenotype_association/vcf2pgSnp.xml" /><tool file="phenotype_association/dividePgSnpAlleles.xml" /></section> - <section name="Genome Diversity" id="gd"> - <tool file="genome_diversity/extract_primers.xml" /> - <tool file="genome_diversity/select_snps.xml" /> - <tool file="genome_diversity/select_restriction_enzymes.xml" /> - <tool file="genome_diversity/extract_flanking_dna.xml" /> - </section><section name="VCF Tools" id="vcf_tools"><tool file="vcf_tools/intersect.xml" /><tool file="vcf_tools/annotate.xml" /> diff -r 0cab089a6ad7305c3ec968c39a7032b640376363 -r 849fcfda099e3fb55a0cad7e0e478142930d2e8e tools/genome_diversity/cdblib.py --- a/tools/genome_diversity/cdblib.py +++ /dev/null @@ -1,230 +0,0 @@ -#!/usr/bin/env python2.5 - -''' -Manipulate DJB's Constant Databases. These are 2 level disk-based hash tables -that efficiently handle many keys, while remaining space-efficient. - - http://cr.yp.to/cdb.html - -When generated databases are only used with Python code, consider using hash() -rather than djb_hash() for a tidy speedup. -''' - -from _struct import Struct -from itertools import chain - - -def py_djb_hash(s): - '''Return the value of DJB's hash function for the given 8-bit string.''' - h = 5381 - for c in s: - h = (((h << 5) + h) ^ ord(c)) & 0xffffffff - return h - -try: - from _cdblib import djb_hash -except ImportError: - djb_hash = py_djb_hash - -read_2_le4 = Struct('<LL').unpack -write_2_le4 = Struct('<LL').pack - - -class Reader(object): - '''A dictionary-like object for reading a Constant Database accessed - through a string or string-like sequence, such as mmap.mmap().''' - - def __init__(self, data, hashfn=djb_hash): - '''Create an instance reading from a sequence and using hashfn to hash - keys.''' - if len(data) < 2048: - raise IOError('CDB too small') - - self.data = data - self.hashfn = hashfn - - self.index = [read_2_le4(data[i:i+8]) for i in xrange(0, 2048, 8)] - self.table_start = min(p[0] for p in self.index) - # Assume load load factor is 0.5 like official CDB. - self.length = sum(p[1] >> 1 for p in self.index) - - def iteritems(self): - '''Like dict.iteritems(). Items are returned in insertion order.''' - pos = 2048 - while pos < self.table_start: - klen, dlen = read_2_le4(self.data[pos:pos+8]) - pos += 8 - - key = self.data[pos:pos+klen] - pos += klen - - data = self.data[pos:pos+dlen] - pos += dlen - - yield key, data - - def items(self): - '''Like dict.items().''' - return list(self.iteritems()) - - def iterkeys(self): - '''Like dict.iterkeys().''' - return (p[0] for p in self.iteritems()) - __iter__ = iterkeys - - def itervalues(self): - '''Like dict.itervalues().''' - return (p[1] for p in self.iteritems()) - - def keys(self): - '''Like dict.keys().''' - return [p[0] for p in self.iteritems()] - - def values(self): - '''Like dict.values().''' - return [p[1] for p in self.iteritems()] - - def __getitem__(self, key): - '''Like dict.__getitem__().''' - value = self.get(key) - if value is None: - raise KeyError(key) - return value - - def has_key(self, key): - '''Return True if key exists in the database.''' - return self.get(key) is not None - __contains__ = has_key - - def __len__(self): - '''Return the number of records in the database.''' - return self.length - - def gets(self, key): - '''Yield values for key in insertion order.''' - # Truncate to 32 bits and remove sign. - h = self.hashfn(key) & 0xffffffff - start, nslots = self.index[h & 0xff] - - if nslots: - end = start + (nslots << 3) - slot_off = start + (((h >> 8) % nslots) << 3) - - for pos in chain(xrange(slot_off, end, 8), - xrange(start, slot_off, 8)): - rec_h, rec_pos = read_2_le4(self.data[pos:pos+8]) - - if not rec_h: - break - elif rec_h == h: - klen, dlen = read_2_le4(self.data[rec_pos:rec_pos+8]) - rec_pos += 8 - - if self.data[rec_pos:rec_pos+klen] == key: - rec_pos += klen - yield self.data[rec_pos:rec_pos+dlen] - - def get(self, key, default=None): - '''Get the first value for key, returning default if missing.''' - # Avoid exception catch when handling default case; much faster. - return chain(self.gets(key), (default,)).next() - - def getint(self, key, default=None, base=0): - '''Get the first value for key converted it to an int, returning - default if missing.''' - value = self.get(key, default) - if value is not default: - return int(value, base) - return value - - def getints(self, key, base=0): - '''Yield values for key in insertion order after converting to int.''' - return (int(v, base) for v in self.gets(key)) - - def getstring(self, key, default=None, encoding='utf-8'): - '''Get the first value for key decoded as unicode, returning default if - not found.''' - value = self.get(key, default) - if value is not default: - return value.decode(encoding) - return value - - def getstrings(self, key, encoding='utf-8'): - '''Yield values for key in insertion order after decoding as - unicode.''' - return (v.decode(encoding) for v in self.gets(key)) - - -class Writer(object): - '''Object for building new Constant Databases, and writing them to a - seekable file-like object.''' - - def __init__(self, fp, hashfn=djb_hash): - '''Create an instance writing to a file-like object, using hashfn to - hash keys.''' - self.fp = fp - self.hashfn = hashfn - - fp.write('\x00' * 2048) - self._unordered = [[] for i in xrange(256)] - - def put(self, key, value=''): - '''Write a string key/value pair to the output file.''' - assert type(key) is str and type(value) is str - - pos = self.fp.tell() - self.fp.write(write_2_le4(len(key), len(value))) - self.fp.write(key) - self.fp.write(value) - - h = self.hashfn(key) & 0xffffffff - self._unordered[h & 0xff].append((h, pos)) - - def puts(self, key, values): - '''Write more than one value for the same key to the output file. - Equivalent to calling put() in a loop.''' - for value in values: - self.put(key, value) - - def putint(self, key, value): - '''Write an integer as a base-10 string associated with the given key - to the output file.''' - self.put(key, str(value)) - - def putints(self, key, values): - '''Write zero or more integers for the same key to the output file. - Equivalent to calling putint() in a loop.''' - self.puts(key, (str(value) for value in values)) - - def putstring(self, key, value, encoding='utf-8'): - '''Write a unicode string associated with the given key to the output - file after encoding it as UTF-8 or the given encoding.''' - self.put(key, unicode.encode(value, encoding)) - - def putstrings(self, key, values, encoding='utf-8'): - '''Write zero or more unicode strings to the output file. Equivalent to - calling putstring() in a loop.''' - self.puts(key, (unicode.encode(value, encoding) for value in values)) - - def finalize(self): - '''Write the final hash tables to the output file, and write out its - index. The output file remains open upon return.''' - index = [] - for tbl in self._unordered: - length = len(tbl) << 1 - ordered = [(0, 0)] * length - for pair in tbl: - where = (pair[0] >> 8) % length - for i in chain(xrange(where, length), xrange(0, where)): - if not ordered[i][0]: - ordered[i] = pair - break - - index.append((self.fp.tell(), length)) - for pair in ordered: - self.fp.write(write_2_le4(*pair)) - - self.fp.seek(0) - for pair in index: - self.fp.write(write_2_le4(*pair)) - self.fp = None # prevent double finalize() diff -r 0cab089a6ad7305c3ec968c39a7032b640376363 -r 849fcfda099e3fb55a0cad7e0e478142930d2e8e tools/genome_diversity/extract_flanking_dna.py --- a/tools/genome_diversity/extract_flanking_dna.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env python2.5 - -import os -import sys -from optparse import OptionParser -import genome_diversity as gd - -def main_function( parse_arguments=None ): - if parse_arguments is None: - parse_arguments = lambda arguments: ( None, arguments ) - def main_decorator( to_decorate ): - def decorated_main( arguments=None ): - if arguments is None: - arguments = sys.argv - options, arguments = parse_arguments( arguments ) - rc = 1 - try: - rc = to_decorate( options, arguments ) - except Exception, err: - sys.stderr.write( 'ERROR: %s\n' % str( err ) ) - traceback.print_exc() - finally: - sys.exit( rc ) - return decorated_main - return main_decorator - -def parse_arguments( arguments ): - parser = OptionParser() - parser.add_option('--input', - type='string', dest='input', - help='file of selected SNPs') - parser.add_option('--output', - type='string', dest='output', - help='output file') - parser.add_option('--snps_loc', - type='string', dest='snps_loc', - help='snps .loc file') - parser.add_option('--scaffold_col', - type="int", dest='scaffold_col', - help='scaffold column in the input file') - parser.add_option('--pos_col', - type="int", dest='pos_col', - help='position column in the input file') - parser.add_option('--output_format', - type="string", dest='output_format', - help='output format, fasta or primer3') - parser.add_option('--species', - type="string", dest='species', - help='species') - return parser.parse_args( arguments[1:] ) - - -@main_function( parse_arguments ) -def main( options, arguments ): - if not options.input: - raise RuntimeError( 'missing --input option' ) - if not options.output: - raise RuntimeError( 'missing --output option' ) - if not options.snps_loc: - raise RuntimeError( 'missing --snps_loc option' ) - if not options.scaffold_col: - raise RuntimeError( 'missing --scaffold_col option' ) - if not options.pos_col: - raise RuntimeError( 'missing --pos_col option' ) - if not options.output_format: - raise RuntimeError( 'missing --output_format option' ) - if not options.species: - raise RuntimeError( 'missing --species option' ) - - snps = gd.SnpFile( filename=options.input, seq_col=int( options.scaffold_col ), pos_col=int( options.pos_col ) ) - - out_fh = gd._openfile( options.output, 'w' ) - - snpcalls_file = gd.get_filename_from_loc( options.species, options.snps_loc ) - file_root, file_ext = os.path.splitext( snpcalls_file ) - snpcalls_index_file = file_root + ".cdb" - snpcalls = gd.SnpcallsFile( data_file=snpcalls_file, index_file=snpcalls_index_file ) - - while snps.next(): - seq, pos = snps.get_seq_pos() - flanking_dna = snpcalls.get_flanking_dna( sequence=seq, position=pos, format=options.output_format ) - if flanking_dna: - out_fh.write( flanking_dna ) - - out_fh.close() - -if __name__ == "__main__": - main() - diff -r 0cab089a6ad7305c3ec968c39a7032b640376363 -r 849fcfda099e3fb55a0cad7e0e478142930d2e8e tools/genome_diversity/extract_flanking_dna.xml --- a/tools/genome_diversity/extract_flanking_dna.xml +++ /dev/null @@ -1,93 +0,0 @@ -<tool id="gd_extract_flanking_dna" name="Extract" version="1.0.0"> - <description>DNA flanking chosen SNPs</description> - - <command interpreter="python2.5"> - extract_flanking_dna.py "--input=$input" "--output=$output" "--snps_loc=${GALAXY_DATA_INDEX_DIR}/gd.snps.loc" - #if $override_metadata.choice == "0": - "--scaffold_col=${input.metadata.scaffold}" "--pos_col=${input.metadata.pos}" "--species=${input.metadata.species}" - #else - "--scaffold_col=$scaf_col" "--pos_col=$pos_col" "--species=$species" - #end if - "--output_format=$output_format" - </command> - - <inputs> - <param format="tabular" name="input" type="data" label="Selected SNPS dataset"/> - <param name="output_format" type="select" format="integer" label="output format"> - <option value="fasta" selected="true">FastA format</option> - <option value="primer3">Primer3 input</option> - </param> - <conditional name="override_metadata"> - <param name="choice" type="select" format="integer" label="choose columns"> - <option value="0" selected="true">No, get columns from metadata</option> - <option value="1" >Yes, choose columns</option> - </param> - <when value="0"> - <!-- no options --> - </when> - <when value="1"> - <param name="scaf_col" type="data_column" data_ref="input" numerical="false" label="Column with scaffold"/> - <param name="pos_col" type="data_column" data_ref="input" numerical="true" label="Column with position"/> - <param name="species" type="select" label="Choose species"> - <options from_file="gd.species.txt"> - <column name="name" index="1"/> - <column name="value" index="0"/> - </options> - </param> - </when> - </conditional> - </inputs> - - <outputs> - <data format="txt" name="output"/> - </outputs> - - <tests> - <test> - <param name="input" value="gd.sample.wsf" ftype="wsf"/> - <param name="output_format" value="primer3"/> - <param name="choice" value="0"/> - <output name="output" file="gd.extract_flanking_dna.txt"/> - </test> - </tests> - - <help> -**What it does** - - It reports a DNA segment containing each SNP, with up to 200 nucleotides on - either side of the SNP position, which is indicated by "n". Fewer nucleotides - are reported if the SNP is near an end of the assembled genome fragment. - ------ - -**Example** - -- input file:: - - chr2_75111355_75112576 314 A C L F chr2 75111676 C F 15 4 53 2 9 48 Y 96 0.369 0.355 0.396 0 - chr8_93901796_93905612 2471 A C A A chr8 93904264 A A 8 0 51 10 2 14 Y 961 0.016 0.534 0.114 2 - chr10_7434473_7435447 524 T C S S chr10 7435005 T S 11 5 90 14 0 69 Y 626 0.066 0.406 0.727 0 - chr14_80021455_80022064 138 G A H H chr14 80021593 G H 14 0 69 9 6 124 Y 377 0.118 0.997 0.195 1 - chr15_64470252_64471048 89 G A Y Y chr15 64470341 G Y 5 6 109 14 0 69 Y 312 0.247 0.998 0.393 0 - chr18_48070585_48071386 514 C T E K chr18 48071100 T K 7 7 46 14 0 69 Y 2 0.200 0.032 0.163 0 - chr18_50154905_50155664 304 A G Y C chr18 50155208 A Y 4 2 17 5 1 22 Y 8 0.022 0.996 0.128 0 - chr18_57379354_57380496 315 C T V V chr18 57379669 G V 11 0 60 9 6 62 Y 726 0.118 0.048 0.014 1 - chr19_14240610_14242055 232 C T A V chr19 14240840 C A 18 8 56 15 5 42 Y 73 0.003 0.153 0.835 0 - chr19_39866997_39874915 3117 C T P P chr19 39870110 C P 3 7 65 14 2 32 Y 6 0.321 0.911 0.462 4 - etc. - -- output file:: - - > chr2_75111355_75112576 314 A C - TATCTTCATTTTTATTATAGACTCTCTGAACCAATTTGCCCTGAGGCAGACTTTTTAAAGTACTGTGTAATGTATGAAGTCCTTCTGCTCAAGCAAATCATTGGCATGAAAACAGTTGCAAACTTATTGTGAGAGAAGAGTCCAAGAGTTTTAACAGTCTGTAAGTATATAGCCTGTGAGTTTGATTTCCTTCTTGTTTTTnTTCCAGAAACATGATCAGGGGCAAGTTCTATTGGATATAGTCTTCAAGCATCTTGATTTGACTGAGCGTGACTATTTTGGTTTGCAGTTGACTGACGATTCCACTGATAACCCAGTAAGTTTAAGCTGTTGTCTTTCATTGTCATTGCAATTTTTCTGTCTTTATACTAGGTCCTTTCTGATTTACATTGTTCACTGATT - > chr8_93901796_93905612 2471 A C - GCTGCCGCTGGATTTACTTCTGCTTGGGTCGAGAGCGGGCTGGATGGGTGAAGAGTGGGCTCCCCGGCCCCTGACCAGGCAGGTGCAGACAAGTCGGAAGAAGGCCCGCCGCATCTCCTTGCTGGCCAGCGTGTAGATGACGGGGTTCATGGCAGAGTTGAGCACGGCCAGCACGATGAACCACTGGGCCTTGAACAGGATnGCGCACTCCTTCACCTTGCAGGCCACATCCACAAGGAAAAGGATGAAGAGTGGGGACCAGCAGGCGATGAACACGCTCACCACGATCACCACGGTCCGCAGCAGGGCCATGGACCGCTCTGAGTTGTGCGGGCTGGCCACCCTGCGGCTGCTGGACTTCACCAGGAAGTAGATGCGTGCGTACAGGATCACGATGGTCAC - > chr10_7434473_7435447 524 T C - ATTATTAACAGAAACATTTCTTTTTCATTACCCAGGGGTTACACTGGTCGTTGATGTTAATCAGTTTTTGGAGAAGGAGAAGCAAAGTGATATTTTGTCTGTTCTGAAGCCTGCCGTTGGTAATACAAATGACGTAATCCCTGAATGTGCTGACAGGTACCATGACGCCCTGGCAAAAGCAAAAGAGCAAAAATCTAGAAGnGGTAAGCATCTTCACTGTTTAGCACAAATTAAATAGCACTTTGAATATGATGATTTCTGTGGTATTGTGTTATCTTACTTTTGAGACAAATAATCGCTTTCAAATGAATATTTCTGAATGTTTGTCATCTCTGGCAAGGAAATTTTTTAGTGTTTCTTTTCCTTTTTTGTCTTTTGGAAATCTGTGATTAACTTGGTGGC - > chr14_80021455_80022064 138 G A - ACCCAGGGATCAAACCCAGGTCTCCCGCATTGCAGGCGGATTCTTTACTGTCTGAGCCTCCAGGGAAGCCCTCGGGGCTGAAGGGATGGTTATGAAGGTGAGAAACAGGGGCCACCTGTCCCCAAGGTACCTTGCGACnTGCCATCTGCGCTCCACCAGTAAATGGACGTCTTCGATCCTTCTGTTGTTGGCGTAGTGCAAACGTTTGGGAAGGTGCTGTTTCAAGTAAGGCTTAAAGTGCTGGTCTGGTTTTTTACACTGAAATATAAATGGACATTGGATTTTGCAATGGAGAGTCTTCTAGAAGAGTCCAAGACATTCTCTCCAGAAAGCTGAAGG - > chr15_64470252_64471048 89 G A - TGTGTGTGTGTGTGTGTGTGTGTGCCTGTGTCTGTACATGCACACCACGTGGCCTCACCCAGTGCCCTCAGCTCCATGGTGATGTCCACnTAGCCGTGCTCCGCGCTGTAGTACATGGCCTCCTGGAGGGCCTTGGTGCGCGTCCGGCTCAGGCGCATGGGCCCCTCGCTGCCGCTGCCCTGGCTGGATGCATCGCTCTCTTCCACGCCCTCAGCCAGGATCTCCTCCAGGGACAGCACATCTGCTTTGGCCTGCTGTGGCTGAGTCAGGAGCTTCCTCAGGACGTTCCT - etc. - </help> -</tool> diff -r 0cab089a6ad7305c3ec968c39a7032b640376363 -r 849fcfda099e3fb55a0cad7e0e478142930d2e8e tools/genome_diversity/extract_primers.py --- a/tools/genome_diversity/extract_primers.py +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env python2.5 - -import os -import sys -from optparse import OptionParser -import genome_diversity as gd - -def main_function( parse_arguments=None ): - if parse_arguments is None: - parse_arguments = lambda arguments: ( None, arguments ) - def main_decorator( to_decorate ): - def decorated_main( arguments=None ): - if arguments is None: - arguments = sys.argv - options, arguments = parse_arguments( arguments ) - rc = 1 - try: - rc = to_decorate( options, arguments ) - except Exception, err: - sys.stderr.write( 'ERROR: %s\n' % str( err ) ) - traceback.print_exc() - finally: - sys.exit( rc ) - return decorated_main - return main_decorator - -def parse_arguments( arguments ): - parser = OptionParser() - parser.add_option('--input', - type='string', dest='input', - help='file of selected SNPs') - parser.add_option('--output', - type='string', dest='output', - help='output file') - parser.add_option('--primers_loc', - type='string', dest='primers_loc', - help='primers .loc file') - parser.add_option('--scaffold_col', - type="int", dest='scaffold_col', - help='scaffold column in the input file') - parser.add_option('--pos_col', - type="int", dest='pos_col', - help='position column in the input file') - parser.add_option('--species', - type="string", dest='species', - help='species') - return parser.parse_args( arguments[1:] ) - - -@main_function( parse_arguments ) -def main( options, arguments ): - if not options.input: - raise RuntimeError( 'missing --input option' ) - if not options.output: - raise RuntimeError( 'missing --output option' ) - if not options.primers_loc: - raise RuntimeError( 'missing --primers_loc option' ) - if not options.scaffold_col: - raise RuntimeError( 'missing --scaffold_col option' ) - if not options.pos_col: - raise RuntimeError( 'missing --pos_col option' ) - if not options.species: - raise RuntimeError( 'missing --species option' ) - - snps = gd.SnpFile( filename=options.input, seq_col=int( options.scaffold_col ), pos_col=int( options.pos_col ) ) - - out_fh = gd._openfile( options.output, 'w' ) - - primer_data_file = gd.get_filename_from_loc( options.species, options.primers_loc ) - file_root, file_ext = os.path.splitext( primer_data_file ) - primer_index_file = file_root + ".cdb" - primers = gd.PrimersFile( data_file=primer_data_file, index_file=primer_index_file ) - - while snps.next(): - seq, pos = snps.get_seq_pos() - primer = primers.get_entry( seq, pos ) - if primer: - out_fh.write( primer ) - - out_fh.close() - -if __name__ == "__main__": - main() - diff -r 0cab089a6ad7305c3ec968c39a7032b640376363 -r 849fcfda099e3fb55a0cad7e0e478142930d2e8e tools/genome_diversity/extract_primers.xml --- a/tools/genome_diversity/extract_primers.xml +++ /dev/null @@ -1,90 +0,0 @@ -<tool id="gd_extract_primers" name="Extract primers" version="1.0.0"> - <description>for selected SNPs</description> - - <command interpreter="python2.5"> - extract_primers.py "--input=$input" "--output=$output" "--primers_loc=${GALAXY_DATA_INDEX_DIR}/gd.primers.loc" - #if $override_metadata.choice == "0": - "--scaffold_col=${input.metadata.scaffold}" "--pos_col=${input.metadata.pos}" "--species=${input.metadata.species}" - #else - "--scaffold_col=$scaf_col" "--pos_col=$pos_col" "--species=$species" - #end if - </command> - - <inputs> - <param format="tabular" name="input" type="data" label="Selected SNPS dataset"/> - <conditional name="override_metadata"> - <param name="choice" type="select" format="integer" label="choose columns"> - <option value="0" selected="true">No, get columns from metadata</option> - <option value="1" >Yes, choose columns</option> - </param> - <when value="0"> - <!-- no options --> - </when> - <when value="1"> - <param name="scaf_col" type="data_column" data_ref="input" numerical="false" label="Column with scaffold"/> - <param name="pos_col" type="data_column" data_ref="input" numerical="true" label="Column with position"/> - <param name="species" type="select" label="Choose species"> - <options from_file="gd.species.txt"> - <column name="name" index="1"/> - <column name="value" index="0"/> - </options> - </param> - </when> - </conditional> - </inputs> - - <outputs> - <data format="txt" name="output"/> - </outputs> - - <tests> - <test> - <param name="input" value="gd.sample.wsf" ftype="wsf"/> - <param name="choice" value="0"/> - <output name="output" file="gd.extract_primers.txt"/> - </test> - </tests> - - - <help> -**What it does** - - This tool extracts primers for SNPs in the dataset using the Primer3 program. - The first line of output for a given SNP reports the name of the assembled - contig, the SNP's position in the contig, the two variant nucleotides, and - Primer3's "pair penalty". The next line, if not blank, names restriction - enzymes (from the user-adjustable list) that differentially cut at that - site, but do not cut at any other position between and including the - primer positions. The next lines show the SNP's flanking regions, with - the SNP position indicated by "n", including the primer positions and an - additional 3 nucleotides. - ------ - -**Example** - -- input file:: - - chr5_30800874_30802049 734 G A chr5 30801606 A 24 0 99 4 11 97 Y 496 0.502 0.033 0.215 6 - chr8_55117827_55119487 994 A G chr8 55118815 G 25 0 102 4 11 96 Y 22 0.502 0.025 2.365 1 - chr9_100484836_100485311 355 C T chr9 100485200 T 27 0 108 6 17 100 Y 190 0.512 0.880 2.733 4 - chr12_3635530_3637738 2101 T C chr12 3637630 T 25 0 102 4 13 93 Y 169 0.554 0.024 0.366 4 - -- output file:: - - chr5_30800874_30802049 734 G A 0.352964 - BglII,MboI,Sau3AI,Tru9I,XhoII - 1 CTGAAGGTGAGCAGGATTCAGGAGACAGAAAACAAAGCCCAGGCCTGCCCAAGGTGGAAA - >>>>>>>>>>>>>>>>>>>> - - 61 AGTCTAACAACTCGCCCTCTGCTTAnATCTGAGACTCACAGGGATAATAACACACTTGGT - - - 21 CAAGGAATAAACTAGATATTATTCACTCCTCTAGAAGGCTGCCAGGAAAATTGCCTGACT - <<<<<<< - - 181 TGAACCTTGGCTCTGA - <<<<<<<<<<<<< - etc. - </help> -</tool> diff -r 0cab089a6ad7305c3ec968c39a7032b640376363 -r 849fcfda099e3fb55a0cad7e0e478142930d2e8e tools/genome_diversity/genome_diversity.py --- a/tools/genome_diversity/genome_diversity.py +++ /dev/null @@ -1,266 +0,0 @@ -#!/usr/bin/env python2.5 - -import sys -import cdblib - -def _openfile( filename=None, mode='r' ): - try: - fh = open( filename, mode ) - except IOError, err: - raise RuntimeError( "can't open file: %s\n" % str( err ) ) - return fh - -def get_filename_from_loc( species=None, filename=None ): - fh = _openfile( filename ) - for line in fh: - if line and not line.startswith( '#' ): - line = line.rstrip( '\r\n' ) - if line: - elems = line.split( '\t' ) - if len( elems ) >= 2 and elems[0] == species: - return elems[1] - return None - - -class SnpFile( object ): - def __init__( self, filename=None, seq_col=1, pos_col=2, ref_seq_col=7, ref_pos_col=8 ): - self.filename = filename - self.fh = _openfile( filename ) - self.seq_col = seq_col - self.pos_col = pos_col - self.ref_seq_col = ref_seq_col - self.ref_pos_col = ref_pos_col - self.elems = None - self.line = None - self.comments = [] - - def next( self ): - while self.fh: - try: - self.line = self.fh.next() - except StopIteration: - self.line = None - self.elems = None - return None - if self.line: - self.line = self.line.rstrip( '\r\n' ) - if self.line: - if self.line.startswith( '#' ): - self.comments.append( self.line ) - else: - self.elems = self.line.split( '\t' ) - return 1 - - def get_seq_pos( self ): - if self.elems: - return self.elems[ self.seq_col - 1 ], self.elems[ self.pos_col - 1 ] - else: - return None, None - - def get_ref_seq_pos( self ): - if self.elems: - return self.elems[ self.ref_seq_seq - 1 ], self.elems[ self.ref_pos_col - 1 ] - else: - return None, None - - -class IndexedFile( object ): - - def __init__( self, data_file=None, index_file=None ): - self.data_file = data_file - self.index_file = index_file - self.data_fh = _openfile( data_file ) - self.index_fh = _openfile( index_file ) - self._reader = cdblib.Reader( self.index_fh.read(), hash ) - - def get_indexed_line( self, key=None ): - line = None - if key in self._reader: - offset = self._reader.getint( key ) - self.data_fh.seek( offset ) - try: - line = self.data_fh.next() - except StopIteration: - raise RuntimeError( 'index file out of sync for %s' % key ) - return line - -class PrimersFile( IndexedFile ): - def get_primer_header( self, sequence=None, position=None ): - key = "%s %s" % ( str( sequence ), str( position ) ) - header = self.get_indexed_line( key ) - if header: - if header.startswith( '>' ): - elems = header.split() - if len( elems ) < 3: - raise RuntimeError( 'short primers header for %s' % key ) - if sequence != elems[1] or str( position ) != elems[2]: - raise RuntimeError( 'primers index for %s finds %s %s' % ( key, elems[1], elems[2] ) ) - else: - raise RuntimeError( 'primers index out of sync for %s' % key ) - return header - - def get_entry( self, sequence=None, position=None ): - entry = self.get_primer_header( sequence, position ) - if entry: - while self.data_fh: - try: - line = self.data_fh.next() - except StopIteration: - break - if line.startswith( '>' ): - break - entry += line - return entry - - def get_enzymes( self, sequence=None, position=None ): - entry = self.get_primer_header( sequence, position ) - enzyme_list = [] - if entry: - try: - line = self.data_fh.next() - except StopIteration: - raise RuntimeError( 'primers entry for %s %s is truncated' % ( str( sequence ), str( position ) ) ) - if line.startswith( '>' ): - raise RuntimeError( 'primers entry for %s %s is truncated' % ( str( sequence ), str( position ) ) ) - line.rstrip( '\r\n' ) - if line: - enzymes = line.split( ',' ) - for enzyme in enzymes: - enzyme = enzyme.strip() - if enzyme: - enzyme_list.append( enzyme ) - return enzyme_list - -class SnpcallsFile( IndexedFile ): - def get_snp_seq( self, sequence=None, position=None ): - key = "%s %s" % ( str( sequence ), str( position ) ) - line = self.get_indexed_line( key ) - if line: - elems = line.split( '\t' ) - if len (elems) < 3: - raise RuntimeError( 'short snpcalls line for %s' % key ) - if sequence != elems[0] or str( position ) != elems[1]: - raise RuntimeError( 'snpcalls index for %s finds %s %s' % ( key, elems[0], elems[1] ) ) - return elems[2] - else: - return None - - def get_flanking_dna( self, sequence=None, position=None, format='fasta' ): - if format != 'fasta' and format != 'primer3': - raise RuntimeError( 'invalid format for flanking dna: %s' % str( format ) ) - seq = self.get_snp_seq( sequence, position ) - if seq: - p = seq.find('[') - if p == -1: - raise RuntimeError( 'snpcalls entry for %s %s missing left bracket: %s' % ( str( sequence ), str( position ), seq ) ) - q = seq.find(']', p + 1) - if q == -1: - raise RuntimeError( 'snpcalls entry for %s %s missing right bracket: %s' % ( str( sequence ), str( position ), seq ) ) - q += 1 - - if format == 'fasta': - flanking_seq = '> ' - else: - flanking_seq = 'SEQUENCE_ID=' - - flanking_seq += "%s %s %s %s\n" % ( str( sequence ), str( position ), seq[p+1], seq[p+3] ) - - if format == 'primer3': - flanking_seq += 'SEQUENCE_TEMPLATE=' - - flanking_seq += "%sn%s\n" % ( seq[0:p], seq[q:] ) - - if format == 'primer3': - flanking_seq += "SEQUENCE_TARGET=%d,11\n=\n" % ( p - 5 ) - - return flanking_seq - else: - return None - - - -class LocationFile( object ): - def __init__(self, filename): - self.build_map(filename) - - def build_map(self, filename): - self.map = {} - self.open_file(filename) - for line in self.read_lines(): - elems = line.split('\t', 1) - if len(elems) == 2: - self.map[ elems[0].strip() ] = elems[1].strip() - self.close_file() - - def read_lines(self): - for line in self.fh: - if not line.startswith('#'): - line = line.rstrip('\r\n') - yield line - - def open_file(self, filename): - self.filename = filename - try: - self.fh = open(filename, 'r') - except IOError, err: - print >> sys.stderr, "Error opening location file '%s': %s" % (filename, str(err)) - sys.exit(1) - - def close_file(self): - self.fh.close() - - def loc_file( self, key ): - if key in self.map: - return self.map[key] - else: - print >> sys.stderr, "'%s' does not appear in location file '%s'" % (key, self.filename) - sys.exit(1) - -class ChrLens( object ): - def __init__( self, location_file, species ): - self.chrlen_loc = LocationFile( location_file ) - self.chrlen_filename = self.chrlen_loc.loc_file( species ) - self.build_map() - - def build_map(self): - self.map = {} - self.open_file(self.chrlen_filename) - for line in self.read_lines(): - elems = line.split('\t', 1) - if len(elems) == 2: - chrom = elems[0].strip() - chrom_len_text = elems[1].strip() - try: - chrom_len = int( chrom_len_text ) - except ValueError: - print >> sys.stderr, "Bad length '%s' for chromosome '%s' in '%s'" % (chrom_len_text, chrom, self.chrlen_filename) - self.map[ chrom ] = chrom_len - self.close_file() - - def read_lines(self): - for line in self.fh: - if not line.startswith('#'): - line = line.rstrip('\r\n') - yield line - - def open_file(self, filename): - self.filename = filename - try: - self.fh = open(filename, 'r') - except IOError, err: - print >> sys.stderr, "Error opening chromosome length file '%s': %s" % (filename, str(err)) - sys.exit(1) - - def close_file(self): - self.fh.close() - - def length( self, key ): - if key in self.map: - return self.map[key] - else: - return None - - def __iter__( self ): - for chrom in self.map: - yield chrom - diff -r 0cab089a6ad7305c3ec968c39a7032b640376363 -r 849fcfda099e3fb55a0cad7e0e478142930d2e8e tools/genome_diversity/select_restriction_enzymes.py --- a/tools/genome_diversity/select_restriction_enzymes.py +++ /dev/null @@ -1,103 +0,0 @@ -#!/usr/bin/env python2.5 - -import os -import sys -from optparse import OptionParser -import genome_diversity as gd - -def main_function( parse_arguments=None ): - if parse_arguments is None: - parse_arguments = lambda arguments: ( None, arguments ) - def main_decorator( to_decorate ): - def decorated_main( arguments=None ): - if arguments is None: - arguments = sys.argv - options, arguments = parse_arguments( arguments ) - rc = 1 - try: - rc = to_decorate( options, arguments ) - except Exception, err: - sys.stderr.write( 'ERROR: %s\n' % str( err ) ) - traceback.print_exc() - finally: - sys.exit( rc ) - return decorated_main - return main_decorator - -def parse_arguments( arguments ): - parser = OptionParser() - parser.add_option('--input', - type='string', dest='input', - help='file of selected SNPs') - parser.add_option('--output', - type='string', dest='output', - help='output file') - parser.add_option('--primers_loc', - type='string', dest='primers_loc', - help='primers .loc file') - parser.add_option('--scaffold_col', - type="int", dest='scaffold_col', - help='scaffold column in the input file') - parser.add_option('--pos_col', - type="int", dest='pos_col', - help='position column in the input file') - parser.add_option('--enzyme_list', - type="string", dest='enzyme_list_string', - help='comma separated list of enzymes') - parser.add_option('--species', - type="string", dest='species', - help='species') - return parser.parse_args( arguments[1:] ) - - -@main_function( parse_arguments ) -def main( options, arguments ): - if not options.input: - raise RuntimeError( 'missing --input option' ) - if not options.output: - raise RuntimeError( 'missing --output option' ) - if not options.primers_loc: - raise RuntimeError( 'missing --primers_loc option' ) - if not options.scaffold_col: - raise RuntimeError( 'missing --scaffold_col option' ) - if not options.pos_col: - raise RuntimeError( 'missing --pos_col option' ) - if not options.enzyme_list_string: - raise RuntimeError( 'missing --enzyme_list option' ) - if not options.species: - raise RuntimeError( 'missing --species option' ) - - snps = gd.SnpFile( filename=options.input, seq_col=int( options.scaffold_col ), pos_col=int( options.pos_col ) ) - - out_fh = gd._openfile( options.output, 'w' ) - - enzyme_dict = {} - for enzyme in options.enzyme_list_string.split( ',' ): - enzyme = enzyme.strip() - if enzyme: - enzyme_dict[enzyme] = 1 - - primer_data_file = gd.get_filename_from_loc( options.species, options.primers_loc ) - file_root, file_ext = os.path.splitext( primer_data_file ) - primer_index_file = file_root + ".cdb" - primers = gd.PrimersFile( data_file=primer_data_file, index_file=primer_index_file ) - - comments_printed = False - - while snps.next(): - seq, pos = snps.get_seq_pos() - enzyme_list = primers.get_enzymes( seq, pos ) - for enzyme in enzyme_list: - if enzyme in enzyme_dict: - if not comments_printed: - for comment in snps.comments: - out_fh.write( "%s\n" % comment ) - comments_printed = True - out_fh.write( "%s\n" % snps.line ) - break - - out_fh.close() - -if __name__ == "__main__": - main() - diff -r 0cab089a6ad7305c3ec968c39a7032b640376363 -r 849fcfda099e3fb55a0cad7e0e478142930d2e8e tools/genome_diversity/select_restriction_enzymes.xml --- a/tools/genome_diversity/select_restriction_enzymes.xml +++ /dev/null @@ -1,90 +0,0 @@ -<tool id="gd_select_restriction_enzymes" name="Specify" version="1.0.0"> - <description>a set of restriction enzymes</description> - - <command interpreter="python2.5"> - select_restriction_enzymes.py "--input=$input" "--output=$output" "--primers_loc=${GALAXY_DATA_INDEX_DIR}/gd.primers.loc" - #if $override_metadata.choice == "0": - "--scaffold_col=${input.metadata.scaffold}" "--pos_col=${input.metadata.pos}" "--species=${input.metadata.species}" - #else - "--scaffold_col=$scaf_col" "--pos_col=$pos_col" "--species=$species" - #end if - "--enzyme_list=$enzymes" - </command> - - <inputs> - <param format="tabular" name="input" type="data" label="Selected SNPS dataset"/> - <conditional name="override_metadata"> - <param name="choice" type="select" format="integer" label="choose columns"> - <option value="0" selected="true">No, get columns from metadata</option> - <option value="1" >Yes, choose columns</option> - </param> - <when value="0"> - <!-- no options --> - </when> - <when value="1"> - <param name="scaf_col" type="data_column" data_ref="input" numerical="false" label="Column with scaffold"/> - <param name="pos_col" type="data_column" data_ref="input" numerical="true" label="Column with position"/> - <param name="species" type="select" label="Choose species"> - <options from_file="gd.species.txt"> - <column name="name" index="1"/> - <column name="value" index="0"/> - </options> - </param> - </when> - </conditional> - - <param name="enzymes" type="select" display="checkboxes" multiple="true" label="Choose enzymes"> - <options from_file="gd.restriction_enzymes.txt"> - <column name="name" index="0"/> - <column name="value" index="1"/> - </options> - </param> - </inputs> - - <outputs> - <data format="wsf" name="output" metadata_source="input"/> - </outputs> - - <tests> - <test> - <param name="input" value="gd.sample.wsf" ftype="wsf"/> - <param name="choice" value="0"/> - <param name="enzymes" value="BanI,BstOI,Hsp92II"/> - <output name="output" file="gd.select_restriction_enzymes.wsf"/> - </test> - </tests> - - <help> -**What it does** - - It selects the SNPs that are differentially cut by at least one of the - specified restriction enzymes. The enzymes are required to cut the amplified - segment (for the specified PCR primers) only at the SNP. - ------ - -**Example** - -- input file:: - - chr2_75111355_75112576 314 A C L F chr2 75111676 C F 15 4 53 2 9 48 Y 96 0.369 0.355 0.396 0 - chr8_93901796_93905612 2471 A C A A chr8 93904264 A A 8 0 51 10 2 14 Y 961 0.016 0.534 0.114 2 - chr10_7434473_7435447 524 T C S S chr10 7435005 T S 11 5 90 14 0 69 Y 626 0.066 0.406 0.727 0 - chr14_80021455_80022064 138 G A H H chr14 80021593 G H 14 0 69 9 6 124 Y 377 0.118 0.997 0.195 1 - chr15_64470252_64471048 89 G A Y Y chr15 64470341 G Y 5 6 109 14 0 69 Y 312 0.247 0.998 0.393 0 - chr18_48070585_48071386 514 C T E K chr18 48071100 T K 7 7 46 14 0 69 Y 2 0.200 0.032 0.163 0 - chr18_50154905_50155664 304 A G Y C chr18 50155208 A Y 4 2 17 5 1 22 Y 8 0.022 0.996 0.128 0 - chr18_57379354_57380496 315 C T V V chr18 57379669 G V 11 0 60 9 6 62 Y 726 0.118 0.048 0.014 1 - chr19_14240610_14242055 232 C T A V chr19 14240840 C A 18 8 56 15 5 42 Y 73 0.003 0.153 0.835 0 - chr19_39866997_39874915 3117 C T P P chr19 39870110 C P 3 7 65 14 2 32 Y 6 0.321 0.911 0.462 4 - etc. - -- output file:: - - chr8_93901796_93905612 2471 A C A A chr8 93904264 A A 8 0 51 10 2 14 Y 961 0.016 0.534 0.114 2 - chr14_80021455_80022064 138 G A H H chr14 80021593 G H 14 0 69 9 6 124 Y 377 0.118 0.997 0.195 1 - chr18_57379354_57380496 315 C T V V chr18 57379669 G V 11 0 60 9 6 62 Y 726 0.118 0.048 0.014 1 - chr19_39866997_39874915 3117 C T P P chr19 39870110 C P 3 7 65 14 2 32 Y 6 0.321 0.911 0.462 4 - etc. - </help> -</tool> diff -r 0cab089a6ad7305c3ec968c39a7032b640376363 -r 849fcfda099e3fb55a0cad7e0e478142930d2e8e tools/genome_diversity/select_snps.py --- a/tools/genome_diversity/select_snps.py +++ /dev/null @@ -1,148 +0,0 @@ -#!/usr/bin/env python - -import sys -import math -from optparse import OptionParser -import genome_diversity as gd - -def main_function(parse_arguments=None): - if parse_arguments is None: - parse_arguments = lambda arguments: (None, arguments) - def main_decorator(to_decorate): - def decorated_main(arguments=None): - if arguments is None: - arguments = sys.argv - options, arguments = parse_arguments(arguments) - sys.exit(to_decorate(options, arguments)) - return decorated_main - return main_decorator - -def parse_arguments(arguments): - parser = OptionParser() - parser.add_option('--input', dest='input') - parser.add_option('--output', dest='output') - parser.add_option('--chrlens_loc', dest='chrlens_loc') - parser.add_option('--num_snps', dest='num_snps') - parser.add_option('--ref_chrom_col', dest='ref_chrom_col') - parser.add_option('--ref_pos_col', dest='ref_pos_col') - parser.add_option('--species', dest='species') - return parser.parse_args(arguments[1:]) - -@main_function(parse_arguments) -def main(options, arguments): - - ref_chrom_idx = to_int( options.ref_chrom_col ) -1 - ref_pos_idx = to_int( options.ref_pos_col ) -1 - - if (ref_chrom_idx < 1) or (ref_pos_idx < 1) or (ref_chrom_idx == ref_pos_idx): - print >> sys.stderr, "Cannot locate reference genome sequence (ref) or reference genome position (rPos) column for this dataset." - sys.exit(1) - - chrlens = gd.ChrLens( options.chrlens_loc, options.species ) - - total_len = 0 - for chrom in chrlens: - total_len += chrlens.length(chrom) - - total_requested = int( options.num_snps ) - lines, data, comments = get_snp_lines_data_and_comments( options.input, ref_chrom_idx, ref_pos_idx ) - selected = select_snps( data, total_len, total_requested ) - out_data = fix_selection_and_order_like_input(data, selected, total_requested) - write_selected_snps( options.output, out_data, lines, comments ) - -def to_int( value ): - try: - int_value = int( value ) - except ValueError: - int_value = 0 - return int_value - -def get_snp_lines_data_and_comments( filename, chrom_idx, pos_idx ): - fh = open( filename, 'r' ) - if (chrom_idx >= pos_idx): - needed = chrom_idx + 1 - else: - needed = pos_idx + 1 - lines = [] - data = [] - comments = [] - line_idx = 0 - line_num = 0 - for line in fh: - line_num += 1 - line = line.rstrip('\r\n') - if line: - if line.startswith('#'): - comments.append(line) - else: - elems = line.split('\t') - if len(elems) >= needed: - chrom = elems[chrom_idx] - try: - pos = int(elems[pos_idx]) - except ValueError: - sys.stderr.write( "bad reference position in line %d column %d: %s\n" % ( line_num, pos_idx+1, elems[pos_idx] ) ) - sys.exit(1) - lines.append(line) - chrom_sort = chrom.lstrip('chr') - data.append( [chrom_sort, chrom, pos, line_num, line_idx] ) - line_idx += 1 - fh.close() - data = sorted( data, key=lambda x: (x[0], x[2]) ) - return lines, data, comments - -def select_snps( data, total_len, requested ): - old_chrom = None - next_print = 0 - selected = [] - space = total_len / requested - for data_idx, datum in enumerate( data ): - chrom = datum[1] - pos = datum[2] - if chrom != old_chrom: - old_chrom = chrom - next_print = 0 - if pos >= next_print: - selected.append(data_idx) - next_print += space - return selected - -def fix_selection_and_order_like_input(data, selected, requested): - total_selected = len( selected ) - a = float( total_selected ) / requested - b = a / 2 - - idx_list = [] - for i in range( requested ): - idx = int( math.ceil( i * a + b ) - 1 ) - idx_list.append( idx ) - - out_data = [] - - for i, data_idx in enumerate(selected): - if total_selected > requested: - if i in idx_list: - out_data.append(data[data_idx]) - else: - out_data.append(data[data_idx]) - - out_data = sorted( out_data, key=lambda x: x[3] ) - - return out_data - -def write_selected_snps( filename, data, lines, comments ): - fh = open( filename, 'w' ) - - for comment in comments: - fh.write("%s\n" % comment ) - - for datum in data: - line_idx = datum[4] - fh.write("%s\n" % lines[line_idx]) - - fh.close() - -if __name__ == "__main__": - main() - - diff -r 0cab089a6ad7305c3ec968c39a7032b640376363 -r 849fcfda099e3fb55a0cad7e0e478142930d2e8e tools/genome_diversity/select_snps.xml --- a/tools/genome_diversity/select_snps.xml +++ /dev/null @@ -1,87 +0,0 @@ -<tool id="gd_select_snps" name="Select" version="1.0.0"> - <description>a specified number of SNPs</description> - - <command interpreter="python"> - select_snps.py "--input=$input" "--output=$output" "--chrlens_loc=${GALAXY_DATA_INDEX_DIR}/gd.chrlens.loc" "--num_snps=$num_snps" - #if $override_metadata.choice == "0": - "--ref_chrom_col=${input.metadata.ref}" "--ref_pos_col=${input.metadata.rPos}" "--species=${input.metadata.species}" - #else - "--ref_chrom_col=$ref_col" "--ref_pos_col=$rpos_col" "--species=$species" - #end if - </command> - - <inputs> - <param format="tabular" name="input" type="data" label="Selected SNPS dataset"/> - <param name="num_snps" type="integer" value="10" optional="false" min="1" label="Number of SNPs"/> - <conditional name="override_metadata"> - <param name="choice" type="select" format="integer" label="choose columns"> - <option value="0" selected="true">No, get columns from metadata</option> - <option value="1" >Yes, choose columns</option> - </param> - <when value="0"> - <!-- no options --> - </when> - <when value="1"> - <param name="ref_col" type="data_column" data_ref="input" numerical="false" label="Column with reference chromosome"/> - <param name="rpos_col" type="data_column" data_ref="input" numerical="true" label="Column with reference position"/> - <param name="species" type="select" label="Choose species"> - <options from_file="gd.species.txt"> - <column name="name" index="1"/> - <column name="value" index="0"/> - </options> - </param> - </when> - </conditional> - </inputs> - - <outputs> - <data format="wsf" name="output" metadata_source="input"/> - </outputs> - - <tests> - <test> - <param name="input" value="gd.sample.wsf" ftype="wsf"/> - <param name="num_snps" value="5"/> - <param name="choice" value="0"/> - <output name="output" file="gd.select_snps.wsf"/> - </test> - </tests> - - - <help> -**What it does** - - It attempts to select a specified number of SNPs from the dataset, making them - approximately uniformly spaced relative to the reference genome. The number - actually selected may be slightly more than the specified number. - ------ - -**Example** - -- input file:: - - chr2_75111355_75112576 314 A C L F chr2 75111676 C F 15 4 53 2 9 48 Y 96 0.369 0.355 0.396 0 - chr8_93901796_93905612 2471 A C A A chr8 93904264 A A 8 0 51 10 2 14 Y 961 0.016 0.534 0.114 2 - chr10_7434473_7435447 524 T C S S chr10 7435005 T S 11 5 90 14 0 69 Y 626 0.066 0.406 0.727 0 - chr14_80021455_80022064 138 G A H H chr14 80021593 G H 14 0 69 9 6 124 Y 377 0.118 0.997 0.195 1 - chr15_64470252_64471048 89 G A Y Y chr15 64470341 G Y 5 6 109 14 0 69 Y 312 0.247 0.998 0.393 0 - chr18_48070585_48071386 514 C T E K chr18 48071100 T K 7 7 46 14 0 69 Y 2 0.200 0.032 0.163 0 - chr18_50154905_50155664 304 A G Y C chr18 50155208 A Y 4 2 17 5 1 22 Y 8 0.022 0.996 0.128 0 - chr18_57379354_57380496 315 C T V V chr18 57379669 G V 11 0 60 9 6 62 Y 726 0.118 0.048 0.014 1 - chr19_14240610_14242055 232 C T A V chr19 14240840 C A 18 8 56 15 5 42 Y 73 0.003 0.153 0.835 0 - chr19_39866997_39874915 3117 C T P P chr19 39870110 C P 3 7 65 14 2 32 Y 6 0.321 0.911 0.462 4 - etc. - -- output file:: - - chr2_75111355_75112576 314 A C L F chr2 75111676 C F 15 4 53 2 9 48 Y 96 0.369 0.355 0.396 0 - chr8_93901796_93905612 2471 A C A A chr8 93904264 A A 8 0 51 10 2 14 Y 961 0.016 0.534 0.114 2 - chr10_7434473_7435447 524 T C S S chr10 7435005 T S 11 5 90 14 0 69 Y 626 0.066 0.406 0.727 0 - chr14_80021455_80022064 138 G A H H chr14 80021593 G H 14 0 69 9 6 124 Y 377 0.118 0.997 0.195 1 - chr15_64470252_64471048 89 G A Y Y chr15 64470341 G Y 5 6 109 14 0 69 Y 312 0.247 0.998 0.393 0 - chr18_48070585_48071386 514 C T E K chr18 48071100 T K 7 7 46 14 0 69 Y 2 0.200 0.032 0.163 0 - chr19_14240610_14242055 232 C T A V chr19 14240840 C A 18 8 56 15 5 42 Y 73 0.003 0.153 0.835 0 - etc. - </help> -</tool> Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.