# HG changeset patch -- Bitbucket.org # Project galaxy-dist # URL http://bitbucket.org/galaxy/galaxy-dist/overview # User jeremy goecks <jeremy.goecks@emory.edu> # Date 1286396635 14400 # Node ID 9cada14f87657b951ccc9a76af5de03263ba72d9 # Parent 75a3e2a75cdaf8872ef3cafd745524dab2872962 Make VCF (variant call format) a Galaxy datatype and enable very basic VCF support in trackster. VCF datatype is sniffable and can be converted to summary tree and interval index. In trackster, VCF files are represented as single-base pair feature tracks. --- a/lib/galaxy/datatypes/tabular.py +++ b/lib/galaxy/datatypes/tabular.py @@ -11,6 +11,7 @@ from galaxy import util from cgi import escape from galaxy.datatypes import metadata from galaxy.datatypes.metadata import MetadataElement +import galaxy_utils.sequence.vcf from sniff import * log = logging.getLogger(__name__) @@ -459,3 +460,18 @@ class ElandMulti( Tabular ): def sniff( self, filename ): return False + +class Vcf( Tabular ): + file_ext = 'vcf' + + def sniff( self, filename ): + try: + # If reader can read and parse file, it's VCF. + for line in list( galaxy_utils.sequence.vcf.Reader( open( filename ) ) ): + pass + return True + except: + return False + + def get_track_type( self ): + return "FeatureTrack", {"data": "interval_index", "index": "summary_tree"} --- a/lib/galaxy/visualization/tracks/data/summary_tree.py +++ b/lib/galaxy/visualization/tracks/data/summary_tree.py @@ -19,6 +19,10 @@ class SummaryTreeDataProvider( object ): if st is None: st = summary_tree_from_file( self.dataset.file_name ) CACHE[filename] = st + + # If chrom is not found in blocks, try removing the first three + # characters (e.g. 'chr') and see if that works. This enables the + # provider to handle chrome names defined as chrXXX and as XXX. if chrom in st.chrom_blocks: pass elif chrom[3:] in st.chrom_blocks: --- a/lib/galaxy/visualization/tracks/data/interval_index.py +++ b/lib/galaxy/visualization/tracks/data/interval_index.py @@ -8,6 +8,7 @@ Payload format: [ uid (offset), start, e import pkg_resources; pkg_resources.require( "bx-python" ) from bx.interval_index_file import Indexes from galaxy.datatypes.interval import Bed, Gff +from galaxy.datatypes.tabular import Vcf MAX_VALS = 5000 # only display first MAX_VALS features @@ -18,13 +19,19 @@ class IntervalIndexDataProvider( object def get_data( self, chrom, start, end, **kwargs ): start, end = int(start), int(end) - chrom = str(chrom) source = open( self.original_dataset.file_name ) index = Indexes( self.converted_dataset.file_name ) results = [] count = 0 message = None + # If chrom is not found in indexes, try removing the first three + # characters (e.g. 'chr') and see if that works. This enables the + # provider to handle chrome names defined as chrXXX and as XXX. + chrom = str(chrom) + if chrom not in index.indexes and chrom[3:] in index.indexes: + chrom = chrom[3:] + for start, end, offset in index.find(chrom, start, end): if count >= MAX_VALS: message = "Only the first %s features are being displayed." % MAX_VALS @@ -34,6 +41,7 @@ class IntervalIndexDataProvider( object feature = source.readline().split() payload = [ offset, start, end ] # TODO: can we use column metadata to fill out payload? + # TODO: use function to set payload data if "no_detail" not in kwargs: length = len(feature) if isinstance( self.original_dataset.datatype, Gff ): @@ -58,7 +66,10 @@ class IntervalIndexDataProvider( object block_starts = [ int(n) for n in feature[11].split(',') if n != '' ] blocks = zip(block_sizes, block_starts) payload.append( [ (start + block[1], start + block[1] + block[0]) for block in blocks] ) - + elif isinstance( self.original_dataset.datatype, Vcf ): + # VCF dataset. + payload.append( feature[2] ) # name + results.append(payload) return { 'data': results, 'message': message } --- a/datatypes_conf.xml.sample +++ b/datatypes_conf.xml.sample @@ -111,6 +111,10 @@ <datatype extension="tabular" type="galaxy.datatypes.tabular:Tabular" display_in_upload="true"/><datatype extension="txt" type="galaxy.datatypes.data:Text" display_in_upload="true"/><datatype extension="blastxml" type="galaxy.datatypes.xml:BlastXml" display_in_upload="true"/> + <datatype extension="vcf" type="galaxy.datatypes.tabular:Vcf" display_in_upload="true"> + <converter file="vcf_to_interval_index_converter.xml" target_datatype="interval_index"/> + <converter file="vcf_to_summary_tree_converter.xml" target_datatype="summary_tree"/> + </datatype><datatype extension="velvet" type="galaxy.datatypes.assembly:Velvet" display_in_upload="false"/><datatype extension="wig" type="galaxy.datatypes.interval:Wiggle" display_in_upload="true"><converter file="wiggle_to_array_tree_converter.xml" target_datatype="array_tree"/> @@ -285,6 +289,7 @@ <sniffer type="galaxy.datatypes.tabular:Pileup"/><sniffer type="galaxy.datatypes.interval:Interval"/><sniffer type="galaxy.datatypes.tabular:Sam"/> + <sniffer type="galaxy.datatypes.tabular:Vcf"/><!-- Keep this commented until the sniff method in the assembly.py module is fixed to not read the entire file. --- a/lib/galaxy/web/controllers/tracks.py +++ b/lib/galaxy/web/controllers/tracks.py @@ -9,7 +9,7 @@ is redirected to the browser interface, """ -import math, re, logging, glob, pkg_resources +import re, pkg_resources pkg_resources.require( "bx-python" ) from bx.seq.twobit import TwoBitFile @@ -17,7 +17,7 @@ from galaxy import model from galaxy.util.json import to_json_string, from_json_string from galaxy.web.base.controller import * from galaxy.web.framework import simplejson -from galaxy.web.framework.helpers import time_ago, grids +from galaxy.web.framework.helpers import grids from galaxy.util.bunch import Bunch from galaxy.visualization.tracks.data.array_tree import ArrayTreeDataProvider --- /dev/null +++ b/lib/galaxy/datatypes/converters/vcf_to_interval_index_converter.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python + +""" +Convert from VCF file to interval index file. +""" + +from __future__ import division + +import optparse +from galaxy import eggs +import pkg_resources; pkg_resources.require( "bx-python" ) +import galaxy_utils.sequence.vcf +from bx.interval_index_file import Indexes + +def main(): + # Read options, args. + parser = optparse.OptionParser() + (options, args) = parser.parse_args() + in_file, out_file = args + + # Do conversion. + index = Indexes() + reader = galaxy_utils.sequence.vcf.Reader( open( in_file ) ) + offset = reader.metadata_len + for vcf_line in reader: + # VCF format provides a chrom and 1-based position for each variant. + # IntervalIndex expects 0-based coordinates. + index.add( vcf_line.chrom, vcf_line.pos-1, vcf_line.pos, offset ) + offset += len( vcf_line.raw_line ) + + index.write( open( out_file, "w" ) ) + +if __name__ == "__main__": + main() + --- /dev/null +++ b/lib/galaxy/datatypes/converters/vcf_to_summary_tree_converter.xml @@ -0,0 +1,14 @@ +<tool id="CONVERTER_vcf_to_summary_tree_0" name="Convert VCF to Summary Tree" version="1.0.0" hidden="true"> + <description>__NOT_USED_CURRENTLY_FOR_CONVERTERS__</description> + <command interpreter="python">vcf_to_summary_tree_converter.py $input1 $output1</command> + <inputs> + <page> + <param format="vcf" name="input1" type="data" label="Choose VCF file"/> + </page> + </inputs> + <outputs> + <data format="summary_tree" name="output1"/> + </outputs> + <help> + </help> +</tool> --- /dev/null +++ b/lib/galaxy/datatypes/converters/vcf_to_summary_tree_converter.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python + +""" +Convert from VCF file to summary tree file. + +usage: %prog in_file out_file +""" +from __future__ import division + +import optparse +import galaxy_utils.sequence.vcf +from galaxy.visualization.tracks.summary import SummaryTree + +def main(): + # Read options, args. + parser = optparse.OptionParser() + (options, args) = parser.parse_args() + in_file, out_file = args + + # Do conversion. + st = SummaryTree(block_size=25, levels=6, draw_cutoff=150, detail_cutoff=30) + for line in list( galaxy_utils.sequence.vcf.Reader( open( in_file ) ) ): + # VCF format provides a chrom and 1-based position for each variant. + # SummaryTree expects 0-based coordinates. + st.insert_range( line.chrom, long( line.pos-1 ), long( line.pos ) ) + + st.write(out_file) + +if __name__ == "__main__": + main() --- /dev/null +++ b/lib/galaxy/datatypes/converters/vcf_to_interval_index_converter.xml @@ -0,0 +1,14 @@ +<tool id="CONVERTER_vcf_to_interval_index_0" name="Convert VCF to Interval Index" version="1.0.0" hidden="true"> + <description>__NOT_USED_CURRENTLY_FOR_CONVERTERS__</description> + <command interpreter="python">vcf_to_interval_index_converter.py $input1 $output1</command> + <inputs> + <page> + <param format="vcf" name="input1" type="data" label="Choose VCF file"/> + </page> + </inputs> + <outputs> + <data format="interval_index" name="output1"/> + </outputs> + <help> + </help> +</tool> --- a/lib/galaxy_utils/sequence/vcf.py +++ b/lib/galaxy_utils/sequence/vcf.py @@ -23,6 +23,8 @@ class VariantCall33( VariantCall ): required_header_length = len( required_header_fields ) def __init__( self, vcf_line, metadata, sample_names ): + # Raw line is needed for indexing file. + self.raw_line = vcf_line self.line = vcf_line.rstrip( '\n\r' ) self.metadata = metadata self.sample_names = sample_names @@ -59,10 +61,14 @@ class Reader( object ): self.vcf_file = fh self.metadata = {} self.header_fields = None + self.metadata_len = 0 self.sample_names = [] self.vcf_class = None + + # Read file metadata. while True: line = self.vcf_file.readline() + self.metadata_len += len( line ) assert line, 'Invalid VCF file provided.' line = line.rstrip( '\r\n' ) if self.vcf_class and line.startswith( self.vcf_class.header_startswith ):