[galaxy-commits] galaxy-dist commit 9cada14f8765: Make VCF (variant call format) a Galaxy datatype and enable very basic VCF support in trackster. VCF datatype is sniffable and can be converted to summary tree and interval index. In trackster, VCF files are represented as single-base pair feature tracks.

20 Nov 2010

# HG changeset patch -- Bitbucket.org
# Project galaxy-dist
# URL http://bitbucket.org/galaxy/galaxy-dist/overview
# User jeremy goecks <jeremy.goecks@emory.edu>
# Date 1286396635 14400
# Node ID 9cada14f87657b951ccc9a76af5de03263ba72d9
# Parent  75a3e2a75cdaf8872ef3cafd745524dab2872962
Make VCF (variant call format) a Galaxy datatype and enable very basic VCF support in trackster. VCF datatype is sniffable and can be converted to summary tree and interval index. In trackster, VCF files are represented as single-base pair feature tracks.

--- a/lib/galaxy/datatypes/tabular.py
+++ b/lib/galaxy/datatypes/tabular.py
@@ -11,6 +11,7 @@ from galaxy import util
 from cgi import escape
 from galaxy.datatypes import metadata
 from galaxy.datatypes.metadata import MetadataElement
+import galaxy_utils.sequence.vcf
 from sniff import *
 
 log = logging.getLogger(__name__)
@@ -459,3 +460,18 @@ class ElandMulti( Tabular ):
     
     def sniff( self, filename ):
         return False
+        
+class Vcf( Tabular ):
+    file_ext = 'vcf'
+    
+    def sniff( self, filename ):
+        try:
+            # If reader can read and parse file, it's VCF.
+            for line in list( galaxy_utils.sequence.vcf.Reader( open( filename ) ) ):
+                pass
+            return True
+        except:
+            return False
+        
+    def get_track_type( self ):
+        return "FeatureTrack", {"data": "interval_index", "index": "summary_tree"}

--- a/lib/galaxy/visualization/tracks/data/summary_tree.py
+++ b/lib/galaxy/visualization/tracks/data/summary_tree.py
@@ -19,6 +19,10 @@ class SummaryTreeDataProvider( object ):
         if st is None:
             st = summary_tree_from_file( self.dataset.file_name )
             CACHE[filename] = st
+            
+        # If chrom is not found in blocks, try removing the first three 
+        # characters (e.g. 'chr') and see if that works. This enables the
+        # provider to handle chrome names defined as chrXXX and as XXX.
         if chrom in st.chrom_blocks:
             pass
         elif chrom[3:] in st.chrom_blocks:

--- a/lib/galaxy/visualization/tracks/data/interval_index.py
+++ b/lib/galaxy/visualization/tracks/data/interval_index.py
@@ -8,6 +8,7 @@ Payload format: [ uid (offset), start, e
 import pkg_resources; pkg_resources.require( "bx-python" )
 from bx.interval_index_file import Indexes
 from galaxy.datatypes.interval import Bed, Gff
+from galaxy.datatypes.tabular import Vcf
 
 MAX_VALS = 5000 # only display first MAX_VALS features
 
@@ -18,13 +19,19 @@ class IntervalIndexDataProvider( object 
     
     def get_data( self, chrom, start, end, **kwargs ):
         start, end = int(start), int(end)
-        chrom = str(chrom)
         source = open( self.original_dataset.file_name )
         index = Indexes( self.converted_dataset.file_name )
         results = []
         count = 0
         message = None
         
+        # If chrom is not found in indexes, try removing the first three 
+        # characters (e.g. 'chr') and see if that works. This enables the
+        # provider to handle chrome names defined as chrXXX and as XXX.
+        chrom = str(chrom)
+        if chrom not in index.indexes and chrom[3:] in index.indexes:
+            chrom = chrom[3:]
+        
         for start, end, offset in index.find(chrom, start, end):
             if count >= MAX_VALS:
                 message = "Only the first %s features are being displayed." % MAX_VALS
@@ -34,6 +41,7 @@ class IntervalIndexDataProvider( object 
             feature = source.readline().split()
             payload = [ offset, start, end ]
             # TODO: can we use column metadata to fill out payload?
+            # TODO: use function to set payload data
             if "no_detail" not in kwargs:
                 length = len(feature)
                 if isinstance( self.original_dataset.datatype, Gff ):
@@ -58,7 +66,10 @@ class IntervalIndexDataProvider( object 
                         block_starts = [ int(n) for n in feature[11].split(',') if n != '' ]
                         blocks = zip(block_sizes, block_starts)
                         payload.append( [ (start + block[1], start + block[1] + block[0]) for block in blocks] )
-
+                elif isinstance( self.original_dataset.datatype, Vcf ):
+                    # VCF dataset.
+                    payload.append( feature[2] ) # name
+                    
             results.append(payload)
         
         return { 'data': results, 'message': message }

--- a/datatypes_conf.xml.sample
+++ b/datatypes_conf.xml.sample
@@ -111,6 +111,10 @@
         <datatype extension="tabular" type="galaxy.datatypes.tabular:Tabular" display_in_upload="true"/><datatype extension="txt" type="galaxy.datatypes.data:Text" display_in_upload="true"/><datatype extension="blastxml" type="galaxy.datatypes.xml:BlastXml" display_in_upload="true"/>
+        <datatype extension="vcf" type="galaxy.datatypes.tabular:Vcf" display_in_upload="true">
+            <converter file="vcf_to_interval_index_converter.xml" target_datatype="interval_index"/>
+            <converter file="vcf_to_summary_tree_converter.xml" target_datatype="summary_tree"/>    
+        </datatype><datatype extension="velvet" type="galaxy.datatypes.assembly:Velvet" display_in_upload="false"/><datatype extension="wig" type="galaxy.datatypes.interval:Wiggle" display_in_upload="true"><converter file="wiggle_to_array_tree_converter.xml" target_datatype="array_tree"/>
@@ -285,6 +289,7 @@
         <sniffer type="galaxy.datatypes.tabular:Pileup"/><sniffer type="galaxy.datatypes.interval:Interval"/><sniffer type="galaxy.datatypes.tabular:Sam"/>
+        <sniffer type="galaxy.datatypes.tabular:Vcf"/><!--
         Keep this commented until the sniff method in the assembly.py
         module is fixed to not read the entire file.

--- a/lib/galaxy/web/controllers/tracks.py
+++ b/lib/galaxy/web/controllers/tracks.py
@@ -9,7 +9,7 @@ is redirected to the browser interface, 
 
 """
 
-import math, re, logging, glob, pkg_resources
+import re, pkg_resources
 pkg_resources.require( "bx-python" )
 
 from bx.seq.twobit import TwoBitFile
@@ -17,7 +17,7 @@ from galaxy import model
 from galaxy.util.json import to_json_string, from_json_string
 from galaxy.web.base.controller import *
 from galaxy.web.framework import simplejson
-from galaxy.web.framework.helpers import time_ago, grids
+from galaxy.web.framework.helpers import grids
 from galaxy.util.bunch import Bunch
 
 from galaxy.visualization.tracks.data.array_tree import ArrayTreeDataProvider

--- /dev/null
+++ b/lib/galaxy/datatypes/converters/vcf_to_interval_index_converter.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python
+
+"""
+Convert from VCF file to interval index file.
+"""
+
+from __future__ import division
+
+import optparse
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+import galaxy_utils.sequence.vcf
+from bx.interval_index_file import Indexes
+
+def main():
+    # Read options, args.
+    parser = optparse.OptionParser()
+    (options, args) = parser.parse_args()
+    in_file, out_file = args    
+    
+    # Do conversion.
+    index = Indexes()
+    reader = galaxy_utils.sequence.vcf.Reader( open( in_file ) ) 
+    offset = reader.metadata_len
+    for vcf_line in reader:
+        # VCF format provides a chrom and 1-based position for each variant. 
+        # IntervalIndex expects 0-based coordinates.
+        index.add( vcf_line.chrom, vcf_line.pos-1, vcf_line.pos, offset )
+        offset += len( vcf_line.raw_line )
+            
+    index.write( open( out_file, "w" ) )
+
+if __name__ == "__main__": 
+    main()
+    

--- /dev/null
+++ b/lib/galaxy/datatypes/converters/vcf_to_summary_tree_converter.xml
@@ -0,0 +1,14 @@
+<tool id="CONVERTER_vcf_to_summary_tree_0" name="Convert VCF to Summary Tree" version="1.0.0" hidden="true">
+  <description>__NOT_USED_CURRENTLY_FOR_CONVERTERS__</description>
+  <command interpreter="python">vcf_to_summary_tree_converter.py $input1 $output1</command>
+  <inputs>
+    <page>
+        <param format="vcf" name="input1" type="data" label="Choose VCF file"/>
+    </page>
+   </inputs>
+  <outputs>
+    <data format="summary_tree" name="output1"/>
+  </outputs>
+  <help>
+  </help>
+</tool>

--- /dev/null
+++ b/lib/galaxy/datatypes/converters/vcf_to_summary_tree_converter.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+
+"""
+Convert from VCF file to summary tree file.
+
+usage: %prog in_file out_file
+"""
+from __future__ import division
+
+import optparse
+import galaxy_utils.sequence.vcf
+from galaxy.visualization.tracks.summary import SummaryTree
+
+def main():
+    # Read options, args.
+    parser = optparse.OptionParser()
+    (options, args) = parser.parse_args()
+    in_file, out_file = args
+        
+    # Do conversion.
+    st = SummaryTree(block_size=25, levels=6, draw_cutoff=150, detail_cutoff=30)
+    for line in list( galaxy_utils.sequence.vcf.Reader( open( in_file ) ) ):
+        # VCF format provides a chrom and 1-based position for each variant. 
+        # SummaryTree expects 0-based coordinates.
+        st.insert_range( line.chrom, long( line.pos-1 ), long( line.pos ) )
+    
+    st.write(out_file)
+
+if __name__ == "__main__": 
+    main()

--- /dev/null
+++ b/lib/galaxy/datatypes/converters/vcf_to_interval_index_converter.xml
@@ -0,0 +1,14 @@
+<tool id="CONVERTER_vcf_to_interval_index_0" name="Convert VCF to Interval Index" version="1.0.0" hidden="true">
+  <description>__NOT_USED_CURRENTLY_FOR_CONVERTERS__</description>
+  <command interpreter="python">vcf_to_interval_index_converter.py $input1 $output1</command>
+  <inputs>
+    <page>
+        <param format="vcf" name="input1" type="data" label="Choose VCF file"/>
+    </page>
+   </inputs>
+  <outputs>
+    <data format="interval_index" name="output1"/>
+  </outputs>
+  <help>
+  </help>
+</tool>

--- a/lib/galaxy_utils/sequence/vcf.py
+++ b/lib/galaxy_utils/sequence/vcf.py
@@ -23,6 +23,8 @@ class VariantCall33( VariantCall ):
     required_header_length = len( required_header_fields )
     
     def __init__( self, vcf_line, metadata, sample_names ):
+        # Raw line is needed for indexing file.
+        self.raw_line = vcf_line
         self.line = vcf_line.rstrip( '\n\r' )
         self.metadata = metadata
         self.sample_names = sample_names
@@ -59,10 +61,14 @@ class Reader( object ):
         self.vcf_file = fh
         self.metadata = {}
         self.header_fields = None
+        self.metadata_len = 0
         self.sample_names = []
         self.vcf_class = None
+        
+        # Read file metadata.
         while True:
             line = self.vcf_file.readline()
+            self.metadata_len += len( line )
             assert line, 'Invalid VCF file provided.'
             line = line.rstrip( '\r\n' )
             if self.vcf_class and line.startswith( self.vcf_class.header_startswith ):

    

commits-noreply＠bitbucket.org