[galaxy-dev] [hg] galaxy 3591: Add bed6 and bed12 datatypes, which are subcla...

16 Apr 2010

details:   http://www.bx.psu.edu/hg/galaxy/rev/40e8c99829e0
changeset: 3591:40e8c99829e0
user:      Dan Blankenberg <dan@bx.psu.edu>
date:      Thu Apr 01 13:08:48 2010 -0400
description:
Add bed6 and bed12 datatypes, which are subclasses of bedstrict; converters are available to turn any interval datatype into these types. Change GeneTrack Indexer tool and converter to use bed6 as input.

diffstat:

 datatypes_conf.xml.sample                                          |   7 +-
 display_applications/genetrack.xml                                 |   2 +-
 lib/galaxy/datatypes/converters/bed_to_genetrack_converter.xml     |   4 +-
 lib/galaxy/datatypes/converters/interval_to_bed12_converter.xml    |  15 +
 lib/galaxy/datatypes/converters/interval_to_bed6_converter.xml     |  15 +
 lib/galaxy/datatypes/converters/interval_to_bedstrict_converter.py |  81 +++++++--
 lib/galaxy/datatypes/interval.py                                   |  10 +
 tools/genetrack/genetrack_indexer.xml                              |   4 +-
 8 files changed, 111 insertions(+), 27 deletions(-)

diffs (261 lines):

diff -r fa70c688cda1 -r 40e8c99829e0 datatypes_conf.xml.sample

--- a/datatypes_conf.xml.sample	Thu Apr 01 10:56:59 2010 -0400
+++ b/datatypes_conf.xml.sample	Thu Apr 01 13:08:48 2010 -0400
@@ -13,11 +13,14 @@
             <converter file="interval_to_coverage.xml" target_datatype="coverage"/>
             <converter file="bed_to_interval_index_converter.xml" target_datatype="interval_index"/>
             <converter file="bed_to_summary_tree_converter.xml" target_datatype="summary_tree"/>
-            <converter file="bed_to_genetrack_converter.xml" target_datatype="genetrack"/>
             <!-- <display file="ucsc/interval_as_bed.xml" /> -->
             <display file="genetrack.xml" />
         </datatype>
         <datatype extension="bedstrict" type="galaxy.datatypes.interval:BedStrict" />
+        <datatype extension="bed6" type="galaxy.datatypes.interval:Bed6">
+            <converter file="bed_to_genetrack_converter.xml" target_datatype="genetrack"/>
+        </datatype>
+        <datatype extension="bed12" type="galaxy.datatypes.interval:Bed12" />
         <datatype extension="binseq.zip" type="galaxy.datatypes.binary:Binseq" mimetype="application/zip" display_in_upload="true"/>
         <datatype extension="len" type="galaxy.datatypes.chrominfo:ChromInfo" display_in_upload="true">
             <!-- no converters yet -->
@@ -49,6 +52,8 @@
         <datatype extension="interval" type="galaxy.datatypes.interval:Interval" display_in_upload="true">
             <converter file="interval_to_bed_converter.xml" target_datatype="bed"/>
             <converter file="interval_to_bedstrict_converter.xml" target_datatype="bedstrict"/>
+            <converter file="interval_to_bed6_converter.xml" target_datatype="bed6"/>
+            <converter file="interval_to_bed12_converter.xml" target_datatype="bed12"/>
             <indexer file="interval_awk.xml" />
             <!-- <display file="ucsc/interval_as_bed.xml" inherit="True" /> -->
             <display file="genetrack.xml" inherit="True"/>
diff -r fa70c688cda1 -r 40e8c99829e0 display_applications/genetrack.xml
--- a/display_applications/genetrack.xml	Thu Apr 01 10:56:59 2010 -0400
+++ b/display_applications/genetrack.xml	Thu Apr 01 13:08:48 2010 -0400
@@ -1,7 +1,7 @@
 <display id="genetrack_interval" version="1.0.0" name="view in">
     <link id="genetrack" name="GeneTrack">
         <url target_frame="galaxy_main">http://genetrack.g2.bx.psu.edu/galaxy?filename=${encoded_filename.qp}&hashkey=${hash_key.qp}&input=${qp(str($genetrack_file.id))}&GALAXY_URL=${galaxy_url.qp}</url>
-        <param type="data" name="bed_file" viewable="False" format="bed,genetrack"/> <!-- for now, we'll explicitly take care of the multi-step conversion; walk genetrack datatype down as a conversion of genetrack to genetrack doesn't exist and would likely be pointless -->
+        <param type="data" name="bed_file" viewable="False" format="bed6,genetrack"/> <!-- for now, we'll explicitly take care of the multi-step conversion; walk genetrack datatype down as a conversion of genetrack to genetrack doesn't exist and would likely be pointless -->
         <param type="data" dataset="bed_file" name="genetrack_file" format="genetrack" viewable="False" />
         <param type="template" name="galaxy_url" strip="True" >
             ${BASE_URL}/tool_runner?tool_id=predict2genetrack
diff -r fa70c688cda1 -r 40e8c99829e0 lib/galaxy/datatypes/converters/bed_to_genetrack_converter.xml
--- a/lib/galaxy/datatypes/converters/bed_to_genetrack_converter.xml	Thu Apr 01 10:56:59 2010 -0400
+++ b/lib/galaxy/datatypes/converters/bed_to_genetrack_converter.xml	Thu Apr 01 13:08:48 2010 -0400
@@ -1,4 +1,4 @@
-<tool id="CONVERTER_bed_to_genetrack_0" name="Convert BED to GeneTrack Index" version="1.0.0">
+<tool id="CONVERTER_bed_to_genetrack_0" name="Convert BED to GeneTrack Index" version="1.0.1">
 <!-- FIXME: THIS IS ALMOST 1:1 COPY OF THE SAME FUNCTIONED TOOL - ALLOW REGULAR TOOLS TO MASCARADE AS CONVERTERS 
 Using a shift of 0, but tool allows specifying...
 -->
@@ -6,7 +6,7 @@
   <command interpreter="python">bed_to_genetrack_converter.py -i $input1 -o $output1 -s 0 -v 0 -f BED -x</command>
   <inputs>
     <page>
-        <param format="bed" name="input1" type="data" label="Choose BED file"/>
+        <param format="bed6" name="input1" type="data" label="Choose BED file"/>
     </page>
    </inputs>
   <outputs>
diff -r fa70c688cda1 -r 40e8c99829e0 lib/galaxy/datatypes/converters/interval_to_bed12_converter.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/galaxy/datatypes/converters/interval_to_bed12_converter.xml	Thu Apr 01 13:08:48 2010 -0400
@@ -0,0 +1,15 @@
+<tool id="CONVERTER_interval_to_bed12_0" name="Convert Genomic Intervals To Strict BED12">
+  <!--  <description>__NOT_USED_CURRENTLY_FOR_CONVERTERS__</description> -->
+  <!-- Used on the metadata edit page. -->
+  <command interpreter="python">interval_to_bedstrict_converter.py $output1 $input1 ${input1.metadata.chromCol} ${input1.metadata.startCol} ${input1.metadata.endCol} ${input1.metadata.strandCol} ${input1.metadata.nameCol} ${input1.extension} 12</command>
+  <inputs>
+    <page>
+      <param format="interval" name="input1" type="data" label="Choose intervals"/>
+    </page>
+   </inputs>
+  <outputs>
+    <data format="bed12" name="output1"/>
+  </outputs>
+  <help>
+  </help>
+</tool>
diff -r fa70c688cda1 -r 40e8c99829e0 lib/galaxy/datatypes/converters/interval_to_bed6_converter.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/galaxy/datatypes/converters/interval_to_bed6_converter.xml	Thu Apr 01 13:08:48 2010 -0400
@@ -0,0 +1,15 @@
+<tool id="CONVERTER_interval_to_bed6_0" name="Convert Genomic Intervals To Strict BED6">
+  <!--  <description>__NOT_USED_CURRENTLY_FOR_CONVERTERS__</description> -->
+  <!-- Used on the metadata edit page. -->
+  <command interpreter="python">interval_to_bedstrict_converter.py $output1 $input1 ${input1.metadata.chromCol} ${input1.metadata.startCol} ${input1.metadata.endCol} ${input1.metadata.strandCol} ${input1.metadata.nameCol} ${input1.extension} 6</command>
+  <inputs>
+    <page>
+      <param format="interval" name="input1" type="data" label="Choose intervals"/>
+    </page>
+   </inputs>
+  <outputs>
+    <data format="bed6" name="output1"/>
+  </outputs>
+  <help>
+  </help>
+</tool>
diff -r fa70c688cda1 -r 40e8c99829e0 lib/galaxy/datatypes/converters/interval_to_bedstrict_converter.py
--- a/lib/galaxy/datatypes/converters/interval_to_bedstrict_converter.py	Thu Apr 01 10:56:59 2010 -0400
+++ b/lib/galaxy/datatypes/converters/interval_to_bedstrict_converter.py	Thu Apr 01 13:08:48 2010 -0400
@@ -12,6 +12,27 @@
     sys.stderr.write( msg )
     sys.exit()
 
+def force_bed_field_count( fields, region_count, force_num_columns ):
+    if force_num_columns >= 4 and len( fields ) < 4:
+        fields.append( 'region_%i' % ( region_count ) )
+    if force_num_columns >= 5 and len( fields ) < 5:
+        fields.append( '0' )
+    if force_num_columns >= 6 and len( fields ) < 6:
+        fields.append( '+' )
+    if force_num_columns >= 7 and len( fields ) < 7:
+        fields.append( fields[1] )
+    if force_num_columns >= 8 and len( fields ) < 8:
+        fields.append( fields[2] )
+    if force_num_columns >= 9 and len( fields ) < 9:
+        fields.append( '0' )
+    if force_num_columns >= 10 and len( fields ) < 10:
+        fields.append( '0' )
+    if force_num_columns >= 11 and len( fields ) < 11:
+        fields.append( ',' )
+    if force_num_columns >= 12 and len( fields ) < 12:
+        fields.append( ',' )
+    return fields[:force_num_columns]
+
 def __main__():
     output_name = sys.argv[1]
     input_name = sys.argv[2]
@@ -39,6 +60,10 @@
         extension = sys.argv[8]
     except:
         extension = 'interval' #default extension
+    try:
+        force_num_columns = int( sys.argv[9] )
+    except:
+        force_num_columns = None
     
     skipped_lines = 0
     first_skipped_line = None
@@ -47,40 +72,52 @@
     #does file already conform to bed strict?
     #if so, we want to keep extended columns, otherwise we'll create a generic 6 column bed file
     strict_bed = True
-    if extension in [ 'bed', 'bedstrict' ] and ( chromCol, startCol, endCol, nameCol, strandCol ) == ( 0, 1, 2, 3, 5 ):
+    if extension in [ 'bed', 'bedstrict', 'bed6', 'bed12' ] and ( chromCol, startCol, endCol) == ( 0, 1, 2) and ( nameCol < 0 or nameCol == 3 ) and ( strandCol < 0 or strandCol == 5 ):
         for count, line in enumerate( open( input_name ) ):
-            line = line.strip()
+            line = line.rstrip( '\n\r' )
             if line == "" or line.startswith("#"):
                 skipped_lines += 1
                 if first_skipped_line is None:
                     first_skipped_line = count + 1
                 continue
             fields = line.split('\t')
+            assert len( fields ) >= 3, 'A BED file requires at least 3 columns' #we can't fix this
             try:
                 if len(fields) > 12:
                     strict_bed = False
                     break
-                if len(fields) > 6:
-                    int(fields[6])
-                    if len(fields) > 7:
-                        int(fields[7])
-                        if len(fields) > 8:
-                            if int(fields[8]) != 0:
-                                strict_bed = False
-                                break
-                            if len(fields) > 9:
-                                int(fields[9])
-                                if len(fields) > 10:
-                                    fields2 = fields[10].rstrip(",").split(",") #remove trailing comma and split on comma
-                                    for field in fields2: 
-                                        int(field)
-                                    if len(fields) > 11:
-                                        fields2 = fields[11].rstrip(",").split(",") #remove trailing comma and split on comma
+                #name (fields[3]) can be anything, no verification needed
+                if len( fields ) > 4:
+                    float( fields[4] ) #score - A score between 0 and 1000. If the track line useScore attribute is set to 1 for this annotation data set, the score value will determine the level of gray in which this feature is displayed (higher numbers = darker gray). 
+                    if len( fields ) > 5:
+                        assert fields[5] in [ '+', '-' ], 'Invalid strand' #strand - Defines the strand - either '+' or '-'. 
+                        if len( fields ) > 6:
+                            int( fields[6] ) #thickStart - The starting position at which the feature is drawn thickly (for example, the start codon in gene displays). 
+                            if len( fields ) > 7:
+                                int( fields[7] ) #thickEnd - The ending position at which the feature is drawn thickly (for example, the stop codon in gene displays). 
+                                if len( fields ) > 8:  
+                                    if fields[8] != '0': #itemRgb - An RGB value of the form R,G,B (e.g. 255,0,0). If the track line itemRgb attribute is set to "On", this RBG value will determine the display color of the data contained in this BED line. NOTE: It is recommended that a simple color scheme (eight colors or less) be used with this attribute to avoid overwhelming the color resources of the Genome Browser and your Internet browser.
+                                        fields2 = fields[8].split( ',' )
+                                        assert len( fields2 ) == 3, 'RGB value must be 0 or have length of 3'
                                         for field in fields2:
-                                            int(field)
+                                            int( field ) #rgb values are integers
+                                    if len( fields ) > 9:
+                                        int( fields[9] ) #blockCount - The number of blocks (exons) in the BED line. 
+                                        if len( fields ) > 10:
+                                            if fields[10] != ',': #blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount. 
+                                                fields2 = fields[10].rstrip( "," ).split( "," ) #remove trailing comma and split on comma
+                                                for field in fields2: 
+                                                    int( field )
+                                            if len( fields ) > 11:
+                                                if fields[11] != ',': #blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount. 
+                                                    fields2 = fields[11].rstrip( "," ).split( "," ) #remove trailing comma and split on comma
+                                                    for field in fields2:
+                                                        int( field )
             except: 
                 strict_bed = False
                 break
+            if force_num_columns is not None and len( fields ) != force_num_columns:
+                line = '\t'.join( force_bed_field_count( fields, count, force_num_columns ) )
             out.write( "%s\n" % line )
     else:
         strict_bed = False
@@ -100,8 +137,10 @@
             except:
                 name = "region_%i" % count
             try:
-                
-                out.write( "%s\t%i\t%i\t%s\t%i\t%s\n" %  ( region.chrom, region.start, region.end, name, 0, region.strand ) )
+                fields = map( str, [ region.chrom, region.start, region.end, name, 0, region.strand ] )
+                if force_num_columns is not None and len( fields ) != force_num_columns:
+                    fields = force_bed_field_count( fields, count, force_num_columns )
+                out.write( "%s\n" % '\t'.join( fields ) )
             except:
                 skipped_lines += 1
                 if first_skipped_line is None:
diff -r fa70c688cda1 -r 40e8c99829e0 lib/galaxy/datatypes/interval.py
--- a/lib/galaxy/datatypes/interval.py	Thu Apr 01 10:56:59 2010 -0400
+++ b/lib/galaxy/datatypes/interval.py	Thu Apr 01 13:08:48 2010 -0400
@@ -536,6 +536,16 @@
     def sniff( self, filename ):
         return False #NOTE: This would require aggressively validating the entire file
 
+class Bed6( BedStrict ):
+    """Tab delimited data in strict BED format - no non-standard columns allowed; column count forced to 6"""
+
+    file_ext = "bed6"
+
+class Bed12( BedStrict ):
+    """Tab delimited data in strict BED format - no non-standard columns allowed; column count forced to 12"""
+
+    file_ext = "bed12"
+
 class _RemoteCallMixin:
     def _get_remote_call_url( self, redirect_url, site_name, dataset, type, app, base_url ):
         """Retrieve the URL to call out to an external site and retrieve data.
diff -r fa70c688cda1 -r 40e8c99829e0 tools/genetrack/genetrack_indexer.xml
--- a/tools/genetrack/genetrack_indexer.xml	Thu Apr 01 10:56:59 2010 -0400
+++ b/tools/genetrack/genetrack_indexer.xml	Thu Apr 01 13:08:48 2010 -0400
@@ -1,4 +1,4 @@
-<tool id="bed2genetrack" name="GeneTrack indexer">
+<tool id="bed2genetrack" name="GeneTrack indexer" version="1.0.1">
   
   <description>on a BED file</description>
 
@@ -8,7 +8,7 @@
     
   <inputs>
     
-    <param format="bed" name="input" type="data" help="Input data">
+    <param format="bed6" name="input" type="data" help="Input data">
       <label>Select input bed file</label>
     </param>

    

[galaxy-dev] [hg] galaxy 3591: Add bed6 and bed12 datatypes, which are subcla...

Greg Von Kuster