galaxy-dist commit 2b8161827380: Move gff_util.py from galaxy/tools/util to galaxy/datatypes/util because gff_utils are now used by the framework as well as tools.

20 Nov 2010

# HG changeset patch -- Bitbucket.org
# Project galaxy-dist
# URL http://bitbucket.org/galaxy/galaxy-dist/overview
# User jeremy goecks <jeremy.goecks@emory.edu>
# Date 1289752184 18000
# Node ID 2b81618273808135efd9409f953161481944fec2
# Parent  98009db17cb2b21db626fc9c185a8fd22109a4cd
Move gff_util.py from galaxy/tools/util to galaxy/datatypes/util because gff_utils are now used by the framework as well as tools.

--- a/tools/new_operations/gops_subtract.py
+++ b/tools/new_operations/gops_subtract.py
@@ -21,7 +21,7 @@ from bx.intervals.io import *
 from bx.intervals.operations.subtract import *
 from bx.cookbook import doc_optparse
 from galaxy.tools.util.galaxyops import *
-from galaxy.tools.util.gff_util import *
+from galaxy.datatypes.util.gff_util import *
 
 assert sys.version_info[:2] >= ( 2, 4 )
 

--- a/tools/new_operations/gops_intersect.py
+++ b/tools/new_operations/gops_intersect.py
@@ -21,7 +21,7 @@ from bx.intervals.io import *
 from bx.intervals.operations.intersect import *
 from bx.cookbook import doc_optparse
 from galaxy.tools.util.galaxyops import *
-from galaxy.tools.util.gff_util import *
+from galaxy.datatypes.util.gff_util import *
 
 assert sys.version_info[:2] >= ( 2, 4 )
 

--- a/tools/extract/extract_genomic_dna.py
+++ b/tools/extract/extract_genomic_dna.py
@@ -15,7 +15,7 @@ from bx.cookbook import doc_optparse
 import bx.seq.nib
 import bx.seq.twobit
 from galaxy.tools.util.galaxyops import *
-from galaxy.tools.util.gff_util import *
+from galaxy.datatypes.util.gff_util import *
 
 assert sys.version_info[:2] >= ( 2, 4 )
     

--- a/lib/galaxy/datatypes/converters/interval_to_interval_index_converter.py
+++ b/lib/galaxy/datatypes/converters/interval_to_interval_index_converter.py
@@ -14,7 +14,7 @@ from galaxy import eggs
 import pkg_resources; pkg_resources.require( "bx-python" )
 from galaxy.visualization.tracks.summary import *
 from bx.cookbook import doc_optparse
-from galaxy.tools.util.gff_util import convert_gff_coords_to_bed
+from galaxy.datatypes.util.gff_util import convert_gff_coords_to_bed
 from bx.interval_index_file import Indexes
 
 def main():

--- a/tools/filters/gff/gff_filter_by_feature_count.py
+++ b/tools/filters/gff/gff_filter_by_feature_count.py
@@ -7,7 +7,7 @@ Usage:
 """
 import sys
 from galaxy import eggs
-from galaxy.tools.util.gff_util import parse_gff_attributes
+from galaxy.datatypes.util.gff_util import parse_gff_attributes
 
 assert sys.version_info[:2] >= ( 2, 4 )
 

--- a/tools/filters/gff_to_bed_converter.py
+++ b/tools/filters/gff_to_bed_converter.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 import sys
 from galaxy import eggs
-from galaxy.tools.util.gff_util import parse_gff_attributes
+from galaxy.datatypes.util.gff_util import parse_gff_attributes
 
 assert sys.version_info[:2] >= ( 2, 4 )
 

--- /dev/null
+++ b/lib/galaxy/datatypes/util/gff_util.py
@@ -0,0 +1,226 @@
+"""
+Provides utilities for working with GFF files.
+"""
+
+from bx.intervals.io import *
+
+class GFFInterval( GenomicInterval ):
+    """ 
+    A GFF interval, including attributes. If file is strictly a GFF file,
+    only attribute is 'group.'
+    """
+    def __init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, default_strand, \
+                  fix_strand=False, raw_line='' ):
+        GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \
+                                  default_strand, fix_strand=fix_strand )
+        self.raw_line = raw_line
+        self.attributes = parse_gff_attributes( fields[8] )
+                
+class GFFFeature( GenomicInterval ):
+    """
+    A GFF feature, which can include multiple intervals.
+    """
+    def __init__( self, reader, chrom_col, start_col, end_col, strand_col, default_strand, \
+                  fix_strand=False, intervals=[] ):
+        GenomicInterval.__init__( self, reader, intervals[0].fields, chrom_col, start_col, end_col, \
+                                  strand_col, default_strand, fix_strand=fix_strand )
+        self.intervals = intervals
+        # Use intervals to set feature attributes.
+        for interval in self.intervals:
+            # Error checking.
+            if interval.chrom != self.chrom:
+                raise ValueError( "interval chrom does not match self chrom: %i != %i" % \
+                                  ( interval.chrom, self.chrom ) )
+            if interval.strand != self.strand:
+                raise ValueError( "interval strand does not match self strand: %s != %s" % \
+                                  ( interval.strand, self.strand ) )
+            # Set start, end of interval.
+            if interval.start < self.start:
+                self.start = interval.start
+            if interval.end > self.end:
+                self.end = interval.end
+                
+class GFFIntervalToBEDReaderWrapper( NiceReaderWrapper ):
+    """ 
+    Reader wrapper that reads GFF intervals/lines and automatically converts
+    them to BED format. 
+    """
+    
+    def parse_row( self, line ):
+        # HACK: this should return a GFF interval, but bx-python operations 
+        # require GenomicInterval objects and subclasses will not work.
+        interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \
+                                    self.end_col, self.strand_col, self.default_strand, \
+                                    fix_strand=self.fix_strand )
+        interval = convert_gff_coords_to_bed( interval )
+        return interval
+
+class GFFReaderWrapper( NiceReaderWrapper ):
+    """
+    Reader wrapper for GFF files.
+    
+    Wrapper has two major functions:
+    (1) group entries for GFF file (via group column), GFF3 (via id attribute ), 
+        or GTF (via gene_id/transcript id);
+    (2) convert coordinates from GFF format--starting and ending coordinates 
+        are 1-based, closed--to the 'traditional'/BED interval format--0 based, 
+        half-open. This is useful when using GFF files as inputs to tools that 
+        expect traditional interval format.
+    """
+    
+    def __init__( self, reader, **kwargs ):
+        """
+        Create wrapper. Defaults are group_entries=False and 
+        convert_coords_to_bed=True to support backward compatibility.
+        """
+        NiceReaderWrapper.__init__( self, reader, **kwargs )
+        self.group_entries = kwargs.get( 'group_entries', False )
+        self.convert_coords_to_bed = kwargs.get( 'convert_coords_to_bed', True )
+        self.last_line = None
+        self.cur_offset = 0
+        self.seed_interval = None
+    
+    def parse_row( self, line ):
+        interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \
+                                self.end_col, self.strand_col, self.default_strand, \
+                                fix_strand=self.fix_strand, raw_line=line )
+        if self.convert_coords_to_bed:
+            interval = convert_gff_coords_to_bed( interval )
+        return interval
+        
+    def next( self ):
+        """ Returns next GFFFeature. """
+        
+        #
+        # Helper function.
+        #
+        
+        def handle_parse_error( parse_error ):
+            """ Actions to take when ParseError found. """
+            if self.outstream:
+               if self.print_delegate and hasattr(self.print_delegate,"__call__"):
+                   self.print_delegate( self.outstream, e, self )
+            self.skipped += 1
+            # no reason to stuff an entire bad file into memmory
+            if self.skipped < 10:
+               self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )
+               
+        #
+        # Get next GFFFeature
+        # 
+
+        # If there is no seed interval, set one. Also, if there are no more 
+        # intervals to read, this is where iterator dies.
+        if not self.seed_interval:
+            while not self.seed_interval:
+                try:
+                    self.seed_interval = GenomicIntervalReader.next( self )
+                except ParseError, e:
+                    handle_parse_error( e )
+    
+        # Initialize feature name from seed.
+        feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF
+        feature_id = self.seed_interval.attributes.get( 'id', None ) # For GFF3
+        feature_gene_id = self.seed_interval.attributes.get( 'gene_id', None ) # For GTF
+        feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None ) # For GTF
+
+        # Read all intervals associated with seed.
+        feature_intervals = []
+        feature_intervals.append( self.seed_interval )
+        while True:
+            try:
+                interval = GenomicIntervalReader.next( self )
+            except StopIteration, e:
+                # No more intervals to read, but last feature needs to be 
+                # returned.
+                interval = None
+                break
+            except ParseError, e:
+                handle_parse_error( e )
+            
+            # If interval not associated with feature, break.
+            group = interval.attributes.get( 'group', None )
+            if group and feature_group != group:
+                break
+            id = interval.attributes.get( 'id', None )
+            if id and feature_id != id:
+                break
+            gene_id = interval.attributes.get( 'gene_id', None )
+            transcript_id = interval.attributes.get( 'transcript_id', None )
+            if transcript_id and transcript_id != feature_transcript_id and gene_id and \
+               gene_id != feature_gene_id:
+                break
+    
+            # Interval associated with feature.
+            feature_intervals.append( interval )
+   
+        # Last interval read is the seed for the next interval.
+        self.seed_interval = interval
+    
+        # Return GFF feature with all intervals.    
+        return GFFFeature( self, self.chrom_col, self.start_col, self.end_col, self.strand_col, \
+                           self.default_strand, fix_strand=self.fix_strand, \
+                           intervals=feature_intervals )
+        
+
+def convert_bed_coords_to_gff( interval ):
+    """
+    Converts an interval object's coordinates from BED format to GFF format. 
+    Accepted object types include GenomicInterval and list (where the first 
+    element in the list is the interval's start, and the second element is 
+    the interval's end).
+    """
+    if type( interval ) is GenomicInterval:
+        interval.start += 1
+    elif type ( interval ) is list:
+        interval[ 0 ] += 1
+    return interval
+    
+def convert_gff_coords_to_bed( interval ):
+    """
+    Converts an interval object's coordinates from GFF format to BED format. 
+    Accepted object types include GenomicInterval and list (where the first
+    element in the list is the interval's start, and the second element is 
+    the interval's end).
+    """
+    if type( interval ) is GenomicInterval:
+        interval.start -= 1
+    elif type ( interval ) is list:
+        interval[ 0 ] -= 1
+    return interval
+    
+def parse_gff_attributes( attr_str ):
+    """
+    Parses a GFF/GTF attribute string and returns a dictionary of name-value 
+    pairs. The general format for a GFF3 attributes string is 
+        name1=value1;name2=value2
+    The general format for a GTF attribute string is 
+        name1 "value1" ; name2 "value2"
+    The general format for a GFF attribute string is a single string that
+    denotes the interval's group; in this case, method returns a dictionary 
+    with a single key-value pair, and key name is 'group'
+    """    
+    attributes_list = attr_str.split(";")
+    attributes = {}
+    for name_value_pair in attributes_list:
+        # Try splitting by space and, if necessary, by '=' sign.
+        pair = name_value_pair.strip().split(" ")
+        if len( pair ) == 1:
+            pair = name_value_pair.strip().split("=")
+        if len( pair ) == 1:
+            # Could not split for some reason -- raise exception?
+            continue
+        if pair == '':
+            continue
+        name = pair[0].strip()
+        if name == '':
+            continue
+        # Need to strip double quote from values
+        value = pair[1].strip(" \"")
+        attributes[ name ] = value
+        
+    if len( attributes ) == 0:
+        # Could not split attributes string, so entire string must be 
+        # 'group' attribute. This is the case for strictly GFF files.
+        attributes['group'] = attr_str
+    return attributes

--- a/lib/galaxy/datatypes/converters/gff_to_interval_index_converter.py
+++ b/lib/galaxy/datatypes/converters/gff_to_interval_index_converter.py
@@ -12,7 +12,7 @@ from __future__ import division
 import sys, fileinput
 from galaxy import eggs
 import pkg_resources; pkg_resources.require( "bx-python" )
-from galaxy.tools.util.gff_util import *
+from galaxy.datatypes.util.gff_util import *
 from bx.interval_index_file import Indexes
 
 def main():

--- a/lib/galaxy/datatypes/converters/interval_to_summary_tree_converter.py
+++ b/lib/galaxy/datatypes/converters/interval_to_summary_tree_converter.py
@@ -14,7 +14,7 @@ import pkg_resources; pkg_resources.requ
 from galaxy.visualization.tracks.summary import *
 from bx.intervals.io import *
 from bx.cookbook import doc_optparse
-from galaxy.tools.util.gff_util import *
+from galaxy.datatypes.util.gff_util import *
 
 def main():
     # Read options, args.

--- a/tools/new_operations/flanking_features.py
+++ b/tools/new_operations/flanking_features.py
@@ -18,7 +18,7 @@ from bx.cookbook import doc_optparse
 from galaxy.tools.util.galaxyops import *
 from bx.intervals.io import *
 from bx.intervals.operations import quicksect
-from galaxy.tools.util.gff_util import *
+from galaxy.datatypes.util.gff_util import *
 
 assert sys.version_info[:2] >= ( 2, 4 )
 

--- a/lib/galaxy/tools/util/gff_util.py
+++ /dev/null
@@ -1,226 +0,0 @@
-"""
-Provides utilities for working with GFF files.
-"""
-
-from bx.intervals.io import *
-
-class GFFInterval( GenomicInterval ):
-    """ 
-    A GFF interval, including attributes. If file is strictly a GFF file,
-    only attribute is 'group.'
-    """
-    def __init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, default_strand, \
-                  fix_strand=False, raw_line='' ):
-        GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \
-                                  default_strand, fix_strand=fix_strand )
-        self.raw_line = raw_line
-        self.attributes = parse_gff_attributes( fields[8] )
-                
-class GFFFeature( GenomicInterval ):
-    """
-    A GFF feature, which can include multiple intervals.
-    """
-    def __init__( self, reader, chrom_col, start_col, end_col, strand_col, default_strand, \
-                  fix_strand=False, intervals=[] ):
-        GenomicInterval.__init__( self, reader, intervals[0].fields, chrom_col, start_col, end_col, \
-                                  strand_col, default_strand, fix_strand=fix_strand )
-        self.intervals = intervals
-        # Use intervals to set feature attributes.
-        for interval in self.intervals:
-            # Error checking.
-            if interval.chrom != self.chrom:
-                raise ValueError( "interval chrom does not match self chrom: %i != %i" % \
-                                  ( interval.chrom, self.chrom ) )
-            if interval.strand != self.strand:
-                raise ValueError( "interval strand does not match self strand: %s != %s" % \
-                                  ( interval.strand, self.strand ) )
-            # Set start, end of interval.
-            if interval.start < self.start:
-                self.start = interval.start
-            if interval.end > self.end:
-                self.end = interval.end
-                
-class GFFIntervalToBEDReaderWrapper( NiceReaderWrapper ):
-    """ 
-    Reader wrapper that reads GFF intervals/lines and automatically converts
-    them to BED format. 
-    """
-    
-    def parse_row( self, line ):
-        # HACK: this should return a GFF interval, but bx-python operations 
-        # require GenomicInterval objects and subclasses will not work.
-        interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \
-                                    self.end_col, self.strand_col, self.default_strand, \
-                                    fix_strand=self.fix_strand )
-        interval = convert_gff_coords_to_bed( interval )
-        return interval
-
-class GFFReaderWrapper( NiceReaderWrapper ):
-    """
-    Reader wrapper for GFF files.
-    
-    Wrapper has two major functions:
-    (1) group entries for GFF file (via group column), GFF3 (via id attribute ), 
-        or GTF (via gene_id/transcript id);
-    (2) convert coordinates from GFF format--starting and ending coordinates 
-        are 1-based, closed--to the 'traditional'/BED interval format--0 based, 
-        half-open. This is useful when using GFF files as inputs to tools that 
-        expect traditional interval format.
-    """
-    
-    def __init__( self, reader, **kwargs ):
-        """
-        Create wrapper. Defaults are group_entries=False and 
-        convert_coords_to_bed=True to support backward compatibility.
-        """
-        NiceReaderWrapper.__init__( self, reader, **kwargs )
-        self.group_entries = kwargs.get( 'group_entries', False )
-        self.convert_coords_to_bed = kwargs.get( 'convert_coords_to_bed', True )
-        self.last_line = None
-        self.cur_offset = 0
-        self.seed_interval = None
-    
-    def parse_row( self, line ):
-        interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \
-                                self.end_col, self.strand_col, self.default_strand, \
-                                fix_strand=self.fix_strand, raw_line=line )
-        if self.convert_coords_to_bed:
-            interval = convert_gff_coords_to_bed( interval )
-        return interval
-        
-    def next( self ):
-        """ Returns next GFFFeature. """
-        
-        #
-        # Helper function.
-        #
-        
-        def handle_parse_error( parse_error ):
-            """ Actions to take when ParseError found. """
-            if self.outstream:
-               if self.print_delegate and hasattr(self.print_delegate,"__call__"):
-                   self.print_delegate( self.outstream, e, self )
-            self.skipped += 1
-            # no reason to stuff an entire bad file into memmory
-            if self.skipped < 10:
-               self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )
-               
-        #
-        # Get next GFFFeature
-        # 
-
-        # If there is no seed interval, set one. Also, if there are no more 
-        # intervals to read, this is where iterator dies.
-        if not self.seed_interval:
-            while not self.seed_interval:
-                try:
-                    self.seed_interval = GenomicIntervalReader.next( self )
-                except ParseError, e:
-                    handle_parse_error( e )
-    
-        # Initialize feature name from seed.
-        feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF
-        feature_id = self.seed_interval.attributes.get( 'id', None ) # For GFF3
-        feature_gene_id = self.seed_interval.attributes.get( 'gene_id', None ) # For GTF
-        feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None ) # For GTF
-
-        # Read all intervals associated with seed.
-        feature_intervals = []
-        feature_intervals.append( self.seed_interval )
-        while True:
-            try:
-                interval = GenomicIntervalReader.next( self )
-            except StopIteration, e:
-                # No more intervals to read, but last feature needs to be 
-                # returned.
-                interval = None
-                break
-            except ParseError, e:
-                handle_parse_error( e )
-            
-            # If interval not associated with feature, break.
-            group = interval.attributes.get( 'group', None )
-            if group and feature_group != group:
-                break
-            id = interval.attributes.get( 'id', None )
-            if id and feature_id != id:
-                break
-            gene_id = interval.attributes.get( 'gene_id', None )
-            transcript_id = interval.attributes.get( 'transcript_id', None )
-            if transcript_id and transcript_id != feature_transcript_id and gene_id and \
-               gene_id != feature_gene_id:
-                break
-    
-            # Interval associated with feature.
-            feature_intervals.append( interval )
-   
-        # Last interval read is the seed for the next interval.
-        self.seed_interval = interval
-    
-        # Return GFF feature with all intervals.    
-        return GFFFeature( self, self.chrom_col, self.start_col, self.end_col, self.strand_col, \
-                           self.default_strand, fix_strand=self.fix_strand, \
-                           intervals=feature_intervals )
-        
-
-def convert_bed_coords_to_gff( interval ):
-    """
-    Converts an interval object's coordinates from BED format to GFF format. 
-    Accepted object types include GenomicInterval and list (where the first 
-    element in the list is the interval's start, and the second element is 
-    the interval's end).
-    """
-    if type( interval ) is GenomicInterval:
-        interval.start += 1
-    elif type ( interval ) is list:
-        interval[ 0 ] += 1
-    return interval
-    
-def convert_gff_coords_to_bed( interval ):
-    """
-    Converts an interval object's coordinates from GFF format to BED format. 
-    Accepted object types include GenomicInterval and list (where the first
-    element in the list is the interval's start, and the second element is 
-    the interval's end).
-    """
-    if type( interval ) is GenomicInterval:
-        interval.start -= 1
-    elif type ( interval ) is list:
-        interval[ 0 ] -= 1
-    return interval
-    
-def parse_gff_attributes( attr_str ):
-    """
-    Parses a GFF/GTF attribute string and returns a dictionary of name-value 
-    pairs. The general format for a GFF3 attributes string is 
-        name1=value1;name2=value2
-    The general format for a GTF attribute string is 
-        name1 "value1" ; name2 "value2"
-    The general format for a GFF attribute string is a single string that
-    denotes the interval's group; in this case, method returns a dictionary 
-    with a single key-value pair, and key name is 'group'
-    """    
-    attributes_list = attr_str.split(";")
-    attributes = {}
-    for name_value_pair in attributes_list:
-        # Try splitting by space and, if necessary, by '=' sign.
-        pair = name_value_pair.strip().split(" ")
-        if len( pair ) == 1:
-            pair = name_value_pair.strip().split("=")
-        if len( pair ) == 1:
-            # Could not split for some reason -- raise exception?
-            continue
-        if pair == '':
-            continue
-        name = pair[0].strip()
-        if name == '':
-            continue
-        # Need to strip double quote from values
-        value = pair[1].strip(" \"")
-        attributes[ name ] = value
-        
-    if len( attributes ) == 0:
-        # Could not split attributes string, so entire string must be 
-        # 'group' attribute. This is the case for strictly GFF files.
-        attributes['group'] = attr_str
-    return attributes

    

commits-noreply＠bitbucket.org

tags

participants (1)