[galaxy-commits] commit/galaxy-central: jgoecks: Extend extract_genomic_dna tool to support GFF/GTF features and custom genomes. For GFF/GTF files, tool provides option to return sequence data for each feature's intervals, which are split across lines in GFF/GTF files. Tool also now accepts sequence data from a history item, enabling the extraction of data from custom genomes. New functional tests added to cover new functionality.

10 Feb 2011

1 new changeset in galaxy-central:

http://bitbucket.org/galaxy/galaxy-central/changeset/c2d187cf4428/
changeset:   r5042:c2d187cf4428
user:        jgoecks
date:        2011-02-10 23:19:40
summary:     Extend extract_genomic_dna tool to support GFF/GTF features and custom genomes. For GFF/GTF files, tool provides option to return sequence data for each feature's intervals, which are split across lines in GFF/GTF files. Tool also now accepts sequence data from a history item, enabling the extraction of data from custom genomes. New functional tests added to cover new functionality.
affected #:  5 files (5.5 KB)

--- a/lib/galaxy/datatypes/util/gff_util.py	Thu Feb 10 15:53:06 2011 -0500
+++ b/lib/galaxy/datatypes/util/gff_util.py	Thu Feb 10 17:19:40 2011 -0500
@@ -257,4 +257,18 @@
         # Could not split attributes string, so entire string must be 
         # 'group' attribute. This is the case for strictly GFF files.
         attributes['group'] = attr_str
-    return attributes
\ No newline at end of file
+    return attributes
+    
+def gff_attributes_to_str( attrs, gff_format ):
+    """
+    Convert GFF attributes to string. Supported formats are GFF3, GTF. 
+    """
+    if gff_format == 'GTF':
+        format_string = '%s "%s"'
+    elif gff_format == 'GFF3':
+        format_string = '%s=%s'
+    attrs_strs = []
+    for name, value in attrs.items():
+        attrs_strs.append( format_string % ( name, value ) )
+    return " ; ".join( attrs_strs )
+    
\ No newline at end of file


--- a/tools/extract/extract_genomic_dna.py	Thu Feb 10 15:53:06 2011 -0500
+++ b/tools/extract/extract_genomic_dna.py	Thu Feb 10 17:19:40 2011 -0500
@@ -5,17 +5,19 @@
     -d, --dbkey=N: Genome build of input file
     -o, --output_format=N: the data type of the output file
     -g, --GALAXY_DATA_INDEX_DIR=N: the directory containing alignseq.loc
+    -I, --interpret_features: if true, complete features are interpreted when input is GFF 
+    -F, --fasta=<genomic_sequences>: genomic sequences to use for extraction
     -G, --gff: input and output file, when it is interval, coordinates are treated as GFF format (1-based, half-open) rather than 'traditional' 0-based, closed format.
 """
 from galaxy import eggs
 import pkg_resources
 pkg_resources.require( "bx-python" )
-import sys, string, os, re
+import sys, string, os, re, tempfile, subprocess
 from bx.cookbook import doc_optparse
 import bx.seq.nib
 import bx.seq.twobit
 from galaxy.tools.util.galaxyops import *
-from galaxy.datatypes.util.gff_util import *
+from galaxy.datatypes.util import gff_util
 
 assert sys.version_info[:2] >= ( 2, 4 )
     
@@ -44,16 +46,20 @@
                 seq_path = fields[2].strip()
                 break
     return seq_path
-
         
 def __main__():
+    #
+    # Parse options, args.
+    #
     options, args = doc_optparse.parse( __doc__ )
     try:
         chrom_col, start_col, end_col, strand_col = parse_cols_arg( options.cols )
         dbkey = options.dbkey
         output_format = options.output_format
         gff_format = options.gff
+        interpret_features = options.interpret_features
         GALAXY_DATA_INDEX_DIR = options.GALAXY_DATA_INDEX_DIR
+        fasta_file = options.fasta
         input_filename, output_filename = args
     except:
         doc_optparse.exception()
@@ -62,11 +68,49 @@
     strand = None
     nibs = {}
     twobits = {}
-    seq_path = check_seq_file( dbkey, GALAXY_DATA_INDEX_DIR )
-    if not os.path.exists( seq_path ):
-        # If this occurs, we need to fix the metadata validator.
-        stop_err( "No sequences are available for '%s', request them by reporting this error." % dbkey )
+        
+    #
+    # Set path to sequence data.
+    #
+    if fasta_file:
+        # Need to create 2bit file from fasta file.
+        try:
+            seq_path = tempfile.NamedTemporaryFile( dir="." ).name
+            cmd = "faToTwoBit %s %s" % ( fasta_file, seq_path )
+        
+            tmp_name = tempfile.NamedTemporaryFile( dir="." ).name
+            tmp_stderr = open( tmp_name, 'wb' )
+            proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() )
+            returncode = proc.wait()
+            tmp_stderr.close()
 
+            # Get stderr, allowing for case where it's very large.
+            tmp_stderr = open( tmp_name, 'rb' )
+            stderr = ''
+            buffsize = 1048576
+            try:
+                while True:
+                    stderr += tmp_stderr.read( buffsize )
+                    if not stderr or len( stderr ) % buffsize != 0:
+                        break
+            except OverflowError:
+                pass
+            tmp_stderr.close()
+
+            # Error checking.
+            if returncode != 0:
+                raise Exception, stderr
+        except Exception, e:
+            stop_err( 'Error running faToTwoBit. ' + str( e ) )
+    else:
+        seq_path = check_seq_file( dbkey, GALAXY_DATA_INDEX_DIR )
+        if not os.path.exists( seq_path ):
+            # If this occurs, we need to fix the metadata validator.
+            stop_err( "No sequences are available for '%s', request them by reporting this error." % dbkey )
+    
+    #
+    # Fetch sequences.
+    #
     skipped_lines = 0
     first_invalid_line = 0
     invalid_line = ''
@@ -74,106 +118,134 @@
     warnings = []
     warning = ''
     twobitfile = None
-     
-    for i, line in enumerate( open( input_filename ) ):
-        line = line.rstrip( '\r\n' )
-        if line and not line.startswith( "#" ):
-            fields = line.split( '\t' )
+    file_iterator = open( input_filename )
+    if gff_format and interpret_features:
+        file_iterator = gff_util.GFFReaderWrapper( file_iterator, fix_strand=True )
+    for i, feature in enumerate( file_iterator ):
+        if gff_format and interpret_features:
+            # Processing features.
+            gff_util.convert_gff_coords_to_bed( feature )
+            chrom = feature.chrom
+            start = feature.start
+            end = feature.end
+            strand = feature.strand
+        else:
+            # Processing lines, either interval or GFF format.
+            line = feature.rstrip( '\r\n' )
+            if line and not line.startswith( "#" ):
+                fields = line.split( '\t' )
+                try:
+                    chrom = fields[chrom_col]
+                    start = int( fields[start_col] )
+                    end = int( fields[end_col] )
+                    if gff_format:
+                        start, end = gff_util.convert_gff_coords_to_bed( [start, end] )
+                    if includes_strand_col:
+                        strand = fields[strand_col]
+                except:
+                    warning = "Invalid chrom, start or end column values. "
+                    warnings.append( warning )
+                    skipped_lines += 1
+                    if not invalid_line:
+                        first_invalid_line = i + 1
+                        invalid_line = line
+                    continue
+                if start > end:
+                    warning = "Invalid interval, start '%d' > end '%d'.  " % ( start, end )
+                    warnings.append( warning )
+                    skipped_lines += 1
+                    if not invalid_line:
+                        first_invalid_line = i + 1
+                        invalid_line = line
+                    continue
+
+                if strand not in ['+', '-']:
+                    strand = '+'
+                sequence = ''
+
+        # Open sequence file and get sequence for feature/interval. 
+        if seq_path and os.path.exists( "%s/%s.nib" % ( seq_path, chrom ) ):
+            # TODO: improve support for GFF-nib interaction.
+            if chrom in nibs:
+                nib = nibs[chrom]
+            else:
+                nibs[chrom] = nib = bx.seq.nib.NibFile( file( "%s/%s.nib" % ( seq_path, chrom ) ) )
             try:
-                chrom = fields[chrom_col]
-                start = int( fields[start_col] )
-                end = int( fields[end_col] )
-                if gff_format:
-                    start, end = convert_gff_coords_to_bed( [start, end] )
-                if includes_strand_col:
-                    strand = fields[strand_col]
+                sequence = nib.get( start, end-start )
             except:
-                warning = "Invalid chrom, start or end column values. "
+                warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " %( start, end-start, dbkey )
                 warnings.append( warning )
                 skipped_lines += 1
                 if not invalid_line:
                     first_invalid_line = i + 1
                     invalid_line = line
                 continue
-            if start > end:
-                warning = "Invalid interval, start '%d' > end '%d'.  " % ( start, end )
+        elif seq_path and os.path.isfile( seq_path ):
+            if not(twobitfile):
+                twobitfile = bx.seq.twobit.TwoBitFile( file( seq_path ) )
+            try:
+                if options.gff and interpret_features:
+                    # Create sequence from intervals within a feature.
+                    sequence = ''
+                    for interval in feature.intervals:
+                        sequence += twobitfile[interval.chrom][interval.start:interval.end]
+                else:
+                    sequence = twobitfile[chrom][start:end]
+            except:
+                warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " %( start, end-start, dbkey )
                 warnings.append( warning )
                 skipped_lines += 1
                 if not invalid_line:
                     first_invalid_line = i + 1
                     invalid_line = line
                 continue
+        else:
+            warning = "Chromosome by name '%s' was not found for build '%s'. " % ( chrom, dbkey )
+            warnings.append( warning )
+            skipped_lines += 1
+            if not invalid_line:
+                first_invalid_line = i + 1
+                invalid_line = line
+            continue
+        if sequence == '':
+            warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " \
+                        % ( chrom, start, end, dbkey )
+            warnings.append( warning )
+            skipped_lines += 1
+            if not invalid_line:
+                first_invalid_line = i + 1
+                invalid_line = line
+            continue
+        if includes_strand_col and strand == "-":
+            sequence = reverse_complement( sequence )
 
-            if strand not in ['+', '-']:
-                strand = '+'
-            sequence = ''
-
-            if seq_path and os.path.exists( "%s/%s.nib" % ( seq_path, chrom ) ):
-                if chrom in nibs:
-                    nib = nibs[chrom]
-                else:
-                    nibs[chrom] = nib = bx.seq.nib.NibFile( file( "%s/%s.nib" % ( seq_path, chrom ) ) )
-                try:
-                    sequence = nib.get( start, end-start )
-                except:
-                    warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " %( start, end-start, dbkey )
-                    warnings.append( warning )
-                    skipped_lines += 1
-                    if not invalid_line:
-                        first_invalid_line = i + 1
-                        invalid_line = line
-                    continue
-            elif seq_path and os.path.isfile( seq_path ):
-                if not(twobitfile):
-                    twobitfile = bx.seq.twobit.TwoBitFile( file( seq_path ) )
-                try:
-                    sequence = twobitfile[chrom][start:end]
-                except:
-                    warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " %( start, end-start, dbkey )
-                    warnings.append( warning )
-                    skipped_lines += 1
-                    if not invalid_line:
-                        first_invalid_line = i + 1
-                        invalid_line = line
-                    continue
+        if output_format == "fasta" :
+            l = len( sequence )        
+            c = 0
+            if gff_format:
+                start, end = gff_util.convert_bed_coords_to_gff( [ start, end ] )
+            fields = [dbkey, str( chrom ), str( start ), str( end ), strand]
+            meta_data = "_".join( fields )
+            fout.write( ">%s\n" % meta_data )
+            while c < l:
+                b = min( c + 50, l )
+                fout.write( "%s\n" % str( sequence[c:b] ) )
+                c = b
+        else: # output_format == "interval"
+            if interpret_features:
+                # TODO: need better GFF Reader to capture all information needed
+                # to produce this line.
+                meta_data = "\t".join( 
+                                [feature.chrom, "galaxy_extract_genomic_dna", "interval", \
+                                 str( feature.start ), str( feature.end ), feature.score, feature.strand,
+                                 ".", gff_util.gff_attributes_to_str( feature.attributes, "GTF" ) ] )
             else:
-                warning = "Chromosome by name '%s' was not found for build '%s'. " % ( chrom, dbkey )
-                warnings.append( warning )
-                skipped_lines += 1
-                if not invalid_line:
-                    first_invalid_line = i + 1
-                    invalid_line = line
-                continue
-            if sequence == '':
-                warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " %( chrom, start, end, dbkey )
-                warnings.append( warning )
-                skipped_lines += 1
-                if not invalid_line:
-                    first_invalid_line = i + 1
-                    invalid_line = line
-                continue
-            if includes_strand_col and strand == "-":
-                sequence = reverse_complement( sequence )
-
-            if output_format == "fasta" :
-                l = len( sequence )        
-                c = 0
-                if gff_format:
-                    start, end = convert_bed_coords_to_gff( [ start, end ] )
-                fields = [dbkey, str( chrom ), str( start ), str( end ), strand]
-                meta_data = "_".join( fields )
-                fout.write( ">%s\n" % meta_data )
-                while c < l:
-                    b = min( c + 50, l )
-                    fout.write( "%s\n" % str( sequence[c:b] ) )
-                    c = b
-            else: # output_format == "interval"
                 meta_data = "\t".join( fields )
-                if gff_format:
-                    format_str = "%s seq \"%s\";\n"
-                else:
-                    format_str = "%s\t%s\n"
-                fout.write( format_str % ( meta_data, str( sequence ) ) )
+            if gff_format:
+                format_str = "%s seq \"%s\";\n"
+            else:
+                format_str = "%s\t%s\n"
+            fout.write( format_str % ( meta_data, str( sequence ) ) )
 
     fout.close()
 


--- a/tools/extract/extract_genomic_dna.xml	Thu Feb 10 15:53:06 2011 -0500
+++ b/tools/extract/extract_genomic_dna.xml	Thu Feb 10 17:19:40 2011 -0500
@@ -1,57 +1,112 @@
 <tool id="Extract genomic DNA 1" name="Extract Genomic DNA" version="2.2.1"><description>using coordinates from assembled/unassembled genomes</description><command interpreter="python">
-      extract_genomic_dna.py $input $out_file1 -d $dbkey -o $out_format -g ${GALAXY_DATA_INDEX_DIR}
+      extract_genomic_dna.py $input $out_file1 -o $out_format -d $dbkey 
+      
+      #if str( $interpret_features ) == "yes":
+        -I
+      #end if
+      
+      ## Columns to use in input file.
       #if isinstance( $input.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__):
         -1 1,4,5,7 --gff
       #else:
         -1 ${input.metadata.chromCol},${input.metadata.startCol},${input.metadata.endCol},${input.metadata.strandCol}
       #end if
+            
+      #if $seq_source.index_source == "cached":
+        ## Genomic data from cache.
+        -g ${GALAXY_DATA_INDEX_DIR}
+      #else:
+        ## Genomic data from history.
+        -F $seq_source.ref_file
+      #end if
   </command><inputs>
-    <param format="interval,gff" name="input" type="data" label="Fetch sequences corresponding to Query">
-        <validator type="unspecified_build" />
-        <validator type="dataset_metadata_in_file" filename="alignseq.loc" metadata_name="dbkey" metadata_column="1" message="Sequences are not currently available for the specified build." line_startswith="seq" />
-    </param>
-	<param name="out_format" type="select" label="Output data type">
-    	<option value="fasta">FASTA</option>
-    	<option value="interval">Interval</option>
-	</param>
+      <param format="interval,gff" name="input" type="data" label="Fetch sequences for intervals in">
+          <validator type="unspecified_build" />
+      </param>
+      <param name="interpret_features" type="select" label="Interpret features when possible" help="Only meaningful for GFF, GTF datasets.">
+          <option value="yes">Yes</option>
+          <option value="no">No</option>
+      </param>
+      <conditional name="seq_source">
+          <param name="index_source" type="select" label="Source for Genomic Data">
+              <option value="cached">Locally cached</option>
+              <option value="history">History</option>
+          </param>
+          <when value="cached">
+          </when>
+          <when value="history">
+              <param name="ref_file" type="data" format="fasta" label="Using reference file" />
+          </when>
+      </conditional>
+	  <param name="out_format" type="select" label="Output data type">
+    	  <option value="fasta">FASTA</option>
+    	  <option value="interval">Interval</option>
+	  </param></inputs><outputs>
-    <data format="input" name="out_file1" metadata_source="input">
-      <change_format>
-        <when input="out_format" value="fasta" format="fasta" />
-      </change_format>
-    </data>
+      <data format="input" name="out_file1" metadata_source="input">
+          <change_format>
+              <when input="out_format" value="fasta" format="fasta" />
+          </change_format>
+      </data></outputs><tests><test><param name="input" value="1.bed" dbkey="hg17" ftype="bed" />
+      <param name="interpret_features" value="yes"/>
+      <param name="index_source" value="cached"/><param name="out_format" value="fasta"/><output name="out_file1" file="extract_genomic_dna_out1.fasta" /></test><test><param name="input" value="droPer1.bed" dbkey="droPer1" ftype="bed" />
+      <param name="interpret_features" value="yes"/>
+      <param name="index_source" value="cached"/><param name="out_format" value="fasta"/><output name="out_file1" file="extract_genomic_dna_out2.fasta" /></test><test><param name="input" value="1.bed" dbkey="hg17" ftype="bed" />
+      <param name="interpret_features" value="no"/>
+      <param name="index_source" value="cached"/><param name="out_format" value="interval"/><output name="out_file1" file="extract_genomic_dna_out3.interval" /></test><!-- Test GFF file support. --><test><param name="input" value="gff_filter_by_attribute_out1.gff" dbkey="mm9" ftype="gff" />
+      <param name="interpret_features" value="no"/>
+      <param name="index_source" value="cached"/><param name="out_format" value="interval"/><output name="out_file1" file="extract_genomic_dna_out4.gff" /></test><test><param name="input" value="gff_filter_by_attribute_out1.gff" dbkey="mm9" ftype="gff" />
+      <param name="interpret_features" value="no"/><param name="out_format" value="fasta"/>
+      <param name="index_source" value="cached"/><output name="out_file1" file="extract_genomic_dna_out5.fasta" /></test>
+    <!-- Test custom sequences support and GFF feature interpretation. -->
+    <test>
+      <param name="input" value="cufflinks_out1.gtf" dbkey="mm9" ftype="gff" />
+      <param name="interpret_features" value="no"/>
+      <param name="index_source" value="history"/>
+      <param name="ref_file" value="tophat_in1.fasta"/>
+      <param name="out_format" value="fasta"/>
+      <output name="out_file1" file="extract_genomic_dna_out6.fasta" />
+    </test>
+    <test>
+      <param name="input" value="cufflinks_out1.gtf" dbkey="mm9" ftype="gff" />
+      <param name="interpret_features" value="yes"/>
+      <param name="index_source" value="history"/>
+      <param name="ref_file" value="tophat_in1.fasta"/>
+      <param name="out_format" value="fasta"/>
+      <output name="out_file1" file="extract_genomic_dna_out7.fasta" />
+    </test></tests><help>

Repository URL: https://bitbucket.org/galaxy/galaxy-central/

--

This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.

    

Bitbucket