# HG changeset patch --
Bitbucket.org
# Project galaxy-dist
# URL
http://bitbucket.org/galaxy/galaxy-dist/overview
# User jeremy goecks <jeremy.goecks(a)emory.edu>
# Date 1277306218 14400
# Node ID 752cb3e325374381bdd0db6b9163b12ba88b7151
# Parent 03eb69c6e92ea5472b7627bd9990327f7546a8a7
Improvements to GOPS intersect: (a) preserve metadata for interval inputs; (b) allow
arbitrary mix of interval and GFF inputs; and (c) functional tests updated to test new
functionality.
--- a/tools/new_operations/intersect.xml
+++ b/tools/new_operations/intersect.xml
@@ -1,131 +1,110 @@
<tool id="gops_intersect_1"
name="Intersect"><description>the intervals of two
queries</description><command interpreter="python">gops_intersect.py
- #if $inputs.type == "Interval":
- $inputs.interval_input1 $inputs.interval_input2 $output
- -1
${inputs.interval_input1.metadata.chromCol},${inputs.interval_input1.metadata.startCol},${inputs.interval_input1.metadata.endCol},${inputs.interval_input1.metadata.strandCol}
- -2
${inputs.interval_input2.metadata.chromCol},${inputs.interval_input2.metadata.startCol},${inputs.interval_input2.metadata.endCol},${inputs.interval_input2.metadata.strandCol}
- $inputs.interval_returntype
- #else
- $inputs.gff_input1 $inputs.gff_input2 $output
- ## TODO: can we use metadata like above to set these columns rather than
hardcode them?
- -1 1,4,5,7
- -2 1,4,5,7
- --gff
- $inputs.gff_returntype
+ $input1 $input2 $output
+
+ #if isinstance( $input1.datatype,
$__app__.datatypes_registry.get_datatype_by_extension('gff').__class__):
+ -1 1,4,5,7 --gff1
+ #else:
+ -1
${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol}
#end if
- -m $min
+
+ #if isinstance( $input2.datatype,
$__app__.datatypes_registry.get_datatype_by_extension('gff').__class__):
+ -2 1,4,5,7 --gff2
+ #else:
+ -2
${input2.metadata.chromCol},${input2.metadata.startCol},${input2.metadata.endCol},${input2.metadata.strandCol}
+ #end if
+
+ -m $min $returntype
</command><inputs>
- <conditional name="inputs">
- <param name="type" type="select" label="File Format
to Use">
- <option value="Interval">Interval</option>
- <option value="GFF">GFF</option>
- </param>
- <when value="Interval">
- <param name="interval_returntype" type="select"
label="Return" help="(see figure below)">
- <option value="">Overlapping Intervals</option>
- <option value="-p">Overlapping pieces of
Intervals</option>
- </param>
- <param format="interval" name="interval_input1"
type="data" help="First query">
- <label>of</label>
- </param>
- <param format="interval" name="interval_input2"
type="data" help="Second query">
- <label>that intersect</label>
- </param>
- </when>
- <when value="GFF">
- <param name="gff_returntype" type="select"
label="Return" help="(see figure below)">
- <option value="">Overlapping Intervals</option>
- <option value="-p">Overlapping pieces of
Intervals</option>
- </param>
- <param format="gff" name="gff_input1"
type="data" help="First query">
- <label>of</label>
- </param>
- <param format="gff" name="gff_input2"
type="data" help="Second query">
- <label>that intersect</label>
- </param>
- </when>
- </conditional>
+ <param name="returntype" type="select"
label="Return" help="(see figure below)">
+ <option value="">Overlapping Intervals</option>
+ <option value="-p">Overlapping pieces of
Intervals</option>
+ </param>
+ <param format="interval,gff" name="input1"
type="data" help="First query">
+ <label>of</label>
+ </param>
+ <param format="interval,gff" name="input2"
type="data" help="Second query">
+ <label>that intersect</label>
+ </param><param name="min" size="4"
type="integer" value="1" help="(bp)"><label>for at
least</label></param></inputs><outputs>
- <data format="input" name="output">
- #if inputs.type == "Interval":
- metadata_source="inputs.interval_input1"
- #else:
- metadata_source="inputs.gff_input1"
- #end if
- </data>
+ <data format="input" name="output"
metadata_source="input1"/></outputs><code
file="operation_filter.py"/><tests><test><param
name="type" value="Interval"/>
- <param name="interval_input1" value="1.bed" />
- <param name="interval_input2" value="2.bed" />
+ <param name="input1" value="1.bed" />
+ <param name="input2" value="2.bed" /><param
name="min" value="1" />
- <param name="interval_returntype" value="" />
+ <param name="returntype" value="" /><output
name="output" file="gops_intersect_out.bed"
/></test><test><param name="type"
value="Interval"/>
- <param name="interval_input1" value="1.bed" />
- <param name="interval_input2" value="2_mod.bed"
ftype="interval"/>
+ <param name="input1" value="1.bed" />
+ <param name="input2" value="2_mod.bed"
ftype="interval"/><param name="min" value="1" />
- <param name="interval_returntype" value="" />
+ <param name="returntype" value="" /><output
name="output" file="gops_intersect_diffCols.bed"
/></test><test>
- <param name="type" value="Interval"/>
- <param name="interval_input1" value="1.bed" />
- <param name="interval_input2" value="2_mod.bed"
ftype="interval"/>
+ <param name="input1" value="1.bed" />
+ <param name="input2" value="2_mod.bed"
ftype="interval"/><param name="min" value="1" />
- <param name="interval_returntype" value="Overlapping pieces of
Intervals" />
+ <param name="returntype" value="Overlapping pieces of
Intervals" /><output name="output"
file="gops_intersect_p_diffCols.bed" /></test><test>
- <param name="type" value="Interval"/>
- <param name="interval_input1" value="1.bed" />
- <param name="interval_input2" value="2.bed" />
+ <param name="input1" value="1.bed" />
+ <param name="input2" value="2.bed" /><param
name="min" value="10" />
- <param name="interval_returntype" value="Overlapping pieces of
Intervals" />
+ <param name="returntype" value="Overlapping pieces of
Intervals" /><output name="output"
file="gops_intersect_p_out.bed" /></test><test>
- <param name="type" value="Interval"/>
- <param name="interval_input1" value="gops_bigint.interval"
ftype="interval" />
- <param name="interval_input2" value="gops_bigint2.interval"
ftype="interval" />
+ <param name="input1" value="gops_bigint.interval"
ftype="interval" />
+ <param name="input2" value="gops_bigint2.interval"
ftype="interval" /><param name="min" value="1" />
- <param name="interval_returntype" value="" />
+ <param name="returntype" value="" /><output
name="output" file="gops_intersect_bigint_out.interval"
/></test><test>
- <param name="type" value="Interval"/>
- <param name="interval_input1" value="gops_bigint2.interval"
ftype="interval" />
- <param name="interval_input2" value="gops_bigint.interval"
ftype="interval" />
+ <param name="input1" value="gops_bigint2.interval"
ftype="interval" />
+ <param name="input2" value="gops_bigint.interval"
ftype="interval" /><param name="min" value="1" />
- <param name="interval_returntype" value="" />
+ <param name="returntype" value="" /><output
name="output" file="gops_intersect_bigint_out.interval"
/></test><test>
- <param name="type" value="Interval"/>
- <param name="interval_input1" value="12.bed"
ftype="bed" />
- <param name="interval_input2" value="1.bed"
ftype="bed" />
+ <param name="input1" value="12.bed" ftype="bed"
/>
+ <param name="input2" value="1.bed" ftype="bed"
/><param name="min" value="1" />
- <param name="interval_returntype" value="" />
+ <param name="returntype" value="" /><output
name="output" file="gops_intersect_no_strand_out.bed"
/></test>
+ <!-- Intersect two GFF files. --><test>
- <param name="type" value="GFF"/>
- <param name="gff_input1" value="gops_subtract_in1.gff"
/>
- <param name="gff_input2" value="gops_subtract_in2.gff"
/>
+ <param name="input1" value="gops_subtract_in1.gff" />
+ <param name="input2" value="gops_subtract_in2.gff"
/><param name="min" value="1" />
- <param name="gff_returntype" value="" />
+ <param name="returntype" value="" /><output
name="output" file="gops_intersect_out2.gff" /></test>
+ <!-- Intersect GFF file and bed file. -->
+ <test>
+ <param name="input1" value="gops_subtract_in1.gff" />
+ <param name="input2" value="gops_subtract_in2.bed" />
+ <param name="min" value="1" />
+ <param name="returntype" value="" />
+ <output name="output" file="gops_intersect_out2.gff" />
+ </test>
+
</tests><help>
--- a/tools/new_operations/subtract.xml
+++ b/tools/new_operations/subtract.xml
@@ -3,7 +3,6 @@
<command interpreter="python">gops_subtract.py
$input1 $input2 $output
- ##if $input1.ext in ['gff','gtf','gff3']:
#if isinstance( $input1.datatype,
$__app__.datatypes_registry.get_datatype_by_extension('gff').__class__):
-1 1,4,5,7 --gff1
#else:
--- a/tools/new_operations/gops_intersect.py
+++ b/tools/new_operations/gops_intersect.py
@@ -1,13 +1,14 @@
#!/usr/bin/env python
"""
-Find regions of first bed file that overlap regions in a second bed file
+Find regions of first interval/GFF file that overlap regions in a second interval/GFF
file
usage: %prog bed_file_1 bed_file_2 out_file
-1, --cols1=N,N,N,N: Columns for start, end, strand in first file
-2, --cols2=N,N,N,N: Columns for start, end, strand in second file
-m, --mincols=N: Require this much overlap (default 1bp)
-p, --pieces: just print pieces of second set (after padding)
- -G, --gff: inputs are GFF format, meaning start and end coordinates are 1-based,
closed interval
+ -G, --gff1: input 1 is GFF format, meaning start and end coordinates are 1-based,
closed interval
+ -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based,
closed interval
"""
from galaxy import eggs
import pkg_resources
@@ -34,24 +35,29 @@ def main():
chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
if options.mincols: mincols = int( options.mincols )
pieces = bool( options.pieces )
- gff_format = bool( options.gff )
+ in1_gff_format = bool( options.gff1 )
+ in2_gff_format = bool( options.gff2 )
in_fname, in2_fname, out_fname = args
except:
doc_optparse.exception()
- # Set reader to handle either GFF or default format.
- if gff_format:
- reader_wrapper = GFFReaderWrapper
+ # Set readers to handle either GFF or default format.
+ if in1_gff_format:
+ in1_reader_wrapper = GFFReaderWrapper
else:
- reader_wrapper = NiceReaderWrapper
+ in1_reader_wrapper = NiceReaderWrapper
+ if in2_gff_format:
+ in2_reader_wrapper = GFFReaderWrapper
+ else:
+ in2_reader_wrapper = NiceReaderWrapper
- g1 = reader_wrapper( fileinput.FileInput( in_fname ),
+ g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ),
chrom_col=chr_col_1,
start_col=start_col_1,
end_col=end_col_1,
strand_col=strand_col_1,
fix_strand=True )
- g2 = reader_wrapper( fileinput.FileInput( in2_fname ),
+ g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ),
chrom_col=chr_col_2,
start_col=start_col_2,
end_col=end_col_2,
@@ -63,7 +69,7 @@ def main():
try:
for line in intersect( [g1,g2], pieces=pieces, mincols=mincols ):
if type( line ) == GenomicInterval:
- if gff_format:
+ if in1_gff_format:
line = convert_to_gff_coordinates( line )
out_file.write( "%s\n" % "\t".join( line.fields ) )
else: