# HG changeset patch --
Bitbucket.org
# Project galaxy-dist
# URL
http://bitbucket.org/galaxy/galaxy-dist/overview
# User jeremy goecks <jeremy.goecks(a)emory.edu>
# Date 1278618202 14400
# Node ID 2d046444998edb1c4a5126897990bddd25de69f5
# Parent 41089d12cdd9eaaeb8e75e09157d412ccf935eae
Enable 'extract genomic DNA' tool to accept and produce GFF files and added
functional tests for this feature.
--- a/tools/extract/extract_genomic_dna.xml
+++ b/tools/extract/extract_genomic_dna.xml
@@ -1,20 +1,27 @@
<tool id="Extract genomic DNA 1" name="Extract Genomic DNA"
version="2.2.1"><description>using coordinates from
assembled/unassembled genomes</description>
- <command interpreter="python">extract_genomic_dna.py $input $out_file1
-1
${input.metadata.chromCol},${input.metadata.startCol},${input.metadata.endCol},${input.metadata.strandCol}
-d $dbkey -o $out_format -g ${GALAXY_DATA_INDEX_DIR}</command>
+ <command interpreter="python">
+ extract_genomic_dna.py $input $out_file1 -d $dbkey -o $out_format -g
${GALAXY_DATA_INDEX_DIR}
+ #if isinstance( $input.datatype,
$__app__.datatypes_registry.get_datatype_by_extension('gff').__class__):
+ -1 1,4,5,7 --gff
+ #else:
+ -1
${input.metadata.chromCol},${input.metadata.startCol},${input.metadata.endCol},${input.metadata.strandCol}
+ #end if
+ </command><inputs>
- <param format="interval" name="input" type="data"
label="Fetch sequences corresponding to Query">
- <validator type="unspecified_build" />
- <validator type="dataset_metadata_in_file"
filename="alignseq.loc" metadata_name="dbkey"
metadata_column="1" message="Sequences are not currently available for the
specified build." line_startswith="seq" />
+ <param format="interval,gff" name="input"
type="data" label="Fetch sequences corresponding to Query">
+ <validator type="unspecified_build" />
+ <validator type="dataset_metadata_in_file"
filename="alignseq.loc" metadata_name="dbkey"
metadata_column="1" message="Sequences are not currently available for the
specified build." line_startswith="seq" /></param><param
name="out_format" type="select" label="Output data
type">
- <option value="fasta">FASTA</option>
- <option value="interval">Interval</option>
+ <option value="fasta">FASTA</option>
+ <option
value="interval">Interval</option></param></inputs><outputs>
- <data format="fasta" name="out_file1"
metadata_source="input">
+ <data format="input" name="out_file1"
metadata_source="input"><change_format>
- <when input="out_format" value="interval"
format="interval" />
+ <when input="out_format" value="fasta"
format="fasta" /></change_format></data></outputs>
@@ -34,6 +41,17 @@
<param name="out_format" value="interval"/><output
name="out_file1" file="extract_genomic_dna_out3.interval"
/></test>
+ <!-- Test GFF file support. -->
+ <test>
+ <param name="input" value="gff_filtering_out1.gff"
dbkey="mm9" ftype="gff" />
+ <param name="out_format" value="interval"/>
+ <output name="out_file1" file="extract_genomic_dna_out4.gff"
/>
+ </test>
+ <test>
+ <param name="input" value="gff_filtering_out1.gff"
dbkey="mm9" ftype="gff" />
+ <param name="out_format" value="fasta"/>
+ <output name="out_file1"
file="extract_genomic_dna_out5.fasta" />
+ </test></tests><help>
@@ -90,7 +108,7 @@ Extracting sequences with **FASTA** outp
CACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCAC
ACACG
-Extrracting sequences with **Interval** output data type returns::
+Extracting sequences with **Interval** output data type returns::
chr7 127475281 127475310 NM_000230 0 +
GTAGGAATCGCAGCGCCAGCGGTTGCAAG
chr7 127485994 127486166 NM_000230 0 +
GCCCAAGAAGCCCATCCTGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCACACACG
--- /dev/null
+++ b/test-data/extract_genomic_dna_out5.fasta
@@ -0,0 +1,258 @@
+>mm9_chr10_62044836_62045189_+
+AATTACAAGATCGACACACCAAGATAGGCAGATCCATGGTTGGTTTTACT
+TTGTAAATCTAAAAGTATGTTGGAAAACGATGCAATGAATTCTTATCCTT
+TTTCAAAATGAAGAATTTGTGATGGTTAGTGGACAGTTCAGAAGCCTCTC
+TGCAAGAAAGGGGGCGCTGAGAAGTGGTAAAAAAAGGAAGGAAGCACTCG
+GGCTTTGTCAGCAGGGTGGACCCTGGGGTCCACAGTGGGAACAGTCCCTT
+CTGGCCTCTACTCACTGACCAAACGCTTTACTAAAACTCCGCTTCTGGCC
+TCTGTTGCCACCTCCTGGTCGCTGTCCTCGGAAGTTTCTACTTCCTCCTC
+GCT
+>mm9_chr10_75372918_75373002_+
+GCGTCTCGCAGCTTCTGCCCGTCGATCTCCATGTCGAGCCGGATGGGCAC
+CAGCACCTCAGGCTGTGACGCATTCTCATGGATC
+>mm9_chr10_80362427_80363292_-
+ATGACGGACAAGTGTTTCCGGAAGTGCATCGGGAAGCCCGGGGGCTCCTT
+GGATAACTCGGAGCAGGTGAGACATCTCGGGAACCCGGGGTGGTGAGGGG
+CGCGGGGTCAGGAGCGTCTAGGAGGTTGAGAGATGTGCGCGTGCGCGGCC
+TCTAGCCTTAGCTACTGAGGAAGTTGTGCGCGTGCGCGGGGTGAGGACCC
+GGCTTCTGTGCCTAGATCGGTGCAGCCTTCATGGGTGATCCTCGGGTCGT
+GTGACCGTCAGTCAGGGATCCCCCTCCACGCTTTGCAGAAATGCATCGCC
+ATGTGCATGGACCGCTACATGGACGCCTGGAATACCGTGTCCCGCGCCTA
+CAACTCTCGACTGCAGCGGGAACGAGCCAACATGTGACCGGGACCTGTGC
+CTCGGGACACCGTGCTTATGGTCTGAACTGTTTTCCCTGCCAGTTAGGGT
+GTCTCCTCCTAGCCGCCCTGAAGTCTGGCAGCATGGAGGGCTTGGGGATC
+GAGGCCTCTCCCCTGGGTTGCTGCGTCCAGCTCAATCTCAGAAGAGAGTG
+AGGACCCGACAGAGCACAGGGATCTGGCTGGCCCCACTGACCTGTGACCT
+CAGGAGAGCAGGCCAATAAATCGCTGCTGGGGCAGTAAAGCAGGCGTGTC
+ACCTCACTGCTTCAGGTCCCTTCCCCTGAGTAGGCCCAGACCTCCCAGGG
+TATCTTTCCCCTTGGGGTCAGTGGGCTGCTGGCTCTCAGGGAATTCGGAG
+CATGATCTCAGGTGTTTGGTCATCCCGGGGAGACCAGCCGAGGTTAAGAA
+GCAAGGCTTCATGTagccttcacctatcatgcatgaggcccagggtgctg
+accttaactctgaat
+>mm9_chr11_7904564_7904642_+
+CATCTTCTATTTGAGCCTCCATCCAGGCACCTCTGAAACAAAGGTGCACT
+CACTGCATGTCCACTTGTCACAGGAGCC
+>mm9_chr11_78140155_78140259_+
+CTGCTTGCTAATTTTCTCTCTTGGGATCAGGGGGACGTGAACTCCAGCCC
+TGACTCGTGCTCCTTATGCTCTGAGTACATAGCAAATAAATGAGAGCAAA
+ACAC
+>mm9_chr11_105616461_105616737_+
+TAGGTGTAATAGTGGAAAACAATAGTTTTTAAACTTCAGAGTCCAGGGCT
+GTAACTCAGTAGTAACAGTGTTCTCTAAGTATGTTATTCTTCCTCTACAT
+GCTGAAATTTTTCATATTTGGAGCATTCACTGTTCCATGTATCAGTAAAT
+TATATTGTGAGCTGTCATCATATCTAAGCACCATATTGAATATTTTTCAT
+GATTAAAATTTGTTGAAACAACAATTCTATGACCGAAAAAAGCAAGGCTT
+TGTAAATAACATGTTTGTTACTAGTA
+>mm9_chr12_30701761_30702509_+
+TGTGGAGTGTACTTATATGATCCCTATGCTGATAGGATTACCTTCCTAGA
+CATAGCTAGACGCAAAGCCACATGTGTAAGGCTGCTGAGCAAAGACAGCA
+TCCCAGCATGGGTGTGTTCACGGTGGATTCACCACGTTGCATATGTAAAG
+TGGTCCCCTTGGCTTACCCTTCACTTTGCTCATGAGATTCAGAAGCTGGT
+GGTCCAGCAGGGGTGAGCATTTGTGAAATAGTAAGCTGAACTTAGTGGTG
+AGATTTCAGAACAGACTTCTGTGAAGTAAGAGATGTAACCATGCATCTAA
+AATCAGATGGCCGTGTAACTGCTCGGGCATAGAAATGGTGGGAGAACCTG
+TCCTGGGTACCTGGCATTTCACATGAGCCCAGGGATATGTCTTGTGCCAA
+GGCACACAAGTGTCCATGGACTTGGACAGGTGCCAAGGGTTTTTGTCTCT
+GTTCCTATGTGGGAGGCTGGCTGTGATTTACATTAATTTCTGTATTTCAA
+ACGAAGATGTCTGCAGATCTCCATTTTGATGTTACAGCCTCATTGCCCAG
+GCAGTGGGCAGTGCCCAGACACCCTTTCTGACTAGCCACTGCATTGGGCT
+TCTGTGATTCAAAGTAGTGTATATATTTATTTACTTCTCTGACTGTGGCC
+AACAGCCAAATGCCATTTTATGTTCCTTGTATTCAGTCCATTACCAAAGA
+GGTGTTTGCACTTTGTAATGATACCTTTCAGTTCAAATAAAAGGACCA
+>mm9_chr13_49159495_49159569_+
+ttttcttttggattacttgatttttttttatttgatcttatttatgatga
+ttttgagtacatttttgaacagtt
+>mm9_chr13_100200303_100200330_+
+TCTCATATGAATAGCCACCCTCTTCTG
+>mm9_chr14_31949102_31949152_+
+GGATGCTATCCGCGATGTGCATGTAAAGGGCCTCATGTACCAGTGGATCG
+>mm9_chr14_67604226_67604668_+
+TTCACCGTGAGAGTTTTCTCCATTTCACTCTTCACTGTGCTGTTCTCTGT
+GCCGCTTTCCTCTTGACTTATAAACATCTGAGCCAGTTTTCAATAAACTT
+AAAACGAAGCCTGCTTCTCATCCCAAATTGTAAACAGGAATAAAGCTTTT
+TAAACCTTATCTTAAATTTTAACTTTGTTGAATTCTGCTTTGTGATAGGA
+CAATCTGTTTCACCCAACAAGAATCTGTGTAGGAGGATGAACATCCCGCA
+TGTTGGAGCTGCAAATCAGCACTGTACAAGCTCACTGATGGACAGCTGTT
+CTGTGATGTATTCCATGATTTTACTAATACTTTCAAAAATGGCAAAACTA
+ACTTCAGTTTTAATGTTGAAAGAAAATCATAAATGTTCCCATAGTTCAAT
+GGCACTGTCGATGAAACTGCTACTGAATTTAGAGAGAAAACG
+>mm9_chr14_75165581_75165744_+
+ggccctgggatgataTAACAGAAGAGTCTAAAGGAGGCTTCTGAGATGTG
+CAGTAGGAAAGCCTGGCACATAATAGGTTATTATCTAAATCCCTTCACTA
+CTCTTCAAAGACAGCAGGATGCCTCTGCTCCCATGTTTTATCTCTACTTA
+TGTGGAATTTATG
+>mm9_chr16_57154026_57154067_+
+GTTGAGGTTTATTTAAGTAAAATGATTTTTTAAAAAAGCAA
+>mm9_chr16_74862301_74862560_+
+GCATTGGCAGCAGATATTGGTACCCAGTGGCACTGCAGAGTACTTACAAT
+CAGGACTCGCTACTGTGCTTCATTCTGCTTTTCTCTCTGCTTCTATTACA
+GTTAAAGTGTTGCTAATTATAGAAACTCTCTGTTTATTGAACCTCGGTGT
+TAAGAAAAACTTGTAATCTTCAGATATGATCCGAAAGATTCCCAAACAAA
+TGTAACAAGGTCCACTTTTGTAGCCCTTTCTACCAGAAcactggttatca
+acctgtggg
+>mm9_chr16_98168778_98168914_+
+CCTATTTATTTCACTAAACATCTGCCTGCTAGCTGAGATAAACATTCTCT
+AAAAAACTGTTTACTGCAAAAAGTGATTACTGTTTTTTATTAGTTTCTTA
+GCATTTGAAATAGTTACATGAATGGAAGGATAGAGT
+>mm9_chr17_8483211_8483268_+
+AGACTTGTCAACAGCTCACCCAATGATGGAACTGAGGCTGCCCCTCAAGT
+GGCCAGA
+>mm9_chr17_30355790_30355913_+
+atctcatacccataagctcagaactcggggtggtaacataggaggactgc
+catgagtgtgactaacctgggctataggaggaggatctaccttaagcaaa
+tgaCCAACAAAACTAACAAGCTC
+>mm9_chr18_39571717_39571880_+
+TATAACATTCCATAAATGTACAATAATCTATTTTTGAGAAGCTCATTTTG
+AAACTTAACACTGTCATTGATAATCTTCAAGTGGTATTTCTTAGGCACCA
+TAAATTTCACATCCAGCTGGGTTACAATTATTTTAAAGTACTTTGAGACC
+AATTTAAACCATT
+>mm9_chr19_17633087_17633203_+
+TGGGAAATGAACTGCATGGCAATGAACCCCAGGGAATTTGGTGGTTAATT
+GTCTAAGGATAAGGACATCAGTTTTGTCTTTTGCATCACTGTGACCTTTG
+CCTCTAATTGTATAGA
+>mm9_chr19_41997623_41997859_+
+gctacacaacgactcacatagagggaagcaggcacacatcagataaaaca
+cAAAAGGATGGGTTGGTGATGGGCATAGTTAATGAGGGCCACTAGGTAAA
+TACACCTGATCCAAAAGTCACGCTACTACTTAGATTCTTCTCTCTGCTAA
+AGACAACAGAAgacatgttagccatgcttgtaatccctgcattggggaga
+tggagtcagaaatatcactgcaagttcacccaatag
+>mm9_chr19_56516514_56516684_+
+TGTATTCATTCACTATTCACTGATTTGTCAGATCATCCATCCACACAGGT
+GCTGAAGAGTAACCCATTTCACTTTGTATACAAGATAATGTTTTTGTACT
+TCAAATACATCTGGAATTCTTTCAAATATTCCAAGATTTTTTTTTTTTCT
+GAATAATCTTTGGTTACCTC
+>mm9_chr2_4543773_4543977_+
+gagccatttctccagccccTTTATGTGGAATATTAACAAGAGAAGACAAC
+ATAAAATGACTTACCATGCTGTGTGGCCTAACAGTGGATGAAGAATGAGT
+GATTTGGGCATTTCTGATAGTATTTATAAAGAAGACTTTTATGACCAAAC
+CACATGTCACAGTAGGGATTTGCTGCACATCTTATGAGAGTTTCTTCTTT
+GTCA
+>mm9_chr2_30200330_30200938_+
+CGCACACAAAGGATTTATTTGCCAGAGAGCAAGCAGACAGGCAGAGGTCA
+GAATGTTAGTTAGAAACTGAAGGAATGACTGCTGTAGCCACTGTGCCCAG
+CCAGAGCCATGAGGGAAGTGGGAGGCAGCACTTGGTGCTGCTGCTCTGGC
+TGACCCTTCTGGTTTCCTGCCACACTCCTAGCCCTGCCTGTGTGCTGCTG
+TCCCCCTCAACCTTCCACAGCCAGAAGGCAGATGTTCTTTCATGCCAAGA
+GCATCCATCCCCAGCATATCCTGGGCCCATGGTGGTGTCAAATGTAGTGA
+CCCTTCTGCCTTAAGGGAGCTGGGAAGCCTGGGGTGTGCAGGGTTGCAGG
+TCAGAAGCAGGACTAGCAGAGGGGCCTGGGGCCATTCTGTCTTGTGGGCT
+CTTTAATAGCTGAATGACGGGCACAGCCAGAAAAGGGTTAGGTCCCTTAT
+CCTAAGCAGCTCTGTGGCCAGCAGACGACTCTAAGTGGCAGAGCCTGGGA
+AGGGGCTGCTTAGCTGAGAAGTTCCAGGTAGGTGACAGGAACCTTGCCCT
+TCTTGTTGCCTCTCTCACCAATGAGCCAGTCGGGATCCATGCCTGGCAGG
+CTGTAGAC
+>mm9_chr2_106644219_106644341_+
+attcttaaggtaaatacctaggagtgatgtaacccagtcatagggaagaa
+ctacttttaatttgttgagcaacccccaacctgattttgacacaggtttg
+agtagtttacacttctactaac
+>mm9_chr2_125388930_125389219_+
+AGAGCACACAGCACATCACTTAGGCCTCCAACATTAAGGCAGCGCAAGTG
+CCTCAAGTAACTGAGAATACTTTACTCAGATACAAGGGTATCAAAAACAT
+GAGAACTGGCAGGAAGACCTCACAATGGTTTGTTAGCATCAAGTATTACC
+ATCCAGTTTCCTGTTTAAATAGTAATTAATGACTATTCTGAAATAAGGCA
+AATAATTACTCAAGCGGGCTGTCAAAGCCACTATCCTGTTGGCTGGGCAT
+CGGAGCAGTTAACTTTATCAAAGGCTTCTGACACAATGA
+>mm9_chr3_130936638_130936898_+
+CGAGGCTGCAGGCTGCAAATGTTCCCAGGCAGGCAAGACCTCACGTCCTA
+CTGGCTGCTGCCCTTGGGTGCATCTGTAGGCCCCGTGGCTCCTGCCCCTG
+GGGTTCAACACCGATAAACATAGAATACTCATTTTCAGAAGACCTGAGGG
+AATGAGTCTAAGCAACGCTTTTTACAAAAAGTGGCAAGGTTCAGGAAAAA
+AAAAAAAAAAGATGTTGCTCCAAGGCACCAAGGGTGTAATTTTTTTTCAG
+AAAAAGTCAG
+>mm9_chr3_136592670_136592771_+
+TGTCAGCCCATCACATTTTAGTGACAACAGTCATAGCCTTTATTTTCAGA
+TGACTTTCCTCTAAAACCACTGTCTATGAGTTGCCCCCCAAAACTCAAAA
+A
+>mm9_chr3_152861373_152861508_+
+ATCAAAAGCGACATGCAAGCATCTTGCTCTCACCACAGATCACTGAGACA
+TTAAGAGTGACGTCTCTTGAACTGTTGGCACGCCTAAGTTATTTCAGCAT
+TTCTTGCTCAGCAGTTGTTCTCTTGGCTTCCTCTG
+>mm9_chr4_13715309_13715630_+
+AACACATGGCCACATCATGTGATATTTTCAAAACACTTACACATAGCTTT
+GAGAAGGTCCCTGCAGGAATGATCCATCCTCTCACAGTTGGCCCATTTTT
+TAACAGCATATCTGCATTTTCCATTTAGGAGAGCTATATATTATTAGCTT
+ACATTTTTGGGTAGTAAAACAGTGCATTGCTGATTGTAAAACATGGACTT
+TATTATCTGCTGAAAATTGATTTGGCATTTATAGCCACTGTGTATTAGAC
+TGTTTTTCTGTTTTTAACATCAATGCTTAAAAGCGATGATTTGTGTTTaa
+aaaaattaaaaaaataaaata
+>mm9_chr4_147515028_147515097_+
+GCTGACGTGCTCTCCGAGTTCCTGGAGGTGGCCGTGCACCTGATTCTCTA
+TGTGCGCGAGGTCTACCCG
+>mm9_chr5_3949521_3949685_+
+AGTCCCAACCACCCCCTTGTTTAATGTATAACTTTCTGAAATGGGAGCGT
+TAGAATGGATTAAAATGGTTGGTAGGTGGTTGGATCACCAACCAAGACCA
+GAAATAGAGGGGTAGGCTGCTCAGGAGAGTATTGGGAGGGTAGCTATTAT
+TTGCATTTTGTGCT
+>mm9_chr5_68089693_68089831_+
+CAATGATAGAGAAGACTAAAATAAAAGCAGGCATGCTGGCACAAGCGACA
+GAAGGAAAAAGCCTCACCCGGCCCTGTTTGAGGCCACTCCTGGTGGCTCC
+TTTTCCAAGGACCATGCGGTCAAGCCTCTGAGTTGTTC
+>mm9_chr5_122819525_122819619_+
+CTTTAGAAAAGATGCATCTGTCATTGATTTAGGGATATGAATTGTTTGGA
+TTTGAGTAGTTTTCCATAACTCCTGCAGTTTGGCAATGTGTGCG
+>mm9_chr5_145619547_145619710_+
+CGGCGTTCTGAAAACTGTGCTCCGGGATGAGATCATTGCTTGGCACAAAA
+AGACACAGGAGGACACTTCCTCTCCACTGTCGGCCGCAGGGCAGCCTGAG
+AACATGGACAGCCAGCAGCTGGTTTCCTTAGTTCAGAAAGCCGTCACTGC
+CATCATGACCCGC
+>mm9_chr6_83928983_83929105_+
+ACAGGAACCATTATTTACATTTAATTTGGATGAATTTGTTACTGTGGATG
+AAGTCATAGAAGAAGTAAATCCTTCTCAAGCCAAGCAGAATCCATTAAAA
+GGAAAAAGAAAGGAAGCCCTCA
+>mm9_chr6_118857948_118858148_+
+CCAGGCTTGCTAGTTGGTGCAGTTAGCTACATCTCAGGACAGAGACAAGG
+TACTCTGAGCTCCCCTTGAACTGCCACACAAGCTGTCTCCTGGATGCCAA
+GCAGAGAAACCTGGAGACAACAATCATCATACTCAAAACCAGGATCTCTT
+TCTTAAGACTTTTGTATTTTGTCCCAGCCCTAACCCTGAGTTCTGCTGAA
+>mm9_chr7_85554209_85554343_+
+GTGAAACATCATGCTTCTGCATCAAGTTATTAGTGGGAAACCTGTAAAAG
+TTGACATTGAATGCTGATAACAAATTACTTTCATCCTGTCTCATAATGAA
+TCCTACATCAAGACAAGGCAAGTGAGAAAGAGGG
+>mm9_chr7_104055490_104055589_+
+ACATTTCTCCTCTCTTGGGGGAGCGCATCTCCTTGGGTGTGTCCACATCC
+GCCCCTAGGTACCCAGTGTGATGTGAGACACGAGTGTCTGTGCTAACTT
+>mm9_chr8_9970397_9970545_+
+AGTCTTCACCAAAATTAAGTCTCAGCTAACTTAAAAGTTGCAAGGATTTT
+TTTCAATAAAATTAATATCTTAAGTGTTTGGTGTTTAGATGATTCTCTCT
+CAACTTCCCCCACATTATCAAAAAACATTTGATGAACCTTAAAAACTC
+>mm9_chr9_20449845_20449932_+
+CCAGCACCGATGACACCATCGGCGACTTGAAGAAACTGATAGCTGCTCAA
+ACTGGCACCCGCTGGAACAAGATCGTTCTTAAAAAGT
+>mm9_chr9_107445869_107445930_+
+CAAGCAGAAGCTGGTGCCCATCATGACCATCCTGCTGGAAGAGCTGAATG
+CCTCCGGCCGC
+>mm9_chr9_120860475_120860606_+
+CTGCCATTGTACGCACCATGCAGAATACAAATGATGTAGAGACAGCTCGT
+TGTACTGCTGGGACTCTGCACAACCTTTCTCACCACCGCGAGGGCTTGCT
+GGCCATCTTTAAGTCTGGTGGCATCCCAGCG
+>mm9_chrX_10274056_10274087_+
+ACTTCGCTGTCATCATTTGTACAAACTCTTT
+>mm9_chrX_39881430_39881678_+
+AGCTAAAAAGAGTCCTTTTCTGACAGAAAGGCTGGACTTCTCCTTTTCAC
+CGTTTCTCTTACTGATGCTTTTGCCAGAAGAACAGTAAAGATTTAGACAC
+TGTCATGATTCATACACGTAAAATATTTTTCAAGGACACAATCTGATATA
+CTAACATTTATTTAAGAGGTTAAAGTCCACCACTAAATCTAAGGAAAGAT
+TTTTAACTGCCAAACACATTTCCTTTGACAAATAATGTAAGATGACAA
+>mm9_chrX_148249671_148249713_+
+AATGCTAGTATGAACAGTGGGAGGAATGAGCAAAATGTTACA
+>mm9_chrX_148481504_148482455_+
+CGCCACAACCTGCTACAGGCCTGTAAGATGCAGGACATCAAACTGCCACT
+GTCAAAGGGCACCATGGATGATATTAGTCAGGAAGAAGTGAGTATTATGG
+TGGGTGGTAGGAGTCATCTATGAATATTTAACCAGTAATGGGAGATTACA
+GATGGCCAGGAAGGGCAGGCAACAGATAGGACCACATAGAGTTGTGAGGG
+GCATAAAGATGGATGCAGAAGAAATGTGGCAAGGTGGAAGTAGTGAAGTC
+AGGCTTTGGTATGAGAGAGACATTGATTTGAGAGGAGAGCTGCAAGCCAG
+TGAGTACTCAGAAAGACCAAGAATGGGTCATTAATCTTAAGGATTTGAGC
+TCTTAGCTGCAGCAGATACTGGGCATGGGTAGGAGTGAGAATTGAGGAGC
+AGAGGAAGATGGGAAACTGGAGAACCTAAGGAGACTGATAGCTTAGCTGC
+AGTAAGGGAGGTTGGCCAGAAGAGGGTTGGGTAGGGGACTCAGCAAGGCA
+GAACTAAGGAAGCTTAGGTGGAGGGGAAGGAACAACATCTGAGCAACTAA
+AGCACTCTATCAACTGGAAGTGCAAGATGGTAGTGAGGGGTGGACAGGTG
+TAACTGAGTAACTCTTTGTAGGTAGCCTTTCAGTTTAATTCAGTAAAATA
+TTTTGAACACTAGTATTCCAGATACTGGTAGGCCATGACTTAACCATTCC
+TAATGTTAATCTCAGCTGTGCTAGCTGAGCTTGTGTTCACATTAGACATG
+AAGAAACTTAGTAAAAGGTAGAGCCCAGTTTTCGGTTTGGACCTTCCTGT
+TGGCCTCTGCTTCCGTGCCATCTAGCAAAGGAGTTCCTAATCTCTAGAGG
+GATACAAATGACTAGTCTGCTCCATCTGCCTCTTCCAACATTGCAGGGTA
+GCTCCCAGGGAGAAGAGTCAGTGAGTGGTTCCCAGAGAACATCCAGTATC
+T
--- a/tools/new_operations/gops_intersect.py
+++ b/tools/new_operations/gops_intersect.py
@@ -70,7 +70,7 @@ def main():
for line in intersect( [g1,g2], pieces=pieces, mincols=mincols ):
if type( line ) == GenomicInterval:
if in1_gff_format:
- line = convert_to_gff_coordinates( line )
+ line = convert_bed_coords_to_gff( line )
out_file.write( "%s\n" % "\t".join( line.fields ) )
else:
out_file.write( "%s\n" % line )
--- a/tools/extract/extract_genomic_dna.py
+++ b/tools/extract/extract_genomic_dna.py
@@ -5,6 +5,7 @@ usage: %prog $input $out_file1
-d, --dbkey=N: Genome build of input file
-o, --output_format=N: the data type of the output file
-g, --GALAXY_DATA_INDEX_DIR=N: the directory containing alignseq.loc
+ -G, --gff: input and output file, when it is interval, coordinates are treated as GFF
format (1-based, half-open) rather than 'traditional' 0-based, closed format.
"""
from galaxy import eggs
import pkg_resources
@@ -14,6 +15,7 @@ from bx.cookbook import doc_optparse
import bx.seq.nib
import bx.seq.twobit
from galaxy.tools.util.galaxyops import *
+from galaxy.tools.util.gff_util import *
assert sys.version_info[:2] >= ( 2, 4 )
@@ -50,6 +52,7 @@ def __main__():
chrom_col, start_col, end_col, strand_col = parse_cols_arg( options.cols )
dbkey = options.dbkey
output_format = options.output_format
+ gff_format = options.gff
GALAXY_DATA_INDEX_DIR = options.GALAXY_DATA_INDEX_DIR
input_filename, output_filename = args
except:
@@ -80,6 +83,8 @@ def __main__():
chrom = fields[chrom_col]
start = int( fields[start_col] )
end = int( fields[end_col] )
+ if gff_format:
+ start, end = convert_gff_coords_to_bed( [start, end] )
if includes_strand_col:
strand = fields[strand_col]
except:
@@ -162,7 +167,11 @@ def __main__():
c = b
else: # output_format == "interval"
meta_data = "\t".join( fields )
- fout.write( "%s\t%s\n" % ( meta_data, str( sequence ) ) )
+ if gff_format:
+ format_str = "%s seq \"%s\";\n"
+ else:
+ format_str = "%s\t%s\n"
+ fout.write( format_str % ( meta_data, str( sequence ) ) )
fout.close()
--- a/tools/new_operations/gops_subtract.py
+++ b/tools/new_operations/gops_subtract.py
@@ -71,7 +71,7 @@ def main():
for line in subtract( [g1,g2], pieces=pieces, mincols=mincols ):
if type( line ) is GenomicInterval:
if in1_gff_format:
- line = convert_to_gff_coordinates( line )
+ line = convert_bed_coords_to_gff( line )
out_file.write( "%s\n" % "\t".join( line.fields ) )
else:
out_file.write( "%s\n" % line )
--- /dev/null
+++ b/test-data/extract_genomic_dna_out4.gff
@@ -0,0 +1,46 @@
+chr10 Cufflinks transcript 62044837 62045189 1000 . . gene_id "CUFF.23531";
transcript_id "CUFF.23531.1"; FPKM "19.5178121606"; frac
"1.000000"; conf_lo "9.264456"; conf_hi "29.771168"; cov
"1.108611"; seq
"AATTACAAGATCGACACACCAAGATAGGCAGATCCATGGTTGGTTTTACTTTGTAAATCTAAAAGTATGTTGGAAAACGATGCAATGAATTCTTATCCTTTTTCAAAATGAAGAATTTGTGATGGTTAGTGGACAGTTCAGAAGCCTCTCTGCAAGAAAGGGGGCGCTGAGAAGTGGTAAAAAAAGGAAGGAAGCACTCGGGCTTTGTCAGCAGGGTGGACCCTGGGGTCCACAGTGGGAACAGTCCCTTCTGGCCTCTACTCACTGACCAAACGCTTTACTAAAACTCCGCTTCTGGCCTCTGTTGCCACCTCCTGGTCGCTGTCCTCGGAAGTTTCTACTTCCTCCTCGCT";
+chr10 Cufflinks transcript 75372919 75373002 1000 . . gene_id "CUFF.24985";
transcript_id "CUFF.24985.1"; FPKM "124.4970510798"; frac
"1.000000"; conf_lo "71.411330"; conf_hi "177.582772"; cov
"7.071429"; seq
"GCGTCTCGCAGCTTCTGCCCGTCGATCTCCATGTCGAGCCGGATGGGCACCAGCACCTCAGGCTGTGACGCATTCTCATGGATC";
+chr10 Cufflinks transcript 80362428 80363292 1000 - . gene_id "CUFF.26065";
transcript_id "CUFF.26065.1"; FPKM "43.6170921216"; frac
"1.000000"; conf_lo "32.260169"; conf_hi "54.974016"; cov
"2.477449"; seq
"ATGACGGACAAGTGTTTCCGGAAGTGCATCGGGAAGCCCGGGGGCTCCTTGGATAACTCGGAGCAGGTGAGACATCTCGGGAACCCGGGGTGGTGAGGGGCGCGGGGTCAGGAGCGTCTAGGAGGTTGAGAGATGTGCGCGTGCGCGGCCTCTAGCCTTAGCTACTGAGGAAGTTGTGCGCGTGCGCGGGGTGAGGACCCGGCTTCTGTGCCTAGATCGGTGCAGCCTTCATGGGTGATCCTCGGGTCGTGTGACCGTCAGTCAGGGATCCCCCTCCACGCTTTGCAGAAATGCATCGCCATGTGCATGGACCGCTACATGGACGCCTGGAATACCGTGTCCCGCGCCTACAACTCTCGACTGCAGCGGGAACGAGCCAACATGTGACCGGGACCTGTGCCTCGGGACACCGTGCTTATGGTCTGAACTGTTTTCCCTGCCAGTTAGGGTGTCTCCTCCTAGCCGCCCTGAAGTCTGGCAGCATGGAGGGCTTGGGGATCGAGGCCTCTCCCCTGGGTTGCTGCGTCCAGCTCAATCTCAGAAGAGAGTGAGGACCCGACAGAGCACAGGGATCTGGCTGGCCCCACTGACCTGTGACCTCAGGAGAGCAGGCCAATAAATCGCTGCTGGGGCAGTAAAGCAGGCGTGTCACCTCACTGCTTCAGGTCCCTTCCCCTGAGTAGGCCCAGACCTCCCAGGGTATCTTTCCCCTTGGGGTCAGTGGGCTGCTGGCTCTCAGGGAATTCGGAGCATGATCTCAGGTGTTTGGTCATCCCGGGGA
GACCAGCCGAGGTTAAGAAGCAAGGCTTCATGTagccttcacctatcatgcatgaggcccagggtgctgaccttaactctgaat";
+chr11 Cufflinks transcript 7904565 7904642 1000 . . gene_id "CUFF.33508";
transcript_id "CUFF.33508.1"; FPKM "61.6484988869"; frac
"1.000000"; conf_lo "22.882428"; conf_hi "100.414569"; cov
"3.501633"; seq
"CATCTTCTATTTGAGCCTCCATCCAGGCACCTCTGAAACAAAGGTGCACTCACTGCATGTCCACTTGTCACAGGAGCC";
+chr11 Cufflinks exon 78140156 78140259 1000 . . gene_id "CUFF.43148";
transcript_id "CUFF.43148.1"; exon_number "1"; FPKM
"54.8483511750"; frac "1.000000"; conf_lo "23.181641";
conf_hi "86.515061"; cov "3.115385"; seq
"CTGCTTGCTAATTTTCTCTCTTGGGATCAGGGGGACGTGAACTCCAGCCCTGACTCGTGCTCCTTATGCTCTGAGTACATAGCAAATAAATGAGAGCAAAACAC";
+chr11 Cufflinks exon 105616462 105616737 1000 . . gene_id "CUFF.48385";
transcript_id "CUFF.48385.1"; exon_number "1"; FPKM
"18.9452034252"; frac "1.000000"; conf_lo "7.520816";
conf_hi "30.369591"; cov "1.076087"; seq
"TAGGTGTAATAGTGGAAAACAATAGTTTTTAAACTTCAGAGTCCAGGGCTGTAACTCAGTAGTAACAGTGTTCTCTAAGTATGTTATTCTTCCTCTACATGCTGAAATTTTTCATATTTGGAGCATTCACTGTTCCATGTATCAGTAAATTATATTGTGAGCTGTCATCATATCTAAGCACCATATTGAATATTTTTCATGATTAAAATTTGTTGAAACAACAATTCTATGACCGAAAAAAGCAAGGCTTTGTAAATAACATGTTTGTTACTAGTA";
+chr12 Cufflinks exon 30701762 30702509 1000 . . gene_id "CUFF.53897";
transcript_id "CUFF.53897.1"; exon_number "1"; FPKM
"48.9333329111"; frac "1.000000"; conf_lo "37.780391";
conf_hi "60.086275"; cov "2.779412"; seq
"TGTGGAGTGTACTTATATGATCCCTATGCTGATAGGATTACCTTCCTAGACATAGCTAGACGCAAAGCCACATGTGTAAGGCTGCTGAGCAAAGACAGCATCCCAGCATGGGTGTGTTCACGGTGGATTCACCACGTTGCATATGTAAAGTGGTCCCCTTGGCTTACCCTTCACTTTGCTCATGAGATTCAGAAGCTGGTGGTCCAGCAGGGGTGAGCATTTGTGAAATAGTAAGCTGAACTTAGTGGTGAGATTTCAGAACAGACTTCTGTGAAGTAAGAGATGTAACCATGCATCTAAAATCAGATGGCCGTGTAACTGCTCGGGCATAGAAATGGTGGGAGAACCTGTCCTGGGTACCTGGCATTTCACATGAGCCCAGGGATATGTCTTGTGCCAAGGCACACAAGTGTCCATGGACTTGGACAGGTGCCAAGGGTTTTTGTCTCTGTTCCTATGTGGGAGGCTGGCTGTGATTTACATTAATTTCTGTATTTCAAACGAAGATGTCTGCAGATCTCCATTTTGATGTTACAGCCTCATTGCCCAGGCAGTGGGCAGTGCCCAGACACCCTTTCTGACTAGCCACTGCATTGGGCTTCTGTGATTCAAAGTAGTGTATATATTTATTTACTTCTCTGACTGTGGCCAACAGCCAAATGCCATTTTATGTTCCTTGTATTCAGTCCATTACCAAAGAGGTGTTTGCACTTTGTAATGATACCTTTCAGTTCAAATAAAAGGACCA";
+chr13 Cufflinks exon 49159496 49159569 1000 . . gene_id "CUFF.67788";
transcript_id "CUFF.67788.1"; exon_number "1"; FPKM
"44.9657653777"; frac "1.000000"; conf_lo "10.974842";
conf_hi "78.956689"; cov "2.554054"; seq
"ttttcttttggattacttgatttttttttatttgatcttatttatgatgattttgagtacatttttgaacagtt";
+chr13 Cufflinks transcript 100200304 100200330 1000 . . gene_id "CUFF.73108";
transcript_id "CUFF.73108.1"; FPKM "123.2395051093"; frac
"1.000000"; conf_lo "30.079196"; conf_hi "216.399814"; cov
"7.000000"; seq "TCTCATATGAATAGCCACCCTCTTCTG";
+chr14 Cufflinks transcript 31949103 31949152 1000 . . gene_id "CUFF.77316";
transcript_id "CUFF.77316.1"; FPKM "85.5634278330"; frac
"1.000000"; conf_lo "28.521143"; conf_hi "142.605713"; cov
"4.860000"; seq "GGATGCTATCCGCGATGTGCATGTAAAGGGCCTCATGTACCAGTGGATCG";
+chr14 Cufflinks exon 67604227 67604668 1000 . . gene_id "CUFF.81446";
transcript_id "CUFF.81446.1"; exon_number "1"; FPKM
"123.6776546104"; frac "1.000000"; conf_lo "100.611653";
conf_hi "146.743656"; cov "7.024887"; seq
"TTCACCGTGAGAGTTTTCTCCATTTCACTCTTCACTGTGCTGTTCTCTGTGCCGCTTTCCTCTTGACTTATAAACATCTGAGCCAGTTTTCAATAAACTTAAAACGAAGCCTGCTTCTCATCCCAAATTGTAAACAGGAATAAAGCTTTTTAAACCTTATCTTAAATTTTAACTTTGTTGAATTCTGCTTTGTGATAGGACAATCTGTTTCACCCAACAAGAATCTGTGTAGGAGGATGAACATCCCGCATGTTGGAGCTGCAAATCAGCACTGTACAAGCTCACTGATGGACAGCTGTTCTGTGATGTATTCCATGATTTTACTAATACTTTCAAAAATGGCAAAACTAACTTCAGTTTTAATGTTGAAAGAAAATCATAAATGTTCCCATAGTTCAATGGCACTGTCGATGAAACTGCTACTGAATTTAGAGAGAAAACG";
+chr14 Cufflinks exon 75165582 75165744 1000 . . gene_id "CUFF.82088";
transcript_id "CUFF.82088.1"; exon_number "1"; FPKM
"20.4139057543"; frac "1.000000"; conf_lo "4.982443";
conf_hi "35.845368"; cov "1.159509"; seq
"ggccctgggatgataTAACAGAAGAGTCTAAAGGAGGCTTCTGAGATGTGCAGTAGGAAAGCCTGGCACATAATAGGTTATTATCTAAATCCCTTCACTACTCTTCAAAGACAGCAGGATGCCTCTGCTCCCATGTTTTATCTCTACTTATGTGGAATTTATG";
+chr16 Cufflinks transcript 57154027 57154067 1000 . . gene_id "CUFF.103364";
transcript_id "CUFF.103364.1"; FPKM "162.3154457537"; frac
"1.000000"; conf_lo "75.554191"; conf_hi "249.076701"; cov
"9.219512"; seq "GTTGAGGTTTATTTAAGTAAAATGATTTTTTAAAAAAGCAA";
+chr16 Cufflinks exon 74862302 74862560 1000 . . gene_id "CUFF.105450";
transcript_id "CUFF.105450.1"; exon_number "1"; FPKM
"11.0120241741"; frac "1.000000"; conf_lo "2.020744";
conf_hi "20.003304"; cov "0.625483"; seq
"GCATTGGCAGCAGATATTGGTACCCAGTGGCACTGCAGAGTACTTACAATCAGGACTCGCTACTGTGCTTCATTCTGCTTTTCTCTCTGCTTCTATTACAGTTAAAGTGTTGCTAATTATAGAAACTCTCTGTTTATTGAACCTCGGTGTTAAGAAAAACTTGTAATCTTCAGATATGATCCGAAAGATTCCCAAACAAATGTAACAAGGTCCACTTTTGTAGCCCTTTCTACCAGAAcactggttatcaacctgtggg";
+chr16 Cufflinks transcript 98168779 98168914 1000 . . gene_id "CUFF.107834";
transcript_id "CUFF.107834.1"; FPKM "24.4666664555"; frac
"1.000000"; conf_lo "5.971605"; conf_hi "42.961728"; cov
"1.389706"; seq
"CCTATTTATTTCACTAAACATCTGCCTGCTAGCTGAGATAAACATTCTCTAAAAAACTGTTTACTGCAAAAAGTGATTACTGTTTTTTATTAGTTTCTTAGCATTTGAAATAGTTACATGAATGGAAGGATAGAGT";
+chr17 Cufflinks exon 8483212 8483268 1000 . . gene_id "CUFF.108498";
transcript_id "CUFF.108498.1"; exon_number "1"; FPKM
"50.0370923000"; frac "1.000000"; conf_lo "9.181978";
conf_hi "90.892207"; cov "2.842105"; seq
"AGACTTGTCAACAGCTCACCCAATGATGGAACTGAGGCTGCCCCTCAAGTGGCCAGA";
+chr17 Cufflinks exon 30355791 30355913 1000 . . gene_id "CUFF.111759";
transcript_id "CUFF.111759.1"; exon_number "1"; FPKM
"19.3232673516"; frac "1.000000"; conf_lo "2.040012";
conf_hi "36.606523"; cov "1.097561"; seq
"atctcatacccataagctcagaactcggggtggtaacataggaggactgccatgagtgtgactaacctgggctataggaggaggatctaccttaagcaaatgaCCAACAAAACTAACAAGCTC";
+chr18 Cufflinks transcript 39571718 39571880 1000 . . gene_id "CUFF.123569";
transcript_id "CUFF.123569.1"; FPKM "20.4139057543"; frac
"1.000000"; conf_lo "4.982443"; conf_hi "35.845368"; cov
"1.159509"; seq
"TATAACATTCCATAAATGTACAATAATCTATTTTTGAGAAGCTCATTTTGAAACTTAACACTGTCATTGATAATCTTCAAGTGGTATTTCTTAGGCACCATAAATTTCACATCCAGCTGGGTTACAATTATTTTAAAGTACTTTGAGACCAATTTAAACCATT";
+chr19 Cufflinks exon 17633088 17633203 1000 . . gene_id "CUFF.131333";
transcript_id "CUFF.131333.1"; exon_number "1"; FPKM
"20.4893265884"; frac "1.000000"; conf_lo "2.163116";
conf_hi "38.815537"; cov "1.163793"; seq
"TGGGAAATGAACTGCATGGCAATGAACCCCAGGGAATTTGGTGGTTAATTGTCTAAGGATAAGGACATCAGTTTTGTCTTTTGCATCACTGTGACCTTTGCCTCTAATTGTATAGA";
+chr19 Cufflinks transcript 41997624 41997859 1000 . . gene_id "CUFF.133569";
transcript_id "CUFF.133569.1"; FPKM "28.1988698132"; frac
"1.000000"; conf_lo "13.125940"; conf_hi "43.271800"; cov
"1.601695"; seq
"gctacacaacgactcacatagagggaagcaggcacacatcagataaaacacAAAAGGATGGGTTGGTGATGGGCATAGTTAATGAGGGCCACTAGGTAAATACACCTGATCCAAAAGTCACGCTACTACTTAGATTCTTCTCTCTGCTAAAGACAACAGAAgacatgttagccatgcttgtaatccctgcattggggagatggagtcagaaatatcactgcaagttcacccaatag";
+chr19 Cufflinks exon 56516515 56516684 1000 . . gene_id "CUFF.135203";
transcript_id "CUFF.135203.1"; exon_number "1"; FPKM
"33.5542854247"; frac "1.000000"; conf_lo "14.181710";
conf_hi "52.926861"; cov "1.905882"; seq
"TGTATTCATTCACTATTCACTGATTTGTCAGATCATCCATCCACACAGGTGCTGAAGAGTAACCCATTTCACTTTGTATACAAGATAATGTTTTTGTACTTCAAATACATCTGGAATTCTTTCAAATATTCCAAGATTTTTTTTTTTTCTGAATAATCTTTGGTTACCTC";
+chr2 Cufflinks transcript 4543774 4543977 1000 . . gene_id "CUFF.136435";
transcript_id "CUFF.136435.1"; FPKM "37.2825393608"; frac
"1.000000"; conf_lo "18.641270"; conf_hi "55.923809"; cov
"2.117647"; seq
"gagccatttctccagccccTTTATGTGGAATATTAACAAGAGAAGACAACATAAAATGACTTACCATGCTGTGTGGCCTAACAGTGGATGAAGAATGAGTGATTTGGGCATTTCTGATAGTATTTATAAAGAAGACTTTTATGACCAAACCACATGTCACAGTAGGGATTTGCTGCACATCTTATGAGAGTTTCTTCTTTGTCA";
+chr2 Cufflinks transcript 30200331 30200938 1000 . . gene_id "CUFF.140289";
transcript_id "CUFF.140289.1"; FPKM "100.0741846001"; frac
"1.000000"; conf_lo "82.383401"; conf_hi "117.764968"; cov
"5.684211"; seq
"CGCACACAAAGGATTTATTTGCCAGAGAGCAAGCAGACAGGCAGAGGTCAGAATGTTAGTTAGAAACTGAAGGAATGACTGCTGTAGCCACTGTGCCCAGCCAGAGCCATGAGGGAAGTGGGAGGCAGCACTTGGTGCTGCTGCTCTGGCTGACCCTTCTGGTTTCCTGCCACACTCCTAGCCCTGCCTGTGTGCTGCTGTCCCCCTCAACCTTCCACAGCCAGAAGGCAGATGTTCTTTCATGCCAAGAGCATCCATCCCCAGCATATCCTGGGCCCATGGTGGTGTCAAATGTAGTGACCCTTCTGCCTTAAGGGAGCTGGGAAGCCTGGGGTGTGCAGGGTTGCAGGTCAGAAGCAGGACTAGCAGAGGGGCCTGGGGCCATTCTGTCTTGTGGGCTCTTTAATAGCTGAATGACGGGCACAGCCAGAAAAGGGTTAGGTCCCTTATCCTAAGCAGCTCTGTGGCCAGCAGACGACTCTAAGTGGCAGAGCCTGGGAAGGGGCTGCTTAGCTGAGAAGTTCCAGGTAGGTGACAGGAACCTTGCCCTTCTTGTTGCCTCTCTCACCAATGAGCCAGTCGGGATCCATGCCTGGCAGGCTGTAGAC";
+chr2 Cufflinks transcript 106644220 106644341 1000 . . gene_id "CUFF.148977";
transcript_id "CUFF.148977.1"; FPKM "27.2743167045"; frac
"1.000000"; conf_lo "6.656871"; conf_hi "47.891762"; cov
"1.549180"; seq
"attcttaaggtaaatacctaggagtgatgtaacccagtcatagggaagaactacttttaatttgttgagcaacccccaacctgattttgacacaggtttgagtagtttacacttctactaac";
+chr2 Cufflinks exon 125388931 125389219 1000 . . gene_id "CUFF.151331";
transcript_id "CUFF.151331.1"; exon_number "1"; FPKM
"23.0274507817"; frac "1.000000"; conf_lo "10.718761";
conf_hi "35.336141"; cov "1.307958"; seq
"AGAGCACACAGCACATCACTTAGGCCTCCAACATTAAGGCAGCGCAAGTGCCTCAAGTAACTGAGAATACTTTACTCAGATACAAGGGTATCAAAAACATGAGAACTGGCAGGAAGACCTCACAATGGTTTGTTAGCATCAAGTATTACCATCCAGTTTCCTGTTTAAATAGTAATTAATGACTATTCTGAAATAAGGCAAATAATTACTCAAGCGGGCTGTCAAAGCCACTATCCTGTTGGCTGGGCATCGGAGCAGTTAACTTTATCAAAGGCTTCTGACACAATGA";
+chr3 Cufflinks transcript 130936639 130936898 1000 . . gene_id "CUFF.171349";
transcript_id "CUFF.171349.1"; FPKM "20.1110620975"; frac
"1.000000"; conf_lo "7.983635"; conf_hi "32.238489"; cov
"1.142308"; seq
"CGAGGCTGCAGGCTGCAAATGTTCCCAGGCAGGCAAGACCTCACGTCCTACTGGCTGCTGCCCTTGGGTGCATCTGTAGGCCCCGTGGCTCCTGCCCCTGGGGTTCAACACCGATAAACATAGAATACTCATTTTCAGAAGACCTGAGGGAATGAGTCTAAGCAACGCTTTTTACAAAAAGTGGCAAGGTTCAGGAAAAAAAAAAAAAAAGATGTTGCTCCAAGGCACCAAGGGTGTAATTTTTTTTCAGAAAAAGTCAG";
+chr3 Cufflinks exon 136592671 136592771 1000 . . gene_id "CUFF.171861";
transcript_id "CUFF.171861.1"; exon_number "1"; FPKM
"32.9452142371"; frac "1.000000"; conf_lo "8.040973";
conf_hi "57.849455"; cov "1.871287"; seq
"TGTCAGCCCATCACATTTTAGTGACAACAGTCATAGCCTTTATTTTCAGATGACTTTCCTCTAAAACCACTGTCTATGAGTTGCCCCCCAAAACTCAAAAA";
+chr3 Cufflinks transcript 152861374 152861508 1000 . . gene_id "CUFF.173007";
transcript_id "CUFF.173007.1"; FPKM "24.6479010219"; frac
"1.000000"; conf_lo "6.015839"; conf_hi "43.279963"; cov
"1.400000"; seq
"ATCAAAAGCGACATGCAAGCATCTTGCTCTCACCACAGATCACTGAGACATTAAGAGTGACGTCTCTTGAACTGTTGGCACGCCTAAGTTATTTCAGCATTTCTTGCTCAGCAGTTGTTCTCTTGGCTTCCTCTG";
+chr4 Cufflinks exon 13715310 13715630 1000 . . gene_id "CUFF.174817";
transcript_id "CUFF.174817.1"; exon_number "1"; FPKM
"19.2510308382"; frac "1.000000"; conf_lo "8.572480";
conf_hi "29.929581"; cov "1.093458"; seq
"AACACATGGCCACATCATGTGATATTTTCAAAACACTTACACATAGCTTTGAGAAGGTCCCTGCAGGAATGATCCATCCTCTCACAGTTGGCCCATTTTTTAACAGCATATCTGCATTTTCCATTTAGGAGAGCTATATATTATTAGCTTACATTTTTGGGTAGTAAAACAGTGCATTGCTGATTGTAAAACATGGACTTTATTATCTGCTGAAAATTGATTTGGCATTTATAGCCACTGTGTATTAGACTGTTTTTCTGTTTTTAACATCAATGCTTAAAAGCGATGATTTGTGTTTaaaaaaattaaaaaaataaaata";
+chr4 Cufflinks exon 147515029 147515097 1000 . . gene_id "CUFF.190627";
transcript_id "CUFF.190627.1"; exon_number "1"; FPKM
"34.4458244094"; frac "1.000000"; conf_lo "3.636542";
conf_hi "65.255106"; cov "1.956522"; seq
"GCTGACGTGCTCTCCGAGTTCCTGGAGGTGGCCGTGCACCTGATTCTCTATGTGCGCGAGGTCTACCCG";
+chr5 Cufflinks exon 3949522 3949685 1000 . . gene_id "CUFF.192485";
transcript_id "CUFF.192485.1"; exon_number "1"; FPKM
"23.1879208220"; frac "1.000000"; conf_lo "6.791585";
conf_hi "39.584257"; cov "1.317073"; seq
"AGTCCCAACCACCCCCTTGTTTAATGTATAACTTTCTGAAATGGGAGCGTTAGAATGGATTAAAATGGTTGGTAGGTGGTTGGATCACCAACCAAGACCAGAAATAGAGGGGTAGGCTGCTCAGGAGAGTATTGGGAGGGTAGCTATTATTTGCATTTTGTGCT";
+chr5 Cufflinks transcript 68089694 68089831 1000 . . gene_id "CUFF.199409";
transcript_id "CUFF.199409.1"; FPKM "17.2229122047"; frac
"1.000000"; conf_lo "1.818271"; conf_hi "32.627553"; cov
"0.978261"; seq
"CAATGATAGAGAAGACTAAAATAAAAGCAGGCATGCTGGCACAAGCGACAGAAGGAAAAAGCCTCACCCGGCCCTGTTTGAGGCCACTCCTGGTGGCTCCTTTTCCAAGGACCATGCGGTCAAGCCTCTGAGTTGTTC";
+chr5 Cufflinks exon 122819526 122819619 1000 . . gene_id "CUFF.205487";
transcript_id "CUFF.205487.1"; exon_number "1"; FPKM
"25.2486782797"; frac "1.000000"; conf_lo "2.649470";
conf_hi "47.847887"; cov "1.434124"; seq
"CTTTAGAAAAGATGCATCTGTCATTGATTTAGGGATATGAATTGTTTGGATTTGAGTAGTTTTCCATAACTCCTGCAGTTTGGCAATGTGTGCG";
+chr5 Cufflinks transcript 145619548 145619710 1000 . . gene_id "CUFF.209965";
transcript_id "CUFF.209965.1"; FPKM "40.8278115086"; frac
"1.000000"; conf_lo "19.004428"; conf_hi "62.651195"; cov
"2.319018"; seq
"CGGCGTTCTGAAAACTGTGCTCCGGGATGAGATCATTGCTTGGCACAAAAAGACACAGGAGGACACTTCCTCTCCACTGTCGGCCGCAGGGCAGCCTGAGAACATGGACAGCCAGCAGCTGGTTTCCTTAGTTCAGAAAGCCGTCACTGCCATCATGACCCGC";
+chr6 Cufflinks exon 83928984 83929105 1000 . . gene_id "CUFF.219317";
transcript_id "CUFF.219317.1"; exon_number "1"; FPKM
"46.7559714935"; frac "1.000000"; conf_lo "19.761399";
conf_hi "73.750544"; cov "2.655738"; seq
"ACAGGAACCATTATTTACATTTAATTTGGATGAATTTGTTACTGTGGATGAAGTCATAGAAGAAGTAAATCCTTCTCAAGCCAAGCAGAATCCATTAAAAGGAAAAAGAAAGGAAGCCCTCA";
+chr6 Cufflinks exon 118857949 118858148 1000 . . gene_id "CUFF.223543";
transcript_id "CUFF.223543.1"; exon_number "1"; FPKM
"19.0140950740"; frac "1.000000"; conf_lo "5.569100";
conf_hi "32.459091"; cov "1.080000"; seq
"CCAGGCTTGCTAGTTGGTGCAGTTAGCTACATCTCAGGACAGAGACAAGGTACTCTGAGCTCCCCTTGAACTGCCACACAAGCTGTCTCCTGGATGCCAAGCAGAGAAACCTGGAGACAACAATCATCATACTCAAAACCAGGATCTCTTTCTTAAGACTTTTGTATTTTGTCCCAGCCCTAACCCTGAGTTCTGCTGAA";
+chr7 Cufflinks transcript 85554210 85554343 1000 . . gene_id "CUFF.235778";
transcript_id "CUFF.235778.1"; FPKM "17.7370289869"; frac
"1.000000"; conf_lo "1.872548"; conf_hi "33.601510"; cov
"1.007463"; seq
"GTGAAACATCATGCTTCTGCATCAAGTTATTAGTGGGAAACCTGTAAAAGTTGACATTGAATGCTGATAACAAATTACTTTCATCCTGTCTCATAATGAATCCTACATCAAGACAAGGCAAGTGAGAAAGAGGG";
+chr7 Cufflinks exon 104055491 104055589 1000 . . gene_id "CUFF.238474";
transcript_id "CUFF.238474.1"; exon_number "1"; FPKM
"28.8092349606"; frac "1.000000"; conf_lo "5.286593";
conf_hi "52.331877"; cov "1.636364"; seq
"ACATTTCTCCTCTCTTGGGGGAGCGCATCTCCTTGGGTGTGTCCACATCCGCCCCTAGGTACCCAGTGTGATGTGAGACACGAGTGTCTGTGCTAACTT";
+chr8 Cufflinks exon 9970398 9970545 1000 . . gene_id "CUFF.245320";
transcript_id "CUFF.245320.1"; exon_number "1"; FPKM
"22.4828826889"; frac "1.000000"; conf_lo "5.487421";
conf_hi "39.478345"; cov "1.277027"; seq
"AGTCTTCACCAAAATTAAGTCTCAGCTAACTTAAAAGTTGCAAGGATTTTTTTCAATAAAATTAATATCTTAAGTGTTTGGTGTTTAGATGATTCTCTCTCAACTTCCCCCACATTATCAAAAAACATTTGATGAACCTTAAAAACTC";
+chr9 Cufflinks transcript 20449846 20449932 1000 . . gene_id "CUFF.260747";
transcript_id "CUFF.260747.1"; FPKM "234.9313045507"; frac
"1.000000"; conf_lo "163.275950"; conf_hi "306.586659"; cov
"13.344091"; seq
"CCAGCACCGATGACACCATCGGCGACTTGAAGAAACTGATAGCTGCTCAAACTGGCACCCGCTGGAACAAGATCGTTCTTAAAAAGT";
+chr9 Cufflinks exon 107445870 107445930 1000 . . gene_id "CUFF.272761";
transcript_id "CUFF.272761.1"; exon_number "1"; FPKM
"38.9633095779"; frac "1.000000"; conf_lo "4.113466";
conf_hi "73.813153"; cov "2.213115"; seq
"CAAGCAGAAGCTGGTGCCCATCATGACCATCCTGCTGGAAGAGCTGAATGCCTCCGGCCGC";
+chr9 Cufflinks transcript 120860476 120860606 1000 . . gene_id "CUFF.275115";
transcript_id "CUFF.275115.1"; FPKM "25.4005086867"; frac
"1.000000"; conf_lo "6.199529"; conf_hi "44.601488"; cov
"1.442748"; seq
"CTGCCATTGTACGCACCATGCAGAATACAAATGATGTAGAGACAGCTCGTTGTACTGCTGGGACTCTGCACAACCTTTCTCACCACCGCGAGGGCTTGCTGGCCATCTTTAAGTCTGGTGGCATCCCAGCG";
+chrX Cufflinks exon 10274057 10274087 1000 . . gene_id "CUFF.276147";
transcript_id "CUFF.276147.1"; exon_number "1"; FPKM
"99.5432248142"; frac "1.000000"; conf_lo "21.405127";
conf_hi "177.681323"; cov "5.654052"; seq
"ACTTCGCTGTCATCATTTGTACAAACTCTTT";
+chrX Cufflinks transcript 39881431 39881678 1000 . . gene_id "CUFF.277419";
transcript_id "CUFF.277419.1"; FPKM "42.1683560109"; frac
"1.000000"; conf_lo "24.187709"; conf_hi "60.149003"; cov
"2.395161"; seq
"AGCTAAAAAGAGTCCTTTTCTGACAGAAAGGCTGGACTTCTCCTTTTCACCGTTTCTCTTACTGATGCTTTTGCCAGAAGAACAGTAAAGATTTAGACACTGTCATGATTCATACACGTAAAATATTTTTCAAGGACACAATCTGATATACTAACATTTATTTAAGAGGTTAAAGTCCACCACTAAATCTAAGGAAAGATTTTTAACTGCCAAACACATTTCCTTTGACAAATAATGTAAGATGACAA";
+chrX Cufflinks transcript 148249672 148249713 1000 . . gene_id "CUFF.282847";
transcript_id "CUFF.282847.1"; FPKM "56.5895686726"; frac
"1.000000"; conf_lo "5.974320"; conf_hi "107.204818"; cov
"3.214286"; seq "AATGCTAGTATGAACAGTGGGAGGAATGAGCAAAATGTTACA";
+chrX Cufflinks transcript 148481505 148482455 1000 + . gene_id "CUFF.282965";
transcript_id "CUFF.282965.1"; FPKM "40.1706233958"; frac
"1.000000"; conf_lo "16.978103"; conf_hi "63.363144"; cov
"2.281690"; seq
"CGCCACAACCTGCTACAGGCCTGTAAGATGCAGGACATCAAACTGCCACTGTCAAAGGGCACCATGGATGATATTAGTCAGGAAGAAGTGAGTATTATGGTGGGTGGTAGGAGTCATCTATGAATATTTAACCAGTAATGGGAGATTACAGATGGCCAGGAAGGGCAGGCAACAGATAGGACCACATAGAGTTGTGAGGGGCATAAAGATGGATGCAGAAGAAATGTGGCAAGGTGGAAGTAGTGAAGTCAGGCTTTGGTATGAGAGAGACATTGATTTGAGAGGAGAGCTGCAAGCCAGTGAGTACTCAGAAAGACCAAGAATGGGTCATTAATCTTAAGGATTTGAGCTCTTAGCTGCAGCAGATACTGGGCATGGGTAGGAGTGAGAATTGAGGAGCAGAGGAAGATGGGAAACTGGAGAACCTAAGGAGACTGATAGCTTAGCTGCAGTAAGGGAGGTTGGCCAGAAGAGGGTTGGGTAGGGGACTCAGCAAGGCAGAACTAAGGAAGCTTAGGTGGAGGGGAAGGAACAACATCTGAGCAACTAAAGCACTCTATCAACTGGAAGTGCAAGATGGTAGTGAGGGGTGGACAGGTGTAACTGAGTAACTCTTTGTAGGTAGCCTTTCAGTTTAATTCAGTAAAATATTTTGAACACTAGTATTCCAGATACTGGTAGGCCATGACTTAACCATTCCTAATGTTAATCTCAGCTGTGCTAGCTGAGCTTGTGTTCACATTAGACATGAAGAAACTTAGTAAAAGGTAGAGCCCAG
TTTTCGGTTTGGACCTTCCTGTTGGCCTCTGCTTCCGTGCCATCTAGCAAAGGAGTTCCTAATCTCTAGAGGGATACAAATGACTAGTCTGCTCCATCTGCCTCTTCCAACATTGCAGGGTAGCTCCCAGGGAGAAGAGTCAGTGAGTGGTTCCCAGAGAACATCCAGTATCT";
--- a/lib/galaxy/tools/util/gff_util.py
+++ b/lib/galaxy/tools/util/gff_util.py
@@ -6,23 +6,37 @@ from bx.intervals.io import NiceReaderWr
class GFFReaderWrapper( NiceReaderWrapper ):
"""
- Reader wrapper converts GFF format--starting and ending coordinates are 1-based,
closed--to the 'traditional' interval format--0 based,
- half-open. This is useful when using GFF files as inputs to tools that expect
traditional interval format.
+ Reader wrapper converts GFF format--starting and ending coordinates are 1-based,
closed--to the
+ 'traditional'/BED interval format--0 based, half-open. This is useful when
using GFF files as inputs
+ to tools that expect traditional interval format.
"""
def parse_row( self, line ):
- interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col,
self.start_col, self.end_col, self.strand_col, self.default_strand,
fix_strand=self.fix_strand )
- # Change from 1-based to 0-based format.
- interval.start -= 1
- # Add 1 to end to move from closed to open format for end coordinate.
- interval.end += 1
+ interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col,
self.start_col, self.end_col, \
+ self.strand_col, self.default_strand,
fix_strand=self.fix_strand )
+ interval = convert_gff_coords_to_bed( interval )
return interval
-def convert_to_gff_coordinates( interval ):
+def convert_bed_coords_to_gff( interval ):
"""
- Converts a GenomicInterval's coordinates to GFF format.
+ Converts an interval object's coordinates from BED format to GFF format. Accepted
object types include
+ GenomicInterval and list (where the first element in the list is the interval's
start, and the second
+ element is the interval's end).
"""
if type( interval ) is GenomicInterval:
interval.start += 1
- interval.end -= 1
- return interval
+ elif type ( interval ) is list:
+ interval[ 0 ] += 1
return interval
+
+def convert_gff_coords_to_bed( interval ):
+ """
+ Converts an interval object's coordinates from GFF format to BED format. Accepted
object types include
+ GenomicInterval and list (where the first element in the list is the interval's
start, and the second
+ element is the interval's end).
+ """
+ if type( interval ) is GenomicInterval:
+ interval.start -= 1
+ elif type ( interval ) is list:
+ interval[ 0 ] -= 1
+ return interval
+