# HG changeset patch -- Bitbucket.org # Project galaxy-dist # URL http://bitbucket.org/galaxy/galaxy-dist/overview # User jeremy goecks <jeremy.goecks@emory.edu> # Date 1278618202 14400 # Node ID 2d046444998edb1c4a5126897990bddd25de69f5 # Parent 41089d12cdd9eaaeb8e75e09157d412ccf935eae Enable 'extract genomic DNA' tool to accept and produce GFF files and added functional tests for this feature. --- a/tools/extract/extract_genomic_dna.xml +++ b/tools/extract/extract_genomic_dna.xml @@ -1,20 +1,27 @@ <tool id="Extract genomic DNA 1" name="Extract Genomic DNA" version="2.2.1"><description>using coordinates from assembled/unassembled genomes</description> - <command interpreter="python">extract_genomic_dna.py $input $out_file1 -1 ${input.metadata.chromCol},${input.metadata.startCol},${input.metadata.endCol},${input.metadata.strandCol} -d $dbkey -o $out_format -g ${GALAXY_DATA_INDEX_DIR}</command> + <command interpreter="python"> + extract_genomic_dna.py $input $out_file1 -d $dbkey -o $out_format -g ${GALAXY_DATA_INDEX_DIR} + #if isinstance( $input.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__): + -1 1,4,5,7 --gff + #else: + -1 ${input.metadata.chromCol},${input.metadata.startCol},${input.metadata.endCol},${input.metadata.strandCol} + #end if + </command><inputs> - <param format="interval" name="input" type="data" label="Fetch sequences corresponding to Query"> - <validator type="unspecified_build" /> - <validator type="dataset_metadata_in_file" filename="alignseq.loc" metadata_name="dbkey" metadata_column="1" message="Sequences are not currently available for the specified build." line_startswith="seq" /> + <param format="interval,gff" name="input" type="data" label="Fetch sequences corresponding to Query"> + <validator type="unspecified_build" /> + <validator type="dataset_metadata_in_file" filename="alignseq.loc" metadata_name="dbkey" metadata_column="1" message="Sequences are not currently available for the specified build." line_startswith="seq" /></param><param name="out_format" type="select" label="Output data type"> - <option value="fasta">FASTA</option> - <option value="interval">Interval</option> + <option value="fasta">FASTA</option> + <option value="interval">Interval</option></param></inputs><outputs> - <data format="fasta" name="out_file1" metadata_source="input"> + <data format="input" name="out_file1" metadata_source="input"><change_format> - <when input="out_format" value="interval" format="interval" /> + <when input="out_format" value="fasta" format="fasta" /></change_format></data></outputs> @@ -34,6 +41,17 @@ <param name="out_format" value="interval"/><output name="out_file1" file="extract_genomic_dna_out3.interval" /></test> + <!-- Test GFF file support. --> + <test> + <param name="input" value="gff_filtering_out1.gff" dbkey="mm9" ftype="gff" /> + <param name="out_format" value="interval"/> + <output name="out_file1" file="extract_genomic_dna_out4.gff" /> + </test> + <test> + <param name="input" value="gff_filtering_out1.gff" dbkey="mm9" ftype="gff" /> + <param name="out_format" value="fasta"/> + <output name="out_file1" file="extract_genomic_dna_out5.fasta" /> + </test></tests><help> @@ -90,7 +108,7 @@ Extracting sequences with **FASTA** outp CACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCAC ACACG -Extrracting sequences with **Interval** output data type returns:: +Extracting sequences with **Interval** output data type returns:: chr7 127475281 127475310 NM_000230 0 + GTAGGAATCGCAGCGCCAGCGGTTGCAAG chr7 127485994 127486166 NM_000230 0 + GCCCAAGAAGCCCATCCTGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCACACACG --- /dev/null +++ b/test-data/extract_genomic_dna_out5.fasta @@ -0,0 +1,258 @@ +>mm9_chr10_62044836_62045189_+ +AATTACAAGATCGACACACCAAGATAGGCAGATCCATGGTTGGTTTTACT +TTGTAAATCTAAAAGTATGTTGGAAAACGATGCAATGAATTCTTATCCTT +TTTCAAAATGAAGAATTTGTGATGGTTAGTGGACAGTTCAGAAGCCTCTC +TGCAAGAAAGGGGGCGCTGAGAAGTGGTAAAAAAAGGAAGGAAGCACTCG +GGCTTTGTCAGCAGGGTGGACCCTGGGGTCCACAGTGGGAACAGTCCCTT +CTGGCCTCTACTCACTGACCAAACGCTTTACTAAAACTCCGCTTCTGGCC +TCTGTTGCCACCTCCTGGTCGCTGTCCTCGGAAGTTTCTACTTCCTCCTC +GCT +>mm9_chr10_75372918_75373002_+ +GCGTCTCGCAGCTTCTGCCCGTCGATCTCCATGTCGAGCCGGATGGGCAC +CAGCACCTCAGGCTGTGACGCATTCTCATGGATC +>mm9_chr10_80362427_80363292_- +ATGACGGACAAGTGTTTCCGGAAGTGCATCGGGAAGCCCGGGGGCTCCTT +GGATAACTCGGAGCAGGTGAGACATCTCGGGAACCCGGGGTGGTGAGGGG +CGCGGGGTCAGGAGCGTCTAGGAGGTTGAGAGATGTGCGCGTGCGCGGCC +TCTAGCCTTAGCTACTGAGGAAGTTGTGCGCGTGCGCGGGGTGAGGACCC +GGCTTCTGTGCCTAGATCGGTGCAGCCTTCATGGGTGATCCTCGGGTCGT +GTGACCGTCAGTCAGGGATCCCCCTCCACGCTTTGCAGAAATGCATCGCC +ATGTGCATGGACCGCTACATGGACGCCTGGAATACCGTGTCCCGCGCCTA +CAACTCTCGACTGCAGCGGGAACGAGCCAACATGTGACCGGGACCTGTGC +CTCGGGACACCGTGCTTATGGTCTGAACTGTTTTCCCTGCCAGTTAGGGT +GTCTCCTCCTAGCCGCCCTGAAGTCTGGCAGCATGGAGGGCTTGGGGATC +GAGGCCTCTCCCCTGGGTTGCTGCGTCCAGCTCAATCTCAGAAGAGAGTG +AGGACCCGACAGAGCACAGGGATCTGGCTGGCCCCACTGACCTGTGACCT +CAGGAGAGCAGGCCAATAAATCGCTGCTGGGGCAGTAAAGCAGGCGTGTC +ACCTCACTGCTTCAGGTCCCTTCCCCTGAGTAGGCCCAGACCTCCCAGGG +TATCTTTCCCCTTGGGGTCAGTGGGCTGCTGGCTCTCAGGGAATTCGGAG +CATGATCTCAGGTGTTTGGTCATCCCGGGGAGACCAGCCGAGGTTAAGAA +GCAAGGCTTCATGTagccttcacctatcatgcatgaggcccagggtgctg +accttaactctgaat +>mm9_chr11_7904564_7904642_+ +CATCTTCTATTTGAGCCTCCATCCAGGCACCTCTGAAACAAAGGTGCACT +CACTGCATGTCCACTTGTCACAGGAGCC +>mm9_chr11_78140155_78140259_+ +CTGCTTGCTAATTTTCTCTCTTGGGATCAGGGGGACGTGAACTCCAGCCC +TGACTCGTGCTCCTTATGCTCTGAGTACATAGCAAATAAATGAGAGCAAA +ACAC +>mm9_chr11_105616461_105616737_+ +TAGGTGTAATAGTGGAAAACAATAGTTTTTAAACTTCAGAGTCCAGGGCT +GTAACTCAGTAGTAACAGTGTTCTCTAAGTATGTTATTCTTCCTCTACAT +GCTGAAATTTTTCATATTTGGAGCATTCACTGTTCCATGTATCAGTAAAT +TATATTGTGAGCTGTCATCATATCTAAGCACCATATTGAATATTTTTCAT +GATTAAAATTTGTTGAAACAACAATTCTATGACCGAAAAAAGCAAGGCTT +TGTAAATAACATGTTTGTTACTAGTA +>mm9_chr12_30701761_30702509_+ +TGTGGAGTGTACTTATATGATCCCTATGCTGATAGGATTACCTTCCTAGA +CATAGCTAGACGCAAAGCCACATGTGTAAGGCTGCTGAGCAAAGACAGCA +TCCCAGCATGGGTGTGTTCACGGTGGATTCACCACGTTGCATATGTAAAG +TGGTCCCCTTGGCTTACCCTTCACTTTGCTCATGAGATTCAGAAGCTGGT +GGTCCAGCAGGGGTGAGCATTTGTGAAATAGTAAGCTGAACTTAGTGGTG +AGATTTCAGAACAGACTTCTGTGAAGTAAGAGATGTAACCATGCATCTAA +AATCAGATGGCCGTGTAACTGCTCGGGCATAGAAATGGTGGGAGAACCTG +TCCTGGGTACCTGGCATTTCACATGAGCCCAGGGATATGTCTTGTGCCAA +GGCACACAAGTGTCCATGGACTTGGACAGGTGCCAAGGGTTTTTGTCTCT +GTTCCTATGTGGGAGGCTGGCTGTGATTTACATTAATTTCTGTATTTCAA +ACGAAGATGTCTGCAGATCTCCATTTTGATGTTACAGCCTCATTGCCCAG +GCAGTGGGCAGTGCCCAGACACCCTTTCTGACTAGCCACTGCATTGGGCT +TCTGTGATTCAAAGTAGTGTATATATTTATTTACTTCTCTGACTGTGGCC +AACAGCCAAATGCCATTTTATGTTCCTTGTATTCAGTCCATTACCAAAGA +GGTGTTTGCACTTTGTAATGATACCTTTCAGTTCAAATAAAAGGACCA +>mm9_chr13_49159495_49159569_+ +ttttcttttggattacttgatttttttttatttgatcttatttatgatga +ttttgagtacatttttgaacagtt +>mm9_chr13_100200303_100200330_+ +TCTCATATGAATAGCCACCCTCTTCTG +>mm9_chr14_31949102_31949152_+ +GGATGCTATCCGCGATGTGCATGTAAAGGGCCTCATGTACCAGTGGATCG +>mm9_chr14_67604226_67604668_+ +TTCACCGTGAGAGTTTTCTCCATTTCACTCTTCACTGTGCTGTTCTCTGT +GCCGCTTTCCTCTTGACTTATAAACATCTGAGCCAGTTTTCAATAAACTT +AAAACGAAGCCTGCTTCTCATCCCAAATTGTAAACAGGAATAAAGCTTTT +TAAACCTTATCTTAAATTTTAACTTTGTTGAATTCTGCTTTGTGATAGGA +CAATCTGTTTCACCCAACAAGAATCTGTGTAGGAGGATGAACATCCCGCA +TGTTGGAGCTGCAAATCAGCACTGTACAAGCTCACTGATGGACAGCTGTT +CTGTGATGTATTCCATGATTTTACTAATACTTTCAAAAATGGCAAAACTA +ACTTCAGTTTTAATGTTGAAAGAAAATCATAAATGTTCCCATAGTTCAAT +GGCACTGTCGATGAAACTGCTACTGAATTTAGAGAGAAAACG +>mm9_chr14_75165581_75165744_+ +ggccctgggatgataTAACAGAAGAGTCTAAAGGAGGCTTCTGAGATGTG +CAGTAGGAAAGCCTGGCACATAATAGGTTATTATCTAAATCCCTTCACTA +CTCTTCAAAGACAGCAGGATGCCTCTGCTCCCATGTTTTATCTCTACTTA +TGTGGAATTTATG +>mm9_chr16_57154026_57154067_+ +GTTGAGGTTTATTTAAGTAAAATGATTTTTTAAAAAAGCAA +>mm9_chr16_74862301_74862560_+ +GCATTGGCAGCAGATATTGGTACCCAGTGGCACTGCAGAGTACTTACAAT +CAGGACTCGCTACTGTGCTTCATTCTGCTTTTCTCTCTGCTTCTATTACA +GTTAAAGTGTTGCTAATTATAGAAACTCTCTGTTTATTGAACCTCGGTGT +TAAGAAAAACTTGTAATCTTCAGATATGATCCGAAAGATTCCCAAACAAA +TGTAACAAGGTCCACTTTTGTAGCCCTTTCTACCAGAAcactggttatca +acctgtggg +>mm9_chr16_98168778_98168914_+ +CCTATTTATTTCACTAAACATCTGCCTGCTAGCTGAGATAAACATTCTCT +AAAAAACTGTTTACTGCAAAAAGTGATTACTGTTTTTTATTAGTTTCTTA +GCATTTGAAATAGTTACATGAATGGAAGGATAGAGT +>mm9_chr17_8483211_8483268_+ +AGACTTGTCAACAGCTCACCCAATGATGGAACTGAGGCTGCCCCTCAAGT +GGCCAGA +>mm9_chr17_30355790_30355913_+ +atctcatacccataagctcagaactcggggtggtaacataggaggactgc +catgagtgtgactaacctgggctataggaggaggatctaccttaagcaaa +tgaCCAACAAAACTAACAAGCTC +>mm9_chr18_39571717_39571880_+ +TATAACATTCCATAAATGTACAATAATCTATTTTTGAGAAGCTCATTTTG +AAACTTAACACTGTCATTGATAATCTTCAAGTGGTATTTCTTAGGCACCA +TAAATTTCACATCCAGCTGGGTTACAATTATTTTAAAGTACTTTGAGACC +AATTTAAACCATT +>mm9_chr19_17633087_17633203_+ +TGGGAAATGAACTGCATGGCAATGAACCCCAGGGAATTTGGTGGTTAATT +GTCTAAGGATAAGGACATCAGTTTTGTCTTTTGCATCACTGTGACCTTTG +CCTCTAATTGTATAGA +>mm9_chr19_41997623_41997859_+ +gctacacaacgactcacatagagggaagcaggcacacatcagataaaaca +cAAAAGGATGGGTTGGTGATGGGCATAGTTAATGAGGGCCACTAGGTAAA +TACACCTGATCCAAAAGTCACGCTACTACTTAGATTCTTCTCTCTGCTAA +AGACAACAGAAgacatgttagccatgcttgtaatccctgcattggggaga +tggagtcagaaatatcactgcaagttcacccaatag +>mm9_chr19_56516514_56516684_+ +TGTATTCATTCACTATTCACTGATTTGTCAGATCATCCATCCACACAGGT +GCTGAAGAGTAACCCATTTCACTTTGTATACAAGATAATGTTTTTGTACT +TCAAATACATCTGGAATTCTTTCAAATATTCCAAGATTTTTTTTTTTTCT +GAATAATCTTTGGTTACCTC +>mm9_chr2_4543773_4543977_+ +gagccatttctccagccccTTTATGTGGAATATTAACAAGAGAAGACAAC +ATAAAATGACTTACCATGCTGTGTGGCCTAACAGTGGATGAAGAATGAGT +GATTTGGGCATTTCTGATAGTATTTATAAAGAAGACTTTTATGACCAAAC +CACATGTCACAGTAGGGATTTGCTGCACATCTTATGAGAGTTTCTTCTTT +GTCA +>mm9_chr2_30200330_30200938_+ +CGCACACAAAGGATTTATTTGCCAGAGAGCAAGCAGACAGGCAGAGGTCA +GAATGTTAGTTAGAAACTGAAGGAATGACTGCTGTAGCCACTGTGCCCAG +CCAGAGCCATGAGGGAAGTGGGAGGCAGCACTTGGTGCTGCTGCTCTGGC +TGACCCTTCTGGTTTCCTGCCACACTCCTAGCCCTGCCTGTGTGCTGCTG +TCCCCCTCAACCTTCCACAGCCAGAAGGCAGATGTTCTTTCATGCCAAGA +GCATCCATCCCCAGCATATCCTGGGCCCATGGTGGTGTCAAATGTAGTGA +CCCTTCTGCCTTAAGGGAGCTGGGAAGCCTGGGGTGTGCAGGGTTGCAGG +TCAGAAGCAGGACTAGCAGAGGGGCCTGGGGCCATTCTGTCTTGTGGGCT +CTTTAATAGCTGAATGACGGGCACAGCCAGAAAAGGGTTAGGTCCCTTAT +CCTAAGCAGCTCTGTGGCCAGCAGACGACTCTAAGTGGCAGAGCCTGGGA +AGGGGCTGCTTAGCTGAGAAGTTCCAGGTAGGTGACAGGAACCTTGCCCT +TCTTGTTGCCTCTCTCACCAATGAGCCAGTCGGGATCCATGCCTGGCAGG +CTGTAGAC +>mm9_chr2_106644219_106644341_+ +attcttaaggtaaatacctaggagtgatgtaacccagtcatagggaagaa +ctacttttaatttgttgagcaacccccaacctgattttgacacaggtttg +agtagtttacacttctactaac +>mm9_chr2_125388930_125389219_+ +AGAGCACACAGCACATCACTTAGGCCTCCAACATTAAGGCAGCGCAAGTG +CCTCAAGTAACTGAGAATACTTTACTCAGATACAAGGGTATCAAAAACAT +GAGAACTGGCAGGAAGACCTCACAATGGTTTGTTAGCATCAAGTATTACC +ATCCAGTTTCCTGTTTAAATAGTAATTAATGACTATTCTGAAATAAGGCA +AATAATTACTCAAGCGGGCTGTCAAAGCCACTATCCTGTTGGCTGGGCAT +CGGAGCAGTTAACTTTATCAAAGGCTTCTGACACAATGA +>mm9_chr3_130936638_130936898_+ +CGAGGCTGCAGGCTGCAAATGTTCCCAGGCAGGCAAGACCTCACGTCCTA +CTGGCTGCTGCCCTTGGGTGCATCTGTAGGCCCCGTGGCTCCTGCCCCTG +GGGTTCAACACCGATAAACATAGAATACTCATTTTCAGAAGACCTGAGGG +AATGAGTCTAAGCAACGCTTTTTACAAAAAGTGGCAAGGTTCAGGAAAAA +AAAAAAAAAAGATGTTGCTCCAAGGCACCAAGGGTGTAATTTTTTTTCAG +AAAAAGTCAG +>mm9_chr3_136592670_136592771_+ +TGTCAGCCCATCACATTTTAGTGACAACAGTCATAGCCTTTATTTTCAGA +TGACTTTCCTCTAAAACCACTGTCTATGAGTTGCCCCCCAAAACTCAAAA +A +>mm9_chr3_152861373_152861508_+ +ATCAAAAGCGACATGCAAGCATCTTGCTCTCACCACAGATCACTGAGACA +TTAAGAGTGACGTCTCTTGAACTGTTGGCACGCCTAAGTTATTTCAGCAT +TTCTTGCTCAGCAGTTGTTCTCTTGGCTTCCTCTG +>mm9_chr4_13715309_13715630_+ +AACACATGGCCACATCATGTGATATTTTCAAAACACTTACACATAGCTTT +GAGAAGGTCCCTGCAGGAATGATCCATCCTCTCACAGTTGGCCCATTTTT +TAACAGCATATCTGCATTTTCCATTTAGGAGAGCTATATATTATTAGCTT +ACATTTTTGGGTAGTAAAACAGTGCATTGCTGATTGTAAAACATGGACTT +TATTATCTGCTGAAAATTGATTTGGCATTTATAGCCACTGTGTATTAGAC +TGTTTTTCTGTTTTTAACATCAATGCTTAAAAGCGATGATTTGTGTTTaa +aaaaattaaaaaaataaaata +>mm9_chr4_147515028_147515097_+ +GCTGACGTGCTCTCCGAGTTCCTGGAGGTGGCCGTGCACCTGATTCTCTA +TGTGCGCGAGGTCTACCCG +>mm9_chr5_3949521_3949685_+ +AGTCCCAACCACCCCCTTGTTTAATGTATAACTTTCTGAAATGGGAGCGT +TAGAATGGATTAAAATGGTTGGTAGGTGGTTGGATCACCAACCAAGACCA +GAAATAGAGGGGTAGGCTGCTCAGGAGAGTATTGGGAGGGTAGCTATTAT +TTGCATTTTGTGCT +>mm9_chr5_68089693_68089831_+ +CAATGATAGAGAAGACTAAAATAAAAGCAGGCATGCTGGCACAAGCGACA +GAAGGAAAAAGCCTCACCCGGCCCTGTTTGAGGCCACTCCTGGTGGCTCC +TTTTCCAAGGACCATGCGGTCAAGCCTCTGAGTTGTTC +>mm9_chr5_122819525_122819619_+ +CTTTAGAAAAGATGCATCTGTCATTGATTTAGGGATATGAATTGTTTGGA +TTTGAGTAGTTTTCCATAACTCCTGCAGTTTGGCAATGTGTGCG +>mm9_chr5_145619547_145619710_+ +CGGCGTTCTGAAAACTGTGCTCCGGGATGAGATCATTGCTTGGCACAAAA +AGACACAGGAGGACACTTCCTCTCCACTGTCGGCCGCAGGGCAGCCTGAG +AACATGGACAGCCAGCAGCTGGTTTCCTTAGTTCAGAAAGCCGTCACTGC +CATCATGACCCGC +>mm9_chr6_83928983_83929105_+ +ACAGGAACCATTATTTACATTTAATTTGGATGAATTTGTTACTGTGGATG +AAGTCATAGAAGAAGTAAATCCTTCTCAAGCCAAGCAGAATCCATTAAAA +GGAAAAAGAAAGGAAGCCCTCA +>mm9_chr6_118857948_118858148_+ +CCAGGCTTGCTAGTTGGTGCAGTTAGCTACATCTCAGGACAGAGACAAGG +TACTCTGAGCTCCCCTTGAACTGCCACACAAGCTGTCTCCTGGATGCCAA +GCAGAGAAACCTGGAGACAACAATCATCATACTCAAAACCAGGATCTCTT +TCTTAAGACTTTTGTATTTTGTCCCAGCCCTAACCCTGAGTTCTGCTGAA +>mm9_chr7_85554209_85554343_+ +GTGAAACATCATGCTTCTGCATCAAGTTATTAGTGGGAAACCTGTAAAAG +TTGACATTGAATGCTGATAACAAATTACTTTCATCCTGTCTCATAATGAA +TCCTACATCAAGACAAGGCAAGTGAGAAAGAGGG +>mm9_chr7_104055490_104055589_+ +ACATTTCTCCTCTCTTGGGGGAGCGCATCTCCTTGGGTGTGTCCACATCC +GCCCCTAGGTACCCAGTGTGATGTGAGACACGAGTGTCTGTGCTAACTT +>mm9_chr8_9970397_9970545_+ +AGTCTTCACCAAAATTAAGTCTCAGCTAACTTAAAAGTTGCAAGGATTTT +TTTCAATAAAATTAATATCTTAAGTGTTTGGTGTTTAGATGATTCTCTCT +CAACTTCCCCCACATTATCAAAAAACATTTGATGAACCTTAAAAACTC +>mm9_chr9_20449845_20449932_+ +CCAGCACCGATGACACCATCGGCGACTTGAAGAAACTGATAGCTGCTCAA +ACTGGCACCCGCTGGAACAAGATCGTTCTTAAAAAGT +>mm9_chr9_107445869_107445930_+ +CAAGCAGAAGCTGGTGCCCATCATGACCATCCTGCTGGAAGAGCTGAATG +CCTCCGGCCGC +>mm9_chr9_120860475_120860606_+ +CTGCCATTGTACGCACCATGCAGAATACAAATGATGTAGAGACAGCTCGT +TGTACTGCTGGGACTCTGCACAACCTTTCTCACCACCGCGAGGGCTTGCT +GGCCATCTTTAAGTCTGGTGGCATCCCAGCG +>mm9_chrX_10274056_10274087_+ +ACTTCGCTGTCATCATTTGTACAAACTCTTT +>mm9_chrX_39881430_39881678_+ +AGCTAAAAAGAGTCCTTTTCTGACAGAAAGGCTGGACTTCTCCTTTTCAC +CGTTTCTCTTACTGATGCTTTTGCCAGAAGAACAGTAAAGATTTAGACAC +TGTCATGATTCATACACGTAAAATATTTTTCAAGGACACAATCTGATATA +CTAACATTTATTTAAGAGGTTAAAGTCCACCACTAAATCTAAGGAAAGAT +TTTTAACTGCCAAACACATTTCCTTTGACAAATAATGTAAGATGACAA +>mm9_chrX_148249671_148249713_+ +AATGCTAGTATGAACAGTGGGAGGAATGAGCAAAATGTTACA +>mm9_chrX_148481504_148482455_+ +CGCCACAACCTGCTACAGGCCTGTAAGATGCAGGACATCAAACTGCCACT +GTCAAAGGGCACCATGGATGATATTAGTCAGGAAGAAGTGAGTATTATGG +TGGGTGGTAGGAGTCATCTATGAATATTTAACCAGTAATGGGAGATTACA +GATGGCCAGGAAGGGCAGGCAACAGATAGGACCACATAGAGTTGTGAGGG +GCATAAAGATGGATGCAGAAGAAATGTGGCAAGGTGGAAGTAGTGAAGTC +AGGCTTTGGTATGAGAGAGACATTGATTTGAGAGGAGAGCTGCAAGCCAG +TGAGTACTCAGAAAGACCAAGAATGGGTCATTAATCTTAAGGATTTGAGC +TCTTAGCTGCAGCAGATACTGGGCATGGGTAGGAGTGAGAATTGAGGAGC +AGAGGAAGATGGGAAACTGGAGAACCTAAGGAGACTGATAGCTTAGCTGC +AGTAAGGGAGGTTGGCCAGAAGAGGGTTGGGTAGGGGACTCAGCAAGGCA +GAACTAAGGAAGCTTAGGTGGAGGGGAAGGAACAACATCTGAGCAACTAA +AGCACTCTATCAACTGGAAGTGCAAGATGGTAGTGAGGGGTGGACAGGTG +TAACTGAGTAACTCTTTGTAGGTAGCCTTTCAGTTTAATTCAGTAAAATA +TTTTGAACACTAGTATTCCAGATACTGGTAGGCCATGACTTAACCATTCC +TAATGTTAATCTCAGCTGTGCTAGCTGAGCTTGTGTTCACATTAGACATG +AAGAAACTTAGTAAAAGGTAGAGCCCAGTTTTCGGTTTGGACCTTCCTGT +TGGCCTCTGCTTCCGTGCCATCTAGCAAAGGAGTTCCTAATCTCTAGAGG +GATACAAATGACTAGTCTGCTCCATCTGCCTCTTCCAACATTGCAGGGTA +GCTCCCAGGGAGAAGAGTCAGTGAGTGGTTCCCAGAGAACATCCAGTATC +T --- a/tools/new_operations/gops_intersect.py +++ b/tools/new_operations/gops_intersect.py @@ -70,7 +70,7 @@ def main(): for line in intersect( [g1,g2], pieces=pieces, mincols=mincols ): if type( line ) == GenomicInterval: if in1_gff_format: - line = convert_to_gff_coordinates( line ) + line = convert_bed_coords_to_gff( line ) out_file.write( "%s\n" % "\t".join( line.fields ) ) else: out_file.write( "%s\n" % line ) --- a/tools/extract/extract_genomic_dna.py +++ b/tools/extract/extract_genomic_dna.py @@ -5,6 +5,7 @@ usage: %prog $input $out_file1 -d, --dbkey=N: Genome build of input file -o, --output_format=N: the data type of the output file -g, --GALAXY_DATA_INDEX_DIR=N: the directory containing alignseq.loc + -G, --gff: input and output file, when it is interval, coordinates are treated as GFF format (1-based, half-open) rather than 'traditional' 0-based, closed format. """ from galaxy import eggs import pkg_resources @@ -14,6 +15,7 @@ from bx.cookbook import doc_optparse import bx.seq.nib import bx.seq.twobit from galaxy.tools.util.galaxyops import * +from galaxy.tools.util.gff_util import * assert sys.version_info[:2] >= ( 2, 4 ) @@ -50,6 +52,7 @@ def __main__(): chrom_col, start_col, end_col, strand_col = parse_cols_arg( options.cols ) dbkey = options.dbkey output_format = options.output_format + gff_format = options.gff GALAXY_DATA_INDEX_DIR = options.GALAXY_DATA_INDEX_DIR input_filename, output_filename = args except: @@ -80,6 +83,8 @@ def __main__(): chrom = fields[chrom_col] start = int( fields[start_col] ) end = int( fields[end_col] ) + if gff_format: + start, end = convert_gff_coords_to_bed( [start, end] ) if includes_strand_col: strand = fields[strand_col] except: @@ -162,7 +167,11 @@ def __main__(): c = b else: # output_format == "interval" meta_data = "\t".join( fields ) - fout.write( "%s\t%s\n" % ( meta_data, str( sequence ) ) ) + if gff_format: + format_str = "%s seq \"%s\";\n" + else: + format_str = "%s\t%s\n" + fout.write( format_str % ( meta_data, str( sequence ) ) ) fout.close() --- a/tools/new_operations/gops_subtract.py +++ b/tools/new_operations/gops_subtract.py @@ -71,7 +71,7 @@ def main(): for line in subtract( [g1,g2], pieces=pieces, mincols=mincols ): if type( line ) is GenomicInterval: if in1_gff_format: - line = convert_to_gff_coordinates( line ) + line = convert_bed_coords_to_gff( line ) out_file.write( "%s\n" % "\t".join( line.fields ) ) else: out_file.write( "%s\n" % line ) --- /dev/null +++ b/test-data/extract_genomic_dna_out4.gff @@ -0,0 +1,46 @@ +chr10 Cufflinks transcript 62044837 62045189 1000 . . gene_id "CUFF.23531"; transcript_id "CUFF.23531.1"; FPKM "19.5178121606"; frac "1.000000"; conf_lo "9.264456"; conf_hi "29.771168"; cov "1.108611"; seq "AATTACAAGATCGACACACCAAGATAGGCAGATCCATGGTTGGTTTTACTTTGTAAATCTAAAAGTATGTTGGAAAACGATGCAATGAATTCTTATCCTTTTTCAAAATGAAGAATTTGTGATGGTTAGTGGACAGTTCAGAAGCCTCTCTGCAAGAAAGGGGGCGCTGAGAAGTGGTAAAAAAAGGAAGGAAGCACTCGGGCTTTGTCAGCAGGGTGGACCCTGGGGTCCACAGTGGGAACAGTCCCTTCTGGCCTCTACTCACTGACCAAACGCTTTACTAAAACTCCGCTTCTGGCCTCTGTTGCCACCTCCTGGTCGCTGTCCTCGGAAGTTTCTACTTCCTCCTCGCT"; +chr10 Cufflinks transcript 75372919 75373002 1000 . . gene_id "CUFF.24985"; transcript_id "CUFF.24985.1"; FPKM "124.4970510798"; frac "1.000000"; conf_lo "71.411330"; conf_hi "177.582772"; cov "7.071429"; seq "GCGTCTCGCAGCTTCTGCCCGTCGATCTCCATGTCGAGCCGGATGGGCACCAGCACCTCAGGCTGTGACGCATTCTCATGGATC"; +chr10 Cufflinks transcript 80362428 80363292 1000 - . gene_id "CUFF.26065"; transcript_id "CUFF.26065.1"; FPKM "43.6170921216"; frac "1.000000"; conf_lo "32.260169"; conf_hi "54.974016"; cov "2.477449"; seq "ATGACGGACAAGTGTTTCCGGAAGTGCATCGGGAAGCCCGGGGGCTCCTTGGATAACTCGGAGCAGGTGAGACATCTCGGGAACCCGGGGTGGTGAGGGGCGCGGGGTCAGGAGCGTCTAGGAGGTTGAGAGATGTGCGCGTGCGCGGCCTCTAGCCTTAGCTACTGAGGAAGTTGTGCGCGTGCGCGGGGTGAGGACCCGGCTTCTGTGCCTAGATCGGTGCAGCCTTCATGGGTGATCCTCGGGTCGTGTGACCGTCAGTCAGGGATCCCCCTCCACGCTTTGCAGAAATGCATCGCCATGTGCATGGACCGCTACATGGACGCCTGGAATACCGTGTCCCGCGCCTACAACTCTCGACTGCAGCGGGAACGAGCCAACATGTGACCGGGACCTGTGCCTCGGGACACCGTGCTTATGGTCTGAACTGTTTTCCCTGCCAGTTAGGGTGTCTCCTCCTAGCCGCCCTGAAGTCTGGCAGCATGGAGGGCTTGGGGATCGAGGCCTCTCCCCTGGGTTGCTGCGTCCAGCTCAATCTCAGAAGAGAGTGAGGACCCGACAGAGCACAGGGATCTGGCTGGCCCCACTGACCTGTGACCTCAGGAGAGCAGGCCAATAAATCGCTGCTGGGGCAGTAAAGCAGGCGTGTCACCTCACTGCTTCAGGTCCCTTCCCCTGAGTAGGCCCAGACCTCCCAGGGTATCTTTCCCCTTGGGGTCAGTGGGCTGCTGGCTCTCAGGGAATTCGGAGCATGATCTCAGGTGTTTGGTCATCCCGGGGA GACCAGCCGAGGTTAAGAAGCAAGGCTTCATGTagccttcacctatcatgcatgaggcccagggtgctgaccttaactctgaat"; +chr11 Cufflinks transcript 7904565 7904642 1000 . . gene_id "CUFF.33508"; transcript_id "CUFF.33508.1"; FPKM "61.6484988869"; frac "1.000000"; conf_lo "22.882428"; conf_hi "100.414569"; cov "3.501633"; seq "CATCTTCTATTTGAGCCTCCATCCAGGCACCTCTGAAACAAAGGTGCACTCACTGCATGTCCACTTGTCACAGGAGCC"; +chr11 Cufflinks exon 78140156 78140259 1000 . . gene_id "CUFF.43148"; transcript_id "CUFF.43148.1"; exon_number "1"; FPKM "54.8483511750"; frac "1.000000"; conf_lo "23.181641"; conf_hi "86.515061"; cov "3.115385"; seq "CTGCTTGCTAATTTTCTCTCTTGGGATCAGGGGGACGTGAACTCCAGCCCTGACTCGTGCTCCTTATGCTCTGAGTACATAGCAAATAAATGAGAGCAAAACAC"; +chr11 Cufflinks exon 105616462 105616737 1000 . . gene_id "CUFF.48385"; transcript_id "CUFF.48385.1"; exon_number "1"; FPKM "18.9452034252"; frac "1.000000"; conf_lo "7.520816"; conf_hi "30.369591"; cov "1.076087"; seq "TAGGTGTAATAGTGGAAAACAATAGTTTTTAAACTTCAGAGTCCAGGGCTGTAACTCAGTAGTAACAGTGTTCTCTAAGTATGTTATTCTTCCTCTACATGCTGAAATTTTTCATATTTGGAGCATTCACTGTTCCATGTATCAGTAAATTATATTGTGAGCTGTCATCATATCTAAGCACCATATTGAATATTTTTCATGATTAAAATTTGTTGAAACAACAATTCTATGACCGAAAAAAGCAAGGCTTTGTAAATAACATGTTTGTTACTAGTA"; +chr12 Cufflinks exon 30701762 30702509 1000 . . gene_id "CUFF.53897"; transcript_id "CUFF.53897.1"; exon_number "1"; FPKM "48.9333329111"; frac "1.000000"; conf_lo "37.780391"; conf_hi "60.086275"; cov "2.779412"; seq "TGTGGAGTGTACTTATATGATCCCTATGCTGATAGGATTACCTTCCTAGACATAGCTAGACGCAAAGCCACATGTGTAAGGCTGCTGAGCAAAGACAGCATCCCAGCATGGGTGTGTTCACGGTGGATTCACCACGTTGCATATGTAAAGTGGTCCCCTTGGCTTACCCTTCACTTTGCTCATGAGATTCAGAAGCTGGTGGTCCAGCAGGGGTGAGCATTTGTGAAATAGTAAGCTGAACTTAGTGGTGAGATTTCAGAACAGACTTCTGTGAAGTAAGAGATGTAACCATGCATCTAAAATCAGATGGCCGTGTAACTGCTCGGGCATAGAAATGGTGGGAGAACCTGTCCTGGGTACCTGGCATTTCACATGAGCCCAGGGATATGTCTTGTGCCAAGGCACACAAGTGTCCATGGACTTGGACAGGTGCCAAGGGTTTTTGTCTCTGTTCCTATGTGGGAGGCTGGCTGTGATTTACATTAATTTCTGTATTTCAAACGAAGATGTCTGCAGATCTCCATTTTGATGTTACAGCCTCATTGCCCAGGCAGTGGGCAGTGCCCAGACACCCTTTCTGACTAGCCACTGCATTGGGCTTCTGTGATTCAAAGTAGTGTATATATTTATTTACTTCTCTGACTGTGGCCAACAGCCAAATGCCATTTTATGTTCCTTGTATTCAGTCCATTACCAAAGAGGTGTTTGCACTTTGTAATGATACCTTTCAGTTCAAATAAAAGGACCA"; +chr13 Cufflinks exon 49159496 49159569 1000 . . gene_id "CUFF.67788"; transcript_id "CUFF.67788.1"; exon_number "1"; FPKM "44.9657653777"; frac "1.000000"; conf_lo "10.974842"; conf_hi "78.956689"; cov "2.554054"; seq "ttttcttttggattacttgatttttttttatttgatcttatttatgatgattttgagtacatttttgaacagtt"; +chr13 Cufflinks transcript 100200304 100200330 1000 . . gene_id "CUFF.73108"; transcript_id "CUFF.73108.1"; FPKM "123.2395051093"; frac "1.000000"; conf_lo "30.079196"; conf_hi "216.399814"; cov "7.000000"; seq "TCTCATATGAATAGCCACCCTCTTCTG"; +chr14 Cufflinks transcript 31949103 31949152 1000 . . gene_id "CUFF.77316"; transcript_id "CUFF.77316.1"; FPKM "85.5634278330"; frac "1.000000"; conf_lo "28.521143"; conf_hi "142.605713"; cov "4.860000"; seq "GGATGCTATCCGCGATGTGCATGTAAAGGGCCTCATGTACCAGTGGATCG"; +chr14 Cufflinks exon 67604227 67604668 1000 . . gene_id "CUFF.81446"; transcript_id "CUFF.81446.1"; exon_number "1"; FPKM "123.6776546104"; frac "1.000000"; conf_lo "100.611653"; conf_hi "146.743656"; cov "7.024887"; seq "TTCACCGTGAGAGTTTTCTCCATTTCACTCTTCACTGTGCTGTTCTCTGTGCCGCTTTCCTCTTGACTTATAAACATCTGAGCCAGTTTTCAATAAACTTAAAACGAAGCCTGCTTCTCATCCCAAATTGTAAACAGGAATAAAGCTTTTTAAACCTTATCTTAAATTTTAACTTTGTTGAATTCTGCTTTGTGATAGGACAATCTGTTTCACCCAACAAGAATCTGTGTAGGAGGATGAACATCCCGCATGTTGGAGCTGCAAATCAGCACTGTACAAGCTCACTGATGGACAGCTGTTCTGTGATGTATTCCATGATTTTACTAATACTTTCAAAAATGGCAAAACTAACTTCAGTTTTAATGTTGAAAGAAAATCATAAATGTTCCCATAGTTCAATGGCACTGTCGATGAAACTGCTACTGAATTTAGAGAGAAAACG"; +chr14 Cufflinks exon 75165582 75165744 1000 . . gene_id "CUFF.82088"; transcript_id "CUFF.82088.1"; exon_number "1"; FPKM "20.4139057543"; frac "1.000000"; conf_lo "4.982443"; conf_hi "35.845368"; cov "1.159509"; seq "ggccctgggatgataTAACAGAAGAGTCTAAAGGAGGCTTCTGAGATGTGCAGTAGGAAAGCCTGGCACATAATAGGTTATTATCTAAATCCCTTCACTACTCTTCAAAGACAGCAGGATGCCTCTGCTCCCATGTTTTATCTCTACTTATGTGGAATTTATG"; +chr16 Cufflinks transcript 57154027 57154067 1000 . . gene_id "CUFF.103364"; transcript_id "CUFF.103364.1"; FPKM "162.3154457537"; frac "1.000000"; conf_lo "75.554191"; conf_hi "249.076701"; cov "9.219512"; seq "GTTGAGGTTTATTTAAGTAAAATGATTTTTTAAAAAAGCAA"; +chr16 Cufflinks exon 74862302 74862560 1000 . . gene_id "CUFF.105450"; transcript_id "CUFF.105450.1"; exon_number "1"; FPKM "11.0120241741"; frac "1.000000"; conf_lo "2.020744"; conf_hi "20.003304"; cov "0.625483"; seq "GCATTGGCAGCAGATATTGGTACCCAGTGGCACTGCAGAGTACTTACAATCAGGACTCGCTACTGTGCTTCATTCTGCTTTTCTCTCTGCTTCTATTACAGTTAAAGTGTTGCTAATTATAGAAACTCTCTGTTTATTGAACCTCGGTGTTAAGAAAAACTTGTAATCTTCAGATATGATCCGAAAGATTCCCAAACAAATGTAACAAGGTCCACTTTTGTAGCCCTTTCTACCAGAAcactggttatcaacctgtggg"; +chr16 Cufflinks transcript 98168779 98168914 1000 . . gene_id "CUFF.107834"; transcript_id "CUFF.107834.1"; FPKM "24.4666664555"; frac "1.000000"; conf_lo "5.971605"; conf_hi "42.961728"; cov "1.389706"; seq "CCTATTTATTTCACTAAACATCTGCCTGCTAGCTGAGATAAACATTCTCTAAAAAACTGTTTACTGCAAAAAGTGATTACTGTTTTTTATTAGTTTCTTAGCATTTGAAATAGTTACATGAATGGAAGGATAGAGT"; +chr17 Cufflinks exon 8483212 8483268 1000 . . gene_id "CUFF.108498"; transcript_id "CUFF.108498.1"; exon_number "1"; FPKM "50.0370923000"; frac "1.000000"; conf_lo "9.181978"; conf_hi "90.892207"; cov "2.842105"; seq "AGACTTGTCAACAGCTCACCCAATGATGGAACTGAGGCTGCCCCTCAAGTGGCCAGA"; +chr17 Cufflinks exon 30355791 30355913 1000 . . gene_id "CUFF.111759"; transcript_id "CUFF.111759.1"; exon_number "1"; FPKM "19.3232673516"; frac "1.000000"; conf_lo "2.040012"; conf_hi "36.606523"; cov "1.097561"; seq "atctcatacccataagctcagaactcggggtggtaacataggaggactgccatgagtgtgactaacctgggctataggaggaggatctaccttaagcaaatgaCCAACAAAACTAACAAGCTC"; +chr18 Cufflinks transcript 39571718 39571880 1000 . . gene_id "CUFF.123569"; transcript_id "CUFF.123569.1"; FPKM "20.4139057543"; frac "1.000000"; conf_lo "4.982443"; conf_hi "35.845368"; cov "1.159509"; seq "TATAACATTCCATAAATGTACAATAATCTATTTTTGAGAAGCTCATTTTGAAACTTAACACTGTCATTGATAATCTTCAAGTGGTATTTCTTAGGCACCATAAATTTCACATCCAGCTGGGTTACAATTATTTTAAAGTACTTTGAGACCAATTTAAACCATT"; +chr19 Cufflinks exon 17633088 17633203 1000 . . gene_id "CUFF.131333"; transcript_id "CUFF.131333.1"; exon_number "1"; FPKM "20.4893265884"; frac "1.000000"; conf_lo "2.163116"; conf_hi "38.815537"; cov "1.163793"; seq "TGGGAAATGAACTGCATGGCAATGAACCCCAGGGAATTTGGTGGTTAATTGTCTAAGGATAAGGACATCAGTTTTGTCTTTTGCATCACTGTGACCTTTGCCTCTAATTGTATAGA"; +chr19 Cufflinks transcript 41997624 41997859 1000 . . gene_id "CUFF.133569"; transcript_id "CUFF.133569.1"; FPKM "28.1988698132"; frac "1.000000"; conf_lo "13.125940"; conf_hi "43.271800"; cov "1.601695"; seq "gctacacaacgactcacatagagggaagcaggcacacatcagataaaacacAAAAGGATGGGTTGGTGATGGGCATAGTTAATGAGGGCCACTAGGTAAATACACCTGATCCAAAAGTCACGCTACTACTTAGATTCTTCTCTCTGCTAAAGACAACAGAAgacatgttagccatgcttgtaatccctgcattggggagatggagtcagaaatatcactgcaagttcacccaatag"; +chr19 Cufflinks exon 56516515 56516684 1000 . . gene_id "CUFF.135203"; transcript_id "CUFF.135203.1"; exon_number "1"; FPKM "33.5542854247"; frac "1.000000"; conf_lo "14.181710"; conf_hi "52.926861"; cov "1.905882"; seq "TGTATTCATTCACTATTCACTGATTTGTCAGATCATCCATCCACACAGGTGCTGAAGAGTAACCCATTTCACTTTGTATACAAGATAATGTTTTTGTACTTCAAATACATCTGGAATTCTTTCAAATATTCCAAGATTTTTTTTTTTTCTGAATAATCTTTGGTTACCTC"; +chr2 Cufflinks transcript 4543774 4543977 1000 . . gene_id "CUFF.136435"; transcript_id "CUFF.136435.1"; FPKM "37.2825393608"; frac "1.000000"; conf_lo "18.641270"; conf_hi "55.923809"; cov "2.117647"; seq "gagccatttctccagccccTTTATGTGGAATATTAACAAGAGAAGACAACATAAAATGACTTACCATGCTGTGTGGCCTAACAGTGGATGAAGAATGAGTGATTTGGGCATTTCTGATAGTATTTATAAAGAAGACTTTTATGACCAAACCACATGTCACAGTAGGGATTTGCTGCACATCTTATGAGAGTTTCTTCTTTGTCA"; +chr2 Cufflinks transcript 30200331 30200938 1000 . . gene_id "CUFF.140289"; transcript_id "CUFF.140289.1"; FPKM "100.0741846001"; frac "1.000000"; conf_lo "82.383401"; conf_hi "117.764968"; cov "5.684211"; seq "CGCACACAAAGGATTTATTTGCCAGAGAGCAAGCAGACAGGCAGAGGTCAGAATGTTAGTTAGAAACTGAAGGAATGACTGCTGTAGCCACTGTGCCCAGCCAGAGCCATGAGGGAAGTGGGAGGCAGCACTTGGTGCTGCTGCTCTGGCTGACCCTTCTGGTTTCCTGCCACACTCCTAGCCCTGCCTGTGTGCTGCTGTCCCCCTCAACCTTCCACAGCCAGAAGGCAGATGTTCTTTCATGCCAAGAGCATCCATCCCCAGCATATCCTGGGCCCATGGTGGTGTCAAATGTAGTGACCCTTCTGCCTTAAGGGAGCTGGGAAGCCTGGGGTGTGCAGGGTTGCAGGTCAGAAGCAGGACTAGCAGAGGGGCCTGGGGCCATTCTGTCTTGTGGGCTCTTTAATAGCTGAATGACGGGCACAGCCAGAAAAGGGTTAGGTCCCTTATCCTAAGCAGCTCTGTGGCCAGCAGACGACTCTAAGTGGCAGAGCCTGGGAAGGGGCTGCTTAGCTGAGAAGTTCCAGGTAGGTGACAGGAACCTTGCCCTTCTTGTTGCCTCTCTCACCAATGAGCCAGTCGGGATCCATGCCTGGCAGGCTGTAGAC"; +chr2 Cufflinks transcript 106644220 106644341 1000 . . gene_id "CUFF.148977"; transcript_id "CUFF.148977.1"; FPKM "27.2743167045"; frac "1.000000"; conf_lo "6.656871"; conf_hi "47.891762"; cov "1.549180"; seq "attcttaaggtaaatacctaggagtgatgtaacccagtcatagggaagaactacttttaatttgttgagcaacccccaacctgattttgacacaggtttgagtagtttacacttctactaac"; +chr2 Cufflinks exon 125388931 125389219 1000 . . gene_id "CUFF.151331"; transcript_id "CUFF.151331.1"; exon_number "1"; FPKM "23.0274507817"; frac "1.000000"; conf_lo "10.718761"; conf_hi "35.336141"; cov "1.307958"; seq "AGAGCACACAGCACATCACTTAGGCCTCCAACATTAAGGCAGCGCAAGTGCCTCAAGTAACTGAGAATACTTTACTCAGATACAAGGGTATCAAAAACATGAGAACTGGCAGGAAGACCTCACAATGGTTTGTTAGCATCAAGTATTACCATCCAGTTTCCTGTTTAAATAGTAATTAATGACTATTCTGAAATAAGGCAAATAATTACTCAAGCGGGCTGTCAAAGCCACTATCCTGTTGGCTGGGCATCGGAGCAGTTAACTTTATCAAAGGCTTCTGACACAATGA"; +chr3 Cufflinks transcript 130936639 130936898 1000 . . gene_id "CUFF.171349"; transcript_id "CUFF.171349.1"; FPKM "20.1110620975"; frac "1.000000"; conf_lo "7.983635"; conf_hi "32.238489"; cov "1.142308"; seq "CGAGGCTGCAGGCTGCAAATGTTCCCAGGCAGGCAAGACCTCACGTCCTACTGGCTGCTGCCCTTGGGTGCATCTGTAGGCCCCGTGGCTCCTGCCCCTGGGGTTCAACACCGATAAACATAGAATACTCATTTTCAGAAGACCTGAGGGAATGAGTCTAAGCAACGCTTTTTACAAAAAGTGGCAAGGTTCAGGAAAAAAAAAAAAAAAGATGTTGCTCCAAGGCACCAAGGGTGTAATTTTTTTTCAGAAAAAGTCAG"; +chr3 Cufflinks exon 136592671 136592771 1000 . . gene_id "CUFF.171861"; transcript_id "CUFF.171861.1"; exon_number "1"; FPKM "32.9452142371"; frac "1.000000"; conf_lo "8.040973"; conf_hi "57.849455"; cov "1.871287"; seq "TGTCAGCCCATCACATTTTAGTGACAACAGTCATAGCCTTTATTTTCAGATGACTTTCCTCTAAAACCACTGTCTATGAGTTGCCCCCCAAAACTCAAAAA"; +chr3 Cufflinks transcript 152861374 152861508 1000 . . gene_id "CUFF.173007"; transcript_id "CUFF.173007.1"; FPKM "24.6479010219"; frac "1.000000"; conf_lo "6.015839"; conf_hi "43.279963"; cov "1.400000"; seq "ATCAAAAGCGACATGCAAGCATCTTGCTCTCACCACAGATCACTGAGACATTAAGAGTGACGTCTCTTGAACTGTTGGCACGCCTAAGTTATTTCAGCATTTCTTGCTCAGCAGTTGTTCTCTTGGCTTCCTCTG"; +chr4 Cufflinks exon 13715310 13715630 1000 . . gene_id "CUFF.174817"; transcript_id "CUFF.174817.1"; exon_number "1"; FPKM "19.2510308382"; frac "1.000000"; conf_lo "8.572480"; conf_hi "29.929581"; cov "1.093458"; seq "AACACATGGCCACATCATGTGATATTTTCAAAACACTTACACATAGCTTTGAGAAGGTCCCTGCAGGAATGATCCATCCTCTCACAGTTGGCCCATTTTTTAACAGCATATCTGCATTTTCCATTTAGGAGAGCTATATATTATTAGCTTACATTTTTGGGTAGTAAAACAGTGCATTGCTGATTGTAAAACATGGACTTTATTATCTGCTGAAAATTGATTTGGCATTTATAGCCACTGTGTATTAGACTGTTTTTCTGTTTTTAACATCAATGCTTAAAAGCGATGATTTGTGTTTaaaaaaattaaaaaaataaaata"; +chr4 Cufflinks exon 147515029 147515097 1000 . . gene_id "CUFF.190627"; transcript_id "CUFF.190627.1"; exon_number "1"; FPKM "34.4458244094"; frac "1.000000"; conf_lo "3.636542"; conf_hi "65.255106"; cov "1.956522"; seq "GCTGACGTGCTCTCCGAGTTCCTGGAGGTGGCCGTGCACCTGATTCTCTATGTGCGCGAGGTCTACCCG"; +chr5 Cufflinks exon 3949522 3949685 1000 . . gene_id "CUFF.192485"; transcript_id "CUFF.192485.1"; exon_number "1"; FPKM "23.1879208220"; frac "1.000000"; conf_lo "6.791585"; conf_hi "39.584257"; cov "1.317073"; seq "AGTCCCAACCACCCCCTTGTTTAATGTATAACTTTCTGAAATGGGAGCGTTAGAATGGATTAAAATGGTTGGTAGGTGGTTGGATCACCAACCAAGACCAGAAATAGAGGGGTAGGCTGCTCAGGAGAGTATTGGGAGGGTAGCTATTATTTGCATTTTGTGCT"; +chr5 Cufflinks transcript 68089694 68089831 1000 . . gene_id "CUFF.199409"; transcript_id "CUFF.199409.1"; FPKM "17.2229122047"; frac "1.000000"; conf_lo "1.818271"; conf_hi "32.627553"; cov "0.978261"; seq "CAATGATAGAGAAGACTAAAATAAAAGCAGGCATGCTGGCACAAGCGACAGAAGGAAAAAGCCTCACCCGGCCCTGTTTGAGGCCACTCCTGGTGGCTCCTTTTCCAAGGACCATGCGGTCAAGCCTCTGAGTTGTTC"; +chr5 Cufflinks exon 122819526 122819619 1000 . . gene_id "CUFF.205487"; transcript_id "CUFF.205487.1"; exon_number "1"; FPKM "25.2486782797"; frac "1.000000"; conf_lo "2.649470"; conf_hi "47.847887"; cov "1.434124"; seq "CTTTAGAAAAGATGCATCTGTCATTGATTTAGGGATATGAATTGTTTGGATTTGAGTAGTTTTCCATAACTCCTGCAGTTTGGCAATGTGTGCG"; +chr5 Cufflinks transcript 145619548 145619710 1000 . . gene_id "CUFF.209965"; transcript_id "CUFF.209965.1"; FPKM "40.8278115086"; frac "1.000000"; conf_lo "19.004428"; conf_hi "62.651195"; cov "2.319018"; seq "CGGCGTTCTGAAAACTGTGCTCCGGGATGAGATCATTGCTTGGCACAAAAAGACACAGGAGGACACTTCCTCTCCACTGTCGGCCGCAGGGCAGCCTGAGAACATGGACAGCCAGCAGCTGGTTTCCTTAGTTCAGAAAGCCGTCACTGCCATCATGACCCGC"; +chr6 Cufflinks exon 83928984 83929105 1000 . . gene_id "CUFF.219317"; transcript_id "CUFF.219317.1"; exon_number "1"; FPKM "46.7559714935"; frac "1.000000"; conf_lo "19.761399"; conf_hi "73.750544"; cov "2.655738"; seq "ACAGGAACCATTATTTACATTTAATTTGGATGAATTTGTTACTGTGGATGAAGTCATAGAAGAAGTAAATCCTTCTCAAGCCAAGCAGAATCCATTAAAAGGAAAAAGAAAGGAAGCCCTCA"; +chr6 Cufflinks exon 118857949 118858148 1000 . . gene_id "CUFF.223543"; transcript_id "CUFF.223543.1"; exon_number "1"; FPKM "19.0140950740"; frac "1.000000"; conf_lo "5.569100"; conf_hi "32.459091"; cov "1.080000"; seq "CCAGGCTTGCTAGTTGGTGCAGTTAGCTACATCTCAGGACAGAGACAAGGTACTCTGAGCTCCCCTTGAACTGCCACACAAGCTGTCTCCTGGATGCCAAGCAGAGAAACCTGGAGACAACAATCATCATACTCAAAACCAGGATCTCTTTCTTAAGACTTTTGTATTTTGTCCCAGCCCTAACCCTGAGTTCTGCTGAA"; +chr7 Cufflinks transcript 85554210 85554343 1000 . . gene_id "CUFF.235778"; transcript_id "CUFF.235778.1"; FPKM "17.7370289869"; frac "1.000000"; conf_lo "1.872548"; conf_hi "33.601510"; cov "1.007463"; seq "GTGAAACATCATGCTTCTGCATCAAGTTATTAGTGGGAAACCTGTAAAAGTTGACATTGAATGCTGATAACAAATTACTTTCATCCTGTCTCATAATGAATCCTACATCAAGACAAGGCAAGTGAGAAAGAGGG"; +chr7 Cufflinks exon 104055491 104055589 1000 . . gene_id "CUFF.238474"; transcript_id "CUFF.238474.1"; exon_number "1"; FPKM "28.8092349606"; frac "1.000000"; conf_lo "5.286593"; conf_hi "52.331877"; cov "1.636364"; seq "ACATTTCTCCTCTCTTGGGGGAGCGCATCTCCTTGGGTGTGTCCACATCCGCCCCTAGGTACCCAGTGTGATGTGAGACACGAGTGTCTGTGCTAACTT"; +chr8 Cufflinks exon 9970398 9970545 1000 . . gene_id "CUFF.245320"; transcript_id "CUFF.245320.1"; exon_number "1"; FPKM "22.4828826889"; frac "1.000000"; conf_lo "5.487421"; conf_hi "39.478345"; cov "1.277027"; seq "AGTCTTCACCAAAATTAAGTCTCAGCTAACTTAAAAGTTGCAAGGATTTTTTTCAATAAAATTAATATCTTAAGTGTTTGGTGTTTAGATGATTCTCTCTCAACTTCCCCCACATTATCAAAAAACATTTGATGAACCTTAAAAACTC"; +chr9 Cufflinks transcript 20449846 20449932 1000 . . gene_id "CUFF.260747"; transcript_id "CUFF.260747.1"; FPKM "234.9313045507"; frac "1.000000"; conf_lo "163.275950"; conf_hi "306.586659"; cov "13.344091"; seq "CCAGCACCGATGACACCATCGGCGACTTGAAGAAACTGATAGCTGCTCAAACTGGCACCCGCTGGAACAAGATCGTTCTTAAAAAGT"; +chr9 Cufflinks exon 107445870 107445930 1000 . . gene_id "CUFF.272761"; transcript_id "CUFF.272761.1"; exon_number "1"; FPKM "38.9633095779"; frac "1.000000"; conf_lo "4.113466"; conf_hi "73.813153"; cov "2.213115"; seq "CAAGCAGAAGCTGGTGCCCATCATGACCATCCTGCTGGAAGAGCTGAATGCCTCCGGCCGC"; +chr9 Cufflinks transcript 120860476 120860606 1000 . . gene_id "CUFF.275115"; transcript_id "CUFF.275115.1"; FPKM "25.4005086867"; frac "1.000000"; conf_lo "6.199529"; conf_hi "44.601488"; cov "1.442748"; seq "CTGCCATTGTACGCACCATGCAGAATACAAATGATGTAGAGACAGCTCGTTGTACTGCTGGGACTCTGCACAACCTTTCTCACCACCGCGAGGGCTTGCTGGCCATCTTTAAGTCTGGTGGCATCCCAGCG"; +chrX Cufflinks exon 10274057 10274087 1000 . . gene_id "CUFF.276147"; transcript_id "CUFF.276147.1"; exon_number "1"; FPKM "99.5432248142"; frac "1.000000"; conf_lo "21.405127"; conf_hi "177.681323"; cov "5.654052"; seq "ACTTCGCTGTCATCATTTGTACAAACTCTTT"; +chrX Cufflinks transcript 39881431 39881678 1000 . . gene_id "CUFF.277419"; transcript_id "CUFF.277419.1"; FPKM "42.1683560109"; frac "1.000000"; conf_lo "24.187709"; conf_hi "60.149003"; cov "2.395161"; seq "AGCTAAAAAGAGTCCTTTTCTGACAGAAAGGCTGGACTTCTCCTTTTCACCGTTTCTCTTACTGATGCTTTTGCCAGAAGAACAGTAAAGATTTAGACACTGTCATGATTCATACACGTAAAATATTTTTCAAGGACACAATCTGATATACTAACATTTATTTAAGAGGTTAAAGTCCACCACTAAATCTAAGGAAAGATTTTTAACTGCCAAACACATTTCCTTTGACAAATAATGTAAGATGACAA"; +chrX Cufflinks transcript 148249672 148249713 1000 . . gene_id "CUFF.282847"; transcript_id "CUFF.282847.1"; FPKM "56.5895686726"; frac "1.000000"; conf_lo "5.974320"; conf_hi "107.204818"; cov "3.214286"; seq "AATGCTAGTATGAACAGTGGGAGGAATGAGCAAAATGTTACA"; +chrX Cufflinks transcript 148481505 148482455 1000 + . gene_id "CUFF.282965"; transcript_id "CUFF.282965.1"; FPKM "40.1706233958"; frac "1.000000"; conf_lo "16.978103"; conf_hi "63.363144"; cov "2.281690"; seq "CGCCACAACCTGCTACAGGCCTGTAAGATGCAGGACATCAAACTGCCACTGTCAAAGGGCACCATGGATGATATTAGTCAGGAAGAAGTGAGTATTATGGTGGGTGGTAGGAGTCATCTATGAATATTTAACCAGTAATGGGAGATTACAGATGGCCAGGAAGGGCAGGCAACAGATAGGACCACATAGAGTTGTGAGGGGCATAAAGATGGATGCAGAAGAAATGTGGCAAGGTGGAAGTAGTGAAGTCAGGCTTTGGTATGAGAGAGACATTGATTTGAGAGGAGAGCTGCAAGCCAGTGAGTACTCAGAAAGACCAAGAATGGGTCATTAATCTTAAGGATTTGAGCTCTTAGCTGCAGCAGATACTGGGCATGGGTAGGAGTGAGAATTGAGGAGCAGAGGAAGATGGGAAACTGGAGAACCTAAGGAGACTGATAGCTTAGCTGCAGTAAGGGAGGTTGGCCAGAAGAGGGTTGGGTAGGGGACTCAGCAAGGCAGAACTAAGGAAGCTTAGGTGGAGGGGAAGGAACAACATCTGAGCAACTAAAGCACTCTATCAACTGGAAGTGCAAGATGGTAGTGAGGGGTGGACAGGTGTAACTGAGTAACTCTTTGTAGGTAGCCTTTCAGTTTAATTCAGTAAAATATTTTGAACACTAGTATTCCAGATACTGGTAGGCCATGACTTAACCATTCCTAATGTTAATCTCAGCTGTGCTAGCTGAGCTTGTGTTCACATTAGACATGAAGAAACTTAGTAAAAGGTAGAGCCCAG TTTTCGGTTTGGACCTTCCTGTTGGCCTCTGCTTCCGTGCCATCTAGCAAAGGAGTTCCTAATCTCTAGAGGGATACAAATGACTAGTCTGCTCCATCTGCCTCTTCCAACATTGCAGGGTAGCTCCCAGGGAGAAGAGTCAGTGAGTGGTTCCCAGAGAACATCCAGTATCT"; --- a/lib/galaxy/tools/util/gff_util.py +++ b/lib/galaxy/tools/util/gff_util.py @@ -6,23 +6,37 @@ from bx.intervals.io import NiceReaderWr class GFFReaderWrapper( NiceReaderWrapper ): """ - Reader wrapper converts GFF format--starting and ending coordinates are 1-based, closed--to the 'traditional' interval format--0 based, - half-open. This is useful when using GFF files as inputs to tools that expect traditional interval format. + Reader wrapper converts GFF format--starting and ending coordinates are 1-based, closed--to the + 'traditional'/BED interval format--0 based, half-open. This is useful when using GFF files as inputs + to tools that expect traditional interval format. """ def parse_row( self, line ): - interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, self.end_col, self.strand_col, self.default_strand, fix_strand=self.fix_strand ) - # Change from 1-based to 0-based format. - interval.start -= 1 - # Add 1 to end to move from closed to open format for end coordinate. - interval.end += 1 + interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, self.end_col, \ + self.strand_col, self.default_strand, fix_strand=self.fix_strand ) + interval = convert_gff_coords_to_bed( interval ) return interval -def convert_to_gff_coordinates( interval ): +def convert_bed_coords_to_gff( interval ): """ - Converts a GenomicInterval's coordinates to GFF format. + Converts an interval object's coordinates from BED format to GFF format. Accepted object types include + GenomicInterval and list (where the first element in the list is the interval's start, and the second + element is the interval's end). """ if type( interval ) is GenomicInterval: interval.start += 1 - interval.end -= 1 - return interval + elif type ( interval ) is list: + interval[ 0 ] += 1 return interval + +def convert_gff_coords_to_bed( interval ): + """ + Converts an interval object's coordinates from GFF format to BED format. Accepted object types include + GenomicInterval and list (where the first element in the list is the interval's start, and the second + element is the interval's end). + """ + if type( interval ) is GenomicInterval: + interval.start -= 1 + elif type ( interval ) is list: + interval[ 0 ] -= 1 + return interval +