details: http://www.bx.psu.edu/hg/galaxy/rev/1b30f5fa152b changeset: 3681:1b30f5fa152b user: jeremy goecks <jeremy.goecks@emory.edu> date: Thu Apr 22 16:05:08 2010 -0400 description: GTF to BEDGraph converter. diffstat: test-data/gtf2bedgraph_in.gtf | 100 ++++++++++++++++++++++++++++ test-data/gtf2bedgraph_out.bedgraph | 101 +++++++++++++++++++++++++++++ tool_conf.xml.sample | 1 + tools/filters/gtf2bedgraph.xml | 79 ++++++++++++++++++++++ tools/filters/gtf_to_bedgraph_converter.py | 73 ++++++++++++++++++++ 5 files changed, 354 insertions(+), 0 deletions(-) diffs (381 lines): diff -r 3445ca17a4c5 -r 1b30f5fa152b test-data/gtf2bedgraph_in.gtf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/gtf2bedgraph_in.gtf Thu Apr 22 16:05:08 2010 -0400 @@ -0,0 +1,100 @@ +chr1 Cufflinks exon 36425950 36426026 1000 - . gene_id "uc007aqa.1"; transcript_id "uc007aqa.1"; exon_number "21"; FPKM "4.8386844109"; frac "0.515875"; conf_lo "0.000000"; conf_hi "9.779040"; cov "0.274837"; +chr1 Cufflinks exon 46891972 46892996 1000 - . gene_id "uc007axc.1"; transcript_id "uc007axc.1"; exon_number "9"; FPKM "8.4688567539"; frac "1.000000"; conf_lo "6.667227"; conf_hi "10.270487"; cov "0.481031"; +chr1 Cufflinks exon 71654478 71654594 1000 - . gene_id "uc007bkb.1"; transcript_id "uc007bkb.1"; exon_number "4"; FPKM "0.4686878995"; frac "0.186704"; conf_lo "0.300747"; conf_hi "0.636629"; cov "0.026621"; +chr1 Cufflinks transcript 72629845 72679706 1000 + . gene_id "uc007bks.1"; transcript_id "uc007bks.1"; FPKM "4.0695297327"; frac "1.000000"; conf_lo "2.473329"; conf_hi "5.665731"; cov "0.231149"; +chr1 Cufflinks exon 75531753 75532000 1000 + . gene_id "uc007bpt.1"; transcript_id "uc007bpt.1"; exon_number "24"; FPKM "3.6392661141"; frac "1.000000"; conf_lo "2.391008"; conf_hi "4.887524"; cov "0.206710"; +chr1 Cufflinks exon 123389482 123389564 1000 + . gene_id "uc007cju.1"; transcript_id "uc007cju.1"; exon_number "20"; FPKM "0.9948773061"; frac "1.000000"; conf_lo "0.105032"; conf_hi "1.884723"; cov "0.056509"; +chr1 Cufflinks exon 129625990 129626119 1000 + . gene_id "uc007ckv.1"; transcript_id "uc007ckv.1"; exon_number "1"; FPKM "0.0003267777"; frac "0.004692"; conf_lo "0.000000"; conf_hi "0.000915"; cov "0.000019"; +chr1 Cufflinks exon 132059397 132059512 1000 + . gene_id "uc007clw.1"; transcript_id "uc007clw.1"; exon_number "7"; FPKM "0.2051423010"; frac "0.886787"; conf_lo "0.000000"; conf_hi "0.509199"; cov "0.011652"; +chr1 Cufflinks exon 175865141 175865308 1000 - . gene_id "uc007dsf.1"; transcript_id "uc007dsf.1"; exon_number "5"; FPKM "0.6544444010"; frac "1.000000"; conf_lo "0.068952"; conf_hi "1.239936"; cov "0.037172"; +chr10 Cufflinks transcript 7399380 7400956 1000 - . gene_id "uc007eie.1"; transcript_id "uc007eie.1"; FPKM "2.1099978681"; frac "1.000000"; conf_lo "0.514989"; conf_hi "3.705006"; cov "0.119848"; +chr10 Cufflinks exon 79784826 79784954 1000 - . gene_id "uc007gcr.1"; transcript_id "uc007gcr.1"; exon_number "2"; FPKM "1.2054582676"; frac "1.000000"; conf_lo "0.000000"; conf_hi "2.597402"; cov "0.068470"; +chr10 Cufflinks exon 79820729 79820836 1000 + . gene_id "uc007gcy.1"; transcript_id "uc007gcy.1"; exon_number "2"; FPKM "1.8177911161"; frac "1.000000"; conf_lo "0.532419"; conf_hi "3.103164"; cov "0.103250"; +chr10 Cufflinks transcript 105907395 106369573 1000 + . gene_id "uc007gyr.1"; transcript_id "uc007gyr.1"; FPKM "4.2493607936"; frac "0.247216"; conf_lo "3.727223"; conf_hi "4.771499"; cov "0.241364"; +chr10 Cufflinks exon 119487061 119487172 1000 + . gene_id "uc007hep.1"; transcript_id "uc007hep.1"; exon_number "10"; FPKM "4.3105966126"; frac "0.341843"; conf_lo "3.127417"; conf_hi "5.493776"; cov "0.244842"; +chr11 Cufflinks exon 29097093 29097209 1000 + . gene_id "uc007igs.1"; transcript_id "uc007igs.1"; exon_number "7"; FPKM "4.2530782301"; frac "1.000000"; conf_lo "2.700074"; conf_hi "5.806083"; cov "0.241575"; +chr11 Cufflinks exon 69404158 69404264 1000 + . gene_id "uc007jqm.1"; transcript_id "uc007jqm.1"; exon_number "10"; FPKM "18.7450971965"; frac "0.685277"; conf_lo "11.773851"; conf_hi "25.716343"; cov "1.064721"; +chr11 Cufflinks transcript 98249986 98261804 1000 - . gene_id "uc007lgh.1"; transcript_id "uc007lgh.1"; FPKM "2.1571271227"; frac "1.000000"; conf_lo "0.856331"; conf_hi "3.457924"; cov "0.122525"; +chr11 Cufflinks exon 102210141 102211681 1000 - . gene_id "uc007lrp.1"; transcript_id "uc007lrp.1"; exon_number "1"; FPKM "0.8688186006"; frac "1.000000"; conf_lo "0.254471"; conf_hi "1.483166"; cov "0.049349"; +chr11 Cufflinks transcript 105926400 105927243 1000 - . gene_id "uc007lya.1"; transcript_id "uc007lya.1"; FPKM "3.6706747247"; frac "1.000000"; conf_lo "0.000000"; conf_hi "8.861793"; cov "0.208494"; +chr11 Cufflinks exon 106633966 106634066 1000 - . gene_id "uc007lzm.1"; transcript_id "uc007lzm.1"; exon_number "2"; FPKM "2.4729108195"; frac "0.689555"; conf_lo "0.805433"; conf_hi "4.140389"; cov "0.140461"; +chr11 Cufflinks exon 120472427 120472492 1000 - . gene_id "uc007mtq.1"; transcript_id "uc007mtq.1"; exon_number "3"; FPKM "10.2380258574"; frac "0.356865"; conf_lo "4.499395"; conf_hi "15.976656"; cov "0.581520"; +chr12 Cufflinks exon 100112717 100112852 1000 - . gene_id "uc007orn.1"; transcript_id "uc007orn.1"; exon_number "39"; FPKM "1.8669402513"; frac "0.154118"; conf_lo "1.707295"; conf_hi "2.026586"; cov "0.106042"; +chr13 Cufflinks exon 8889564 8891614 1000 + . gene_id "uc007pkn.1"; transcript_id "uc007pkn.1"; exon_number "5"; FPKM "9.4402522582"; frac "1.000000"; conf_lo "6.745038"; conf_hi "12.135466"; cov "0.536206"; +chr13 Cufflinks exon 13756207 13756380 1000 + . gene_id "uc007pmj.1"; transcript_id "uc007pmj.1"; exon_number "18"; FPKM "0.0574771218"; frac "0.793101"; conf_lo "0.000000"; conf_hi "0.140705"; cov "0.003265"; +chr13 Cufflinks exon 93243918 93244083 1000 - . gene_id "uc007rkp.1"; transcript_id "uc007rkp.1"; exon_number "4"; FPKM "6.9802111138"; frac "1.000000"; conf_lo "3.858566"; conf_hi "10.101856"; cov "0.396476"; +chr14 Cufflinks exon 13130096 13130170 1000 + . gene_id "uc007sfq.1"; transcript_id "uc007sfq.1"; exon_number "4"; FPKM "4.0381928600"; frac "1.000000"; conf_lo "2.254366"; conf_hi "5.822020"; cov "0.229369"; +chr14 Cufflinks exon 32036106 32036250 1000 + . gene_id "uc007sxe.1"; transcript_id "uc007sxe.1"; exon_number "10"; FPKM "0.1289615781"; frac "1.000000"; conf_lo "0.000000"; conf_hi "0.386885"; cov "0.007325"; +chr14 Cufflinks exon 56517080 56517223 1000 - . gene_id "uc007ubd.1"; transcript_id "uc007ubd.1"; exon_number "2"; FPKM "15.7683764379"; frac "0.548796"; conf_lo "8.949920"; conf_hi "22.586833"; cov "0.895643"; +chr14 Cufflinks exon 62950942 62951013 1000 + . gene_id "uc007ugl.1"; transcript_id "uc007ugl.1"; exon_number "1"; FPKM "10.1138803585"; frac "1.000000"; conf_lo "6.480867"; conf_hi "13.746893"; cov "0.574468"; +chr14 Cufflinks exon 66479007 66479052 1000 + . gene_id "uc007ujq.1"; transcript_id "uc007ujq.1"; exon_number "8"; FPKM "14.3011267395"; frac "1.000000"; conf_lo "10.806805"; conf_hi "17.795448"; cov "0.812304"; +chr14 Cufflinks exon 70961619 70961783 1000 + . gene_id "uc007uoj.1"; transcript_id "uc007uoj.1"; exon_number "7"; FPKM "2.0814553995"; frac "1.000000"; conf_lo "1.231705"; conf_hi "2.931206"; cov "0.118227"; +chr14 Cufflinks exon 96679222 96679434 1000 - . gene_id "uc007uuq.1"; transcript_id "uc007uuq.1"; exon_number "7"; FPKM "1.7614342028"; frac "1.000000"; conf_lo "0.851833"; conf_hi "2.671035"; cov "0.100049"; +chr14 Cufflinks exon 99504388 99504488 1000 + . gene_id "uc007uvc.1"; transcript_id "uc007uvc.1"; exon_number "3"; FPKM "3.1573312214"; frac "0.277705"; conf_lo "2.620155"; conf_hi "3.694508"; cov "0.179336"; +chr15 Cufflinks exon 12777808 12777962 1000 + . gene_id "uc007vic.1"; transcript_id "uc007vic.1"; exon_number "6"; FPKM "12.7118803258"; frac "0.653301"; conf_lo "7.807708"; conf_hi "17.616053"; cov "0.722034"; +chr15 Cufflinks exon 28200049 28200282 1000 + . gene_id "uc007vjy.1"; transcript_id "uc007vjy.1"; exon_number "19"; FPKM "0.0608801712"; frac "1.000000"; conf_lo "0.000000"; conf_hi "0.146978"; cov "0.003458"; +chr15 Cufflinks exon 34434714 34434889 1000 + . gene_id "uc007vlv.1"; transcript_id "uc007vlv.1"; exon_number "4"; FPKM "2.1698982510"; frac "1.000000"; conf_lo "1.049368"; conf_hi "3.290429"; cov "0.123250"; +chr15 Cufflinks transcript 51709056 51716160 1000 + . gene_id "uc007vrc.1"; transcript_id "uc007vrc.1"; FPKM "5.0213279245"; frac "1.000000"; conf_lo "3.187798"; conf_hi "6.854858"; cov "0.285211"; +chr15 Cufflinks exon 54880182 54880296 1000 - . gene_id "uc007vrt.1"; transcript_id "uc007vrt.1"; exon_number "14"; FPKM "9.7267082384"; frac "1.000000"; conf_lo "7.809774"; conf_hi "11.643643"; cov "0.552477"; +chr15 Cufflinks exon 59176893 59177072 1000 - . gene_id "uc007vxs.1"; transcript_id "uc007vxs.1"; exon_number "11"; FPKM "4.5392702144"; frac "1.000000"; conf_lo "2.723562"; conf_hi "6.354978"; cov "0.257830"; +chr15 Cufflinks exon 76426650 76426779 1000 - . gene_id "uc007wla.1"; transcript_id "uc007wla.1"; exon_number "3"; FPKM "3.5730073595"; frac "0.230550"; conf_lo "2.576136"; conf_hi "4.569879"; cov "0.202947"; +chr15 Cufflinks exon 76533504 76533613 1000 + . gene_id "uc007wlt.1"; transcript_id "uc007wlt.1"; exon_number "4"; FPKM "3.3395072810"; frac "0.491112"; conf_lo "2.499197"; conf_hi "4.179818"; cov "0.189684"; +chr15 Cufflinks exon 88963183 88963261 1000 - . gene_id "uc007xfl.1"; transcript_id "uc007xfl.1"; exon_number "3"; FPKM "1.5871531781"; frac "1.000000"; conf_lo "0.291248"; conf_hi "2.883058"; cov "0.090150"; +chr15 Cufflinks exon 102455470 102455519 1000 - . gene_id "uc007xwk.1"; transcript_id "uc007xwk.1"; exon_number "13"; FPKM "0.2873090379"; frac "0.161741"; conf_lo "0.099159"; conf_hi "0.475459"; cov "0.016319"; +chr16 Cufflinks transcript 3979123 3982204 1000 - . gene_id "uc007xzf.1"; transcript_id "uc007xzf.1"; FPKM "4.1992546925"; frac "0.467884"; conf_lo "2.835257"; conf_hi "5.563252"; cov "0.238518"; +chr15 Cufflinks exon 102313591 102313719 1000 + . gene_id "uc007xvy.2"; transcript_id "uc007xvy.2"; exon_number "7"; FPKM "37.5792165910"; frac "0.297738"; conf_lo "34.688492"; conf_hi "40.469941"; cov "2.134498"; +chr16 Cufflinks exon 4608598 4608818 1000 + . gene_id "uc007xzw.1"; transcript_id "uc007xzw.1"; exon_number "2"; FPKM "5.7793602049"; frac "1.000000"; conf_lo "4.036818"; conf_hi "7.521903"; cov "0.328267"; +chr16 Cufflinks exon 20541820 20541939 1000 + . gene_id "uc007ypy.1"; transcript_id "uc007ypy.1"; exon_number "7"; FPKM "68.0268643583"; frac "1.000000"; conf_lo "60.085498"; conf_hi "75.968231"; cov "3.863924"; +chr17 Cufflinks transcript 24857054 24858867 1000 + . gene_id "uc008axy.1"; transcript_id "uc008axy.1"; FPKM "22.0141466642"; frac "1.000000"; conf_lo "15.369306"; conf_hi "28.658988"; cov "1.250403"; +chr17 Cufflinks exon 25379604 25380686 1000 - . gene_id "uc008bah.1"; transcript_id "uc008bah.1"; exon_number "1"; FPKM "1.7458387165"; frac "0.226783"; conf_lo "1.488719"; conf_hi "2.002959"; cov "0.099164"; +chr17 Cufflinks exon 27159196 27159462 1000 + . gene_id "uc008bfe.1"; transcript_id "uc008bfe.1"; exon_number "2"; FPKM "1.7334774900"; frac "0.118977"; conf_lo "1.272113"; conf_hi "2.194842"; cov "0.098461"; +chr18 Cufflinks exon 34787707 34787836 1000 + . gene_id "uc008ela.1"; transcript_id "uc008ela.1"; exon_number "7"; FPKM "5.0638001964"; frac "0.237331"; conf_lo "4.342098"; conf_hi "5.785503"; cov "0.287624"; +chr18 Cufflinks exon 61371052 61371250 1000 - . gene_id "uc008fbu.1"; transcript_id "uc008fbu.1"; exon_number "4"; FPKM "0.1230526474"; frac "1.000000"; conf_lo "0.000000"; conf_hi "0.369158"; cov "0.006989"; +chr18 Cufflinks exon 61167370 61167501 1000 - . gene_id "uc008fbi.1"; transcript_id "uc008fbi.1"; exon_number "12"; FPKM "2.4172869897"; frac "1.000000"; conf_lo "1.244731"; conf_hi "3.589843"; cov "0.137302"; +chr18 Cufflinks exon 86630592 86630719 1000 + . gene_id "uc008fuz.1"; transcript_id "uc008fuz.1"; exon_number "6"; FPKM "2.2892787327"; frac "1.000000"; conf_lo "1.065608"; conf_hi "3.512950"; cov "0.130031"; +chr19 Cufflinks exon 5603634 5603715 1000 - . gene_id "uc008gea.1"; transcript_id "uc008gea.1"; exon_number "2"; FPKM "2.1837193523"; frac "0.163446"; conf_lo "1.715120"; conf_hi "2.652319"; cov "0.124035"; +chr2 Cufflinks exon 28404475 28404676 1000 + . gene_id "uc008iyn.1"; transcript_id "uc008iyn.1"; exon_number "15"; FPKM "10.9087431164"; frac "0.368384"; conf_lo "4.356515"; conf_hi "17.460972"; cov "0.619616"; +chr2 Cufflinks exon 29770254 29770439 1000 + . gene_id "uc008jal.1"; transcript_id "uc008jal.1"; exon_number "12"; FPKM "7.2973656902"; frac "0.685974"; conf_lo "5.778526"; conf_hi "8.816206"; cov "0.414490"; +chr2 Cufflinks exon 30002172 30002382 1000 + . gene_id "uc008jbj.1"; transcript_id "uc008jbj.1"; exon_number "8"; FPKM "12.8769808138"; frac "1.000000"; conf_lo "10.220662"; conf_hi "15.533299"; cov "0.731412"; +chr2 Cufflinks exon 32076600 32076704 1000 + . gene_id "uc008jeo.1"; transcript_id "uc008jeo.1"; exon_number "21"; FPKM "43.8860660433"; frac "0.911093"; conf_lo "40.407190"; conf_hi "47.364942"; cov "2.492727"; +chr2 Cufflinks exon 32546710 32546774 1000 - . gene_id "uc008jgm.1"; transcript_id "uc008jgm.1"; exon_number "12"; FPKM "8.1366623064"; frac "1.000000"; conf_lo "5.496780"; conf_hi "10.776544"; cov "0.462162"; +chr2 Cufflinks exon 35574280 35574458 1000 + . gene_id "uc008jkv.1"; transcript_id "uc008jkv.1"; exon_number "6"; FPKM "2.0012109810"; frac "0.141121"; conf_lo "1.688896"; conf_hi "2.313526"; cov "0.113669"; +chr2 Cufflinks exon 117127697 117127757 1000 - . gene_id "uc008lrl.1"; transcript_id "uc008lrl.1"; exon_number "14"; FPKM "1.6760710643"; frac "0.685093"; conf_lo "1.109659"; conf_hi "2.242483"; cov "0.095201"; +chr2 Cufflinks exon 122435405 122435623 1000 - . gene_id "uc008maw.1"; transcript_id "uc008maw.1"; exon_number "8"; FPKM "10.5679023498"; frac "1.000000"; conf_lo "7.636894"; conf_hi "13.498911"; cov "0.600257"; +chr2 Cufflinks exon 130265172 130265261 1000 + . gene_id "uc008mja.1"; transcript_id "uc008mja.1"; exon_number "9"; FPKM "3.6318426438"; frac "0.287992"; conf_lo "2.815837"; conf_hi "4.447848"; cov "0.206289"; +chr2 Cufflinks exon 152702303 152702428 1000 + . gene_id "uc008ngq.1"; transcript_id "uc008ngq.1"; exon_number "7"; FPKM "2.5312142816"; frac "0.526901"; conf_lo "1.108909"; conf_hi "3.953519"; cov "0.143773"; +chr2 Cufflinks exon 158262739 158262887 1000 + . gene_id "uc008nqh.1"; transcript_id "uc008nqh.1"; exon_number "8"; FPKM "5.0001206267"; frac "1.000000"; conf_lo "3.934091"; conf_hi "6.066150"; cov "0.284007"; +chr2 Cufflinks exon 178152211 178152296 1000 + . gene_id "uc008ohq.1"; transcript_id "uc008ohq.1"; exon_number "3"; FPKM "1.6796903776"; frac "1.000000"; conf_lo "0.491970"; conf_hi "2.867411"; cov "0.095406"; +chr3 Cufflinks exon 97500913 97501137 1000 - . gene_id "uc008qpe.1"; transcript_id "uc008qpe.1"; exon_number "7"; FPKM "4.1738869883"; frac "0.398377"; conf_lo "3.671923"; conf_hi "4.675851"; cov "0.237077"; +chr3 Cufflinks exon 101987874 101987902 1000 - . gene_id "uc008qrt.1"; transcript_id "uc008qrt.1"; exon_number "4"; FPKM "0.6428024028"; frac "1.000000"; conf_lo "0.000000"; conf_hi "1.551862"; cov "0.036511"; +chr3 Cufflinks exon 127258214 127258303 1000 + . gene_id "uc008rhf.1"; transcript_id "uc008rhf.1"; exon_number "2"; FPKM "0.4060755145"; frac "0.353557"; conf_lo "0.120085"; conf_hi "0.692066"; cov "0.023065"; +chr3 Cufflinks exon 144790795 144790854 1000 + . gene_id "uc008rqi.1"; transcript_id "uc008rqi.1"; exon_number "5"; FPKM "1.1258808773"; frac "0.289434"; conf_lo "0.699104"; conf_hi "1.552658"; cov "0.063950"; +chr4 Cufflinks exon 17978869 17981846 1000 + . gene_id "uc008sbv.1"; transcript_id "uc008sbv.1"; exon_number "5"; FPKM "0.6623587694"; frac "0.585087"; conf_lo "0.270053"; conf_hi "1.054665"; cov "0.037622"; +chr4 Cufflinks exon 21711840 21711940 1000 + . gene_id "uc008scz.1"; transcript_id "uc008scz.1"; exon_number "5"; FPKM "0.9584930367"; frac "0.150841"; conf_lo "0.742054"; conf_hi "1.174932"; cov "0.054442"; +chr4 Cufflinks exon 108353507 108353731 1000 - . gene_id "uc008ubn.1"; transcript_id "uc008ubn.1"; exon_number "9"; FPKM "2.6286767383"; frac "1.000000"; conf_lo "1.111010"; conf_hi "4.146344"; cov "0.149309"; +chr4 Cufflinks exon 131325668 131325803 1000 - . gene_id "uc008vab.1"; transcript_id "uc008vab.1"; exon_number "2"; FPKM "4.0813960015"; frac "1.000000"; conf_lo "2.890730"; conf_hi "5.272062"; cov "0.231823"; +chr4 Cufflinks exon 153530641 153530927 1000 + . gene_id "uc008wbi.1"; transcript_id "uc008wbi.1"; exon_number "12"; FPKM "21.2412511761"; frac "1.000000"; conf_lo "16.217040"; conf_hi "26.265462"; cov "1.206502"; +chr5 Cufflinks exon 3631589 3631765 1000 + . gene_id "uc008whf.1"; transcript_id "uc008whf.1"; exon_number "19"; FPKM "4.6386616700"; frac "0.517324"; conf_lo "3.723563"; conf_hi "5.553760"; cov "0.263476"; +chr5 Cufflinks exon 3992046 3992138 1000 + . gene_id "uc008wid.1"; transcript_id "uc008wid.1"; exon_number "15"; FPKM "23.3742995121"; frac "0.874843"; conf_lo "21.278988"; conf_hi "25.469611"; cov "1.327659"; +chr5 Cufflinks exon 34223636 34223836 1000 + . gene_id "uc008xbk.1"; transcript_id "uc008xbk.1"; exon_number "12"; FPKM "4.1101744570"; frac "0.642098"; conf_lo "2.270677"; conf_hi "5.949672"; cov "0.233458"; +chr5 Cufflinks exon 115734400 115734621 1000 + . gene_id "uc008zdo.1"; transcript_id "uc008zdo.1"; exon_number "3"; FPKM "15.3221708908"; frac "1.000000"; conf_lo "11.506469"; conf_hi "19.137873"; cov "0.870299"; +chr5 Cufflinks exon 137807769 137808016 1000 + . gene_id "uc009aci.1"; transcript_id "uc009aci.1"; exon_number "12"; FPKM "0.7189248975"; frac "0.533440"; conf_lo "0.000000"; conf_hi "1.543846"; cov "0.040835"; +chr6 Cufflinks transcript 17015149 17055825 1000 + . gene_id "uc009azi.1"; transcript_id "uc009azi.1"; FPKM "12.3429992456"; frac "1.000000"; conf_lo "9.242902"; conf_hi "15.443097"; cov "0.701082"; +chr6 Cufflinks exon 15361026 15361102 1000 + . gene_id "uc009ayz.1"; transcript_id "uc009ayz.1"; exon_number "14"; FPKM "4.1692596952"; frac "0.281345"; conf_lo "2.894471"; conf_hi "5.444049"; cov "0.236814"; +chr6 Cufflinks exon 115576309 115576426 1000 - . gene_id "uc009dix.1"; transcript_id "uc009dix.1"; exon_number "8"; FPKM "34.7320589881"; frac "0.628311"; conf_lo "31.195284"; conf_hi "38.268834"; cov "1.972780"; +chr6 Cufflinks exon 117820274 117822784 1000 + . gene_id "uc009dld.1"; transcript_id "uc009dld.1"; exon_number "3"; FPKM "8.2141924772"; frac "1.000000"; conf_lo "5.778655"; conf_hi "10.649730"; cov "0.466566"; +chr6 Cufflinks exon 121331667 121331759 1000 - . gene_id "uc009don.1"; transcript_id "uc009don.1"; exon_number "4"; FPKM "0.9373248338"; frac "0.255959"; conf_lo "0.597786"; conf_hi "1.276864"; cov "0.053240"; +chr6 Cufflinks exon 134837648 134837803 1000 - . gene_id "uc009ekw.1"; transcript_id "uc009ekw.1"; exon_number "2"; FPKM "3.4342255434"; frac "0.337007"; conf_lo "2.099159"; conf_hi "4.769292"; cov "0.195064"; +chr7 Cufflinks exon 19628774 19628924 1000 + . gene_id "uc009fkg.1"; transcript_id "uc009fkg.1"; exon_number "14"; FPKM "3.4380795645"; frac "0.240903"; conf_lo "2.901335"; conf_hi "3.974824"; cov "0.195283"; +chr7 Cufflinks transcript 51739887 51740783 1000 + . gene_id "uc009gpo.1"; transcript_id "uc009gpo.1"; FPKM "3.5875651083"; frac "1.000000"; conf_lo "0.658330"; conf_hi "6.516800"; cov "0.203774"; +chr7 Cufflinks exon 53085965 53086159 1000 - . gene_id "uc009gxj.1"; transcript_id "uc009gxj.1"; exon_number "6"; FPKM "6.4200658663"; frac "0.543693"; conf_lo "4.666748"; conf_hi "8.173383"; cov "0.364660"; +chr7 Cufflinks exon 77546982 77547077 1000 + . gene_id "uc009hnk.1"; transcript_id "uc009hnk.1"; exon_number "7"; FPKM "0.4622078998"; frac "0.405823"; conf_lo "0.077413"; conf_hi "0.847003"; cov "0.026253"; +chr7 Cufflinks exon 82788205 82788350 1000 + . gene_id "uc009hwu.1"; transcript_id "uc009hwu.1"; exon_number "1"; FPKM "0.6859341657"; frac "0.576962"; conf_lo "0.055268"; conf_hi "1.316600"; cov "0.038961"; +chr7 Cufflinks exon 85984891 85985078 1000 - . gene_id "uc009hxo.1"; transcript_id "uc009hxo.1"; exon_number "3"; FPKM "3.0017741434"; frac "1.000000"; conf_lo "1.397258"; conf_hi "4.606290"; cov "0.170501"; +chr7 Cufflinks exon 148509981 148510078 1000 - . gene_id "uc009kkn.1"; transcript_id "uc009kkn.1"; exon_number "11"; FPKM "32.9197864125"; frac "1.000000"; conf_lo "27.375094"; conf_hi "38.464479"; cov "1.869843"; +chr9 Cufflinks exon 15330072 15330148 1000 + . gene_id "uc009ogc.1"; transcript_id "uc009ogc.1"; exon_number "2"; FPKM "1.0060367764"; frac "1.000000"; conf_lo "0.000000"; conf_hi "2.428788"; cov "0.057143"; +chr9 Cufflinks transcript 21069743 21078812 1000 + . gene_id "uc009okt.1"; transcript_id "uc009okt.1"; FPKM "7.9134805855"; frac "0.623402"; conf_lo "5.930640"; conf_hi "9.896321"; cov "0.449485"; +chr9 Cufflinks exon 57867100 57867303 1000 + . gene_id "uc009pwa.1"; transcript_id "uc009pwa.1"; exon_number "4"; FPKM "0.5359102332"; frac "1.000000"; conf_lo "0.000000"; conf_hi "1.293802"; cov "0.030440"; +chr9 Cufflinks exon 49314958 49315758 1000 - . gene_id "uc009pje.1"; transcript_id "uc009pje.1"; exon_number "2"; FPKM "156.0206032233"; frac "0.793945"; conf_lo "147.369898"; conf_hi "164.671308"; cov "8.861965"; +chr9 Cufflinks exon 106815438 106815604 1000 - . gene_id "uc009rkv.1"; transcript_id "uc009rkv.1"; exon_number "12"; FPKM "5.4023275754"; frac "1.000000"; conf_lo "4.337713"; conf_hi "6.466942"; cov "0.306852"; +chr9 Cufflinks exon 119703054 119703292 1000 - . gene_id "uc009sbk.1"; transcript_id "uc009sbk.1"; exon_number "15"; FPKM "0.0814657030"; frac "1.000000"; conf_lo "0.000000"; conf_hi "0.244397"; cov "0.004627"; \ No newline at end of file diff -r 3445ca17a4c5 -r 1b30f5fa152b test-data/gtf2bedgraph_out.bedgraph --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/gtf2bedgraph_out.bedgraph Thu Apr 22 16:05:08 2010 -0400 @@ -0,0 +1,101 @@ +chr1 36425949 36426026 4.8386844109 +chr1 46891971 46892996 8.4688567539 +chr1 71654477 71654594 0.4686878995 +chr1 72629844 72679706 4.0695297327 +chr1 75531752 75532000 3.6392661141 +chr1 123389481 123389564 0.9948773061 +chr1 129625989 129626119 0.0003267777 +chr1 132059396 132059512 0.2051423010 +chr1 175865140 175865308 0.6544444010 +chr10 7399379 7400956 2.1099978681 +chr10 79784825 79784954 1.2054582676 +chr10 79820728 79820836 1.8177911161 +chr10 105907394 106369573 4.2493607936 +chr10 119487060 119487172 4.3105966126 +chr11 29097092 29097209 4.2530782301 +chr11 69404157 69404264 18.7450971965 +chr11 98249985 98261804 2.1571271227 +chr11 102210140 102211681 0.8688186006 +chr11 105926399 105927243 3.6706747247 +chr11 106633965 106634066 2.4729108195 +chr11 120472426 120472492 10.2380258574 +chr12 100112716 100112852 1.8669402513 +chr13 8889563 8891614 9.4402522582 +chr13 13756206 13756380 0.0574771218 +chr13 93243917 93244083 6.9802111138 +chr14 13130095 13130170 4.0381928600 +chr14 32036105 32036250 0.1289615781 +chr14 56517079 56517223 15.7683764379 +chr14 62950941 62951013 10.1138803585 +chr14 66479006 66479052 14.3011267395 +chr14 70961618 70961783 2.0814553995 +chr14 96679221 96679434 1.7614342028 +chr14 99504387 99504488 3.1573312214 +chr15 12777807 12777962 12.7118803258 +chr15 28200048 28200282 0.0608801712 +chr15 34434713 34434889 2.1698982510 +chr15 51709055 51716160 5.0213279245 +chr15 54880181 54880296 9.7267082384 +chr15 59176892 59177072 4.5392702144 +chr15 76426649 76426779 3.5730073595 +chr15 76533503 76533613 3.3395072810 +chr15 88963182 88963261 1.5871531781 +chr15 102313590 102313719 37.5792165910 +chr15 102455469 102455519 0.2873090379 +chr16 3979122 3982204 4.1992546925 +chr16 4608597 4608818 5.7793602049 +chr16 20541819 20541939 68.0268643583 +chr17 24857053 24858867 22.0141466642 +chr17 25379603 25380686 1.7458387165 +chr17 27159195 27159462 1.7334774900 +chr18 34787706 34787836 5.0638001964 +chr18 61167369 61167501 2.4172869897 +chr18 61371051 61371250 0.1230526474 +chr18 86630591 86630719 2.2892787327 +chr19 5603633 5603715 2.1837193523 +chr2 28404474 28404676 10.9087431164 +chr2 29770253 29770439 7.2973656902 +chr2 30002171 30002382 12.8769808138 +chr2 32076599 32076704 43.8860660433 +chr2 32546709 32546774 8.1366623064 +chr2 35574279 35574458 2.0012109810 +chr2 117127696 117127757 1.6760710643 +chr2 122435404 122435623 10.5679023498 +chr2 130265171 130265261 3.6318426438 +chr2 152702302 152702428 2.5312142816 +chr2 158262738 158262887 5.0001206267 +chr2 178152210 178152296 1.6796903776 +chr3 97500912 97501137 4.1738869883 +chr3 101987873 101987902 0.6428024028 +chr3 127258213 127258303 0.4060755145 +chr3 144790794 144790854 1.1258808773 +chr4 17978868 17981846 0.6623587694 +chr4 21711839 21711940 0.9584930367 +chr4 108353506 108353731 2.6286767383 +chr4 131325667 131325803 4.0813960015 +chr4 153530640 153530927 21.2412511761 +chr5 3631588 3631765 4.6386616700 +chr5 3992045 3992138 23.3742995121 +chr5 34223635 34223836 4.1101744570 +chr5 115734399 115734621 15.3221708908 +chr5 137807768 137808016 0.7189248975 +chr6 15361025 15361102 4.1692596952 +chr6 17015148 17055825 12.3429992456 +chr6 115576308 115576426 34.7320589881 +chr6 117820273 117822784 8.2141924772 +chr6 121331666 121331759 0.9373248338 +chr6 134837647 134837803 3.4342255434 +chr7 19628773 19628924 3.4380795645 +chr7 51739886 51740783 3.5875651083 +chr7 53085964 53086159 6.4200658663 +chr7 77546981 77547077 0.4622078998 +chr7 82788204 82788350 0.6859341657 +chr7 85984890 85985078 3.0017741434 +chr7 148509980 148510078 32.9197864125 +chr9 15330071 15330148 1.0060367764 +chr9 21069742 21078812 7.9134805855 +chr9 49314957 49315758 156.0206032233 +chr9 57867099 57867303 0.5359102332 +chr9 106815437 106815604 5.4023275754 +chr9 119703053 119703292 0.0814657030 +track type=bedGraph diff -r 3445ca17a4c5 -r 1b30f5fa152b tool_conf.xml.sample --- a/tool_conf.xml.sample Thu Apr 22 09:35:44 2010 -0400 +++ b/tool_conf.xml.sample Thu Apr 22 16:05:08 2010 -0400 @@ -79,6 +79,7 @@ <tool file="fastx_toolkit/fastq_to_fasta.xml" /> <tool file="filters/wiggle_to_simple.xml" /> <tool file="filters/sff_extractor.xml" /> + <tool file="filters/gtf2bedgraph.xml" /> </section> <section name="Extract Features" id="features"> <tool file="filters/ucsc_gene_bed_to_exon_bed.xml" /> diff -r 3445ca17a4c5 -r 1b30f5fa152b tools/filters/gtf2bedgraph.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/filters/gtf2bedgraph.xml Thu Apr 22 16:05:08 2010 -0400 @@ -0,0 +1,79 @@ +<tool id="gtf2bedgraph" name="GTF-to-BEDGraph"> + <description>converter</description> + <command interpreter="python">gtf_to_bedgraph_converter.py $input $out_file1 $attribute_name</command> + <inputs> + <param format="gtf" name="input" type="data" label="Convert this query"/> + <param name="attribute_name" type="text" label="Attribute to Use for Value"/> + </inputs> + <outputs> + <data format="bedgraph" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="input" value="gtf2bedgraph_in.gtf" ftype="gtf"/> + <param name="attribute_name" value="FPKM"/> + <output name="out_file1" file="gtf2bedgraph_out.bedgraph" ftype="bedgraph"/> + </test> + </tests> + <help> + +**What it does** + +This tool converts data from GTF format to BEDGraph format (scroll down for format description). + +-------- + +**Example** + +The following data in GFF format:: + + chr22 GeneA enhancer 10000000 10001000 500 + . gene_id "GeneA"; transcript_id "TranscriptAlpha"; FPKM "2.75"; frac "1.000000"; + chr22 GeneA promoter 10010000 10010100 900 + . gene_id "GeneA"; transcript_id "TranscriptsAlpha"; FPKM "2.25"; frac "1.000000"; + +using the attribute name 'FPKM' will be converted to BEDGraph (**note** that 1 is subtracted from the start coordinate):: + + + chr22 9999999 10001000 2.75 + chr22 10009999 10010100 2.25 + +------ + +.. class:: infomark + +**About formats** + +**GTF format** Gene Transfer Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GTF lines have nine tab-separated fields:: + + 1. seqname - Must be a chromosome or scaffold. + 2. source - The program that generated this feature. + 3. feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon". + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. end - The ending position of the feature (inclusive). + 6. score - A score between 0 and 1000. If there is no score value, enter ".". + 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). + 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. + 9. group - The group field is a list of attributes. Each attribute consists of a type/value pair. Attributes must end in a semi-colon, and be separated from any following attribute by exactly one space. The attribute list must begin with the two mandatory attributes: (i) gene_id value - A globally unique identifier for the genomic source of the sequence and (ii) transcript_id value - A globally unique identifier for the predicted transcript. + +**BEDGraph format** + +The bedGraph format is line-oriented. Bedgraph data are preceeded by a track definition line, which adds a number of options for controlling the default display of this track. + +For the track definition line, all options are placed in a single line separated by spaces: + track type=bedGraph name=track_label description=center_label + visibility=display_mode color=r,g,b altColor=r,g,b + priority=priority autoScale=on|off alwaysZero=on|off + gridDefault=on|off maxHeightPixels=max:default:min + graphType=bar|points viewLimits=lower:upper + yLineMark=real-value yLineOnOff=on|off + windowingFunction=maximum|mean|minimum smoothingWindow=off|2-16 + +The track type is REQUIRED, and must be bedGraph: + type=bedGraph + +Following the track definition line are the track data in four column BED format:: + + chromA chromStartA chromEndA dataValueA + chromB chromStartB chromEndB dataValueB + +</help> +</tool> diff -r 3445ca17a4c5 -r 1b30f5fa152b tools/filters/gtf_to_bedgraph_converter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/filters/gtf_to_bedgraph_converter.py Thu Apr 22 16:05:08 2010 -0400 @@ -0,0 +1,73 @@ +#!/usr/bin/env python +import os, sys, tempfile + +assert sys.version_info[:2] >= ( 2, 4 ) + +def __main__(): + # Read parms. + input_name = sys.argv[1] + output_name = sys.argv[2] + attribute_name = sys.argv[3] + + # Create temp file. + tmp_name = tempfile.NamedTemporaryFile().name + + # Do conversion. + skipped_lines = 0 + first_skipped_line = 0 + out = open( tmp_name, 'w' ) + + # Write track definition line. + out.write( "track type=bedGraph\n") + + # Write track data to temporary file. + i = 0 + for i, line in enumerate( file( input_name ) ): + line = line.rstrip( '\r\n' ) + + if line and not line.startswith( '#' ): + try: + elems = line.split( '\t' ) + start = str( int( elems[3] ) - 1 ) # GTF coordinates are 1-based, BedGraph are 0-based. + strand = elems[6] + if strand not in ['+', '-']: + strand = '+' + attributes_list = elems[8].split(";") + attributes = {} + for name_value_pair in attributes_list: + pair = name_value_pair.strip().split(" ") + name = pair[0].strip() + if name == '': + continue + # Need to strip double quote from values + value = pair[1].strip(" \"") + attributes[name] = value + value = attributes[ attribute_name ] + # GTF format: chrom source, name, chromStart, chromEnd, score, strand, frame, attributes. + # BedGraph format: chrom, chromStart, chromEnd, value + out.write( "%s\t%s\t%s\t%s\n" %( elems[0], start, elems[4], value ) ) + except: + skipped_lines += 1 + if not first_skipped_line: + first_skipped_line = i + 1 + else: + skipped_lines += 1 + if not first_skipped_line: + first_skipped_line = i + 1 + out.close() + + # Sort tmp file to create bedgraph file; sort by chromosome name and chromosome start. + cmd = "sort -k1,1 -k2,2n < %s > %s" % ( tmp_name, output_name ) + try: + os.system(cmd) + os.remove(tmp_name) + except Exception, ex: + sys.stderr.write( "%s\n" % ex ) + sys.exit(1) + + info_msg = "%i lines converted to BEDGraph. " % ( i + 1 - skipped_lines ) + if skipped_lines > 0: + info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line ) + print info_msg + +if __name__ == "__main__": __main__()