details: http://www.bx.psu.edu/hg/galaxy/rev/dfaa18960944 changeset: 3558:dfaa18960944 user: Greg Von Kuster <greg@bx.psu.edu> date: Tue Mar 23 13:53:21 2010 -0400 description: Add the codingSnps tool. diffstat: test-data/codingSnps_input1.interval | 21 + test-data/codingSnps_input2.bed | 20 + test-data/codingSnps_output.interval | 21 + tool-data/codingSnps.loc.sample | 25 + tool_conf.xml.sample | 1 + tools/evolution/codingSnps.pl | 541 +++++++++++++++++++++++++++++++++++ tools/evolution/codingSnps.xml | 89 +++++ tools/evolution/codingSnps_filter.py | 43 ++ 8 files changed, 761 insertions(+), 0 deletions(-) diffs (799 lines): diff -r a67ca0795efb -r dfaa18960944 test-data/codingSnps_input1.interval --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/codingSnps_input1.interval Tue Mar 23 13:53:21 2010 -0400 @@ -0,0 +1,21 @@ +chr1 1415785 1415786 C C S N C N C N C +chr1 1420777 1420778 C C Y N C N C N C +chr1 1469195 1469196 A G R R G G G G G +chr1 1541789 1541790 T C Y Y N Y N T N +chr1 1548654 1548655 T T C N C N N N N +chr1 1549089 1549090 T C A A T A N A N +chr1 1551984 1551985 C C Y N C N N N N +chr1 1571349 1571350 G A R N N N N N N +chr1 1571354 1571355 G G R N N N N N N +chr1 1589750 1589751 A G G N G N G N G +chr1 1675899 1675900 G A C C G C G C N +chr1 1714578 1714579 G G R N A N G N G +chr1 1837838 1837839 T C Y N T N T N C +chr1 1839388 1839389 A A T T A T A T A +chr1 1839389 1839390 T T Y N T N T N C +chr1 1839603 1839604 G G R N G N G N R +chr1 1843968 1843969 A G G G R R A A G +chr1 1844405 1844406 C C Y Y C C N C Y +chr1 1876878 1876879 A G G N N N N N N +chr1 1886192 1886193 G N R N G N N N N +chr1 1890091 1890092 T C Y Y C C N C N diff -r a67ca0795efb -r dfaa18960944 test-data/codingSnps_input2.bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/codingSnps_input2.bed Tue Mar 23 13:53:21 2010 -0400 @@ -0,0 +1,20 @@ +chr1 67051161 67163158 ENST00000371026 0 - 67052400 67163102 0 17 1290,157,227,99,122,158,152,87,203,195,156,140,157,113,185,175,226, 0,9470,13929,14921,20694,21100,22735,24819,27578,34593,49256,58479,61890,78263,80338,92310,111771, +chr1 67075869 67163055 ENST00000371023 0 - 67075923 67162979 0 10 198,203,195,156,140,157,113,185,175,123, 0,2870,9885,24548,33771,37182,53555,55630,67602,87063, +chr1 1843259 1849228 ENST00000378596 0 - 1843585 1847183 0 10 428,135,161,133,155,130,156,120,56,1278, 0,628,1024,1406,1814,2430,3822,4267,4496,4691, +chr1 1874611 1925136 ENST00000270720 0 - 1876876 1912255 0 18 2538,165,90,67,119,174,156,199,158,103,111,174,105,99,144,85,86,137, 0,3307,5807,6651,10436,11575,13058,15322,17346,18667,19624,20712,32073,33624,35199,35576,37577,50388, +chr1 1838889 1840572 ENST00000310991 0 - 1839180 1840564 0 5 572,179,42,44,92, 0,662,1302,1454,1591, +chr1 1838888 1840096 ENST00000378602 0 - 1839180 1839855 0 2 573,545, 0,663, +chr1 1672537 1699769 ENST00000344463 0 - 1674202 1686705 0 14 1822,83,158,100,155,103,86,155,204,143,169,84,219,182, 0,2328,2812,3045,3305,4135,5019,5264,5868,7862,8495,10713,13989,27050, +chr1 1836125 1838593 ENST00000307786 0 + 1836579 1838492 0 6 481,51,175,145,101,148, 0,858,1614,1925,2147,2320, +chr1 1672530 1701368 ENST00000378625 0 - 1674202 1686705 0 14 1829,83,158,100,155,103,86,155,204,143,169,84,219,165, 0,2335,2819,3052,3312,4142,5026,5271,5875,7869,8502,10720,13996,28673, +chr1 1672537 1699769 ENST00000341426 0 - 1674202 1686705 0 12 1822,83,158,100,155,103,86,106,130,84,219,182, 0,2328,2812,3045,3305,4135,5019,5264,5942,10713,13989,27050, +chr1 1540746 1555847 ENST00000378710 0 + 1540873 1555773 0 19 130,107,311,172,107,195,143,105,163,134,149,157,161,127,234,179,182,66,313, 0,1004,7885,8270,9291,9487,9782,11146,11333,11570,11792,12169,12515,12769,12985,13629,13881,14135,14788, +chr1 1672530 1700089 ENST00000341991 0 - 1674202 1686705 0 12 1829,83,158,100,155,103,86,106,130,84,219,45, 0,2335,2819,3052,3312,4142,5026,5271,5949,10720,13996,27514, +chr1 1540746 1555847 ENST00000355826 0 + 1540873 1555773 0 20 130,107,311,172,107,195,143,108,105,163,134,149,157,161,127,234,179,182,66,313, 0,1004,7885,8270,9291,9487,9782,10042,11146,11333,11570,11792,12169,12515,12769,12985,13629,13881,14135,14788, +chr1 1836125 1838593 ENST00000378604 0 + 1836579 1838492 0 5 481,175,145,101,148, 0,1614,1925,2147,2320, +chr1 1540746 1555847 ENST00000357882 0 + 1540873 1555773 0 19 130,107,311,172,107,143,108,105,163,134,149,157,161,127,234,179,182,66,313, 0,1004,7885,8270,9291,9782,10042,11146,11333,11570,11792,12169,12515,12769,12985,13629,13881,14135,14788, +chr1 1540746 1555847 ENST00000357882 0 + 1540873 1555773 0 19 130,107,311,172,107,143,108,105,163,134,149,157,161,127,234,179,182,66,313, 0,1004,7885,8270,9291,9782,10042,11146,11333,11570,11792,12169,12515,12769,12985,13629,13881,14135,14788, +chr1 67075991 67163158 ENST00000395250 0 - 67075991 67113089 0 11 27,45,203,195,156,140,157,113,185,175,226, 0,31,2748,9763,24426,33649,37060,53433,55508,67480,86941, +chr1 201326404 201403155 ENST00000309502 0 + 201364592 201401651 0 6 81,18,100,155,398,2144, 0,1039,37175,37624,38131,74607, +chr1 8335052 8508585 ENST00000377464 0 - 8337733 8508430 0 17 2715,181,147,721,223,1379,114,162,200,93,163,81,99,100,125,49,181, 0,3013,3694,5790,7358,7706,9357,10277,11650,12340,13406,70321,113519,142657,144999,156220,173352, +chr1 8335054 8800286 ENST00000400908 0 - 8337733 8638943 0 23 2713,181,147,721,223,1379,114,162,200,93,163,81,99,100,125,49,105,97,106,126,71,469,481, 0,3011,3692,5788,7356,7704,9355,10275,11648,12338,13404,70319,113517,142655,144997,156218,188805,204066,205009,262152,271901,303564,464751, diff -r a67ca0795efb -r dfaa18960944 test-data/codingSnps_output.interval --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/codingSnps_output.interval Tue Mar 23 13:53:21 2010 -0400 @@ -0,0 +1,21 @@ +chr1 1541789 1541790 C/T ENST00000355826 Phe:Leu/Phe 15 T +chr1 1541789 1541790 C/T ENST00000357882 Phe:Leu/Phe 15 T +chr1 1541789 1541790 C/T ENST00000357882 Phe:Leu/Phe 15 T +chr1 1541789 1541790 C/T ENST00000378710 Phe:Leu/Phe 15 T +chr1 1548654 1548655 C ENST00000355826 Met:Thr 45 T +chr1 1548654 1548655 C ENST00000357882 Met:Thr 45 T +chr1 1548654 1548655 C ENST00000357882 Met:Thr 45 T +chr1 1548654 1548655 C ENST00000378710 Met:Thr 45 T +chr1 1675899 1675900 C ENST00000341991 Asn:Lys 262 G +chr1 1675899 1675900 C ENST00000378625 Asn:Lys 407 G +chr1 1675899 1675900 C ENST00000341426 Asn:Lys 262 G +chr1 1675899 1675900 C ENST00000344463 Asn:Lys 407 G +chr1 1837838 1837839 C/T ENST00000307786 Trp:Arg/Trp 60 T +chr1 1837838 1837839 C/T ENST00000378604 Trp:Arg/Trp 43 T +chr1 1839388 1839389 T ENST00000378602 Met:Lys 126 A +chr1 1839388 1839389 T ENST00000310991 Met:Lys 141 A +chr1 1839389 1839390 C/T ENST00000378602 Met:Val/Met 126 T +chr1 1839389 1839390 C/T ENST00000310991 Met:Val/Met 141 T +chr1 1844405 1844406 C/T ENST00000378596 Glu:Glu/Lys 187 C +chr1 1876878 1876879 G ENST00000270720 Stop:Gln 763 A +chr1 1890091 1890092 C/T ENST00000270720 Ile:Val/Ile 363 T diff -r a67ca0795efb -r dfaa18960944 tool-data/codingSnps.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/codingSnps.loc.sample Tue Mar 23 13:53:21 2010 -0400 @@ -0,0 +1,25 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of nib genome files for use with codingSnps. You will +#need to supply these files and then create a codingSnps.loc file +#similar to this one (store it in this directory) that points to +#the directories in which those files are stored. The codingSnps.loc +#file has this format (white space characters are TAB characters): +# +#<build> <file_path> +# +#So, for example, if your codingSnps.loc began like this: +# +#hg18 /afs/bx.psu.edu/depot/data/genome/hg18/ + +# +#then your /afs/bx.psu.edu/depot/data/genome/hg18/ directory +#would need to contain the following 2bit file: +# +# +#-rw-r--r-- 1 g2data g2data 807604784 Dec 8 13:21 hg18.2bit + +#Your codingSnps.loc file should include an entry per line for +#each file you have stored that you want to be available. Note that +#your files should all have the extension '2bit'. + +hg18 /afs/bx.psu.edu/depot/data/genome/hg18 diff -r a67ca0795efb -r dfaa18960944 tool_conf.xml.sample --- a/tool_conf.xml.sample Tue Mar 23 13:15:56 2010 -0400 +++ b/tool_conf.xml.sample Tue Mar 23 13:53:21 2010 -0400 @@ -161,6 +161,7 @@ <tool file="hyphy/hyphy_nj_tree_wrapper.xml" /> <tool file="hyphy/hyphy_dnds_wrapper.xml" /> <tool file="evolution/mutate_snp_codon.xml" /> + <tool file="evolution/codingSnps.xml" /> </section> <section name="Metagenomic analyses" id="tax_manipulation"> <tool file="taxonomy/gi2taxonomy.xml" /> diff -r a67ca0795efb -r dfaa18960944 tools/evolution/codingSnps.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/evolution/codingSnps.pl Tue Mar 23 13:53:21 2010 -0400 @@ -0,0 +1,541 @@ +#!/usr/bin/perl -w +use strict; + +######################################################################### +# codingSnps.pl +# This takes a bed file with the names being / separated nts +# and a gene bed file with cds start and stop. +# It then checks for changes in coding regions, reporting +# those that cause a frameshift or substitution in the amino acid. +######################################################################### + +my $uniq = 0; # flag for whether want uniq positions +my $syn = 0; # flag for if want synonomous changes rather than non-syn +my $seqFlag = "2bit"; # flag to set sequence type 2bit|nib +my $nibDir; # directory containg data +my $nibTag; # tag for directory above + +################################################################################ +# Parse command line arguments # +################################################################################ + +# make sure we have enough arguments +if (!@ARGV or scalar @ARGV < 3) { + print STDERR "Usage: codingSnps.pl snps.bed genes.bed locfile.loc [chr=# start=# end=# snp=#] output_file\n"; + exit; +} + +# get first three command line arguments +my $snpFile = shift @ARGV; +my $geneFile = shift @ARGV; +my $locFile = shift @ARGV; + +# read $locFile to get $nibDir (ignoring commets) +# FIXME: the last entry is the one you get +open(LF, "< $locFile") || die "open($locFile): $!\n"; +while(<LF>) { + s/#.*$//; + s/(?:^\s+|\s+$)//g; + next if (/^$/); + + # $tag and $path are set for each "valid" line in the file + ($nibTag, $nibDir) = split(/\t/); +} +close(LF); + +# bed like columns in default positions +my $col0 = 0; +my $col1 = 1; +my $col2 = 2; +my $col3 = 3; + +# get column positions for chr, start, end, snp +# column positions 1 based coming in (for Galaxy) +foreach (@ARGV) { + if (/^chr=(\d+)$/) { + $col0 = $1 - 1; + } elsif (/^start=(\d+)$/) { + $col1 = $1 - 1; + } elsif (/^end=(\d+)$/) { + $col2 = $1 - 1; + } elsif (/^snp=(\d+)$/) { + $col3 = $1 - 1; + } +} + +# make sure the column positions are sane +if ($col0 < 0 || $col1 < 0 || $col2 < 0 || $col3 < 0) { + print STDERR "ERROR column numbers are given with origin 1\n"; + exit 1; +} + +# get the output_file from the command line arguments +my $outFile = $ARGV[$#ARGV]; + +open(OUTFILE, "> $outFile") || die "open($outFile): $!\n"; + +################################################################################ +# Initialization # +################################################################################ + +my @genes; #bed lines for genes, sorted by chrom and start +my %chrSt; #index in array where each chrom starts +my %codon; #hash of codon amino acid conversions +my $ends = 0; #ends vs sizes in bed 11 position, starts relative to chrom +my $ignoreN = 1; #skip N + +my %amb = ( +"R" => "A/G", +"Y" => "C/T", +"S" => "C/G", +"W" => "A/T", +"K" => "G/T", +"M" => "A/C", +"B" => "C/G/T", +"D" => "A/G/T", +"H" => "A/C/T", +"V" => "A/C/G", +"N" => "A/C/G/T" +); + +fill_codon(); + +################################################################################ +# Main # +################################################################################ + +open(FH, "cat $geneFile | sort -k1,1 -k2,2n |") + or die "Couldn't open and sort $geneFile, $!\n"; +my $i = 0; +while(<FH>) { + chomp; + if (/refGene.cdsEnd|ccdsGene.exonEnds/) { $ends = 1; next; } + push(@genes, "$_"); + my @f = split(/\t/); + if (!exists $chrSt{$f[0]}) { $chrSt{$f[0]} = $i; } + $i++; +} +close FH or die "Couldn't close $geneFile, $!\n"; + +if ($ends) { print STDERR "TESTING using block ends rather than sizes\n"; } + +#open snps sorted as well +my $s1 = $col0 + 1; #sort order is origin 1 +my $s2 = $col1 + 1; +open(FH, "cat $snpFile | sort -k$s1,$s1 -k$s2,${s2}n |") + or die "Couldn't open and sort $snpFile, $!\n"; +$i = 0; +my @g; #one genes fields, should be used repeatedly +my %done; +while(<FH>) { + chomp; + my @s = split(/\t/); #SNP fields + if (!@s or !$s[$col0]) { die "ERROR missing SNP data, $_\n"; } + my $size = $#s; + if ($col0 > $size || $col1 > $size || $col2 > $size || $col3 > $size) { + print STDERR "ERROR file has fewer columns than requested, requested columns (0 based) $col0 $col1 $col2 $col3, file has $size\n"; + exit 1; + } + if ($s[$col1] =~ /\D/) { + print STDERR "ERROR the start point must be an integer not $s[$col1]\n"; + exit 1; + } + if ($s[$col2] =~ /\D/) { + print STDERR "ERROR the start point must be an integer not $s[$col2]\n"; + exit 1; + } + if ($s[$col3] eq 'N' && $ignoreN) { next; } + if (exists $amb{$s[$col3]}) { $s[$col3] = $amb{$s[$col3]}; } + if (!@g && exists $chrSt{$s[$col0]}) { #need to fetch first gene row + $i = $chrSt{$s[$col0]}; + @g = split(/\t/, $genes[$i]); + }elsif (!@g) { + next; #no gene for this chrom + }elsif ($s[$col0] ne $g[0] && exists $chrSt{$s[$col0]}) { #new chrom + $i = $chrSt{$s[$col0]}; + @g = split(/\t/, $genes[$i]); + }elsif ($s[$col0] ne $g[0]) { + next; #no gene for this chrom + }elsif ($s[$col1] < $g[1] && $i == $chrSt{$s[$col0]}) { + next; #before any genes + }elsif ($s[$col1] > $g[2] && ($i == $#genes or $genes[$i+1] !~ $s[$col0])) { + next; #after all genes on chr + }else { + while ($s[$col1] > $g[2] && $i < $#genes) { + $i++; + @g = split(/\t/, $genes[$i]); + if ($s[$col0] ne $g[0]) { last; } #end of gene + } + if ($s[$col0] ne $g[0] or $s[$col1] < $g[1] or $s[$col1] > $g[2]) { + next; #no overlap with genes + } + } + + processSnp(\@s, \@g); + if ($uniq && exists $done{"$s[$col0] $s[$col1] $s[$col2]"}) { next; } + + my $k = $i + 1; #check for more genes without losing data of first + if ($k <= $#genes) { + my @g2 = split(/\t/, $genes[$k]); + while (@g2 && $k <= $#genes) { + @g2 = split(/\t/, $genes[$k]); + if ($s[$col0] ne $g2[0]) { + undef @g2; + last; #not same chrom + }else { + while ($s[$col1] > $g2[2] && $k <= $#genes) { + $k++; + @g2 = split(/\t/, $genes[$k]); + if ($s[$col0] ne $g2[0]) { last; } #end of chrom + } + if ($s[$col0] ne $g2[0] or $s[$col1] < $g2[1] or $s[$col1] > $g2[2]) { + undef @g2; + last; #no overlap with more genes + } + processSnp(\@s, \@g2); + if ($uniq && exists $done{"$s[$col0] $s[$col1] $s[$col2]"}) { last; } + } + $k++; + } + } +} +close FH or die "Couldn't close $snpFile, $!\n"; +close(OUTFILE) || die "close($outFile): $!\n"; + +exit; + +################################################################################ +# Subroutines # +################################################################################ + +sub processSnp { + my $sref = shift; + my $gref = shift; + #overlaps gene, but maybe not coding seq + #inside cds + if ($sref->[$col1] + 1 < $gref->[6] or $sref->[$col2] > $gref->[7]) { + return; #outside of coding + } + #now check exon + my $i = 0; + my @st = split(/,/, $gref->[11]); + my @size = split(/,/, $gref->[10]); + if (scalar @st ne $gref->[9]) { die "bad gene $gref->[3]\n"; } + my @pos; + my $in = 0; + for($i = 0; $i < $gref->[9]; $i++) { + my $sta = $gref->[1] + $st[$i] + 1; #1 based position + my $end = $sta + $size[$i] - 1; # + if ($ends) { $end = $size[$i]; $sta = $st[$i] + 1; } #ends instead of sizes + if ($end < $gref->[6]) { next; } #utr only + if ($sta > $gref->[7]) { next; } #utr only + #shorten to coding only + if ($sta < $gref->[6]) { $sta = $gref->[6] + 1; } + if ($end > $gref->[7]) { $end = $gref->[7]; } + if ($sref->[$col1] + 1 >= $sta && $sref->[$col2] <= $end) { $in = 1; } + elsif ($sref->[$col1] == $sref->[$col2] && $sref->[$col2] <= $end && $sref->[$col2] >= $sta) { $in = 1; } + push(@pos, ($sta .. $end)); #add exon worth of positions + } + #@pos has coding positions for whole gene (chr coors), + #and $in has whether we need to continue + if (!$in) { return; } #not in coding exon + if ((scalar @pos) % 3 != 0) { return; } #partial gene? not even codons + if ($sref->[$col3] =~ /^-+\/[ACTG]+$/ or $sref->[$col3] =~ /^[ACTG]+\/-+$/ or + $sref->[$col3] =~ /^-+$/) { #indel or del + my $copy = $sref->[$col3]; + my $c = ($copy =~ tr/-//); + if ($c % 3 == 0) { return; } #not frameshift + #handle bed4 to bed4 + 4 (pgSnp) + print OUTFILE "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; + #if ($sref->[4]) { print "\t$sref->[4]"; } + #if ($sref->[5]) { print "\t$sref->[5]"; } + #if ($sref->[6]) { print "\t$sref->[6]"; } + #if ($sref->[7]) { print "\t$sref->[7]"; } + print OUTFILE "\t$gref->[3]\tframeshift\n"; + $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++; + return; + }elsif ($sref->[$col1] == $sref->[$col2]) { #insertion + my $copy = $sref->[$col3]; + my $c = ($copy =~ tr/\[ACTG]+//); + if ($c % 3 == 0) { return; } #not frameshift + #handle bed4 to bed4 + 4 (pgSnp) + print OUTFILE "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; + #if ($sref->[4]) { print "\t$sref->[4]"; } + #if ($sref->[5]) { print "\t$sref->[5]"; } + #if ($sref->[6]) { print "\t$sref->[6]"; } + #if ($sref->[7]) { print "\t$sref->[7]"; } + print OUTFILE "\t$gref->[3]\tframeshift\n"; + $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++; + return; + } + #check for amino acid substitutions + my $s = $sref->[$col1] + 1; + my $e = $sref->[$col2]; + my $len = $sref->[$col2] - $sref->[$col1]; + if ($gref->[5] eq '-') { + @pos = reverse(@pos); + my $t = $s; + $s = $e; + $e = $t; + } + $i = 0; + my $found = 0; + foreach (@pos) { + if ($s == $_) { + $found = 1; + last; + } + $i++; + } + if ($found) { + my $fs = $i; #keep original start index + #have index where substitution starts + my $cp = $i % 3; + $i -= $cp; #i is now first position in codon + my $cdNum = int($i / 3) + 1; + my $ls = $i; + if (!defined $ls) { die "ERROR not defined ls for $fs $sref->[$col2]\n"; } + if (!@pos) { die "ERROR not defined array pos\n"; } + if (!defined $pos[$ls]) { die "ERROR not defined pos at $ls\n"; } + if (!defined $e) { die "ERROR not defined e for $pos[0] $pos[1] $pos[2]\n"; } + while ($ls <= $#pos && $pos[$ls] ne $e) { + $ls++; + } + my $i2 = $ls + (2 - ($ls % 3)); + if ($i2 > $#pos) { return; } #not a full codon, partial gene? + + if ($i2 - $i < 2) { die "not a full codon positions $i to $i2 for $sref->[3]\n"; } + my $oldnts = getnts($sref->[$col0], @pos[$i..$i2]); + if (!$oldnts) { die "Failed to get sequence for $sref->[$col0] $pos[$i] .. $pos[$i2]\n"; } + my @vars = split(/\//, $sref->[$col3]); + if ($gref->[5] eq '-') { #complement oldnts and revcomp vars + $oldnts = compl($oldnts); + $oldnts = join('', (reverse(split(/ */, $oldnts)))); + foreach (@vars) { + $_ = reverse(split(/ */)); + $_ = compl($_); + } + } + my $r = $fs - $i; #difference in old indexes gives new index + my @newnts; + my $changed = ''; + foreach my $v (@vars) { + my @new = split(/ */, $oldnts); + $changed = splice(@new, $r, $len, split(/ */, $v)); + #should only change single nt + push(@newnts, join("", @new)); + } + #now compute amino acids + my $oldaa = getaa($oldnts); + my @newaa; + my $change = 0; #flag for if there is a change + foreach my $v (@newnts) { + my $t = getaa($v); + if ($t ne $oldaa) { $change = 1; } + push(@newaa, $t); + } + if (!$change && $syn) { + print OUTFILE "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; + print OUTFILE "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\n"; + return; + }elsif ($syn) { return; } #only want synonymous changes + if (!$change) { return; } #no change in amino acids +#if (abs($pos[$i] - $pos[$i2]) > 200) { +#print STDERR "TESTING found mutation at splice site $sref->[0]\t$sref->[1]\t$sref->[2]\n"; +#print STDERR "old $oldaa, new ", join(', ', @newaa), "\n"; +#print STDERR "oldnt $oldnts, strand $gref->[5]\n"; +#exit; +#} + print OUTFILE "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; + #if (defined $sref->[4]) { print "\t$sref->[4]"; } + #if (defined $sref->[5]) { print "\t$sref->[5]"; } + #if (defined $sref->[6]) { print "\t$sref->[6]"; } + #if (defined $sref->[7]) { print "\t$sref->[7]"; } + if ($gref->[5] eq '-') { $changed = compl($changed); } #use plus for ref + print OUTFILE "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\t$cdNum\t$changed\n"; + $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++; + } +} + +sub getnts { + my $chr = shift; + my @pos = @_; #list of positions not necessarily in order + #list may be reversed or have gaps(introns), at least 3 bps + my $seq = ''; + if (scalar @pos < 3) { die "too small region for $chr $pos[0]\n"; } + if ($pos[0] < $pos[1]) { #not reversed + my $s = $pos[0]; + for(my $i = 1; $i <= $#pos; $i++) { + if ($pos[$i] == $pos[$i-1] + 1) { next; } + if ($seqFlag eq '2bit') { + $seq .= fetchSeq2bit($chr, $s, $pos[$i-1]); + }else { + $seq .= fetchSeqNib($chr, $s, $pos[$i-1]); + } + $s = $pos[$i]; + } + if (length $seq != scalar @pos) { #still need to fetch seq +#if (abs($pos[$#pos]-$pos[0]) > 200) { +#print STDERR "TESTING have split codon $chr $pos[0] $pos[$#pos]\n"; +#exit; +#} + if ($seqFlag eq '2bit') { + $seq .= fetchSeq2bit($chr, $s, $pos[$#pos]); + }else { + $seq .= fetchSeqNib($chr, $s, $pos[$#pos]); + } + } + }else { #reversed + my $s = $pos[$#pos]; + for(my $i = $#pos -1; $i >= 0; $i--) { + if ($pos[$i] == $pos[$i+1] + 1) { next; } + if ($seqFlag eq '2bit') { + $seq .= fetchSeq2bit($chr, $s, $pos[$i+1]); + }else { + $seq .= fetchSeqNib($chr, $s, $pos[$i+1]); + } + $s = $pos[$i]; + } + if (length $seq != scalar @pos) { #still need to fetch seq +#if (abs($pos[$#pos]-$pos[0]) > 200) { +#print STDERR "TESTING have split codon $pos[0] .. $pos[$#pos]\n"; +#} + if ($seqFlag eq '2bit') { + $seq .= fetchSeq2bit($chr, $s, $pos[0]); + }else { + $seq .= fetchSeqNib($chr, $s, $pos[0]); + } + } + } +} + +sub fetchSeq2bit { + my $chr = shift; + my $st = shift; + my $end = shift; + my $strand = '+'; + $st--; #change to UCSC numbering + open (BIT, "twoBitToFa -seq=$chr -start=$st -end=$end $nibDir/$nibTag.2bit stdout |") or + die "Couldn't run twoBitToFa, $!\n"; + my $seq = ''; + while (<BIT>) { + chomp; + if (/^>/) { next; } #header + $seq .= $_; + } + close BIT or die "Couldn't finish nibFrag on $chr $st $end, $!\n"; + return $seq; +} + +sub fetchSeqNib { + my $chr = shift; + my $st = shift; + my $end = shift; + my $strand = '+'; + $st--; #change to UCSC numbering + open (NIB, "nibFrag -upper $nibDir/${chr}.nib $st $end $strand stdout |") or die "Couldn't run nibFrag, $!\n"; + my $seq = ''; + while (<NIB>) { + chomp; + if (/^>/) { next; } #header + $seq .= $_; + } + close NIB or die "Couldn't finish nibFrag on $chr $st $end, $!\n"; + return $seq; +} + +sub compl { + my $nts = shift; + my $comp = ''; + foreach my $n (split(/ */, $nts)) { + if ($n eq 'A') { $comp .= 'T'; } + elsif ($n eq 'T') { $comp .= 'A'; } + elsif ($n eq 'C') { $comp .= 'G'; } + elsif ($n eq 'G') { $comp .= 'C'; } + elsif ($n eq 'N') { $comp .= 'N'; } + elsif ($n eq '-') { $comp .= '-'; } #deletion + else { die "Couldn't do complement of $n for $nts\n"; } + } + return $comp; +} + +sub getaa { + my $nts = shift; #in multiples of 3 + my $aa = ''; + my @n = split(/ */, $nts); + while (@n) { + my @t = splice(@n, 0, 3); + my $n = uc(join("", @t)); + if (!exists $codon{$n}) { $aa .= 'N'; next; } + $aa .= $codon{$n}; + } + return $aa; +} + +sub fill_codon { + $codon{GCA} = 'Ala'; + $codon{GCC} = 'Ala'; + $codon{GCG} = 'Ala'; + $codon{GCT} = 'Ala'; + $codon{CGG} = 'Arg'; + $codon{CGT} = 'Arg'; + $codon{CGC} = 'Arg'; + $codon{AGA} = 'Arg'; + $codon{AGG} = 'Arg'; + $codon{CGA} = 'Arg'; + $codon{AAC} = 'Asn'; + $codon{AAT} = 'Asn'; + $codon{GAC} = 'Asp'; + $codon{GAT} = 'Asp'; + $codon{TGC} = 'Cys'; + $codon{TGT} = 'Cys'; + $codon{CAG} = 'Gln'; + $codon{CAA} = 'Gln'; + $codon{GAA} = 'Glu'; + $codon{GAG} = 'Glu'; + $codon{GGG} = 'Gly'; + $codon{GGA} = 'Gly'; + $codon{GGC} = 'Gly'; + $codon{GGT} = 'Gly'; + $codon{CAC} = 'His'; + $codon{CAT} = 'His'; + $codon{ATA} = 'Ile'; + $codon{ATT} = 'Ile'; + $codon{ATC} = 'Ile'; + $codon{CTA} = 'Leu'; + $codon{CTC} = 'Leu'; + $codon{CTG} = 'Leu'; + $codon{CTT} = 'Leu'; + $codon{TTG} = 'Leu'; + $codon{TTA} = 'Leu'; + $codon{AAA} = 'Lys'; + $codon{AAG} = 'Lys'; + $codon{ATG} = 'Met'; + $codon{TTC} = 'Phe'; + $codon{TTT} = 'Phe'; + $codon{CCT} = 'Pro'; + $codon{CCA} = 'Pro'; + $codon{CCC} = 'Pro'; + $codon{CCG} = 'Pro'; + $codon{TCA} = 'Ser'; + $codon{AGC} = 'Ser'; + $codon{AGT} = 'Ser'; + $codon{TCC} = 'Ser'; + $codon{TCT} = 'Ser'; + $codon{TCG} = 'Ser'; + $codon{TGA} = 'Stop'; + $codon{TAG} = 'Stop'; + $codon{TAA} = 'Stop'; + $codon{ACT} = 'Thr'; + $codon{ACA} = 'Thr'; + $codon{ACC} = 'Thr'; + $codon{ACG} = 'Thr'; + $codon{TGG} = 'Trp'; + $codon{TAT} = 'Tyr'; + $codon{TAC} = 'Tyr'; + $codon{GTC} = 'Val'; + $codon{GTA} = 'Val'; + $codon{GTG} = 'Val'; + $codon{GTT} = 'Val'; +} + diff -r a67ca0795efb -r dfaa18960944 tools/evolution/codingSnps.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/evolution/codingSnps.xml Tue Mar 23 13:53:21 2010 -0400 @@ -0,0 +1,89 @@ +<tool id="codingSnps" name="Amino-acid changes"> + <description>caused by a set of SNPs</description> + <command interpreter="perl"> + codingSnps.pl $input1 $input2 ${GALAXY_DATA_INDEX_DIR}/codingSnps.loc chr=${input1.metadata.chromCol} start=${input1.metadata.startCol} end=${input1.metadata.endCol} snp=$col1 $out_file1 + </command> + <inputs> + <param format="interval" name="input1" type="data" label="SNPs"/> + <param format="interval" name="input2" type="data" label="genes"/> + <param name="col1" label="Column with SNPs" type="data_column" data_ref="input1" /> + </inputs> + <outputs> + <data format="input" name="out_file1" /> + </outputs> + <code file="codingSnps_filter.py"></code> + <requirements> + <requirement type="binary">twoBitToFa</requirement> + </requirements> + <tests> + <test> + <param name="input1" value="codingSnps_input1.interval" dbkey="hg18" /> + <param name="input2" value="codingSnps_input2.bed" dbkey="hg18" /> + <param name="col1" value="6" /> + <output name="output" file="codingSnps_output.interval" /> + </test> + </tests> + + <help> +This tool identifies which SNPs create amino-acid changes in the specified coding regions. + +**Example** + +- first input file, with SNPs:: + + chr22 14440426 14440427 C/T + chr22 14494851 14494852 A/G + chr22 14494911 14494912 A/T + chr22 14550435 14550436 A/G + chr22 14611956 14611957 G/T + chr22 14612076 14612077 A/G + chr22 14668537 14668538 C + chr22 14668703 14668704 A/T + chr22 14668775 14668776 G + chr22 14680074 14680075 A/T + etc. + + alternatively indicating polymorphisms using ambiguous-nucleotide symbols: + + chr22 14440426 14440427 Y + chr22 14494851 14494852 R + chr22 14494911 14494912 W + chr22 14550435 14550436 R + chr22 14611956 14611957 K + chr22 14612076 14612077 R + chr22 14668537 14668538 C + chr22 14668703 14668704 W + chr22 14668775 14668776 G + chr22 14680074 14680075 W + etc. + +- second input file, with UCSC annotations for human genes:: + + chr22 14504263 14572999 uc002zkr.2 0 - 14504263 14504263 0 5 710,91,136,138,94, 0,38133,62547,62901,68642, + chr22 14527995 14572999 uc010gqo.1 0 - 14527995 14527995 0 4 3826,91,136,94, 0,14401,38815,44910, + chr22 14542065 14552264 uc002zkt.2 0 + 14542065 14542065 0 3 323,88,313, 0,2416,9886, + chr22 14559619 14561004 uc002zku.2 0 - 14559619 14559619 0 1 1385, 0, + chr22 14567164 14572999 uc002zkv.2 0 - 14567164 14567164 0 5 138,112,115,111,94, 0,1867,2099,3516,5741, + chr22 14620243 14620281 uc002zkw.1 0 - 14620243 14620243 0 1 38, 0, + chr22 14620300 14620339 uc002zkx.1 0 - 14620300 14620300 0 1 39, 0, + chr22 14621086 14621125 uc002zky.1 0 + 14621086 14621086 0 1 39, 0, + chr22 14622000 14622030 uc002zkz.1 0 - 14622000 14622000 0 1 30, 0, + chr22 14623380 14623414 uc002zla.1 0 - 14623380 14623380 0 1 34, 0, + etc. + +- output file, showing non-synonymous substitutions in coding regions:: + + chr22 15452482 15452483 G uc002zlp.1 Trp:Arg 320 A + chr22 15644564 15644565 T uc002zlv.1 His:Asn 442 G + chr22 15645123 15645124 C uc002zlv.1 Phe:Leu 255 A + chr22 15645193 15645194 A/G uc002zlv.1 Pro:Leu/Pro 232 G + chr22 15660821 15660822 A/G uc002zlv.1 Thr:Met/Thr 143 G + chr22 15969208 15969209 C/T uc002zly.1 Ala:Ala/Val 367 C + chr22 15969208 15969209 C/T uc010gqt.1 Ala:Ala/Val 315 C + chr22 15999075 15999076 C/G uc002zmd.1 Arg:Arg/Ser 180 C + chr22 15999075 15999076 C/G uc002zme.1 Arg:Arg/Ser 161 C + chr22 15999075 15999076 C/G uc002zmf.1 Arg:Arg/Ser 369 C + etc. + +</help> +</tool> diff -r a67ca0795efb -r dfaa18960944 tools/evolution/codingSnps_filter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/evolution/codingSnps_filter.py Tue Mar 23 13:53:21 2010 -0400 @@ -0,0 +1,43 @@ +# runs after the job (and after the default post-filter) +import os +from galaxy import eggs +from galaxy import jobs +from galaxy.tools.parameters import DataToolParameter +# Older py compatibility +try: + set() +except: + from sets import Set as set + +def validate_input( trans, error_map, param_values, page_param_map ): + dbkeys = set() + data_param_names = set() + data_params = 0 + for name, param in page_param_map.iteritems(): + if isinstance( param, DataToolParameter ): + # for each dataset parameter + if param_values.get(name, None) != None: + dbkeys.add( param_values[name].dbkey ) + data_params += 1 + # check meta data + try: + param = param_values[name] + startCol = int( param.metadata.startCol ) + endCol = int( param.metadata.endCol ) + chromCol = int( param.metadata.chromCol ) + if param.metadata.strandCol is not None: + strandCol = int ( param.metadata.strandCol ) + else: + strandCol = 0 + except: + error_msg = "The attributes of this dataset are not properly set. " + \ + "Click the pencil icon in the history item to set the chrom, start, end and strand columns." + error_map[name] = error_msg + data_param_names.add( name ) + if len( dbkeys ) > 1: + for name in data_param_names: + error_map[name] = "All datasets must belong to same genomic build, " \ + "this dataset is linked to build '%s'" % param_values[name].dbkey + if data_params != len(data_param_names): + for name in data_param_names: + error_map[name] = "A dataset of the appropriate type is required"