[hg] galaxy 3558: Add the codingSnps tool.

24 Mar 2010

details:   http://www.bx.psu.edu/hg/galaxy/rev/dfaa18960944
changeset: 3558:dfaa18960944
user:      Greg Von Kuster <greg@bx.psu.edu>
date:      Tue Mar 23 13:53:21 2010 -0400
description:
Add the codingSnps tool.

diffstat:

 test-data/codingSnps_input1.interval |   21 +
 test-data/codingSnps_input2.bed      |   20 +
 test-data/codingSnps_output.interval |   21 +
 tool-data/codingSnps.loc.sample      |   25 +
 tool_conf.xml.sample                 |    1 +
 tools/evolution/codingSnps.pl        |  541 +++++++++++++++++++++++++++++++++++
 tools/evolution/codingSnps.xml       |   89 +++++
 tools/evolution/codingSnps_filter.py |   43 ++
 8 files changed, 761 insertions(+), 0 deletions(-)

diffs (799 lines):

diff -r a67ca0795efb -r dfaa18960944 test-data/codingSnps_input1.interval

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/codingSnps_input1.interval	Tue Mar 23 13:53:21 2010 -0400
@@ -0,0 +1,21 @@
+chr1	1415785	1415786	C	C	S	N	C	N	C	N	C
+chr1	1420777	1420778	C	C	Y	N	C	N	C	N	C
+chr1	1469195	1469196	A	G	R	R	G	G	G	G	G
+chr1	1541789	1541790	T	C	Y	Y	N	Y	N	T	N
+chr1	1548654	1548655	T	T	C	N	C	N	N	N	N
+chr1	1549089	1549090	T	C	A	A	T	A	N	A	N
+chr1	1551984	1551985	C	C	Y	N	C	N	N	N	N
+chr1	1571349	1571350	G	A	R	N	N	N	N	N	N
+chr1	1571354	1571355	G	G	R	N	N	N	N	N	N
+chr1	1589750	1589751	A	G	G	N	G	N	G	N	G
+chr1	1675899	1675900	G	A	C	C	G	C	G	C	N
+chr1	1714578	1714579	G	G	R	N	A	N	G	N	G
+chr1	1837838	1837839	T	C	Y	N	T	N	T	N	C
+chr1	1839388	1839389	A	A	T	T	A	T	A	T	A
+chr1	1839389	1839390	T	T	Y	N	T	N	T	N	C
+chr1	1839603	1839604	G	G	R	N	G	N	G	N	R
+chr1	1843968	1843969	A	G	G	G	R	R	A	A	G
+chr1	1844405	1844406	C	C	Y	Y	C	C	N	C	Y
+chr1	1876878	1876879	A	G	G	N	N	N	N	N	N
+chr1	1886192	1886193	G	N	R	N	G	N	N	N	N
+chr1	1890091	1890092	T	C	Y	Y	C	C	N	C	N
diff -r a67ca0795efb -r dfaa18960944 test-data/codingSnps_input2.bed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/codingSnps_input2.bed	Tue Mar 23 13:53:21 2010 -0400
@@ -0,0 +1,20 @@
+chr1	67051161	67163158	ENST00000371026	0	-	67052400	67163102	0	17	1290,157,227,99,122,158,152,87,203,195,156,140,157,113,185,175,226,	0,9470,13929,14921,20694,21100,22735,24819,27578,34593,49256,58479,61890,78263,80338,92310,111771,
+chr1	67075869	67163055	ENST00000371023	0	-	67075923	67162979	0	10	198,203,195,156,140,157,113,185,175,123,	0,2870,9885,24548,33771,37182,53555,55630,67602,87063,
+chr1	1843259	1849228	ENST00000378596	0	-	1843585	1847183	0	10	428,135,161,133,155,130,156,120,56,1278,	0,628,1024,1406,1814,2430,3822,4267,4496,4691,
+chr1	1874611	1925136	ENST00000270720	0	-	1876876	1912255	0	18	2538,165,90,67,119,174,156,199,158,103,111,174,105,99,144,85,86,137,	0,3307,5807,6651,10436,11575,13058,15322,17346,18667,19624,20712,32073,33624,35199,35576,37577,50388,
+chr1	1838889	1840572	ENST00000310991	0	-	1839180	1840564	0	5	572,179,42,44,92,	0,662,1302,1454,1591,
+chr1	1838888	1840096	ENST00000378602	0	-	1839180	1839855	0	2	573,545,	0,663,
+chr1	1672537	1699769	ENST00000344463	0	-	1674202	1686705	0	14	1822,83,158,100,155,103,86,155,204,143,169,84,219,182,	0,2328,2812,3045,3305,4135,5019,5264,5868,7862,8495,10713,13989,27050,
+chr1	1836125	1838593	ENST00000307786	0	+	1836579	1838492	0	6	481,51,175,145,101,148,	0,858,1614,1925,2147,2320,
+chr1	1672530	1701368	ENST00000378625	0	-	1674202	1686705	0	14	1829,83,158,100,155,103,86,155,204,143,169,84,219,165,	0,2335,2819,3052,3312,4142,5026,5271,5875,7869,8502,10720,13996,28673,
+chr1	1672537	1699769	ENST00000341426	0	-	1674202	1686705	0	12	1822,83,158,100,155,103,86,106,130,84,219,182,	0,2328,2812,3045,3305,4135,5019,5264,5942,10713,13989,27050,
+chr1	1540746	1555847	ENST00000378710	0	+	1540873	1555773	0	19	130,107,311,172,107,195,143,105,163,134,149,157,161,127,234,179,182,66,313,	0,1004,7885,8270,9291,9487,9782,11146,11333,11570,11792,12169,12515,12769,12985,13629,13881,14135,14788,
+chr1	1672530	1700089	ENST00000341991	0	-	1674202	1686705	0	12	1829,83,158,100,155,103,86,106,130,84,219,45,	0,2335,2819,3052,3312,4142,5026,5271,5949,10720,13996,27514,
+chr1	1540746	1555847	ENST00000355826	0	+	1540873	1555773	0	20	130,107,311,172,107,195,143,108,105,163,134,149,157,161,127,234,179,182,66,313,	0,1004,7885,8270,9291,9487,9782,10042,11146,11333,11570,11792,12169,12515,12769,12985,13629,13881,14135,14788,
+chr1	1836125	1838593	ENST00000378604	0	+	1836579	1838492	0	5	481,175,145,101,148,	0,1614,1925,2147,2320,
+chr1	1540746	1555847	ENST00000357882	0	+	1540873	1555773	0	19	130,107,311,172,107,143,108,105,163,134,149,157,161,127,234,179,182,66,313,	0,1004,7885,8270,9291,9782,10042,11146,11333,11570,11792,12169,12515,12769,12985,13629,13881,14135,14788,
+chr1	1540746	1555847	ENST00000357882	0	+	1540873	1555773	0	19	130,107,311,172,107,143,108,105,163,134,149,157,161,127,234,179,182,66,313,	0,1004,7885,8270,9291,9782,10042,11146,11333,11570,11792,12169,12515,12769,12985,13629,13881,14135,14788,
+chr1	67075991	67163158	ENST00000395250	0	-	67075991	67113089	0	11	27,45,203,195,156,140,157,113,185,175,226,	0,31,2748,9763,24426,33649,37060,53433,55508,67480,86941,
+chr1	201326404	201403155	ENST00000309502	0	+	201364592	201401651	0	6	81,18,100,155,398,2144,	0,1039,37175,37624,38131,74607,
+chr1	8335052	8508585	ENST00000377464	0	-	8337733	8508430	0	17	2715,181,147,721,223,1379,114,162,200,93,163,81,99,100,125,49,181,	0,3013,3694,5790,7358,7706,9357,10277,11650,12340,13406,70321,113519,142657,144999,156220,173352,
+chr1	8335054	8800286	ENST00000400908	0	-	8337733	8638943	0	23	2713,181,147,721,223,1379,114,162,200,93,163,81,99,100,125,49,105,97,106,126,71,469,481,	0,3011,3692,5788,7356,7704,9355,10275,11648,12338,13404,70319,113517,142655,144997,156218,188805,204066,205009,262152,271901,303564,464751,
diff -r a67ca0795efb -r dfaa18960944 test-data/codingSnps_output.interval
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/codingSnps_output.interval	Tue Mar 23 13:53:21 2010 -0400
@@ -0,0 +1,21 @@
+chr1	1541789	1541790	C/T	ENST00000355826	Phe:Leu/Phe	15	T
+chr1	1541789	1541790	C/T	ENST00000357882	Phe:Leu/Phe	15	T
+chr1	1541789	1541790	C/T	ENST00000357882	Phe:Leu/Phe	15	T
+chr1	1541789	1541790	C/T	ENST00000378710	Phe:Leu/Phe	15	T
+chr1	1548654	1548655	C	ENST00000355826	Met:Thr	45	T
+chr1	1548654	1548655	C	ENST00000357882	Met:Thr	45	T
+chr1	1548654	1548655	C	ENST00000357882	Met:Thr	45	T
+chr1	1548654	1548655	C	ENST00000378710	Met:Thr	45	T
+chr1	1675899	1675900	C	ENST00000341991	Asn:Lys	262	G
+chr1	1675899	1675900	C	ENST00000378625	Asn:Lys	407	G
+chr1	1675899	1675900	C	ENST00000341426	Asn:Lys	262	G
+chr1	1675899	1675900	C	ENST00000344463	Asn:Lys	407	G
+chr1	1837838	1837839	C/T	ENST00000307786	Trp:Arg/Trp	60	T
+chr1	1837838	1837839	C/T	ENST00000378604	Trp:Arg/Trp	43	T
+chr1	1839388	1839389	T	ENST00000378602	Met:Lys	126	A
+chr1	1839388	1839389	T	ENST00000310991	Met:Lys	141	A
+chr1	1839389	1839390	C/T	ENST00000378602	Met:Val/Met	126	T
+chr1	1839389	1839390	C/T	ENST00000310991	Met:Val/Met	141	T
+chr1	1844405	1844406	C/T	ENST00000378596	Glu:Glu/Lys	187	C
+chr1	1876878	1876879	G	ENST00000270720	Stop:Gln	763	A
+chr1	1890091	1890092	C/T	ENST00000270720	Ile:Val/Ile	363	T
diff -r a67ca0795efb -r dfaa18960944 tool-data/codingSnps.loc.sample
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/codingSnps.loc.sample	Tue Mar 23 13:53:21 2010 -0400
@@ -0,0 +1,25 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use a directory of nib genome files for use with codingSnps.  You will 
+#need to supply these files and then create a codingSnps.loc file 
+#similar to this one (store it in this directory) that points to 
+#the directories in which those files are stored. The codingSnps.loc 
+#file has this format (white space characters are TAB characters):
+#
+#<build>	<file_path>
+#
+#So, for example, if your codingSnps.loc began like this:
+#
+#hg18	/afs/bx.psu.edu/depot/data/genome/hg18/
+
+#
+#then your /afs/bx.psu.edu/depot/data/genome/hg18/ directory
+#would need to contain the following 2bit file:
+#
+#
+#-rw-r--r--   1 g2data   g2data   807604784 Dec  8 13:21 hg18.2bit
+
+#Your codingSnps.loc file should include an entry per line for 
+#each file you have stored that you want to be available. Note that 
+#your files should all have the extension '2bit'. 
+
+hg18	/afs/bx.psu.edu/depot/data/genome/hg18
diff -r a67ca0795efb -r dfaa18960944 tool_conf.xml.sample
--- a/tool_conf.xml.sample	Tue Mar 23 13:15:56 2010 -0400
+++ b/tool_conf.xml.sample	Tue Mar 23 13:53:21 2010 -0400
@@ -161,6 +161,7 @@
     <tool file="hyphy/hyphy_nj_tree_wrapper.xml" />
     <tool file="hyphy/hyphy_dnds_wrapper.xml" />
     <tool file="evolution/mutate_snp_codon.xml" />
+    <tool file="evolution/codingSnps.xml" />
  </section>
  <section name="Metagenomic analyses" id="tax_manipulation">
     <tool file="taxonomy/gi2taxonomy.xml" />
diff -r a67ca0795efb -r dfaa18960944 tools/evolution/codingSnps.pl
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/evolution/codingSnps.pl	Tue Mar 23 13:53:21 2010 -0400
@@ -0,0 +1,541 @@
+#!/usr/bin/perl -w 
+use strict;
+
+#########################################################################
+#	codingSnps.pl
+#	This takes a bed file with the names being / separated nts
+#	and a gene bed file with cds start and stop.
+#	It then checks for changes in coding regions, reporting
+#	those that cause a frameshift or substitution in the amino acid.
+#########################################################################
+
+my $uniq    = 0;	# flag for whether want uniq positions
+my $syn     = 0;	# flag for if want synonomous changes rather than non-syn
+my $seqFlag = "2bit";	# flag to set sequence type 2bit|nib
+my $nibDir;		# directory containg data
+my $nibTag;		# tag for directory above
+
+################################################################################
+# Parse command line arguments                                                 #
+################################################################################
+
+# make sure we have enough arguments
+if (!@ARGV or scalar @ARGV < 3) {
+   print STDERR "Usage: codingSnps.pl snps.bed genes.bed locfile.loc [chr=# start=# end=# snp=#] output_file\n";
+   exit;
+}
+
+# get first three command line arguments
+my $snpFile  = shift @ARGV;
+my $geneFile = shift @ARGV;
+my $locFile  = shift @ARGV;
+
+# read $locFile to get $nibDir (ignoring commets)
+# FIXME: the last entry is the one you get
+open(LF, "< $locFile") || die "open($locFile): $!\n";
+while(<LF>) {
+	s/#.*$//;
+	s/(?:^\s+|\s+$)//g;
+	next if (/^$/);
+
+	# $tag and $path are set for each "valid" line in the file
+	($nibTag, $nibDir) = split(/\t/);
+}
+close(LF);
+
+# bed like columns in default positions
+my $col0 = 0;
+my $col1 = 1;
+my $col2 = 2;
+my $col3 = 3;
+
+# get column positions for chr, start, end, snp
+# column positions 1 based coming in (for Galaxy)
+foreach (@ARGV) {
+	if (/^chr=(\d+)$/) {
+		$col0 = $1 - 1;
+	} elsif (/^start=(\d+)$/) {
+		$col1 = $1 - 1;
+	} elsif (/^end=(\d+)$/) {
+		$col2 = $1 - 1;
+	} elsif (/^snp=(\d+)$/) {
+		$col3 = $1 - 1;
+	}
+}
+
+# make sure the column positions are sane
+if ($col0 < 0 || $col1 < 0 || $col2 < 0 || $col3 < 0) {
+   print STDERR "ERROR column numbers are given with origin 1\n";
+   exit 1;
+}
+
+# get the output_file from the command line arguments
+my $outFile = $ARGV[$#ARGV];
+
+open(OUTFILE, "> $outFile") || die "open($outFile): $!\n";
+
+################################################################################
+# Initialization                                                               #
+################################################################################
+
+my @genes; #bed lines for genes, sorted by chrom and start
+my %chrSt; #index in array where each chrom starts
+my %codon; #hash of codon amino acid conversions
+my $ends = 0; #ends vs sizes in bed 11 position, starts relative to chrom
+my $ignoreN = 1; #skip N
+
+my %amb = (
+"R" => "A/G",
+"Y" => "C/T",
+"S" => "C/G",
+"W" => "A/T",
+"K" => "G/T",
+"M" => "A/C",
+"B" => "C/G/T",
+"D" => "A/G/T",
+"H" => "A/C/T",
+"V" => "A/C/G",
+"N" => "A/C/G/T"
+);
+
+fill_codon();
+
+################################################################################
+# Main                                                                         #
+################################################################################
+
+open(FH, "cat $geneFile | sort -k1,1 -k2,2n |") 
+   or die "Couldn't open and sort $geneFile, $!\n";
+my $i = 0;
+while(<FH>) {
+   chomp;
+   if (/refGene.cdsEnd|ccdsGene.exonEnds/) { $ends = 1; next; }
+   push(@genes, "$_");
+   my @f = split(/\t/);
+   if (!exists $chrSt{$f[0]}) { $chrSt{$f[0]} = $i; }
+   $i++;
+}
+close FH or die "Couldn't close $geneFile, $!\n";
+
+if ($ends) { print STDERR "TESTING using block ends rather than sizes\n"; }
+
+#open snps sorted as well
+my $s1 = $col0 + 1; #sort order is origin 1
+my $s2 = $col1 + 1; 
+open(FH, "cat $snpFile | sort -k$s1,$s1 -k$s2,${s2}n |")
+   or die "Couldn't open and sort $snpFile, $!\n";
+$i = 0;
+my @g; #one genes fields, should be used repeatedly
+my %done;
+while(<FH>) {
+   chomp;
+   my @s = split(/\t/); #SNP fields
+   if (!@s or !$s[$col0]) { die "ERROR missing SNP data, $_\n"; }
+   my $size = $#s;
+   if ($col0 > $size || $col1 > $size || $col2 > $size || $col3 > $size) {
+      print STDERR "ERROR file has fewer columns than requested, requested columns (0 based) $col0 $col1 $col2 $col3, file has $size\n";
+      exit 1;
+   }
+   if ($s[$col1] =~ /\D/) { 
+      print STDERR "ERROR the start point must be an integer not $s[$col1]\n";
+      exit 1;
+   }
+   if ($s[$col2] =~ /\D/) {
+      print STDERR "ERROR the start point must be an integer not $s[$col2]\n";
+      exit 1;
+   }
+   if ($s[$col3] eq 'N' && $ignoreN) { next; }
+   if (exists $amb{$s[$col3]}) { $s[$col3] = $amb{$s[$col3]}; }
+   if (!@g && exists $chrSt{$s[$col0]}) { #need to fetch first gene row
+      $i = $chrSt{$s[$col0]};
+      @g = split(/\t/, $genes[$i]);
+   }elsif (!@g) { 
+      next; #no gene for this chrom
+   }elsif ($s[$col0] ne $g[0] && exists $chrSt{$s[$col0]}) { #new chrom 
+      $i = $chrSt{$s[$col0]};
+      @g = split(/\t/, $genes[$i]);
+   }elsif ($s[$col0] ne $g[0]) {
+      next; #no gene for this chrom
+   }elsif ($s[$col1] < $g[1] && $i == $chrSt{$s[$col0]}) {
+      next; #before any genes
+   }elsif ($s[$col1] > $g[2] && ($i == $#genes or $genes[$i+1] !~ $s[$col0])) {
+      next; #after all genes on chr
+   }else {
+      while ($s[$col1] > $g[2] && $i < $#genes) {
+         $i++;
+         @g = split(/\t/, $genes[$i]);
+         if ($s[$col0] ne $g[0]) { last; } #end of gene
+      }
+      if ($s[$col0] ne $g[0] or $s[$col1] < $g[1] or $s[$col1] > $g[2]) {
+         next; #no overlap with genes
+      }
+   }
+
+   processSnp(\@s, \@g);
+   if ($uniq && exists $done{"$s[$col0] $s[$col1] $s[$col2]"}) { next; }
+
+   my $k = $i + 1; #check for more genes without losing data of first
+   if ($k <= $#genes) {
+      my @g2 = split(/\t/, $genes[$k]);
+      while (@g2 && $k <= $#genes) {
+         @g2 = split(/\t/, $genes[$k]);
+         if ($s[$col0] ne $g2[0]) {
+            undef @g2;
+            last; #not same chrom
+         }else {
+            while ($s[$col1] > $g2[2] && $k <= $#genes) {
+               $k++;
+               @g2 = split(/\t/, $genes[$k]);
+               if ($s[$col0] ne $g2[0]) { last; } #end of chrom
+            }
+            if ($s[$col0] ne $g2[0] or $s[$col1] < $g2[1] or $s[$col1] > $g2[2]) {
+               undef @g2;
+               last; #no overlap with more genes
+            }
+            processSnp(\@s, \@g2);
+            if ($uniq && exists $done{"$s[$col0] $s[$col1] $s[$col2]"}) { last; }
+         }      
+         $k++;
+      }
+   }
+}
+close FH or die "Couldn't close $snpFile, $!\n";
+close(OUTFILE) || die "close($outFile): $!\n";
+
+exit;
+
+################################################################################
+# Subroutines                                                                  #
+################################################################################
+
+sub processSnp {
+   my $sref = shift;
+   my $gref = shift;
+   #overlaps gene, but maybe not coding seq
+   #inside cds
+   if ($sref->[$col1] + 1 < $gref->[6] or $sref->[$col2] > $gref->[7]) {
+      return; #outside of coding 
+   }
+   #now check exon
+   my $i = 0;
+   my @st = split(/,/, $gref->[11]);
+   my @size = split(/,/, $gref->[10]);
+   if (scalar @st ne $gref->[9]) { die "bad gene $gref->[3]\n"; }
+   my @pos;
+   my $in = 0;
+   for($i = 0; $i < $gref->[9]; $i++) {
+      my $sta = $gref->[1] + $st[$i] + 1; #1 based position
+      my $end = $sta + $size[$i] - 1; #
+      if ($ends) { $end = $size[$i]; $sta = $st[$i] + 1; } #ends instead of sizes
+      if ($end < $gref->[6]) { next; } #utr only
+      if ($sta > $gref->[7]) { next; } #utr only
+      #shorten to coding only
+      if ($sta < $gref->[6]) { $sta = $gref->[6] + 1; }
+      if ($end > $gref->[7]) { $end = $gref->[7]; }
+      if ($sref->[$col1] + 1 >= $sta && $sref->[$col2] <= $end) { $in = 1; }
+      elsif ($sref->[$col1] == $sref->[$col2] && $sref->[$col2] <= $end && $sref->[$col2] >= $sta) { $in = 1; }
+      push(@pos, ($sta .. $end)); #add exon worth of positions
+   }
+   #@pos has coding positions for whole gene (chr coors), 
+   #and $in has whether we need to continue
+   if (!$in) { return; } #not in coding exon
+   if ((scalar @pos) % 3 != 0) { return; } #partial gene? not even codons
+   if ($sref->[$col3] =~ /^-+\/[ACTG]+$/ or $sref->[$col3] =~ /^[ACTG]+\/-+$/ or
+       $sref->[$col3] =~ /^-+$/) { #indel or del
+      my $copy = $sref->[$col3];
+      my $c = ($copy =~ tr/-//);
+      if ($c % 3 == 0) { return; } #not frameshift 
+      #handle bed4 to bed4 + 4 (pgSnp)
+      print OUTFILE "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]";
+      #if ($sref->[4]) { print "\t$sref->[4]"; }
+      #if ($sref->[5]) { print "\t$sref->[5]"; }
+      #if ($sref->[6]) { print "\t$sref->[6]"; }
+      #if ($sref->[7]) { print "\t$sref->[7]"; }
+      print OUTFILE "\t$gref->[3]\tframeshift\n";
+      $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++;
+      return;
+   }elsif ($sref->[$col1] == $sref->[$col2]) { #insertion
+      my $copy = $sref->[$col3];
+      my $c = ($copy =~ tr/\[ACTG]+//);
+      if ($c % 3 == 0) { return; } #not frameshift
+      #handle bed4 to bed4 + 4 (pgSnp)
+      print OUTFILE "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]";
+      #if ($sref->[4]) { print "\t$sref->[4]"; }
+      #if ($sref->[5]) { print "\t$sref->[5]"; }
+      #if ($sref->[6]) { print "\t$sref->[6]"; }
+      #if ($sref->[7]) { print "\t$sref->[7]"; }
+      print OUTFILE "\t$gref->[3]\tframeshift\n";
+      $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++;
+      return;
+   }
+   #check for amino acid substitutions
+   my $s = $sref->[$col1] + 1;
+   my $e = $sref->[$col2];
+   my $len = $sref->[$col2] - $sref->[$col1];
+   if ($gref->[5] eq '-') { 
+      @pos = reverse(@pos); 
+      my $t = $s;
+      $s = $e;
+      $e = $t;
+   }
+   $i = 0;
+   my $found = 0;
+   foreach (@pos) {
+      if ($s == $_) {
+         $found = 1;
+         last;
+      }
+      $i++;
+   }
+   if ($found) {
+      my $fs = $i; #keep original start index
+      #have index where substitution starts
+      my $cp = $i % 3; 
+      $i -= $cp; #i is now first position in codon
+      my $cdNum = int($i / 3) + 1;
+      my $ls = $i;
+      if (!defined $ls) { die "ERROR not defined ls for $fs $sref->[$col2]\n"; }
+      if (!@pos) { die "ERROR not defined array pos\n"; }
+      if (!defined $pos[$ls]) { die "ERROR not defined pos at $ls\n"; }
+      if (!defined $e) { die "ERROR not defined e for $pos[0] $pos[1] $pos[2]\n"; }
+      while ($ls <= $#pos && $pos[$ls] ne $e) { 
+         $ls++; 
+      }
+      my $i2 = $ls + (2 - ($ls % 3));
+      if ($i2 > $#pos) { return; } #not a full codon, partial gene?
+
+      if ($i2 - $i < 2) { die "not a full codon positions $i to $i2 for $sref->[3]\n"; }
+      my $oldnts = getnts($sref->[$col0], @pos[$i..$i2]);
+      if (!$oldnts) { die "Failed to get sequence for $sref->[$col0] $pos[$i] .. $pos[$i2]\n"; }
+      my @vars = split(/\//, $sref->[$col3]);
+      if ($gref->[5] eq '-') { #complement oldnts and revcomp vars
+         $oldnts = compl($oldnts);
+         $oldnts = join('', (reverse(split(/ */, $oldnts))));
+         foreach (@vars) {
+            $_ = reverse(split(/ */));
+            $_ = compl($_);
+         }
+      }
+      my $r = $fs - $i; #difference in old indexes gives new index
+      my @newnts;
+      my $changed = '';
+      foreach my $v (@vars) {
+         my @new = split(/ */, $oldnts);
+         $changed = splice(@new, $r, $len, split(/ */, $v));
+         #should only change single nt
+         push(@newnts, join("", @new));
+      }
+      #now compute amino acids
+      my $oldaa = getaa($oldnts);
+      my @newaa;
+      my $change = 0; #flag for if there is a change
+      foreach my $v (@newnts) {
+         my $t = getaa($v);
+         if ($t ne $oldaa) { $change = 1; }
+         push(@newaa, $t);
+      }
+      if (!$change && $syn) { 
+          print OUTFILE "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]";
+          print OUTFILE "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\n";
+          return;
+      }elsif ($syn) { return; } #only want synonymous changes
+      if (!$change) { return; } #no change in amino acids
+#if (abs($pos[$i] - $pos[$i2]) > 200) {
+#print STDERR "TESTING found mutation at splice site $sref->[0]\t$sref->[1]\t$sref->[2]\n";
+#print STDERR "old $oldaa, new ", join(', ', @newaa), "\n";
+#print STDERR "oldnt $oldnts, strand $gref->[5]\n";
+#exit;
+#}
+      print OUTFILE "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]";
+      #if (defined $sref->[4]) { print "\t$sref->[4]"; }
+      #if (defined $sref->[5]) { print "\t$sref->[5]"; }
+      #if (defined $sref->[6]) { print "\t$sref->[6]"; }
+      #if (defined $sref->[7]) { print "\t$sref->[7]"; }
+      if ($gref->[5] eq '-') { $changed = compl($changed); } #use plus for ref
+      print OUTFILE "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\t$cdNum\t$changed\n";
+      $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++;
+   }
+}
+
+sub getnts {
+   my $chr = shift;
+   my @pos = @_; #list of positions not necessarily in order
+   #list may be reversed or have gaps(introns), at least 3 bps
+   my $seq = '';
+   if (scalar @pos < 3) { die "too small region for $chr $pos[0]\n"; }
+   if ($pos[0] < $pos[1]) { #not reversed
+      my $s = $pos[0];
+      for(my $i = 1; $i <= $#pos; $i++) {
+         if ($pos[$i] == $pos[$i-1] + 1) { next; }
+         if ($seqFlag eq '2bit') { 
+            $seq .= fetchSeq2bit($chr, $s, $pos[$i-1]);
+         }else {
+            $seq .= fetchSeqNib($chr, $s, $pos[$i-1]);
+         }
+         $s = $pos[$i];
+      }
+      if (length $seq != scalar @pos) { #still need to fetch seq
+#if (abs($pos[$#pos]-$pos[0]) > 200) {
+#print STDERR "TESTING have split codon $chr $pos[0] $pos[$#pos]\n";
+#exit;
+#}
+         if ($seqFlag eq '2bit') {
+            $seq .= fetchSeq2bit($chr, $s, $pos[$#pos]);
+         }else {
+            $seq .= fetchSeqNib($chr, $s, $pos[$#pos]);
+         }
+      }
+   }else { #reversed
+      my $s = $pos[$#pos];
+      for(my $i = $#pos -1; $i >= 0; $i--) {
+         if ($pos[$i] == $pos[$i+1] + 1) { next; }
+         if ($seqFlag eq '2bit') {
+            $seq .= fetchSeq2bit($chr, $s, $pos[$i+1]);
+         }else {
+            $seq .= fetchSeqNib($chr, $s, $pos[$i+1]);
+         }
+         $s = $pos[$i];
+      }
+      if (length $seq != scalar @pos) { #still need to fetch seq
+#if (abs($pos[$#pos]-$pos[0]) > 200) {
+#print STDERR "TESTING have split codon $pos[0] .. $pos[$#pos]\n";
+#}
+         if ($seqFlag eq '2bit') {
+            $seq .= fetchSeq2bit($chr, $s, $pos[0]);
+         }else {
+            $seq .= fetchSeqNib($chr, $s, $pos[0]);
+         }
+      }
+   }
+}
+
+sub fetchSeq2bit {
+   my $chr = shift;
+   my $st = shift;
+   my $end = shift;
+   my $strand = '+';
+   $st--; #change to UCSC numbering
+   open (BIT, "twoBitToFa -seq=$chr -start=$st -end=$end $nibDir/$nibTag.2bit stdout |") or
+      die "Couldn't run twoBitToFa, $!\n";
+   my $seq = '';
+   while (<BIT>) {
+      chomp;
+      if (/^>/) { next; } #header
+      $seq .= $_;
+   }
+   close BIT or die "Couldn't finish nibFrag on $chr $st $end, $!\n";
+   return $seq;
+}
+
+sub fetchSeqNib {
+   my $chr = shift;
+   my $st = shift;
+   my $end = shift;
+   my $strand = '+';
+   $st--; #change to UCSC numbering
+   open (NIB, "nibFrag -upper $nibDir/${chr}.nib $st $end $strand stdout |") or die "Couldn't run nibFrag, $!\n";
+   my $seq = '';
+   while (<NIB>) {
+      chomp;
+      if (/^>/) { next; } #header
+      $seq .= $_;
+   }
+   close NIB or die "Couldn't finish nibFrag on $chr $st $end, $!\n";
+   return $seq;
+}
+
+sub compl {
+   my $nts = shift;
+   my $comp = '';
+   foreach my $n (split(/ */, $nts)) {
+      if ($n eq 'A') { $comp .= 'T'; }
+      elsif ($n eq 'T') { $comp .= 'A'; }
+      elsif ($n eq 'C') { $comp .= 'G'; }
+      elsif ($n eq 'G') { $comp .= 'C'; }
+      elsif ($n eq 'N') { $comp .= 'N'; }
+      elsif ($n eq '-') { $comp .= '-'; } #deletion
+      else { die "Couldn't do complement of $n for $nts\n"; }
+   }
+   return $comp;
+}
+
+sub getaa {
+   my $nts = shift;  #in multiples of 3
+   my $aa = '';
+   my @n = split(/ */, $nts);
+   while (@n) {
+      my @t = splice(@n, 0, 3);
+      my $n = uc(join("", @t));
+      if (!exists $codon{$n}) { $aa .= 'N'; next; }
+      $aa .= $codon{$n};
+   }
+   return $aa;
+}
+
+sub fill_codon {
+	$codon{GCA} = 'Ala';
+	$codon{GCC} = 'Ala';
+	$codon{GCG} = 'Ala';
+	$codon{GCT} = 'Ala';
+	$codon{CGG} = 'Arg';
+	$codon{CGT} = 'Arg';
+	$codon{CGC} = 'Arg';
+	$codon{AGA} = 'Arg';
+	$codon{AGG} = 'Arg';
+	$codon{CGA} = 'Arg';
+	$codon{AAC} = 'Asn';
+	$codon{AAT} = 'Asn';
+	$codon{GAC} = 'Asp';
+	$codon{GAT} = 'Asp';
+	$codon{TGC} = 'Cys';
+	$codon{TGT} = 'Cys';
+	$codon{CAG} = 'Gln';
+	$codon{CAA} = 'Gln';
+	$codon{GAA} = 'Glu';
+	$codon{GAG} = 'Glu';
+	$codon{GGG} = 'Gly';
+	$codon{GGA} = 'Gly';
+	$codon{GGC} = 'Gly';
+	$codon{GGT} = 'Gly';
+	$codon{CAC} = 'His';
+	$codon{CAT} = 'His';
+	$codon{ATA} = 'Ile';
+	$codon{ATT} = 'Ile';
+	$codon{ATC} = 'Ile';
+	$codon{CTA} = 'Leu';
+	$codon{CTC} = 'Leu';
+	$codon{CTG} = 'Leu';
+	$codon{CTT} = 'Leu';
+	$codon{TTG} = 'Leu';
+	$codon{TTA} = 'Leu';
+	$codon{AAA} = 'Lys';
+	$codon{AAG} = 'Lys';
+	$codon{ATG} = 'Met';
+	$codon{TTC} = 'Phe';
+	$codon{TTT} = 'Phe';
+	$codon{CCT} = 'Pro';
+	$codon{CCA} = 'Pro';
+	$codon{CCC} = 'Pro';
+	$codon{CCG} = 'Pro';
+	$codon{TCA} = 'Ser';
+	$codon{AGC} = 'Ser';
+	$codon{AGT} = 'Ser';
+	$codon{TCC} = 'Ser';
+	$codon{TCT} = 'Ser';
+	$codon{TCG} = 'Ser';
+	$codon{TGA} = 'Stop';
+	$codon{TAG} = 'Stop';
+	$codon{TAA} = 'Stop';
+	$codon{ACT} = 'Thr';
+	$codon{ACA} = 'Thr';
+	$codon{ACC} = 'Thr';
+	$codon{ACG} = 'Thr';
+	$codon{TGG} = 'Trp';
+	$codon{TAT} = 'Tyr';
+	$codon{TAC} = 'Tyr';
+	$codon{GTC} = 'Val';
+	$codon{GTA} = 'Val';
+	$codon{GTG} = 'Val';
+	$codon{GTT} = 'Val';
+}
+	
diff -r a67ca0795efb -r dfaa18960944 tools/evolution/codingSnps.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/evolution/codingSnps.xml	Tue Mar 23 13:53:21 2010 -0400
@@ -0,0 +1,89 @@
+<tool id="codingSnps" name="Amino-acid changes">
+  <description>caused by a set of SNPs</description>
+  <command interpreter="perl">
+    codingSnps.pl $input1 $input2 ${GALAXY_DATA_INDEX_DIR}/codingSnps.loc chr=${input1.metadata.chromCol} start=${input1.metadata.startCol} end=${input1.metadata.endCol} snp=$col1 $out_file1
+  </command>
+  <inputs>
+    <param format="interval" name="input1" type="data" label="SNPs"/>
+    <param format="interval" name="input2" type="data" label="genes"/>
+    <param name="col1" label="Column with SNPs" type="data_column" data_ref="input1" />
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" />
+  </outputs>
+  <code file="codingSnps_filter.py"></code>
+  <requirements>
+    <requirement type="binary">twoBitToFa</requirement>
+  </requirements>
+  <tests>
+    <test>
+	<param name="input1" value="codingSnps_input1.interval" dbkey="hg18" />
+	<param name="input2" value="codingSnps_input2.bed" dbkey="hg18" />
+	<param name="col1" value="6" />
+	<output name="output" file="codingSnps_output.interval" />
+    </test>
+  </tests>
+
+  <help>
+This tool identifies which SNPs create amino-acid changes in the specified coding regions.
+
+**Example**
+
+- first input file, with SNPs::
+
+    chr22       14440426        14440427        C/T
+    chr22       14494851        14494852        A/G
+    chr22       14494911        14494912        A/T
+    chr22       14550435        14550436        A/G
+    chr22       14611956        14611957        G/T
+    chr22       14612076        14612077        A/G
+    chr22       14668537        14668538        C
+    chr22       14668703        14668704        A/T
+    chr22       14668775        14668776        G
+    chr22       14680074        14680075        A/T
+    etc.
+
+    alternatively indicating polymorphisms using ambiguous-nucleotide symbols:
+
+    chr22       14440426        14440427        Y
+    chr22       14494851        14494852        R
+    chr22       14494911        14494912        W
+    chr22       14550435        14550436        R
+    chr22       14611956        14611957        K
+    chr22       14612076        14612077        R
+    chr22       14668537        14668538        C
+    chr22       14668703        14668704        W
+    chr22       14668775        14668776        G
+    chr22       14680074        14680075        W
+    etc.
+
+- second input file, with UCSC annotations for human genes::
+
+    chr22   14504263        14572999        uc002zkr.2      0       -       14504263        14504263        0       5       710,91,136,138,94,      0,38133,62547,62901,68642,
+    chr22   14527995        14572999        uc010gqo.1      0       -       14527995        14527995        0       4       3826,91,136,94, 0,14401,38815,44910,
+    chr22   14542065        14552264        uc002zkt.2      0       +       14542065        14542065        0       3       323,88,313,     0,2416,9886,
+    chr22   14559619        14561004        uc002zku.2      0       -       14559619        14559619        0       1       1385,   0,
+    chr22   14567164        14572999        uc002zkv.2      0       -       14567164        14567164        0       5       138,112,115,111,94,     0,1867,2099,3516,5741,
+    chr22   14620243        14620281        uc002zkw.1      0       -       14620243        14620243        0       1       38,     0,
+    chr22   14620300        14620339        uc002zkx.1      0       -       14620300        14620300        0       1       39,     0,
+    chr22   14621086        14621125        uc002zky.1      0       +       14621086        14621086        0       1       39,     0,
+    chr22   14622000        14622030        uc002zkz.1      0       -       14622000        14622000        0       1       30,     0,
+    chr22   14623380        14623414        uc002zla.1      0       -       14623380        14623380        0       1       34,     0,
+    etc.
+
+- output file, showing non-synonymous substitutions in coding regions::
+
+    chr22    15452482        15452483        G       uc002zlp.1      Trp:Arg            320     A
+    chr22    15644564        15644565        T       uc002zlv.1      His:Asn            442     G
+    chr22    15645123        15645124        C       uc002zlv.1      Phe:Leu            255     A
+    chr22    15645193        15645194        A/G     uc002zlv.1      Pro:Leu/Pro        232     G
+    chr22    15660821        15660822        A/G     uc002zlv.1      Thr:Met/Thr        143     G
+    chr22    15969208        15969209        C/T     uc002zly.1      Ala:Ala/Val        367     C
+    chr22    15969208        15969209        C/T     uc010gqt.1      Ala:Ala/Val        315     C
+    chr22    15999075        15999076        C/G     uc002zmd.1      Arg:Arg/Ser        180     C
+    chr22    15999075        15999076        C/G     uc002zme.1      Arg:Arg/Ser        161     C
+    chr22    15999075        15999076        C/G     uc002zmf.1      Arg:Arg/Ser        369     C
+    etc.
+
+</help>
+</tool>
diff -r a67ca0795efb -r dfaa18960944 tools/evolution/codingSnps_filter.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/evolution/codingSnps_filter.py	Tue Mar 23 13:53:21 2010 -0400
@@ -0,0 +1,43 @@
+# runs after the job (and after the default post-filter)
+import os
+from galaxy import eggs
+from galaxy import jobs
+from galaxy.tools.parameters import DataToolParameter
+# Older py compatibility
+try:
+    set()
+except:
+    from sets import Set as set
+
+def validate_input( trans, error_map, param_values, page_param_map ):
+    dbkeys = set()
+    data_param_names = set()
+    data_params = 0
+    for name, param in page_param_map.iteritems():
+        if isinstance( param, DataToolParameter ):
+            # for each dataset parameter
+            if param_values.get(name, None) != None:
+                dbkeys.add( param_values[name].dbkey )
+                data_params += 1
+                # check meta data
+                try:
+                    param = param_values[name]
+                    startCol = int( param.metadata.startCol )
+                    endCol = int( param.metadata.endCol )
+                    chromCol = int( param.metadata.chromCol )
+                    if param.metadata.strandCol is not None:
+                        strandCol = int ( param.metadata.strandCol )
+                    else:
+                        strandCol = 0
+                except:
+                    error_msg = "The attributes of this dataset are not properly set. " + \
+                    "Click the pencil icon in the history item to set the chrom, start, end and strand columns."
+                    error_map[name] = error_msg
+            data_param_names.add( name )
+    if len( dbkeys ) > 1:
+        for name in data_param_names:
+            error_map[name] = "All datasets must belong to same genomic build, " \
+                "this dataset is linked to build '%s'" % param_values[name].dbkey
+    if data_params != len(data_param_names):
+        for name in data_param_names:
+            error_map[name] = "A dataset of the appropriate type is required"

    

Greg Von Kuster

tags

participants (1)