details: http://www.bx.psu.edu/hg/galaxy/rev/4c95f1a101f1 changeset: 3560:4c95f1a101f1 user: Greg Von Kuster <greg@bx.psu.edu> date: Wed Mar 24 14:14:58 2010 -0400 description: Next rev of codingSnps tool. diffstat: tools/evolution/codingSnps.pl | 268 ++++++++++++++++++---------------------- tools/evolution/codingSnps.xml | 2 +- 2 files changed, 124 insertions(+), 146 deletions(-) diffs (423 lines): diff -r bc2e61c2dac1 -r 4c95f1a101f1 tools/evolution/codingSnps.pl --- a/tools/evolution/codingSnps.pl Tue Mar 23 15:43:58 2010 -0400 +++ b/tools/evolution/codingSnps.pl Wed Mar 24 14:14:58 2010 -0400 @@ -9,75 +9,32 @@ # those that cause a frameshift or substitution in the amino acid. ######################################################################### -my $uniq = 0; # flag for whether want uniq positions -my $syn = 0; # flag for if want synonomous changes rather than non-syn -my $seqFlag = "2bit"; # flag to set sequence type 2bit|nib -my $nibDir; # directory containg data -my $nibTag; # tag for directory above - -################################################################################ -# Parse command line arguments # -################################################################################ - -# make sure we have enough arguments if (!@ARGV or scalar @ARGV < 3) { - print STDERR "Usage: codingSnps.pl snps.bed genes.bed locfile.loc [chr=# start=# end=# snp=#] output_file\n"; + print "Usage: codingSnps.pl snps.bed genes.bed (/dir/nib/|Galaxy build= loc=) [chr=# start=# end=# snp=#] > codingSnps.txt\n"; exit; } - -# get first three command line arguments -my $snpFile = shift @ARGV; +my $uniq = 0; #flag for whether want uniq positions +my $syn = 0; #flag for if want synonomous changes rather than non-syn +my $snpFile = shift @ARGV; my $geneFile = shift @ARGV; -my $locFile = shift @ARGV; - -# read $locFile to get $nibDir (ignoring commets) -# FIXME: the last entry is the one you get -open(LF, "< $locFile") || die "open($locFile): $!\n"; -while(<LF>) { - s/#.*$//; - s/(?:^\s+|\s+$)//g; - next if (/^$/); - - # $tag and $path are set for each "valid" line in the file - ($nibTag, $nibDir) = split(/\t/); -} -close(LF); - -# bed like columns in default positions -my $col0 = 0; +my $nibDir = shift @ARGV; +if ($nibDir eq 'Galaxy') { getGalaxyInfo(); } +my $seqFlag = "2bit"; #flag to set sequence type 2bit|nib +my $col0 = 0; #bed like columns in default positions my $col1 = 1; my $col2 = 2; my $col3 = 3; - -# get column positions for chr, start, end, snp -# column positions 1 based coming in (for Galaxy) +#column positions 1 based coming in (for Galaxy) foreach (@ARGV) { - if (/^chr=(\d+)$/) { - $col0 = $1 - 1; - } elsif (/^start=(\d+)$/) { - $col1 = $1 - 1; - } elsif (/^end=(\d+)$/) { - $col2 = $1 - 1; - } elsif (/^snp=(\d+)$/) { - $col3 = $1 - 1; - } + if (/chr=(\d+)/) { $col0 = $1 -1; } + elsif (/start=(\d+)/) { $col1 = $1 -1; } + elsif (/end=(\d+)/) { $col2 = $1 -1; } + elsif (/snp=(\d+)/) { $col3 = $1 -1; } } - -# make sure the column positions are sane if ($col0 < 0 || $col1 < 0 || $col2 < 0 || $col3 < 0) { print STDERR "ERROR column numbers are given with origin 1\n"; exit 1; } - -# get the output_file from the command line arguments -my $outFile = $ARGV[$#ARGV]; - -open(OUTFILE, "> $outFile") || die "open($outFile): $!\n"; - -################################################################################ -# Initialization # -################################################################################ - my @genes; #bed lines for genes, sorted by chrom and start my %chrSt; #index in array where each chrom starts my %codon; #hash of codon amino acid conversions @@ -97,13 +54,7 @@ "V" => "A/C/G", "N" => "A/C/G/T" ); - fill_codon(); - -################################################################################ -# Main # -################################################################################ - open(FH, "cat $geneFile | sort -k1,1 -k2,2n |") or die "Couldn't open and sort $geneFile, $!\n"; my $i = 0; @@ -117,7 +68,7 @@ } close FH or die "Couldn't close $geneFile, $!\n"; -if ($ends) { print STDERR "TESTING using block ends rather than sizes\n"; } +if ($ends) { print STDERR "WARNING using block ends rather than sizes\n"; } #open snps sorted as well my $s1 = $col0 + 1; #sort order is origin 1 @@ -129,6 +80,7 @@ my %done; while(<FH>) { chomp; + if (/^\s*#/) { next; } #comment my @s = split(/\t/); #SNP fields if (!@s or !$s[$col0]) { die "ERROR missing SNP data, $_\n"; } my $size = $#s; @@ -200,14 +152,10 @@ } } close FH or die "Couldn't close $snpFile, $!\n"; -close(OUTFILE) || die "close($outFile): $!\n"; exit; -################################################################################ -# Subroutines # -################################################################################ - +######################################################################## sub processSnp { my $sref = shift; my $gref = shift; @@ -220,7 +168,7 @@ my $i = 0; my @st = split(/,/, $gref->[11]); my @size = split(/,/, $gref->[10]); - if (scalar @st ne $gref->[9]) { die "bad gene $gref->[3]\n"; } + if (scalar @st ne $gref->[9]) { return; } #cant do this gene #die "bad gene $gref->[3]\n"; } my @pos; my $in = 0; for($i = 0; $i < $gref->[9]; $i++) { @@ -246,12 +194,12 @@ my $c = ($copy =~ tr/-//); if ($c % 3 == 0) { return; } #not frameshift #handle bed4 to bed4 + 4 (pgSnp) - print OUTFILE "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; + print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; #if ($sref->[4]) { print "\t$sref->[4]"; } #if ($sref->[5]) { print "\t$sref->[5]"; } #if ($sref->[6]) { print "\t$sref->[6]"; } #if ($sref->[7]) { print "\t$sref->[7]"; } - print OUTFILE "\t$gref->[3]\tframeshift\n"; + print "\t$gref->[3]\tframeshift\n"; $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++; return; }elsif ($sref->[$col1] == $sref->[$col2]) { #insertion @@ -259,12 +207,12 @@ my $c = ($copy =~ tr/\[ACTG]+//); if ($c % 3 == 0) { return; } #not frameshift #handle bed4 to bed4 + 4 (pgSnp) - print OUTFILE "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; + print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; #if ($sref->[4]) { print "\t$sref->[4]"; } #if ($sref->[5]) { print "\t$sref->[5]"; } #if ($sref->[6]) { print "\t$sref->[6]"; } #if ($sref->[7]) { print "\t$sref->[7]"; } - print OUTFILE "\t$gref->[3]\tframeshift\n"; + print "\t$gref->[3]\tframeshift\n"; $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++; return; } @@ -310,6 +258,7 @@ my @vars = split(/\//, $sref->[$col3]); if ($gref->[5] eq '-') { #complement oldnts and revcomp vars $oldnts = compl($oldnts); + if (!$oldnts) { return; } #skip this one $oldnts = join('', (reverse(split(/ */, $oldnts)))); foreach (@vars) { $_ = reverse(split(/ */)); @@ -320,6 +269,7 @@ my @newnts; my $changed = ''; foreach my $v (@vars) { + if (!$v or length($v) != 1) { return; } #only simple changes my @new = split(/ */, $oldnts); $changed = splice(@new, $r, $len, split(/ */, $v)); #should only change single nt @@ -335,8 +285,8 @@ push(@newaa, $t); } if (!$change && $syn) { - print OUTFILE "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; - print OUTFILE "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\n"; + print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; + print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\n"; return; }elsif ($syn) { return; } #only want synonymous changes if (!$change) { return; } #no change in amino acids @@ -346,13 +296,14 @@ #print STDERR "oldnt $oldnts, strand $gref->[5]\n"; #exit; #} - print OUTFILE "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; + print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; #if (defined $sref->[4]) { print "\t$sref->[4]"; } #if (defined $sref->[5]) { print "\t$sref->[5]"; } #if (defined $sref->[6]) { print "\t$sref->[6]"; } #if (defined $sref->[7]) { print "\t$sref->[7]"; } if ($gref->[5] eq '-') { $changed = compl($changed); } #use plus for ref - print OUTFILE "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\t$cdNum\t$changed\n"; + if (!$changed) { return; } #skip this one + print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\t$cdNum\t$changed\n"; $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++; } } @@ -415,7 +366,7 @@ my $end = shift; my $strand = '+'; $st--; #change to UCSC numbering - open (BIT, "twoBitToFa -seq=$chr -start=$st -end=$end $nibDir/$nibTag.2bit stdout |") or + open (BIT, "twoBitToFa -seq=$chr -start=$st -end=$end $nibDir stdout |") or die "Couldn't run twoBitToFa, $!\n"; my $seq = ''; while (<BIT>) { @@ -423,7 +374,7 @@ if (/^>/) { next; } #header $seq .= $_; } - close BIT or die "Couldn't finish nibFrag on $chr $st $end, $!\n"; + close BIT or die "Couldn't finish twoBitToFa on $chr $st $end, $!\n"; return $seq; } @@ -454,7 +405,7 @@ elsif ($n eq 'G') { $comp .= 'C'; } elsif ($n eq 'N') { $comp .= 'N'; } elsif ($n eq '-') { $comp .= '-'; } #deletion - else { die "Couldn't do complement of $n for $nts\n"; } + else { $comp = undef; } } return $comp; } @@ -473,69 +424,96 @@ } sub fill_codon { - $codon{GCA} = 'Ala'; - $codon{GCC} = 'Ala'; - $codon{GCG} = 'Ala'; - $codon{GCT} = 'Ala'; - $codon{CGG} = 'Arg'; - $codon{CGT} = 'Arg'; - $codon{CGC} = 'Arg'; - $codon{AGA} = 'Arg'; - $codon{AGG} = 'Arg'; - $codon{CGA} = 'Arg'; - $codon{AAC} = 'Asn'; - $codon{AAT} = 'Asn'; - $codon{GAC} = 'Asp'; - $codon{GAT} = 'Asp'; - $codon{TGC} = 'Cys'; - $codon{TGT} = 'Cys'; - $codon{CAG} = 'Gln'; - $codon{CAA} = 'Gln'; - $codon{GAA} = 'Glu'; - $codon{GAG} = 'Glu'; - $codon{GGG} = 'Gly'; - $codon{GGA} = 'Gly'; - $codon{GGC} = 'Gly'; - $codon{GGT} = 'Gly'; - $codon{CAC} = 'His'; - $codon{CAT} = 'His'; - $codon{ATA} = 'Ile'; - $codon{ATT} = 'Ile'; - $codon{ATC} = 'Ile'; - $codon{CTA} = 'Leu'; - $codon{CTC} = 'Leu'; - $codon{CTG} = 'Leu'; - $codon{CTT} = 'Leu'; - $codon{TTG} = 'Leu'; - $codon{TTA} = 'Leu'; - $codon{AAA} = 'Lys'; - $codon{AAG} = 'Lys'; - $codon{ATG} = 'Met'; - $codon{TTC} = 'Phe'; - $codon{TTT} = 'Phe'; - $codon{CCT} = 'Pro'; - $codon{CCA} = 'Pro'; - $codon{CCC} = 'Pro'; - $codon{CCG} = 'Pro'; - $codon{TCA} = 'Ser'; - $codon{AGC} = 'Ser'; - $codon{AGT} = 'Ser'; - $codon{TCC} = 'Ser'; - $codon{TCT} = 'Ser'; - $codon{TCG} = 'Ser'; - $codon{TGA} = 'Stop'; - $codon{TAG} = 'Stop'; - $codon{TAA} = 'Stop'; - $codon{ACT} = 'Thr'; - $codon{ACA} = 'Thr'; - $codon{ACC} = 'Thr'; - $codon{ACG} = 'Thr'; - $codon{TGG} = 'Trp'; - $codon{TAT} = 'Tyr'; - $codon{TAC} = 'Tyr'; - $codon{GTC} = 'Val'; - $codon{GTA} = 'Val'; - $codon{GTG} = 'Val'; - $codon{GTT} = 'Val'; +$codon{GCA} = 'Ala'; +$codon{GCC} = 'Ala'; +$codon{GCG} = 'Ala'; +$codon{GCT} = 'Ala'; +$codon{CGG} = 'Arg'; +$codon{CGT} = 'Arg'; +$codon{CGC} = 'Arg'; +$codon{AGA} = 'Arg'; +$codon{AGG} = 'Arg'; +$codon{CGA} = 'Arg'; +$codon{AAC} = 'Asn'; +$codon{AAT} = 'Asn'; +$codon{GAC} = 'Asp'; +$codon{GAT} = 'Asp'; +$codon{TGC} = 'Cys'; +$codon{TGT} = 'Cys'; +$codon{CAG} = 'Gln'; +$codon{CAA} = 'Gln'; +$codon{GAA} = 'Glu'; +$codon{GAG} = 'Glu'; +$codon{GGG} = 'Gly'; +$codon{GGA} = 'Gly'; +$codon{GGC} = 'Gly'; +$codon{GGT} = 'Gly'; +$codon{CAC} = 'His'; +$codon{CAT} = 'His'; +$codon{ATA} = 'Ile'; +$codon{ATT} = 'Ile'; +$codon{ATC} = 'Ile'; +$codon{CTA} = 'Leu'; +$codon{CTC} = 'Leu'; +$codon{CTG} = 'Leu'; +$codon{CTT} = 'Leu'; +$codon{TTG} = 'Leu'; +$codon{TTA} = 'Leu'; +$codon{AAA} = 'Lys'; +$codon{AAG} = 'Lys'; +$codon{ATG} = 'Met'; +$codon{TTC} = 'Phe'; +$codon{TTT} = 'Phe'; +$codon{CCT} = 'Pro'; +$codon{CCA} = 'Pro'; +$codon{CCC} = 'Pro'; +$codon{CCG} = 'Pro'; +$codon{TCA} = 'Ser'; +$codon{AGC} = 'Ser'; +$codon{AGT} = 'Ser'; +$codon{TCC} = 'Ser'; +$codon{TCT} = 'Ser'; +$codon{TCG} = 'Ser'; +$codon{TGA} = 'Stop'; +$codon{TAG} = 'Stop'; +$codon{TAA} = 'Stop'; +$codon{ACT} = 'Thr'; +$codon{ACA} = 'Thr'; +$codon{ACC} = 'Thr'; +$codon{ACG} = 'Thr'; +$codon{TGG} = 'Trp'; +$codon{TAT} = 'Tyr'; +$codon{TAC} = 'Tyr'; +$codon{GTC} = 'Val'; +$codon{GTA} = 'Val'; +$codon{GTG} = 'Val'; +$codon{GTT} = 'Val'; } - + +sub getGalaxyInfo { + my $build; + my $locFile; + foreach (@ARGV) { + if (/build=(.*)/) { $build = $1; } + elsif (/loc=(.*)/) { $locFile = $1; } + } + if (!$build or !$locFile) { + print STDERR "ERROR missing build or locfile for Galaxy input\n"; + exit 1; + } + # read $locFile to get $nibDir (ignoring commets) + open(LF, "< $locFile") || die "open($locFile): $!\n"; + while(<LF>) { + s/#.*$//; + s/(?:^\s+|\s+$)//g; + next if (/^$/); + + my @t = split(/\t/); + if ($t[0] eq $build) { $nibDir = $t[1]; } + } + close(LF); + if ($nibDir eq 'Galaxy') { + print STDERR "Failed to find sequence directory in locfile $locFile\n"; + } + $nibDir .= "/$build.2bit"; #we want full path and filename +} diff -r bc2e61c2dac1 -r 4c95f1a101f1 tools/evolution/codingSnps.xml --- a/tools/evolution/codingSnps.xml Tue Mar 23 15:43:58 2010 -0400 +++ b/tools/evolution/codingSnps.xml Wed Mar 24 14:14:58 2010 -0400 @@ -1,7 +1,7 @@ <tool id="codingSnps" name="Amino-acid changes"> <description>caused by a set of SNPs</description> <command interpreter="perl"> - codingSnps.pl $input1 $input2 ${GALAXY_DATA_INDEX_DIR}/codingSnps.loc chr=${input1.metadata.chromCol} start=${input1.metadata.startCol} end=${input1.metadata.endCol} snp=$col1 $out_file1 + codingSnps.pl $input1 $input2 Galaxy build=${input1.metadata.dbkey} loc=${GALAXY_DATA_INDEX_DIR}/codingSnps.loc chr=${input1.metadata.chromCol} start=${input1.metadata.startCol} end=${input1.metadata.endCol} snp=$col1 > $out_file1 </command> <inputs> <param format="interval" name="input1" type="data" label="SNPs"/>