details: http://www.bx.psu.edu/hg/galaxy/rev/b25489f4fb26 changeset: 2695:b25489f4fb26 user: Dan Blankenberg <dan@bx.psu.edu> date: Tue Sep 15 12:04:14 2009 -0400 description: Add a new tool, Mutate by SNP codon, which will take codon position and sequence information that is joined to SNP data and create a 'mutated codon'. 6 file(s) affected in this change: test-data/mutate_snp_codon_in.interval test-data/mutate_snp_codon_out.interval tool_conf.xml.main tool_conf.xml.sample tools/evolution/mutate_snp_codon.py tools/evolution/mutate_snp_codon.xml diffs (193 lines): diff -r 990231e77b88 -r b25489f4fb26 test-data/mutate_snp_codon_in.interval --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mutate_snp_codon_in.interval Tue Sep 15 12:04:14 2009 -0400 @@ -0,0 +1,6 @@ +chr1 58995 58998 NM_001005484 0 + GAA GAA Glu GAA 1177632 28.96 0 2787607 0.422452662804 585 chr1 58996 58997 rs1638318 0 + A A A/G genomic single by-submitter 0 0 unknown exact 3 +chr1 59289 59292 NM_001005484 0 + TTT TTT Phe TTT 714298 17.57 0 1538990 0.464134269878 585 chr1 59290 59291 rs71245814 0 + T T G/T genomic single unknown 0 0 unknown exact 3 +chr1 59313 59316 NM_001005484 0 + AAG AAG Lys AAG 1295568 31.86 0 2289189 0.565950648898 585 chr1 59315 59316 rs2854682 0 - G G C/T genomic single by-submitter 0 0 unknown exact 3 +chr1 59373 59376 NM_001005484 0 + ACA ACA Thr ACA 614523 15.11 0 2162384 0.284187729839 585 chr1 59373 59374 rs2691305 0 - A A C/T genomic single unknown 0 0 unknown exact 3 +chr1 59412 59415 NM_001005484 0 + GCG GCG Ala GCG 299495 7.37 0 2820741 0.106176001271 585 chr1 59414 59415 rs2531266 0 + G G C/G genomic single by-submitter 0 0 unknown exact 3 +chr1 59412 59415 NM_001005484 0 + GCG GCG Ala GCG 299495 7.37 0 2820741 0.106176001271 585 chr1 59414 59415 rs55874132 0 + G G C/G genomic single unknown 0 0 coding-synon exact 1 diff -r 990231e77b88 -r b25489f4fb26 test-data/mutate_snp_codon_out.interval --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mutate_snp_codon_out.interval Tue Sep 15 12:04:14 2009 -0400 @@ -0,0 +1,6 @@ +chr1 58995 58998 NM_001005484 0 + GAA GAA Glu GAA 1177632 28.96 0 2787607 0.422452662804 585 chr1 58996 58997 rs1638318 0 + A A A/G genomic single by-submitter 0 0 unknown exact 3 GGA +chr1 59289 59292 NM_001005484 0 + TTT TTT Phe TTT 714298 17.57 0 1538990 0.464134269878 585 chr1 59290 59291 rs71245814 0 + T T G/T genomic single unknown 0 0 unknown exact 3 TGT +chr1 59313 59316 NM_001005484 0 + AAG AAG Lys AAG 1295568 31.86 0 2289189 0.565950648898 585 chr1 59315 59316 rs2854682 0 - G G C/T genomic single by-submitter 0 0 unknown exact 3 AAA +chr1 59373 59376 NM_001005484 0 + ACA ACA Thr ACA 614523 15.11 0 2162384 0.284187729839 585 chr1 59373 59374 rs2691305 0 - A A C/T genomic single unknown 0 0 unknown exact 3 GCA +chr1 59412 59415 NM_001005484 0 + GCG GCG Ala GCG 299495 7.37 0 2820741 0.106176001271 585 chr1 59414 59415 rs2531266 0 + G G C/G genomic single by-submitter 0 0 unknown exact 3 GCC +chr1 59412 59415 NM_001005484 0 + GCG GCG Ala GCG 299495 7.37 0 2820741 0.106176001271 585 chr1 59414 59415 rs55874132 0 + G G C/G genomic single unknown 0 0 coding-synon exact 1 GCC diff -r 990231e77b88 -r b25489f4fb26 tool_conf.xml.main --- a/tool_conf.xml.main Tue Sep 15 11:28:50 2009 -0400 +++ b/tool_conf.xml.main Tue Sep 15 12:04:14 2009 -0400 @@ -132,10 +132,11 @@ <tool file="regVariation/best_regression_subsets.xml" /> <tool file="regVariation/rcve.xml" /> </section> - <section name="Evolution: HyPhy" id="hyphy"> + <section name="Evolution" id="hyphy"> <tool file="hyphy/hyphy_branch_lengths_wrapper.xml" /> <tool file="hyphy/hyphy_nj_tree_wrapper.xml" /> <tool file="hyphy/hyphy_dnds_wrapper.xml" /> + <tool file="evolution/mutate_snp_codon.xml" /> </section> <section name="Metagenomic analyses" id="tax_manipulation"> <tool file="taxonomy/gi2taxonomy.xml" /> diff -r 990231e77b88 -r b25489f4fb26 tool_conf.xml.sample --- a/tool_conf.xml.sample Tue Sep 15 11:28:50 2009 -0400 +++ b/tool_conf.xml.sample Tue Sep 15 12:04:14 2009 -0400 @@ -152,10 +152,11 @@ <tool file="regVariation/best_regression_subsets.xml" /> <tool file="regVariation/rcve.xml" /> </section> - <section name="Evolution: HyPhy" id="hyphy"> + <section name="Evolution" id="hyphy"> <tool file="hyphy/hyphy_branch_lengths_wrapper.xml" /> <tool file="hyphy/hyphy_nj_tree_wrapper.xml" /> <tool file="hyphy/hyphy_dnds_wrapper.xml" /> + <tool file="evolution/mutate_snp_codon.xml" /> </section> <section name="Metagenomic analyses" id="tax_manipulation"> <tool file="taxonomy/gi2taxonomy.xml" /> diff -r 990231e77b88 -r b25489f4fb26 tools/evolution/mutate_snp_codon.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/evolution/mutate_snp_codon.py Tue Sep 15 12:04:14 2009 -0400 @@ -0,0 +1,73 @@ +#!/usr/bin/env python +""" +Script to mutate SNP codons. +Dan Blankenberg +""" + +import sys, string + +def strandify( fields, column ): + strand = '+' + if column >= 0 and column < len( fields ): + strand = fields[ column ] + if strand not in [ '+', '-' ]: + strand = '+' + return strand + +def main(): + # parse command line + input_file = sys.argv[1] + out = open( sys.argv[2], 'wb+' ) + codon_chrom_col = int( sys.argv[3] ) - 1 + codon_start_col = int( sys.argv[4] ) - 1 + codon_end_col = int( sys.argv[5] ) - 1 + codon_strand_col = int( sys.argv[6] ) - 1 + codon_seq_col = int( sys.argv[7] ) - 1 + + snp_chrom_col = int( sys.argv[8] ) - 1 + snp_start_col = int( sys.argv[9] ) - 1 + snp_end_col = int( sys.argv[10] ) - 1 + snp_strand_col = int( sys.argv[11] ) - 1 + snp_observed_col = int( sys.argv[12] ) - 1 + + max_field_index = max( codon_chrom_col, codon_start_col, codon_end_col, codon_strand_col, codon_seq_col, snp_chrom_col, snp_start_col, snp_end_col, snp_strand_col, snp_observed_col ) + + DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" ) + skipped_lines = 0 + for line in open( input_file ): + line = line.rstrip( '\n\r' ) + if line and not line.startswith( '#' ): + fields = line.split( '\t' ) + if max_field_index >= len( fields ): + skipped_lines += 1 + continue + codon_chrom = fields[codon_chrom_col] + codon_start = int( fields[codon_start_col] ) + codon_end = int( fields[codon_end_col] ) + codon_strand = strandify( fields, codon_strand_col ) + codon_seq = fields[codon_seq_col].upper() + + snp_chrom = fields[snp_chrom_col] + snp_start = int( fields[snp_start_col] ) + snp_end = int( fields[snp_end_col] ) + snp_strand = strandify( fields, snp_strand_col ) + snp_observed = fields[snp_observed_col].split( '/' ) + + for observed in snp_observed: + #Extract DNA on neg strand codons will have positions reversed relative to interval positions; i.e. position 0 == position 2 + offset = snp_start - codon_start + if codon_strand == '-': + offset = 2 - offset + assert offset >= 0 and offset <= 2, ValueError( 'Impossible offset determined: %s' % offset ) + + if codon_strand != snp_strand: + #if our SNP is on a different strand than our codon, take complement of provided observed SNP base + observed = observed.translate( DNA_COMP ) + snp_codon = [ char for char in codon_seq ] + snp_codon[offset] = observed.upper() + snp_codon = ''.join( snp_codon ) + + if codon_seq != snp_codon: #only output when we actually have a different codon + out.write( "%s\t%s\n" % ( line, snp_codon ) ) + +if __name__ == "__main__": main() diff -r 990231e77b88 -r b25489f4fb26 tools/evolution/mutate_snp_codon.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/evolution/mutate_snp_codon.xml Tue Sep 15 12:04:14 2009 -0400 @@ -0,0 +1,60 @@ +<tool id="mutate_snp_codon_1" name="Mutate Codons" version="1.0.0"> + <description>with SNPs</description> + <command interpreter="python">mutate_snp_codon.py $input1 $output1 ${input1.metadata.chromCol} ${input1.metadata.startCol} ${input1.metadata.endCol} ${input1.metadata.strandCol} $codon_seq_col $snp_chrom_col $snp_start_col $snp_end_col $snp_strand_col $snp_observed_col</command> + <inputs> + <param name="input1" type="data" format="interval" label="Interval file with joined SNPs" optional="False" help="The interval metadata for this file should be set for the codon positions."/> + <param name="codon_seq_col" label="Codon Sequence column" type="data_column" data_ref="input1" /> + <param name="snp_chrom_col" label="SNP chromosome column" type="data_column" data_ref="input1" /> + <param name="snp_start_col" label="SNP start column" type="data_column" data_ref="input1" /> + <param name="snp_end_col" label="SNP end column" type="data_column" data_ref="input1" /> + <param name="snp_strand_col" label="SNP strand column" type="data_column" data_ref="input1" /> + <param name="snp_observed_col" label="SNP observed column" type="data_column" data_ref="input1" /> + </inputs> + <outputs> + <data name="output1" format="interval" metadata_source="input1"/> + </outputs> + <tests> + <test> + <param name="input1" value="mutate_snp_codon_in.interval"/> + <param name="codon_seq_col" value="8"/> + <param name="snp_chrom_col" value="17"/> + <param name="snp_start_col" value="18"/> + <param name="snp_end_col" value="19"/> + <param name="snp_strand_col" value="22"/> + <param name="snp_observed_col" value="25"/> + <output name="output1" file="mutate_snp_codon_out.interval" /> + </test> + </tests> + <help> +This tool takes an interval file as input. This input should contain a set of codon locations and corresponding DNA sequence (such as from the *Extract Genomic DNA* tool) joined to SNP locations with observed values (such as *all fields from selected table* from the snp130 table of hg18 at the UCSC Table browser). This interval file should have the metadata (chromosome, start, end, strand) set for the columns containing the locations of the codons. The user needs to specify the columns containing the sequence for the codon as well as the genomic positions and observed values (values should be split by '/') for the SNP data as tool input; SNPs positions and sequence substitutes must have a length of exactly 1. Only genomic intervals which yield a different sequence string are output. All sequence characters are converted to uppercase during processing. + + For example, using these settings: + + * **metadata** **chromosome**, **start**, **end** and **strand** set to **1**, **2**, **3** and **6**, respectively + * **Codon Sequence column** set to **c8** + * **SNP chromosome column** set to **c17** + * **SNP start column** set to **c18** + * **SNP end column** set to **c19** + * **SNP strand column** set to **c22** + * **SNP observed column** set to **c25** + + with the following input:: + + chr1 58995 58998 NM_001005484 0 + GAA GAA Glu GAA 1177632 28.96 0 2787607 0.422452662804 585 chr1 58996 58997 rs1638318 0 + A A A/G genomic single by-submitter 0 0 unknown exact 3 + chr1 59289 59292 NM_001005484 0 + TTT TTT Phe TTT 714298 17.57 0 1538990 0.464134269878 585 chr1 59290 59291 rs71245814 0 + T T G/T genomic single unknown 0 0 unknown exact 3 + chr1 59313 59316 NM_001005484 0 + AAG AAG Lys AAG 1295568 31.86 0 2289189 0.565950648898 585 chr1 59315 59316 rs2854682 0 - G G C/T genomic single by-submitter 0 0 unknown exact 3 + chr1 59373 59376 NM_001005484 0 + ACA ACA Thr ACA 614523 15.11 0 2162384 0.284187729839 585 chr1 59373 59374 rs2691305 0 - A A C/T genomic single unknown 0 0 unknown exact 3 + chr1 59412 59415 NM_001005484 0 + GCG GCG Ala GCG 299495 7.37 0 2820741 0.106176001271 585 chr1 59414 59415 rs2531266 0 + G G C/G genomic single by-submitter 0 0 unknown exact 3 + chr1 59412 59415 NM_001005484 0 + GCG GCG Ala GCG 299495 7.37 0 2820741 0.106176001271 585 chr1 59414 59415 rs55874132 0 + G G C/G genomic single unknown 0 0 coding-synon exact 1 + + + will produce:: + + chr1 58995 58998 NM_001005484 0 + GAA GAA Glu GAA 1177632 28.96 0 2787607 0.422452662804 585 chr1 58996 58997 rs1638318 0 + A A A/G genomic single by-submitter 0 0 unknown exact 3 GGA + chr1 59289 59292 NM_001005484 0 + TTT TTT Phe TTT 714298 17.57 0 1538990 0.464134269878 585 chr1 59290 59291 rs71245814 0 + T T G/T genomic single unknown 0 0 unknown exact 3 TGT + chr1 59313 59316 NM_001005484 0 + AAG AAG Lys AAG 1295568 31.86 0 2289189 0.565950648898 585 chr1 59315 59316 rs2854682 0 - G G C/T genomic single by-submitter 0 0 unknown exact 3 AAA + chr1 59373 59376 NM_001005484 0 + ACA ACA Thr ACA 614523 15.11 0 2162384 0.284187729839 585 chr1 59373 59374 rs2691305 0 - A A C/T genomic single unknown 0 0 unknown exact 3 GCA + chr1 59412 59415 NM_001005484 0 + GCG GCG Ala GCG 299495 7.37 0 2820741 0.106176001271 585 chr1 59414 59415 rs2531266 0 + G G C/G genomic single by-submitter 0 0 unknown exact 3 GCC + chr1 59412 59415 NM_001005484 0 + GCG GCG Ala GCG 299495 7.37 0 2820741 0.106176001271 585 chr1 59414 59415 rs55874132 0 + G G C/G genomic single unknown 0 0 coding-synon exact 1 GCC + </help> +</tool>