details: http://www.bx.psu.edu/hg/galaxy/rev/2f4807d6c38c changeset: 3578:2f4807d6c38c user: Dan Blankenberg <dan@bx.psu.edu> date: Mon Mar 29 14:54:23 2010 -0400 description: Add ability for reverse complement in e.g. FASTQ Manipulation tool to handle ambiguity codes. diffstat: lib/galaxy_utils/sequence/transform.py | 7 +++---- test-data/misc_dna_as_sanger_rev_comp_1.fastqsanger | 16 ++++++++++++++++ test-data/misc_dna_original_sanger.fastqsanger | 16 ++++++++++++++++ test-data/misc_rna_as_sanger_rev_comp_1.fastqsanger | 16 ++++++++++++++++ test-data/misc_rna_original_sanger.fastqsanger | 16 ++++++++++++++++ tools/fastq/fastq_manipulation.xml | 20 ++++++++++++++++++++ 6 files changed, 87 insertions(+), 4 deletions(-) diffs (127 lines): diff -r d34c1a8df003 -r 2f4807d6c38c lib/galaxy_utils/sequence/transform.py --- a/lib/galaxy_utils/sequence/transform.py Mon Mar 29 14:33:23 2010 -0400 +++ b/lib/galaxy_utils/sequence/transform.py Mon Mar 29 14:54:23 2010 -0400 @@ -2,10 +2,9 @@ #Contains methods to tranform sequence strings import string -#FIXME: This should Handle ambiguity codes... -#Translation table for reverse Complement -DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" ) -RNA_COMPLEMENT = string.maketrans( "ACGUacgu", "UGCAugca" ) +#Translation table for reverse Complement, with ambiguity codes +DNA_COMPLEMENT = string.maketrans( "ACGTRYKMBDHVacgtrykmbdhv", "TGCAYRMKVHDBtgcayrmkvhdb" ) +RNA_COMPLEMENT = string.maketrans( "ACGURYKMBDHVacgurykmbdhv", "UGCAYRMKVHDBugcayrmkvhdb" ) #Translation table for DNA <--> RNA DNA_TO_RNA = string.maketrans( "Tt", "Uu" ) RNA_TO_DNA = string.maketrans( "Uu", "Tt" ) diff -r d34c1a8df003 -r 2f4807d6c38c test-data/misc_dna_as_sanger_rev_comp_1.fastqsanger --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/misc_dna_as_sanger_rev_comp_1.fastqsanger Mon Mar 29 14:54:23 2010 -0400 @@ -0,0 +1,16 @@ +@FAKE0007 Original version has lower case unambiguous DNA with PHRED scores from 0 to 40 inclusive (in that order) +TACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT ++ +IHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"! +@FAKE0008 Original version has mixed case unambiguous DNA with PHRED scores from 0 to 40 inclusive (in that order) +cgCTatgAcgCTatgAcgCTatgAcgCTatgAcgCTatgAc ++ +IHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"! +@FAKE0009 Original version has lower case unambiguous DNA with PHRED scores from 0 to 40 inclusive (in that order) +actgactgactgactgactgactgactgactgactgactga ++ +IHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"! +@FAKE0010 Original version has mixed case ambiguous DNA and PHRED scores of 40, 30, 20, 10 (cycled) +NHBVDMKSWRYGATCnhbvdmkswrygatc ++ +?I+5?I+5?I+5?I+5?I+5?I+5?I+5?I diff -r d34c1a8df003 -r 2f4807d6c38c test-data/misc_dna_original_sanger.fastqsanger --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/misc_dna_original_sanger.fastqsanger Mon Mar 29 14:54:23 2010 -0400 @@ -0,0 +1,16 @@ +@FAKE0007 Original version has lower case unambiguous DNA with PHRED scores from 0 to 40 inclusive (in that order) +ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTA ++ +!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI +@FAKE0008 Original version has mixed case unambiguous DNA with PHRED scores from 0 to 40 inclusive (in that order) +gTcatAGcgTcatAGcgTcatAGcgTcatAGcgTcatAGcg ++ +!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI +@FAKE0009 Original version has lower case unambiguous DNA with PHRED scores from 0 to 40 inclusive (in that order) +tcagtcagtcagtcagtcagtcagtcagtcagtcagtcagt ++ +!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI +@FAKE0010 Original version has mixed case ambiguous DNA and PHRED scores of 40, 30, 20, 10 (cycled) +gatcrywsmkhbvdnGATCRYWSMKHBVDN ++ +I?5+I?5+I?5+I?5+I?5+I?5+I?5+I? diff -r d34c1a8df003 -r 2f4807d6c38c test-data/misc_rna_as_sanger_rev_comp_1.fastqsanger --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/misc_rna_as_sanger_rev_comp_1.fastqsanger Mon Mar 29 14:54:23 2010 -0400 @@ -0,0 +1,16 @@ +@FAKE0011 Original version has lower case unambiguous RNA with PHRED scores from 0 to 40 inclusive (in that order) +UACGUACGUACGUACGUACGUACGUACGUACGUACGUACGU ++ +IHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"! +@FAKE0012 Original version has mixed case unambiguous RNA with PHRED scores from 0 to 40 inclusive (in that order) +cgCUaugAcgCUaugAcgCUaugAcgCUaugAcgCUaugAc ++ +IHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"! +@FAKE0013 Original version has lower case unambiguous RNA with PHRED scores from 0 to 40 inclusive (in that order) +acugacugacugacugacugacugacugacugacugacuga ++ +IHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"! +@FAKE0014 Original version has mixed case ambiguous RNA with PHRED scores from 35 to 40 inclusive (cycled) +NHBVDMKSWRYGAUCnhbvdmkswrygauc ++ +IHGFEDIHGFEDIHGFEDIHGFEDIHGFED diff -r d34c1a8df003 -r 2f4807d6c38c test-data/misc_rna_original_sanger.fastqsanger --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/misc_rna_original_sanger.fastqsanger Mon Mar 29 14:54:23 2010 -0400 @@ -0,0 +1,16 @@ +@FAKE0011 Original version has lower case unambiguous RNA with PHRED scores from 0 to 40 inclusive (in that order) +ACGUACGUACGUACGUACGUACGUACGUACGUACGUACGUA ++ +!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI +@FAKE0012 Original version has mixed case unambiguous RNA with PHRED scores from 0 to 40 inclusive (in that order) +gUcauAGcgUcauAGcgUcauAGcgUcauAGcgUcauAGcg ++ +!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI +@FAKE0013 Original version has lower case unambiguous RNA with PHRED scores from 0 to 40 inclusive (in that order) +ucagucagucagucagucagucagucagucagucagucagu ++ +!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI +@FAKE0014 Original version has mixed case ambiguous RNA with PHRED scores from 35 to 40 inclusive (cycled) +gaucrywsmkhbvdnGAUCRYWSMKHBVDN ++ +DEFGHIDEFGHIDEFGHIDEFGHIDEFGHI diff -r d34c1a8df003 -r 2f4807d6c38c tools/fastq/fastq_manipulation.xml --- a/tools/fastq/fastq_manipulation.xml Mon Mar 29 14:33:23 2010 -0400 +++ b/tools/fastq/fastq_manipulation.xml Mon Mar 29 14:54:23 2010 -0400 @@ -331,6 +331,26 @@ <param name="manipulation_selector" value="rev_comp" /> <output name="output_file" file="sanger_full_range_rev_comp.fastqsanger" /> </test> + <!-- match all and rev comp, with ambiguous DNA --> + <test> + <param name="input_file" value="misc_dna_original_sanger.fastqsanger" ftype="fastqsanger" /> + <param name="match_type_selector" value="identifier" /> + <param name="match_selector" value="regex" /> + <param name="match_by" value=".*" /> + <param name="manipulation_type_selector" value="sequence" /> + <param name="manipulation_selector" value="rev_comp" /> + <output name="output_file" file="misc_dna_as_sanger_rev_comp_1.fastqsanger" /> + </test> + <!-- match all and rev comp, with ambiguous RNA --> + <test> + <param name="input_file" value="misc_rna_original_sanger.fastqsanger" ftype="fastqsanger" /> + <param name="match_type_selector" value="identifier" /> + <param name="match_selector" value="regex" /> + <param name="match_by" value=".*" /> + <param name="manipulation_type_selector" value="sequence" /> + <param name="manipulation_selector" value="rev_comp" /> + <output name="output_file" file="misc_rna_as_sanger_rev_comp_1.fastqsanger" /> + </test> <!-- match first seq and rev comp --> <test> <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" />