details: http://www.bx.psu.edu/hg/galaxy/rev/2f4807d6c38c
changeset: 3578:2f4807d6c38c
user: Dan Blankenberg <dan(a)bx.psu.edu>
date: Mon Mar 29 14:54:23 2010 -0400
description:
Add ability for reverse complement in e.g. FASTQ Manipulation tool to handle ambiguity codes.
diffstat:
lib/galaxy_utils/sequence/transform.py | 7 +++----
test-data/misc_dna_as_sanger_rev_comp_1.fastqsanger | 16 ++++++++++++++++
test-data/misc_dna_original_sanger.fastqsanger | 16 ++++++++++++++++
test-data/misc_rna_as_sanger_rev_comp_1.fastqsanger | 16 ++++++++++++++++
test-data/misc_rna_original_sanger.fastqsanger | 16 ++++++++++++++++
tools/fastq/fastq_manipulation.xml | 20 ++++++++++++++++++++
6 files changed, 87 insertions(+), 4 deletions(-)
diffs (127 lines):
diff -r d34c1a8df003 -r 2f4807d6c38c lib/galaxy_utils/sequence/transform.py
--- a/lib/galaxy_utils/sequence/transform.py Mon Mar 29 14:33:23 2010 -0400
+++ b/lib/galaxy_utils/sequence/transform.py Mon Mar 29 14:54:23 2010 -0400
@@ -2,10 +2,9 @@
#Contains methods to tranform sequence strings
import string
-#FIXME: This should Handle ambiguity codes...
-#Translation table for reverse Complement
-DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" )
-RNA_COMPLEMENT = string.maketrans( "ACGUacgu", "UGCAugca" )
+#Translation table for reverse Complement, with ambiguity codes
+DNA_COMPLEMENT = string.maketrans( "ACGTRYKMBDHVacgtrykmbdhv", "TGCAYRMKVHDBtgcayrmkvhdb" )
+RNA_COMPLEMENT = string.maketrans( "ACGURYKMBDHVacgurykmbdhv", "UGCAYRMKVHDBugcayrmkvhdb" )
#Translation table for DNA <--> RNA
DNA_TO_RNA = string.maketrans( "Tt", "Uu" )
RNA_TO_DNA = string.maketrans( "Uu", "Tt" )
diff -r d34c1a8df003 -r 2f4807d6c38c test-data/misc_dna_as_sanger_rev_comp_1.fastqsanger
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/misc_dna_as_sanger_rev_comp_1.fastqsanger Mon Mar 29 14:54:23 2010 -0400
@@ -0,0 +1,16 @@
+@FAKE0007 Original version has lower case unambiguous DNA with PHRED scores from 0 to 40 inclusive (in that order)
+TACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
++
+IHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"!
+@FAKE0008 Original version has mixed case unambiguous DNA with PHRED scores from 0 to 40 inclusive (in that order)
+cgCTatgAcgCTatgAcgCTatgAcgCTatgAcgCTatgAc
++
+IHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"!
+@FAKE0009 Original version has lower case unambiguous DNA with PHRED scores from 0 to 40 inclusive (in that order)
+actgactgactgactgactgactgactgactgactgactga
++
+IHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"!
+@FAKE0010 Original version has mixed case ambiguous DNA and PHRED scores of 40, 30, 20, 10 (cycled)
+NHBVDMKSWRYGATCnhbvdmkswrygatc
++
+?I+5?I+5?I+5?I+5?I+5?I+5?I+5?I
diff -r d34c1a8df003 -r 2f4807d6c38c test-data/misc_dna_original_sanger.fastqsanger
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/misc_dna_original_sanger.fastqsanger Mon Mar 29 14:54:23 2010 -0400
@@ -0,0 +1,16 @@
+@FAKE0007 Original version has lower case unambiguous DNA with PHRED scores from 0 to 40 inclusive (in that order)
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTA
++
+!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI
+@FAKE0008 Original version has mixed case unambiguous DNA with PHRED scores from 0 to 40 inclusive (in that order)
+gTcatAGcgTcatAGcgTcatAGcgTcatAGcgTcatAGcg
++
+!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI
+@FAKE0009 Original version has lower case unambiguous DNA with PHRED scores from 0 to 40 inclusive (in that order)
+tcagtcagtcagtcagtcagtcagtcagtcagtcagtcagt
++
+!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI
+@FAKE0010 Original version has mixed case ambiguous DNA and PHRED scores of 40, 30, 20, 10 (cycled)
+gatcrywsmkhbvdnGATCRYWSMKHBVDN
++
+I?5+I?5+I?5+I?5+I?5+I?5+I?5+I?
diff -r d34c1a8df003 -r 2f4807d6c38c test-data/misc_rna_as_sanger_rev_comp_1.fastqsanger
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/misc_rna_as_sanger_rev_comp_1.fastqsanger Mon Mar 29 14:54:23 2010 -0400
@@ -0,0 +1,16 @@
+@FAKE0011 Original version has lower case unambiguous RNA with PHRED scores from 0 to 40 inclusive (in that order)
+UACGUACGUACGUACGUACGUACGUACGUACGUACGUACGU
++
+IHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"!
+@FAKE0012 Original version has mixed case unambiguous RNA with PHRED scores from 0 to 40 inclusive (in that order)
+cgCUaugAcgCUaugAcgCUaugAcgCUaugAcgCUaugAc
++
+IHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"!
+@FAKE0013 Original version has lower case unambiguous RNA with PHRED scores from 0 to 40 inclusive (in that order)
+acugacugacugacugacugacugacugacugacugacuga
++
+IHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"!
+@FAKE0014 Original version has mixed case ambiguous RNA with PHRED scores from 35 to 40 inclusive (cycled)
+NHBVDMKSWRYGAUCnhbvdmkswrygauc
++
+IHGFEDIHGFEDIHGFEDIHGFEDIHGFED
diff -r d34c1a8df003 -r 2f4807d6c38c test-data/misc_rna_original_sanger.fastqsanger
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/misc_rna_original_sanger.fastqsanger Mon Mar 29 14:54:23 2010 -0400
@@ -0,0 +1,16 @@
+@FAKE0011 Original version has lower case unambiguous RNA with PHRED scores from 0 to 40 inclusive (in that order)
+ACGUACGUACGUACGUACGUACGUACGUACGUACGUACGUA
++
+!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI
+@FAKE0012 Original version has mixed case unambiguous RNA with PHRED scores from 0 to 40 inclusive (in that order)
+gUcauAGcgUcauAGcgUcauAGcgUcauAGcgUcauAGcg
++
+!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI
+@FAKE0013 Original version has lower case unambiguous RNA with PHRED scores from 0 to 40 inclusive (in that order)
+ucagucagucagucagucagucagucagucagucagucagu
++
+!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI
+@FAKE0014 Original version has mixed case ambiguous RNA with PHRED scores from 35 to 40 inclusive (cycled)
+gaucrywsmkhbvdnGAUCRYWSMKHBVDN
++
+DEFGHIDEFGHIDEFGHIDEFGHIDEFGHI
diff -r d34c1a8df003 -r 2f4807d6c38c tools/fastq/fastq_manipulation.xml
--- a/tools/fastq/fastq_manipulation.xml Mon Mar 29 14:33:23 2010 -0400
+++ b/tools/fastq/fastq_manipulation.xml Mon Mar 29 14:54:23 2010 -0400
@@ -331,6 +331,26 @@
<param name="manipulation_selector" value="rev_comp" />
<output name="output_file" file="sanger_full_range_rev_comp.fastqsanger" />
</test>
+ <!-- match all and rev comp, with ambiguous DNA -->
+ <test>
+ <param name="input_file" value="misc_dna_original_sanger.fastqsanger" ftype="fastqsanger" />
+ <param name="match_type_selector" value="identifier" />
+ <param name="match_selector" value="regex" />
+ <param name="match_by" value=".*" />
+ <param name="manipulation_type_selector" value="sequence" />
+ <param name="manipulation_selector" value="rev_comp" />
+ <output name="output_file" file="misc_dna_as_sanger_rev_comp_1.fastqsanger" />
+ </test>
+ <!-- match all and rev comp, with ambiguous RNA -->
+ <test>
+ <param name="input_file" value="misc_rna_original_sanger.fastqsanger" ftype="fastqsanger" />
+ <param name="match_type_selector" value="identifier" />
+ <param name="match_selector" value="regex" />
+ <param name="match_by" value=".*" />
+ <param name="manipulation_type_selector" value="sequence" />
+ <param name="manipulation_selector" value="rev_comp" />
+ <output name="output_file" file="misc_rna_as_sanger_rev_comp_1.fastqsanger" />
+ </test>
<!-- match first seq and rev comp -->
<test>
<param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" />