galaxy-dev
Threads by month
- ----- 2025 -----
- July
- June
- May
- April
- March
- February
- January
- ----- 2024 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2023 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2022 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2021 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2020 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2019 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2018 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2017 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2016 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2015 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2014 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2013 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2012 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2011 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2010 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2009 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2008 -----
- December
- November
- October
- September
- August
- 10008 discussions
details: http://www.bx.psu.edu/hg/galaxy/rev/b2a9827178e2
changeset: 1519:b2a9827178e2
user: Dan Blankenberg <dan(a)bx.psu.edu>
date: Fri Sep 19 12:27:20 2008 -0400
description:
Update GMAJ tool interface.
1 file(s) affected in this change:
tools/visualization/GMAJ.xml
diffs (117 lines):
diff -r 0f735b21dc12 -r b2a9827178e2 tools/visualization/GMAJ.xml
--- a/tools/visualization/GMAJ.xml Thu Sep 18 16:48:29 2008 -0400
+++ b/tools/visualization/GMAJ.xml Fri Sep 19 12:27:20 2008 -0400
@@ -3,7 +3,10 @@
<command interpreter="python">GMAJ.py $out_file1 $maf_input $gmaj_file $filenames_file</command>
<inputs>
<param name="maf_input" type="data" format="maf" label="Alignment File" optional="False"/>
- <param name="refseq" label="Reference Sequence" value="" type="text" help="Leave empty to allow interactive selection."/>
+ <param name="refseq" label="Reference Sequence" type="select">
+ <option value="first" selected="true">First sequence in each block</option>
+ <option value="any">Any sequence</option>
+ </param>
<repeat name="annotations" title="Annotations">
<conditional name="annotation_style">
<param name="style" type="select" label="Annotation Style" help="If your data is not in a style similar to what is available from Galaxy (and the UCSC table browser), choose 'Basic'.">
@@ -11,7 +14,7 @@
<option value="basic">Basic</option>
</param>
<when value="galaxy">
- <param name="species" type="select" label="Species of Annotation" multiple="False">
+ <param name="species" type="select" label="Species" multiple="False">
<options>
<filter type="data_meta" ref="maf_input" key="species" />
</options>
@@ -21,7 +24,6 @@
<param name="underlays_file" type="data" format="bed,gff" label="Underlays File" optional="True"/>
<param name="repeats_file" type="data" format="bed,gff" label="Repeats File" optional="True"/>
<param name="links_file" type="data" format="bed,gff" label="Links File" optional="True"/>
- <param name="offset" label="Offset" value="0" type="integer"/>
</when>
<when value="basic">
<param name="seq_name" label="Full Sequence Name" value="" type="text">
@@ -44,6 +46,7 @@
<option name="Skipping unsupported paragraph (maf_paragraph)" value="maf_paragraph"/>
<option name="Skipping all reconstruction scores: no species specified (recon_noseq)" value="recon_noseq"/>
<option name="Skipping reconstruction scores in blocks with missing row (recon_missing)" value="recon_missing"/>
+ <option name="The first row in some blocks is not the specified reference sequence (refseq_not_first)" value="refseq_not_first"/>
<option name="Skipping extra MAF File (unused_maf)" value="unused_maf"/>
</option>
<option name="Annotation Files" value="annotations">
@@ -71,12 +74,15 @@
</option>
<option name="Red Flags" value="red">
<option name="Sequence name in annotation file does not match name in MAF (seqname_mismatch)" value="seqname_mismatch"/>
- <option name="BED Start or end < 0 (bed_coord)" value="bed_coord"/>
- <option name="GFF Start or end < 1 (gff_coord)" value="gff_coord"/>
+ <option name="BED start or end < 0 (bed_coord)" value="bed_coord"/>
+ <option name="GFF start or end < 1 (gff_coord)" value="gff_coord"/>
<option name="Missing item name for URL substitution (url_subst)" value="url_subst"/>
</option>
</option>
<option name="Miscellaneous" value="miscellaneous">
+ <option name="No refseq specified; assuming 'first' (default_refseq)" value="default_refseq"/>
+ <option name="One or more bundle entries are not used in parameters file(unused_entry)" value="unused_entry"/>
+ <option name="Skipping blocks for export where reference sequence is hidden or all gaps (export_skip)" value="export_skip"/>
<option name="Possible parse error: token ends with an escaped quote (escaped_quote)" value="escaped_quote"/>
<option name="Draggable panel dividers will not be sticky (no_sticky)" value="no_sticky"/>
</option>
@@ -89,11 +95,7 @@
title = "Galaxy: $maf_input.name"
alignfile = input.maf
-#if $refseq.value:
refseq = $refseq
-#else:
-refseq = any
-#end if
tabext = .bed .gff .gtf
#if $nowarn.value:
nowarn = $nowarn
@@ -102,36 +104,35 @@
#set $seq_count = 0
#for $annotation_count, $annotation in $enumerate( $annotations ):
#if $annotation.annotation_style.style == "galaxy":
-#if $maf_input.metadata.species_chromosomes and $annotation.annotation_style['species'].value in $maf_input.metadata.species_chromosomes and $maf_input.metadata.species_chromosomes[$annotation.annotation_style['species'].value]:
-#set $seq_names = [ "%s.%s" % ( $annotation.annotation_style['species'].value, $chrom ) for $chrom in $maf_input.metadata.species_chromosomes[$annotation.annotation_style['species'].value]]
-#set $aliases = [ " %s" % $chrom for $chrom in $maf_input.metadata.species_chromosomes[$annotation.annotation_style['species'].value]]
+#if $maf_input.dataset.metadata.species_chromosomes and $annotation.annotation_style['species'].value in $maf_input.dataset.metadata.species_chromosomes and $maf_input.dataset.metadata.species_chromosomes[$annotation.annotation_style['species'].value]:
+#set $seq_names = [ "%s.%s" % ( $annotation.annotation_style['species'].value, $chrom ) for $chrom in $maf_input.dataset.metadata.species_chromosomes[$annotation.annotation_style['species'].value]]
#else:
#set $seq_names = [$annotation.annotation_style['species']]
-#set $aliases = [""]
#end if
#else:
#set $seq_names = [$annotation.annotation_style['seq_name']]
-#set $aliases = [""]
#end if
-#for $seq_name, $alias in $zip( $seq_names, $aliases ):
+#for $seq_name in $seq_names:
seq ${seq_count}:
seqname = $seq_name
#if $annotation.annotation_style['exons_file'].dataset:
-exons = ${annotation_count}.exons.${annotation.annotation_style['exons_file'].extension}$alias
+exons = ${annotation_count}.exons.${annotation.annotation_style['exons_file'].extension}
#end if
#if $annotation.annotation_style['repeats_file'].dataset:
-repeats = ${annotation_count}.repeats.${annotation.annotation_style['repeats_file'].extension}$alias
+repeats = ${annotation_count}.repeats.${annotation.annotation_style['repeats_file'].extension}
#end if
#if $annotation.annotation_style['links_file'].dataset:
-links = ${annotation_count}.links.${annotation.annotation_style['links_file'].extension}$alias
+links = ${annotation_count}.links.${annotation.annotation_style['links_file'].extension}
#end if
#if $annotation.annotation_style['underlays_file'].dataset:
-underlays = ${annotation_count}.underlays.${annotation.annotation_style['underlays_file'].extension}$alias
+underlays = ${annotation_count}.underlays.${annotation.annotation_style['underlays_file'].extension}
#end if
#if $annotation.annotation_style['highlights_file'].dataset:
-highlights = ${annotation_count}.highlights.${annotation.annotation_style['highlights_file'].extension}$alias
+highlights = ${annotation_count}.highlights.${annotation.annotation_style['highlights_file'].extension}
#end if
+#if $annotation.annotation_style.style == "basic":
offset = $annotation.annotation_style['offset']
+#end if
#set $seq_count = $seq_count + 1
#end for
1
0

[hg] galaxy 1520: Fix a bug in shrimp_wrapper and add a tool for...
by gregļ¼ scofield.bx.psu.edu 22 Sep '08
by gregļ¼ scofield.bx.psu.edu 22 Sep '08
22 Sep '08
details: http://www.bx.psu.edu/hg/galaxy/rev/9ef55e79068b
changeset: 1520:9ef55e79068b
user: wychung
date: Fri Sep 19 12:02:13 2008 -0400
description:
Fix a bug in shrimp_wrapper and add a tool for splitting paired-end reads.
Update datatype/fastqsolexa so the number of sequences is correct.
7 file(s) affected in this change:
lib/galaxy/datatypes/sequence.py
test-data/split_paired_reads_test1.fastq
test-data/split_paired_reads_test1.out1
tool_conf.xml.sample
tools/metag_tools/shrimp_wrapper.py
tools/metag_tools/split_paired_reads.py
tools/metag_tools/split_paired_reads.xml
diffs (216 lines):
diff -r 0f735b21dc12 -r 9ef55e79068b lib/galaxy/datatypes/sequence.py
--- a/lib/galaxy/datatypes/sequence.py Thu Sep 18 16:48:29 2008 -0400
+++ b/lib/galaxy/datatypes/sequence.py Fri Sep 19 12:02:13 2008 -0400
@@ -98,8 +98,8 @@
dataset.peek = data.get_file_peek( dataset.file_name )
count = size = 0
bases_regexp = re.compile("^[NGTAC]*$")
- for line in file( dataset.file_name ):
- if line and line[0] == "@":
+ for i, line in enumerate(file( dataset.file_name )):
+ if line and line[0] == "@" and i % 4 == 0:
count += 1
elif bases_regexp.match(line):
line = line.strip()
diff -r 0f735b21dc12 -r 9ef55e79068b test-data/split_paired_reads_test1.fastq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/split_paired_reads_test1.fastq Fri Sep 19 12:02:13 2008 -0400
@@ -0,0 +1,21 @@
+@HWI-EAS91_1_30788AAXX:7:21:1542:1758
+GTCAATTGTACTGGTCAATACTAAAAGAATAGGATCGCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA
++HWI-EAS91_1_30788AAXX:7:21:1542:1758
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR
+@HWI-EAS91_1_30788AAXX:7:22:1621:462
+ATAATGGCTATTATTGTGGGGGGGATGATGCTGGAAACTAGCCCCAATATCAATCCTATATCAAATCTCACC
++HWI-EAS91_1_30788AAXX:7:22:1621:462
+hhhhhhhhhhhhQAhh@hhhhNhhhfhMbCIScC?hhJhhhhChhhJhhhRhhKhePhc\KhhV\KhXhJhh
+@HWI-EAS91_1_30788AAXX:7:45:408:807
+TACCCGATTTTTTGCTTTCCACTTTATCCTACCCTTATGAGTGCTAGGATCAGGATGGAGAGGATTAGGGCT
++HWI-EAS91_1_30788AAXX:7:45:408:807
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hhhZh`hhhhhRXhhYh
+@HWI-EAS91_1_30788AAXX:7:49:654:1439
+CTAACTCTATTTATTGTATTTCAACTAAAAATCTCATAGGTTTATTGATAGTTGTGTTGTTGGTGTAAATGG
++HWI-EAS91_1_30788AAXX:7:49:654:1439
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhdhh_hG\XhU@
+@HWI-EAS91_1_30788AAXX:7:64:947:234
+TATCAAAAAAGAATATAATCTGAATCAACACTACAACCTATTAGTGTGTAGAATAGGAAGTAGAGGCCTGCG
++HWI-EAS91_1_30788AAXX:7:64:947:234
+hhhhhhhhhhhhhhhhhhhhhhhRhhehhahhhhhJhhhhhhhh^hPhWfhhhhThWUhhfhh_hhNIVPUd
+
diff -r 0f735b21dc12 -r 9ef55e79068b test-data/split_paired_reads_test1.out1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/split_paired_reads_test1.out1 Fri Sep 19 12:02:13 2008 -0400
@@ -0,0 +1,20 @@
+@HWI-EAS91_1_30788AAXX:7:21:1542:1758/1
+GTCAATTGTACTGGTCAATACTAAAAGAATAGGATC
++HWI-EAS91_1_30788AAXX:7:21:1542:1758/1
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
+@HWI-EAS91_1_30788AAXX:7:22:1621:462/1
+ATAATGGCTATTATTGTGGGGGGGATGATGCTGGAA
++HWI-EAS91_1_30788AAXX:7:22:1621:462/1
+hhhhhhhhhhhhQAhh@hhhhNhhhfhMbCIScC?h
+@HWI-EAS91_1_30788AAXX:7:45:408:807/1
+TACCCGATTTTTTGCTTTCCACTTTATCCTACCCTT
++HWI-EAS91_1_30788AAXX:7:45:408:807/1
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
+@HWI-EAS91_1_30788AAXX:7:49:654:1439/1
+CTAACTCTATTTATTGTATTTCAACTAAAAATCTCA
++HWI-EAS91_1_30788AAXX:7:49:654:1439/1
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
+@HWI-EAS91_1_30788AAXX:7:64:947:234/1
+TATCAAAAAAGAATATAATCTGAATCAACACTACAA
++HWI-EAS91_1_30788AAXX:7:64:947:234/1
+hhhhhhhhhhhhhhhhhhhhhhhRhhehhahhhhhJ
diff -r 0f735b21dc12 -r 9ef55e79068b tool_conf.xml.sample
--- a/tool_conf.xml.sample Thu Sep 18 16:48:29 2008 -0400
+++ b/tool_conf.xml.sample Fri Sep 19 12:02:13 2008 -0400
@@ -274,6 +274,7 @@
<tool file="metag_tools/short_reads_figure_high_quality_length.xml" />
<tool file="metag_tools/short_reads_trim_seq.xml" />
<tool file="metag_tools/blat_coverage_report.xml" />
+ <tool file="metag_tools/split_paired_reads.xml" />
</section>
<section name="Short Read Mapping" id="solexa_tools">
<tool file="metag_tools/shrimp_wrapper.xml" />
diff -r 0f735b21dc12 -r 9ef55e79068b tools/metag_tools/shrimp_wrapper.py
--- a/tools/metag_tools/shrimp_wrapper.py Thu Sep 18 16:48:29 2008 -0400
+++ b/tools/metag_tools/shrimp_wrapper.py Fri Sep 19 12:02:13 2008 -0400
@@ -162,6 +162,7 @@
readname, endindex = line[1:].split('/')
else:
score = line
+
if score: # the last one
if hits.has_key(readname):
if len(hits[readname]) == hit_per_read:
@@ -182,8 +183,9 @@
match_count = 0
if hit_per_read == 1:
- matches = [ hits[readkey]['1'] ]
- match_count = 1
+ if len(hits[readkey]['1']) == 1:
+ matches = [ hits[readkey]['1'] ]
+ match_count = 1
else:
end1_data = hits[readkey]['1']
end2_data = hits[readkey]['2']
@@ -591,6 +593,7 @@
if os.path.exists(query_qual_end2): os.remove(query_qual_end2)
if os.path.exists(shrimp_log): os.remove(shrimp_log)
+
if __name__ == '__main__': __main__()
diff -r 0f735b21dc12 -r 9ef55e79068b tools/metag_tools/split_paired_reads.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/split_paired_reads.py Fri Sep 19 12:02:13 2008 -0400
@@ -0,0 +1,46 @@
+#! /usr/bin/python
+
+"""
+Split Solexa paired end reads
+"""
+
+import os, sys
+
+if __name__ == '__main__':
+
+ infile = sys.argv[1]
+ outfile_end1 = open(sys.argv[2], 'w')
+ outfile_end2 = open(sys.argv[3], 'w')
+
+ for i, line in enumerate(file(infile)):
+ line = line.rstrip()
+ if not line or line.startswith('#'): continue
+
+ end1 = ''
+ end2 = ''
+
+ line_index = i % 4
+
+ if line_index == 0:
+ end1 = line + '/1'
+ end2 = line + '/2'
+
+ elif line_index == 1:
+ seq_len = len(line)/2
+ end1 = line[0:seq_len]
+ end2 = line[seq_len:]
+
+ elif line_index == 2:
+ end1 = line + '/1'
+ end2 = line + '/2'
+
+ else:
+ qual_len = len(line)/2
+ end1 = line[0:qual_len]
+ end2 = line[qual_len:]
+
+ outfile_end1.write('%s\n' %(end1))
+ outfile_end2.write('%s\n' %(end2))
+
+ outfile_end1.close()
+ outfile_end2.close()
\ No newline at end of file
diff -r 0f735b21dc12 -r 9ef55e79068b tools/metag_tools/split_paired_reads.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/split_paired_reads.xml Fri Sep 19 12:02:13 2008 -0400
@@ -0,0 +1,56 @@
+<tool id="split_paired_reads" name="Split" version="1.0.0">
+ <description>paired-end reads into two ends</description>
+ <command interpreter="python">
+ split_paired_reads.py $input $output1 $output2
+ </command>
+ <inputs>
+ <param name="input" type="data" format="fastqsolexa" label="Your paired-end file" />
+ </inputs>
+ <outputs>
+ <data name="output1" format="fastqsolexa"/>
+ <data name="output2" format="fastqsolexa"/>
+ </outputs>
+ <tests>
+ <test>
+ <param name="input" value="split_paired_reads_test1.fastq" ftype="fastqsolexa" />
+ <output name="output1" file="split_paired_reads_test1.out1" fype="fastqsolexa" />
+ </test>
+ </tests>
+<help>
+
+**What it does**
+
+This tool splits a single paired-end file in half and returns two files with each ends.
+
+-----
+
+**Input formats**
+
+A multiple-fastq file, for example::
+
+ @HWI-EAS91_1_30788AAXX:7:21:1542:1758
+ GTCAATTGTACTGGTCAATACTAAAAGAATAGGATCGCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA
+ +HWI-EAS91_1_30788AAXX:7:21:1542:1758
+ hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR
+
+
+-----
+
+**Outputs**
+
+One end::
+
+ @HWI-EAS91_1_30788AAXX:7:21:1542:1758/1
+ GTCAATTGTACTGGTCAATACTAAAAGAATAGGATC
+ +HWI-EAS91_1_30788AAXX:7:21:1542:1758/1
+ hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
+
+The other end::
+
+ @HWI-EAS91_1_30788AAXX:7:21:1542:1758/2
+ GCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA
+ +HWI-EAS91_1_30788AAXX:7:21:1542:1758/2
+ hhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR
+
+</help>
+</tool>
1
0

[hg] galaxy 1521: Merge with b2a9827178e28d93e2a978f64033a556a72...
by gregļ¼ scofield.bx.psu.edu 22 Sep '08
by gregļ¼ scofield.bx.psu.edu 22 Sep '08
22 Sep '08
details: http://www.bx.psu.edu/hg/galaxy/rev/618210a97e62
changeset: 1521:618210a97e62
user: wychung
date: Fri Sep 19 12:34:51 2008 -0400
description:
Merge with b2a9827178e28d93e2a978f64033a556a72b4c51
0 file(s) affected in this change:
diffs (117 lines):
diff -r 9ef55e79068b -r 618210a97e62 tools/visualization/GMAJ.xml
--- a/tools/visualization/GMAJ.xml Fri Sep 19 12:02:13 2008 -0400
+++ b/tools/visualization/GMAJ.xml Fri Sep 19 12:34:51 2008 -0400
@@ -3,7 +3,10 @@
<command interpreter="python">GMAJ.py $out_file1 $maf_input $gmaj_file $filenames_file</command>
<inputs>
<param name="maf_input" type="data" format="maf" label="Alignment File" optional="False"/>
- <param name="refseq" label="Reference Sequence" value="" type="text" help="Leave empty to allow interactive selection."/>
+ <param name="refseq" label="Reference Sequence" type="select">
+ <option value="first" selected="true">First sequence in each block</option>
+ <option value="any">Any sequence</option>
+ </param>
<repeat name="annotations" title="Annotations">
<conditional name="annotation_style">
<param name="style" type="select" label="Annotation Style" help="If your data is not in a style similar to what is available from Galaxy (and the UCSC table browser), choose 'Basic'.">
@@ -11,7 +14,7 @@
<option value="basic">Basic</option>
</param>
<when value="galaxy">
- <param name="species" type="select" label="Species of Annotation" multiple="False">
+ <param name="species" type="select" label="Species" multiple="False">
<options>
<filter type="data_meta" ref="maf_input" key="species" />
</options>
@@ -21,7 +24,6 @@
<param name="underlays_file" type="data" format="bed,gff" label="Underlays File" optional="True"/>
<param name="repeats_file" type="data" format="bed,gff" label="Repeats File" optional="True"/>
<param name="links_file" type="data" format="bed,gff" label="Links File" optional="True"/>
- <param name="offset" label="Offset" value="0" type="integer"/>
</when>
<when value="basic">
<param name="seq_name" label="Full Sequence Name" value="" type="text">
@@ -44,6 +46,7 @@
<option name="Skipping unsupported paragraph (maf_paragraph)" value="maf_paragraph"/>
<option name="Skipping all reconstruction scores: no species specified (recon_noseq)" value="recon_noseq"/>
<option name="Skipping reconstruction scores in blocks with missing row (recon_missing)" value="recon_missing"/>
+ <option name="The first row in some blocks is not the specified reference sequence (refseq_not_first)" value="refseq_not_first"/>
<option name="Skipping extra MAF File (unused_maf)" value="unused_maf"/>
</option>
<option name="Annotation Files" value="annotations">
@@ -71,12 +74,15 @@
</option>
<option name="Red Flags" value="red">
<option name="Sequence name in annotation file does not match name in MAF (seqname_mismatch)" value="seqname_mismatch"/>
- <option name="BED Start or end < 0 (bed_coord)" value="bed_coord"/>
- <option name="GFF Start or end < 1 (gff_coord)" value="gff_coord"/>
+ <option name="BED start or end < 0 (bed_coord)" value="bed_coord"/>
+ <option name="GFF start or end < 1 (gff_coord)" value="gff_coord"/>
<option name="Missing item name for URL substitution (url_subst)" value="url_subst"/>
</option>
</option>
<option name="Miscellaneous" value="miscellaneous">
+ <option name="No refseq specified; assuming 'first' (default_refseq)" value="default_refseq"/>
+ <option name="One or more bundle entries are not used in parameters file(unused_entry)" value="unused_entry"/>
+ <option name="Skipping blocks for export where reference sequence is hidden or all gaps (export_skip)" value="export_skip"/>
<option name="Possible parse error: token ends with an escaped quote (escaped_quote)" value="escaped_quote"/>
<option name="Draggable panel dividers will not be sticky (no_sticky)" value="no_sticky"/>
</option>
@@ -89,11 +95,7 @@
title = "Galaxy: $maf_input.name"
alignfile = input.maf
-#if $refseq.value:
refseq = $refseq
-#else:
-refseq = any
-#end if
tabext = .bed .gff .gtf
#if $nowarn.value:
nowarn = $nowarn
@@ -102,36 +104,35 @@
#set $seq_count = 0
#for $annotation_count, $annotation in $enumerate( $annotations ):
#if $annotation.annotation_style.style == "galaxy":
-#if $maf_input.metadata.species_chromosomes and $annotation.annotation_style['species'].value in $maf_input.metadata.species_chromosomes and $maf_input.metadata.species_chromosomes[$annotation.annotation_style['species'].value]:
-#set $seq_names = [ "%s.%s" % ( $annotation.annotation_style['species'].value, $chrom ) for $chrom in $maf_input.metadata.species_chromosomes[$annotation.annotation_style['species'].value]]
-#set $aliases = [ " %s" % $chrom for $chrom in $maf_input.metadata.species_chromosomes[$annotation.annotation_style['species'].value]]
+#if $maf_input.dataset.metadata.species_chromosomes and $annotation.annotation_style['species'].value in $maf_input.dataset.metadata.species_chromosomes and $maf_input.dataset.metadata.species_chromosomes[$annotation.annotation_style['species'].value]:
+#set $seq_names = [ "%s.%s" % ( $annotation.annotation_style['species'].value, $chrom ) for $chrom in $maf_input.dataset.metadata.species_chromosomes[$annotation.annotation_style['species'].value]]
#else:
#set $seq_names = [$annotation.annotation_style['species']]
-#set $aliases = [""]
#end if
#else:
#set $seq_names = [$annotation.annotation_style['seq_name']]
-#set $aliases = [""]
#end if
-#for $seq_name, $alias in $zip( $seq_names, $aliases ):
+#for $seq_name in $seq_names:
seq ${seq_count}:
seqname = $seq_name
#if $annotation.annotation_style['exons_file'].dataset:
-exons = ${annotation_count}.exons.${annotation.annotation_style['exons_file'].extension}$alias
+exons = ${annotation_count}.exons.${annotation.annotation_style['exons_file'].extension}
#end if
#if $annotation.annotation_style['repeats_file'].dataset:
-repeats = ${annotation_count}.repeats.${annotation.annotation_style['repeats_file'].extension}$alias
+repeats = ${annotation_count}.repeats.${annotation.annotation_style['repeats_file'].extension}
#end if
#if $annotation.annotation_style['links_file'].dataset:
-links = ${annotation_count}.links.${annotation.annotation_style['links_file'].extension}$alias
+links = ${annotation_count}.links.${annotation.annotation_style['links_file'].extension}
#end if
#if $annotation.annotation_style['underlays_file'].dataset:
-underlays = ${annotation_count}.underlays.${annotation.annotation_style['underlays_file'].extension}$alias
+underlays = ${annotation_count}.underlays.${annotation.annotation_style['underlays_file'].extension}
#end if
#if $annotation.annotation_style['highlights_file'].dataset:
-highlights = ${annotation_count}.highlights.${annotation.annotation_style['highlights_file'].extension}$alias
+highlights = ${annotation_count}.highlights.${annotation.annotation_style['highlights_file'].extension}
#end if
+#if $annotation.annotation_style.style == "basic":
offset = $annotation.annotation_style['offset']
+#end if
#set $seq_count = $seq_count + 1
#end for
1
0

[hg] galaxy 1518: Add a wrapper for metadata inside of DatasetFi...
by gregļ¼ scofield.bx.psu.edu 22 Sep '08
by gregļ¼ scofield.bx.psu.edu 22 Sep '08
22 Sep '08
details: http://www.bx.psu.edu/hg/galaxy/rev/0f735b21dc12
changeset: 1518:0f735b21dc12
user: Dan Blankenberg <dan(a)bx.psu.edu>
date: Thu Sep 18 16:48:29 2008 -0400
description:
Add a wrapper for metadata inside of DatasetFilenameWrapper to allow proper string substitution in
commandline and templates.
2 file(s) affected in this change:
lib/galaxy/datatypes/metadata.py
lib/galaxy/tools/__init__.py
diffs (56 lines):
diff -r 1d326855ba89 -r 0f735b21dc12 lib/galaxy/datatypes/metadata.py
--- a/lib/galaxy/datatypes/metadata.py Thu Sep 18 15:41:23 2008 -0400
+++ b/lib/galaxy/datatypes/metadata.py Thu Sep 18 16:48:29 2008 -0400
@@ -211,6 +211,9 @@
elif not isinstance(value, list):
MetadataParameter.__setattr__(self, name, [value])
+ def __iter__( self ):
+ return iter( self.value )
+
def __str__(self):
if self.value in [None, []]:
return str(self.spec.no_value)
diff -r 1d326855ba89 -r 0f735b21dc12 lib/galaxy/tools/__init__.py
--- a/lib/galaxy/tools/__init__.py Thu Sep 18 15:41:23 2008 -0400
+++ b/lib/galaxy/tools/__init__.py Thu Sep 18 16:48:29 2008 -0400
@@ -1177,6 +1177,31 @@
Wraps a dataset so that __str__ returns the filename, but all other
attributes are accessible.
"""
+
+ class MetadataWrapper:
+ """
+ Wraps a Metadata Collection to return MetadataParameters wrapped according to the metadata spec.
+ Methods implemented to match behavior of a Metadata Collection.
+ """
+ def __init__( self, metadata ):
+ self.metadata = metadata
+ def __getattr__( self, name ):
+ rval = self.metadata.get( name, None )
+ if name in self.metadata.spec:
+ rval = self.metadata.spec[name].wrap( rval, self.metadata.parent )
+ return rval
+ def __nonzero__( self ):
+ return self.metadata.__nonzero__()
+ def __iter__( self ):
+ return self.metadata.__iter__()
+ def get( self, key, default=None ):
+ try:
+ return getattr( self, key )
+ except:
+ return default
+ def items( self ):
+ return iter( [ ( k, self.get( k ) ) for k, v in self.metadata.items() ] )
+
def __init__( self, dataset, datatypes_registry = None, tool = None, name = None ):
if not dataset:
try:
@@ -1187,6 +1212,7 @@
self.dataset = NoneDataset( datatypes_registry = datatypes_registry, ext = ext )
else:
self.dataset = dataset
+ self.metadata = self.MetadataWrapper( dataset.metadata )
def __str__( self ):
return self.dataset.file_name
def __getattr__( self, key ):
1
0

[hg] galaxy 1522: Adding a new set of toolss to perform multiple...
by gregļ¼ scofield.bx.psu.edu 22 Sep '08
by gregļ¼ scofield.bx.psu.edu 22 Sep '08
22 Sep '08
details: http://www.bx.psu.edu/hg/galaxy/rev/05974294cbf1
changeset: 1522:05974294cbf1
user: guru
date: Sat Sep 20 18:14:24 2008 -0400
description:
Adding a new set of toolss to perform multiple linear regression analysis.
9 file(s) affected in this change:
test-data/rcve_out.dat
test-data/reg_inp.tab
tool_conf.xml.sample
tools/regVariation/best_regression_subsets.py
tools/regVariation/best_regression_subsets.xml
tools/regVariation/linear_regression.py
tools/regVariation/linear_regression.xml
tools/regVariation/rcve.py
tools/regVariation/rcve.xml
diffs (700 lines):
diff -r 618210a97e62 -r 05974294cbf1 test-data/rcve_out.dat
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/rcve_out.dat Sat Sep 20 18:14:24 2008 -0400
@@ -0,0 +1,8 @@
+#Model R-sq RCVE_Terms RCVE_Value
+2 3 4 0.3997 - -
+3 4 0.3319 2 0.1697
+2 4 0.2974 3 0.2561
+2 3 0.3985 4 0.0031
+4 0.1226 2 3 0.6934
+3 0.2733 2 4 0.3164
+2 0.2972 3 4 0.2564
diff -r 618210a97e62 -r 05974294cbf1 test-data/reg_inp.tab
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/reg_inp.tab Sat Sep 20 18:14:24 2008 -0400
@@ -0,0 +1,100 @@
+2.04 2.01 1070 5
+2.56 3.40 1254 6
+3.75 3.68 1466 6
+1.10 1.54 706 4
+3.00 3.32 1160 5
+0.05 0.33 756 3
+1.38 0.36 1058 2
+1.50 1.97 1008 7
+1.38 2.03 1104 4
+4.01 2.05 1200 7
+1.50 2.13 896 7
+1.29 1.34 848 3
+1.90 1.51 958 5
+3.11 3.12 1246 6
+1.92 2.14 1106 4
+0.81 2.60 790 5
+1.01 1.90 954 4
+3.66 3.06 1500 6
+2.00 1.60 1046 5
+2.05 1.96 1054 4
+2.60 1.96 1198 6
+2.55 1.56 940 3
+0.38 1.60 456 6
+2.48 1.92 1150 7
+2.74 3.09 636 6
+1.77 0.78 744 5
+1.61 2.12 644 5
+0.99 1.85 842 3
+1.62 1.78 852 5
+2.03 1.03 1170 3
+3.50 3.44 1034 10
+3.18 2.42 1202 5
+2.39 1.74 1018 5
+1.48 1.89 1180 5
+1.54 1.43 952 3
+1.57 1.64 1038 4
+2.46 2.69 1090 6
+2.42 1.79 694 5
+2.11 2.72 1096 6
+2.04 2.15 1114 5
+1.68 2.22 1256 6
+1.64 1.55 1208 5
+2.41 2.34 820 6
+2.10 2.92 1222 4
+1.40 2.10 1120 5
+2.03 1.64 886 4
+1.99 2.83 1126 7
+2.24 1.76 1158 4
+0.45 1.81 676 6
+2.31 2.68 1214 7
+2.41 2.55 1136 6
+2.56 2.70 1264 6
+2.50 1.66 1116 3
+2.92 2.23 1292 4
+2.35 2.01 604 5
+2.82 1.24 854 6
+1.80 1.95 814 6
+1.29 1.73 778 3
+1.68 1.08 800 2
+3.44 3.46 1424 7
+1.90 3.01 950 6
+2.06 0.54 1056 3
+3.30 3.20 956 8
+1.80 1.50 1352 5
+2.00 1.71 852 5
+1.68 1.99 1168 5
+1.94 2.76 970 6
+0.97 1.56 776 4
+1.12 1.78 854 6
+1.31 1.32 1232 5
+1.68 0.87 1140 6
+3.09 1.75 1084 4
+1.87 1.41 954 2
+2.00 2.77 1000 4
+2.39 1.78 1084 4
+1.50 1.34 1058 4
+1.82 1.52 816 5
+1.80 2.97 1146 7
+2.01 1.75 1000 6
+1.88 1.64 856 4
+1.64 1.80 798 4
+2.42 3.37 1324 6
+0.22 1.15 704 6
+2.31 1.72 1222 5
+0.95 2.27 948 6
+1.99 2.85 1182 8
+1.86 2.21 1000 6
+1.79 1.94 910 6
+3.02 4.25 1374 9
+1.85 1.83 1014 6
+1.98 2.75 1420 7
+2.15 1.71 400 6
+1.46 2.20 998 7
+2.29 2.13 776 6
+2.39 2.38 1134 7
+1.80 1.64 772 4
+2.64 1.87 1304 6
+2.08 2.53 1212 4
+0.70 1.78 818 6
+0.89 1.20 864 2
\ No newline at end of file
diff -r 618210a97e62 -r 05974294cbf1 tool_conf.xml.sample
--- a/tool_conf.xml.sample Fri Sep 19 12:34:51 2008 -0400
+++ b/tool_conf.xml.sample Sat Sep 20 18:14:24 2008 -0400
@@ -128,6 +128,11 @@
<tool file="regVariation/getIndels_2way.xml" />
<tool file="regVariation/getIndels_3way.xml" />
<tool file="regVariation/getIndelRates_3way.xml" />
+ </section>
+ <section name="Multiple regression" id="multReg">
+ <tool file="regVariation/linear_regression.xml" />
+ <tool file="regVariation/best_regression_subsets.xml" />
+ <tool file="regVariation/rcve.xml" />
</section>
<section name="Evolution: HyPhy" id="hyphy">
<tool file="hyphy/hyphy_branch_lengths_wrapper.xml" />
diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/best_regression_subsets.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/best_regression_subsets.py Sat Sep 20 18:14:24 2008 -0400
@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+
+from galaxy import eggs
+
+import sys, string
+from rpy import *
+import numpy
+
+def stop_err(msg):
+ sys.stderr.write(msg)
+ sys.exit()
+
+infile = sys.argv[1]
+y_col = int(sys.argv[2])-1
+x_cols = sys.argv[3].split(',')
+outfile = sys.argv[4]
+outfile2 = sys.argv[5]
+print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1)
+fout = open(outfile,'w')
+
+for i, line in enumerate( file ( infile )):
+ line = line.rstrip('\r\n')
+ if len( line )>0 and not line.startswith( '#' ):
+ elems = line.split( '\t' )
+ break
+ if i == 30:
+ break # Hopefully we'll never get here...
+
+if len( elems )<1:
+ stop_err( "The data in your input dataset is either missing or not formatted properly." )
+
+y_vals = []
+x_vals = []
+
+for k,col in enumerate(x_cols):
+ x_cols[k] = int(col)-1
+ x_vals.append([])
+
+NA = 'NA'
+for ind,line in enumerate( file( infile )):
+ if line and not line.startswith( '#' ):
+ try:
+ fields = line.split("\t")
+ try:
+ yval = float(fields[y_col])
+ except Exception, ey:
+ yval = r('NA')
+ y_vals.append(yval)
+ for k,col in enumerate(x_cols):
+ try:
+ xval = float(fields[col])
+ except Exception, ex:
+ xval = r('NA')
+ x_vals[k].append(xval)
+ except:
+ pass
+
+response_term = ""
+
+x_vals1 = numpy.asarray(x_vals).transpose()
+
+dat= r.list(x=array(x_vals1), y=y_vals)
+
+r.library("leaps")
+
+set_default_mode(NO_CONVERSION)
+try:
+ leaps = r.regsubsets(r("y ~ x"), data= r.na_exclude(dat))
+except RException, rex:
+ stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain no numeric values.")
+set_default_mode(BASIC_CONVERSION)
+
+summary = r.summary(leaps)
+tot = len(x_vals)
+pattern = "["
+for i in range(tot):
+ pattern = pattern + 'c' + str(int(x_cols[int(i)]) + 1) + ' '
+pattern = pattern.strip() + ']'
+print >>fout, "#Vars\t%s\tR-sq\tAdj. R-sq\tC-p\tbic" %(pattern)
+for ind,item in enumerate(summary['outmat']):
+ print >>fout, "%s\t%s\t%s\t%s\t%s\t%s" %(str(item).count('*'), item, summary['rsq'][ind], summary['adjr2'][ind], summary['cp'][ind], summary['bic'][ind])
+
+
+r.pdf( outfile2, 8, 8 )
+r.plot(leaps, scale="Cp", main="Best subsets using Cp Criterion")
+r.plot(leaps, scale="r2", main="Best subsets using R-sq Criterion")
+r.plot(leaps, scale="adjr2", main="Best subsets using Adjusted R-sq Criterion")
+r.plot(leaps, scale="bic", main="Best subsets using bic Criterion")
+
+r.dev_off()
diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/best_regression_subsets.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/best_regression_subsets.xml Sat Sep 20 18:14:24 2008 -0400
@@ -0,0 +1,64 @@
+<tool id="BestSubsetsRegression1" name="Perform Best-subsets Regression">
+ <description> </description>
+ <command interpreter="python">
+ best_regression_subsets.py
+ $input1
+ $response_col
+ $predictor_cols
+ $out_file1
+ $out_file2
+ 1>/dev/null
+ 2>/dev/null
+ </command>
+ <inputs>
+ <param format="tabular" name="input1" type="data" label="Select data" help="Query missing? See TIP below."/>
+ <param name="response_col" label="Response column (Y)" type="data_column" data_ref="input1" />
+ <param name="predictor_cols" label="Predictor columns (X)" type="data_column" data_ref="input1" multiple="true" />
+ </inputs>
+ <outputs>
+ <data format="input" name="out_file1" metadata_source="input1" />
+ <data format="pdf" name="out_file2" />
+ </outputs>
+ <requirements>
+ <requirement type="python-module">rpy</requirement>
+ </requirements>
+ <tests>
+ <!-- Testing this tool will not be possible because this tool produces a pdf output file.
+ -->
+ </tests>
+ <help>
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Edit Queries->Convert characters*
+
+-----
+
+.. class:: infomark
+
+**What it does**
+
+This tool uses the 'regsubsets' function from R statistical package for regression subset selection. It outputs two files, one containing a table with the best subsets and the corresponding summary statistics, and the other containing the graphical representation of the results.
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+- This tool currently treats all predictor and response variables as continuous variables.
+
+- Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis.
+
+- The 6 columns in the output are described below:
+
+ - Column 1 (Vars): denotes the number of variables in the model
+ - Column 2 ([c2 c3 c4...]): represents a list of the user-selected predictor variables (full model). An asterix denotes the presence of the corresponding predictor variable in the selected model.
+ - Column 3 (R-sq): the fraction of variance explained by the model
+ - Column 4 (Adj. R-sq): the above R-squared statistic adjusted, penalizing for higher number of predictors (p)
+ - Column 5 (Cp): Mallow's Cp statistics
+ - Column 6 (bic): Bayesian Information Criterion.
+
+
+ </help>
+</tool>
diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/linear_regression.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/linear_regression.py Sat Sep 20 18:14:24 2008 -0400
@@ -0,0 +1,117 @@
+#!/usr/bin/env python
+
+from galaxy import eggs
+import sys, string
+from rpy import *
+import numpy
+
+def stop_err(msg):
+ sys.stderr.write(msg)
+ sys.exit()
+
+infile = sys.argv[1]
+y_col = int(sys.argv[2])-1
+x_cols = sys.argv[3].split(',')
+outfile = sys.argv[4]
+outfile2 = sys.argv[5]
+
+print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1)
+fout = open(outfile,'w')
+
+for i, line in enumerate( file ( infile )):
+ line = line.rstrip('\r\n')
+ if len( line )>0 and not line.startswith( '#' ):
+ elems = line.split( '\t' )
+ break
+ if i == 30:
+ break # Hopefully we'll never get here...
+
+if len( elems )<1:
+ stop_err( "The data in your input dataset is either missing or not formatted properly." )
+
+y_vals = []
+x_vals = []
+
+for k,col in enumerate(x_cols):
+ x_cols[k] = int(col)-1
+ x_vals.append([])
+
+NA = 'NA'
+for ind,line in enumerate( file( infile )):
+ if line and not line.startswith( '#' ):
+ try:
+ fields = line.split("\t")
+ try:
+ yval = float(fields[y_col])
+ except:
+ yval = r('NA')
+ y_vals.append(yval)
+ for k,col in enumerate(x_cols):
+ try:
+ xval = float(fields[col])
+ except:
+ xval = r('NA')
+ x_vals[k].append(xval)
+ except:
+ pass
+
+x_vals1 = numpy.asarray(x_vals).transpose()
+
+dat= r.list(x=array(x_vals1), y=y_vals)
+
+set_default_mode(NO_CONVERSION)
+try:
+ linear_model = r.lm(r("y ~ x"), data = r.na_exclude(dat))
+except RException, rex:
+ stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain only non-numeric or invalid values.")
+set_default_mode(BASIC_CONVERSION)
+
+coeffs=linear_model.as_py()['coefficients']
+yintercept= coeffs['(Intercept)']
+print >>fout, "Y-intercept\t%s" %(yintercept)
+summary = r.summary(linear_model)
+
+co = summary.get('coefficients', 'NA')
+"""
+if len(co) != len(x_vals)+1:
+ stop_err("Stopped performing linear regression on the input data, since one of the predictor columns contains only non-numeric or invalid values.")
+"""
+print >>fout, "p-value (Y-intercept)\t%s" %(co[0][3])
+
+if len(x_vals) == 1: #Simple linear regression case with 1 predictor variable
+ try:
+ slope = coeffs['x']
+ except:
+ slope = 'NA'
+ try:
+ pval = co[1][3]
+ except:
+ pval = 'NA'
+ print >>fout, "Slope (c%d)\t%s" %(x_cols[0]+1,slope)
+ print >>fout, "p-value (c%d)\t%s" %(x_cols[0]+1,pval)
+else: #Multiple regression case with >1 predictors
+ ind=1
+ while ind < len(coeffs.keys()):
+ print >>fout, "Slope (c%d)\t%s" %(x_cols[ind-1]+1,coeffs['x'+str(ind)])
+ try:
+ pval = co[ind][3]
+ except:
+ pval = 'NA'
+ print >>fout, "p-value (c%d)\t%s" %(x_cols[ind-1]+1,pval)
+ ind+=1
+
+print >>fout, "R-squared\t%s" %(summary.get('r.squared','NA'))
+print >>fout, "Adjusted R-squared\t%s" %(summary.get('adj.r.squared','NA'))
+print >>fout, "F-statistic\t%s" %(summary.get('fstatistic','NA'))
+print >>fout, "Sigma\t%s" %(summary.get('sigma','NA'))
+
+r.pdf( outfile2, 8, 8 )
+if len(x_vals) == 1: #Simple linear regression case with 1 predictor variable
+ sub_title = "Slope = %s; Y-int = %s" %(slope,yintercept)
+ r.plot(x=x_vals[0], y=y_vals, xlab="X", ylab="Y", sub=sub_title, main="Scatterplot with regression")
+ r.abline(a=yintercept, b=slope, col="red")
+else:
+ r.pairs(dat, main="Scatterplot Matrix", col="blue")
+
+r.plot(linear_model)
+r.dev_off()
diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/linear_regression.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/linear_regression.xml Sat Sep 20 18:14:24 2008 -0400
@@ -0,0 +1,62 @@
+<tool id="LinearRegression1" name="Perform Linear Regression">
+ <description> </description>
+ <command interpreter="python">
+ linear_regression.py
+ $input1
+ $response_col
+ $predictor_cols
+ $out_file1
+ $out_file2
+ 1>/dev/null
+ </command>
+ <inputs>
+ <param format="tabular" name="input1" type="data" label="Select data" help="Query missing? See TIP below."/>
+ <param name="response_col" label="Response column (Y)" type="data_column" data_ref="input1" />
+ <param name="predictor_cols" label="Predictor columns (X)" type="data_column" data_ref="input1" multiple="true" />
+ </inputs>
+ <outputs>
+ <data format="input" name="out_file1" metadata_source="input1" />
+ <data format="pdf" name="out_file2" />
+ </outputs>
+ <requirements>
+ <requirement type="python-module">rpy</requirement>
+ </requirements>
+ <tests>
+ <!-- Testing this tool will not be possible because this tool produces a pdf output file.
+ -->
+ </tests>
+ <help>
+
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Edit Queries->Convert characters*
+
+-----
+
+.. class:: infomark
+
+**What it does**
+
+This tool uses the 'lm' function from R statistical package to perform linear regression on the input data. It outputs two files, one containing the summary statistics of the performed regression, and the other containing diagnostic plots to check whether model assumptions are satisfied.
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+- This tool currently treats all predictor and response variables as continuous variables.
+
+- Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis.
+
+- The summary statistics in the output are described below:
+
+ - sigma: the square root of the estimated variance of the random error (standard error of the residiuals)
+ - R-squared: the fraction of variance explained by the model
+ - Adjusted R-squared: the above R-squared statistic adjusted, penalizing for the number of the predictors (p)
+ - p-value: p-value for the t-test of the null hypothesis that the corresponding slope is equal to zero against the two-sided alternative.
+
+
+ </help>
+</tool>
diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/rcve.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/rcve.py Sat Sep 20 18:14:24 2008 -0400
@@ -0,0 +1,143 @@
+#!/usr/bin/env python
+
+from galaxy import eggs
+
+import sys, string
+from rpy import *
+import numpy
+
+def stop_err(msg):
+ sys.stderr.write(msg)
+ sys.exit()
+
+def sscombs(s):
+ if len(s) == 1:
+ return [s]
+ else:
+ ssc = sscombs(s[1:])
+ return [s[0]] + [s[0]+comb for comb in ssc] + ssc
+
+
+infile = sys.argv[1]
+y_col = int(sys.argv[2])-1
+x_cols = sys.argv[3].split(',')
+outfile = sys.argv[4]
+
+print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1)
+fout = open(outfile,'w')
+
+for i, line in enumerate( file ( infile )):
+ line = line.rstrip('\r\n')
+ if len( line )>0 and not line.startswith( '#' ):
+ elems = line.split( '\t' )
+ break
+ if i == 30:
+ break # Hopefully we'll never get here...
+
+if len( elems )<1:
+ stop_err( "The data in your input dataset is either missing or not formatted properly." )
+
+y_vals = []
+x_vals = []
+
+for k,col in enumerate(x_cols):
+ x_cols[k] = int(col)-1
+ x_vals.append([])
+ """
+ try:
+ float( elems[x_cols[k]] )
+ except:
+ try:
+ msg = "This operation cannot be performed on non-numeric column %d containing value '%s'." %( col, elems[x_cols[k]] )
+ except:
+ msg = "This operation cannot be performed on non-numeric data."
+ stop_err( msg )
+ """
+NA = 'NA'
+for ind,line in enumerate( file( infile )):
+ if line and not line.startswith( '#' ):
+ try:
+ fields = line.split("\t")
+ try:
+ yval = float(fields[y_col])
+ except Exception, ey:
+ yval = r('NA')
+ #print >>sys.stderr, "ey = %s" %ey
+ y_vals.append(yval)
+ for k,col in enumerate(x_cols):
+ try:
+ xval = float(fields[col])
+ except Exception, ex:
+ xval = r('NA')
+ #print >>sys.stderr, "ex = %s" %ex
+ x_vals[k].append(xval)
+ except:
+ pass
+
+x_vals1 = numpy.asarray(x_vals).transpose()
+dat= r.list(x=array(x_vals1), y=y_vals)
+
+set_default_mode(NO_CONVERSION)
+try:
+ full = r.lm(r("y ~ x"), data= r.na_exclude(dat)) #full model includes all the predictor variables specified by the user
+except RException, rex:
+ stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain no numeric values.")
+set_default_mode(BASIC_CONVERSION)
+
+summary = r.summary(full)
+fullr2 = summary.get('r.squared','NA')
+
+if fullr2 == 'NA':
+ stop_error("Error in linear regression")
+
+if len(x_vals) < 10:
+ s = ""
+ for ch in range(len(x_vals)):
+ s += str(ch)
+else:
+ stop_err("This tool only works with less than 10 predictors.")
+
+print >>fout, "#Model\tR-sq\tRCVE_Terms\tRCVE_Value"
+all_combos = sorted(sscombs(s), key=len)
+all_combos.reverse()
+for j,cols in enumerate(all_combos):
+ #if len(cols) == len(s): #Same as the full model above
+ # continue
+ if len(cols) == 1:
+ x_vals1 = x_vals[int(cols)]
+ else:
+ x_v = []
+ for col in cols:
+ x_v.append(x_vals[int(col)])
+ x_vals1 = numpy.asarray(x_v).transpose()
+ dat= r.list(x=array(x_vals1), y=y_vals)
+ set_default_mode(NO_CONVERSION)
+ red = r.lm(r("y ~ x"), data= dat) #Reduced model
+ set_default_mode(BASIC_CONVERSION)
+ summary = r.summary(red)
+ redr2 = summary.get('r.squared','NA')
+ try:
+ rcve = (float(fullr2)-float(redr2))/float(fullr2)
+ except:
+ rcve = 'NA'
+ col_str = ""
+ for col in cols:
+ col_str = col_str + str(int(x_cols[int(col)]) + 1) + " "
+ col_str.strip()
+ rcve_col_str = ""
+ for col in s:
+ if col not in cols:
+ rcve_col_str = rcve_col_str + str(int(x_cols[int(col)]) + 1) + " "
+ rcve_col_str.strip()
+ if len(cols) == len(s): #full model
+ rcve_col_str = "-"
+ rcve = "-"
+ try:
+ redr2 = "%.4f" %(float(redr2))
+ except:
+ pass
+ try:
+ rcve = "%.4f" %(float(rcve))
+ except:
+ pass
+ print >>fout, "%s\t%s\t%s\t%s" %(col_str,redr2,rcve_col_str,rcve)
diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/rcve.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/rcve.xml Sat Sep 20 18:14:24 2008 -0400
@@ -0,0 +1,68 @@
+<tool id="rcve1" name="Compute RCVE" version="1.0.0">
+ <description> </description>
+ <command interpreter="python">
+ rcve.py
+ $input1
+ $response_col
+ $predictor_cols
+ $out_file1
+ 1>/dev/null
+ </command>
+ <inputs>
+ <param format="tabular" name="input1" type="data" label="Select data" help="Query missing? See TIP below."/>
+ <param name="response_col" label="Response column (Y)" type="data_column" data_ref="input1" />
+ <param name="predictor_cols" label="Predictor columns (X)" type="data_column" data_ref="input1" multiple="true" />
+ </inputs>
+ <outputs>
+ <data format="input" name="out_file1" metadata_source="input1" />
+ </outputs>
+ <requirements>
+ <requirement type="python-module">rpy</requirement>
+ </requirements>
+ <tests>
+ <!-- Test data with vlid values -->
+ <test>
+ <param name="input1" value="reg_inp.tab"/>
+ <param name="response_col" value="1"/>
+ <param name="predictor_cols" value="2,3,4"/>
+ <output name="out_file1" file="rcve_out.dat"/>
+ </test>
+
+ </tests>
+ <help>
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Edit Queries->Convert characters*
+
+-----
+
+.. class:: infomark
+
+**What it does**
+
+This tool computes the RCVE (Relative Contribution to Variance) for all possible variable subsets using the following formula:
+
+**RCVE(i) = [R-sq (full: 1,2,..,i..,p-1) - R-sq(without i: 1,2,...,p-1)] / R-sq (full: 1,2,..,i..,p-1)**,
+which denotes the case where the 'i'th predictor is dropped.
+
+
+In general,
+**RCVE(X+) = [R-sq (full: {X,X+}) - R-sq(reduced: {X})] / R-sq (full: {X,X+})**,
+where,
+
+- {X,X+} denotes the set of all predictors,
+- X+ is the set of predictors for which we compute RCVE (and therefore drop from the full model to obtain a reduced one),
+- {X} is the set of the predictors that are left in the reduced model after excluding {X+}
+
+
+The 4 columns in the output are described below:
+
+- Column 1 (Model): denotes the variables present in the model ({X})
+- Column 2 (R-sq): denotes the R-squared value corresponding to the model in Column 1
+- Column 3 (RCVE_Terms): denotes the variable/s for which RCVE is computed ({X+}). These are the variables that are absent in the reduced model in Column 1. A '-' in this column indicates that the model in Column 1 is the Full model.
+- Column 4 (RCVE): denotes the RCVE value corresponding to the variable/s in Column 3. A '-' in this column indicates that the model in Column 1 is the Full model.
+
+
+ </help>
+</tool>
1
0

[hg] galaxy 1510: Strip whitespace from columns in file for data...
by gregļ¼ scofield.bx.psu.edu 22 Sep '08
by gregļ¼ scofield.bx.psu.edu 22 Sep '08
22 Sep '08
details: http://www.bx.psu.edu/hg/galaxy/rev/f8e3770c23f6
changeset: 1510:f8e3770c23f6
user: Dan Blankenberg <dan(a)bx.psu.edu>
date: Tue Sep 16 14:10:53 2008 -0400
description:
Strip whitespace from columns in file for dataset_metadata_in_file validator.
1 file(s) affected in this change:
lib/galaxy/tools/parameters/validation.py
diffs (12 lines):
diff -r ec547440ec97 -r f8e3770c23f6 lib/galaxy/tools/parameters/validation.py
--- a/lib/galaxy/tools/parameters/validation.py Tue Sep 16 13:25:42 2008 -0400
+++ b/lib/galaxy/tools/parameters/validation.py Tue Sep 16 14:10:53 2008 -0400
@@ -247,7 +247,7 @@
if line_startswith is None or line.startswith( line_startswith ):
fields = line.split( '\t' )
if metadata_column < len( fields ):
- self.valid_values.append( fields[metadata_column] )
+ self.valid_values.append( fields[metadata_column].strip() )
def validate( self, value, history = None ):
if not value: return
if hasattr( value, "metadata" ):
1
0
details: http://www.bx.psu.edu/hg/galaxy/rev/f1da9b95549b
changeset: 1516:f1da9b95549b
user: Dan Blankenberg <dan(a)bx.psu.edu>
date: Thu Sep 18 15:24:51 2008 -0400
description:
Update to latest gmaj.
1 file(s) affected in this change:
static/gmaj/gmaj.jar
diffs (2 lines):
diff -r 280e8b68f845 -r f1da9b95549b static/gmaj/gmaj.jar
Binary file static/gmaj/gmaj.jar has changed
1
0

[hg] galaxy 1515: Forgot to update tool_conf.sample with the new...
by gregļ¼ scofield.bx.psu.edu 22 Sep '08
by gregļ¼ scofield.bx.psu.edu 22 Sep '08
22 Sep '08
details: http://www.bx.psu.edu/hg/galaxy/rev/280e8b68f845
changeset: 1515:280e8b68f845
user: guru
date: Wed Sep 17 17:14:59 2008 -0400
description:
Forgot to update tool_conf.sample with the new tool details.
1 file(s) affected in this change:
tool_conf.xml.sample
diffs (10 lines):
diff -r 33e06a98b6d8 -r 280e8b68f845 tool_conf.xml.sample
--- a/tool_conf.xml.sample Wed Sep 17 16:42:08 2008 -0400
+++ b/tool_conf.xml.sample Wed Sep 17 17:14:59 2008 -0400
@@ -281,5 +281,6 @@
<tool file="metag_tools/megablast_wrapper.xml" />
<tool file="metag_tools/megablast_xml_parser.xml" />
<tool file="metag_tools/blat_wrapper.xml" />
+ <tool file="metag_tools/mapping_to_ucsc.xml" />
</section>
</toolbox>
1
0

[hg] galaxy 1507: add SHRiMP mapper for short reads analysis.
by gregļ¼ scofield.bx.psu.edu 22 Sep '08
by gregļ¼ scofield.bx.psu.edu 22 Sep '08
22 Sep '08
details: http://www.bx.psu.edu/hg/galaxy/rev/842f1883cf53
changeset: 1507:842f1883cf53
user: wychung
date: Mon Sep 15 15:04:41 2008 -0400
description:
add SHRiMP mapper for short reads analysis.
6 file(s) affected in this change:
test-data/shrimp_phix_anc.fa
test-data/shrimp_wrapper_test1.fastq
test-data/shrimp_wrapper_test1.out1
tool_conf.xml.sample
tools/metag_tools/shrimp_wrapper.py
tools/metag_tools/shrimp_wrapper.xml
diffs (853 lines):
diff -r 26825f08d362 -r 842f1883cf53 test-data/shrimp_phix_anc.fa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/shrimp_phix_anc.fa Mon Sep 15 15:04:41 2008 -0400
@@ -0,0 +1,2 @@
+>PHIX174
+GAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATCTTGATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAAATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTGTCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTAGATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATCTGAGTCCGATGCTGTTCAACCACTAATAGGTAAGAAATCATGAGTCAAGTTACTGAACAATCCGTACGTTTCCAGACCGCTTTGGCCTCTATTAAGCTCATTCAGGCTTCTGCCGTTTTGGATTTAACCGAAGATGATTTCGATTTTCTGACGAGTAACAAAGTTTGGATTGCTACTGACCGCTCTCGTGCTCGTCGCTGCGTTGAGGCTTGCGTTTATGGTACGCTGGACTTTGTGGGATACCCTCGCTTTCCTGCTCCTGTTGAGTTTATTGCTGCCGTCATTGCTTATTATGTTCATCCCGTCAACATTCAAACGGCCTGTCTCATCATGGAAGGCGCTGAATTTACGGAAAACATTATTAATGGCGTCGAGCGTCCGGTTAAAGCCGCTGAATTGTTCGCGTTTACCTTGCGTGTACGCGCAGGAAACACTGACGTTCTTACTGACGCAGAAGAAAACGTGCGTCAAAAATTACGTGCaGAAGGAGTGATGTAATGTCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCCGTTGCGAGGTACTAAAGGCAAGCGTAAAGGCGCTCGTCTTTGGTATGTAGGTGGTCAACAATTTTAATTGCAGGGGCTTCGGCCCCTTACTT
GAGGATAAATTATGTCTAATATTCAAACTGGCGCCGAGCGTATGCCGCATGACCTTTCCCATCTTGGCTTCCTTGCTGGTCAGATTGGTCGTCTTATTACCATTTCAACTACTCCGGTTATCGCTGGCGACTCCTTCGAGATGGACGCCGTTGGCGCTCTCCGTCTTTCTCCATTGCGTCGTGGCCTTGCTATTGACTCTACTGTAGACATTTTTACTTTTTATGTCCCTCATCGTCACGTTTATGGTGAACAGTGGATTAAGTTCATGAAGGATGGTGTTAATGCCACTCCTCTCCCGACTGTTAACACTACTGGTTATATTGACCATGCCGCTTTTCTTGGCACGATTAACCCTGATACCAATAAAATCCCTAAGCATTTGTTTCAGGGTTATTTGAATATCTATAACAACTATTTTAAAGCGCCGTGGATGCCTGACCGTACCGAGGCTAACCCTAATGAGCTTAATCAAGATGATGCTCGTTATGGTTTCCGTTGCTGCCATCTCAAAAACATTTGGACTGCTCCGCTTCCTCCTGAGACTGAGCTTTCTCGCCAAATGACGACTTCTACCACATCTATTGACATTATGGGTCTGCAAGCTGCTTATGCTAATTTGCATACTGACCAAGAACGTGATTACTTCATGCAGCGTTACCgTGATGTTATTTCTTCATTTGGAGGTAAAACCTCTTATGACGCTGACAACCGTCCTTTACTTGTCATGCGCTCTAATCTCTGGGCATCTGGCTATGATGTTGATGGAACTGACCAAACGTCGTTAGGCCAGTTTTCTGGTCGTGTTCAACAGACCTATAAACATTCTGTGCCGCGTTTCTTTGTTCCTGAGCATGGCACTATGTTTACTCTTGCGCTTGTTCGTTTTCCGCCTACTGCGACTAAAGAGATTCAGTACCTTAACGCTAAAGGTGCTTTGACTTATACCGATATTGCTGGCGACCCTGTTTTGTATGGCAACTTGCCGCCG
CGTGAAATTTCTATGAAGGATGTTTTCCGTTCTGGTGATTCGTCTAAGAAGTTTAAGATTGCTGAGGGTCAGTGGTATCGTTATGCGCCTTCGTATGTTTCTCCTGCTTATCACCTTCTTGAAGGCTTCCCATTCATTCAGGAACCGCCTTCTGGTGATTTGCAAGAACGCGTACTTATTCGCCACCATGATTATGACCAGTGTTTCCAGTCCGTTCAGTTGTTGCAGTGGAATAGTCAGGTTAAATTTAATGTGACCGTTTATCGCAATCTGCCGACCACTCGCGATTCAATCATGACTTCGTGATAAAAGATTGAGTGTGAGGTTATAACGCCGAAGCGGTAAAAATTTTAATTTTTGCCGCTGAGGGGTTGACCAAGCGAAGCGCGGTAGGTTTTCTGCTTAGGAGTTTAATCATGTTTCAGACTTTTATTTCTCGCCATAATTCAAACTTTTTTTCTGATAAGCTGGTTCTCACTTCTGTTACTCCAGCTTCTTCGGCACCTGTTTTACAGACACCTAAAGCTACATCGTCAACGTTATATTTTGATAGTTTGACGGTTAATGCTGGTAATGGTGGTTTTCTTCATTGCATTCAGATGGATACATCTGTCAACGCCGCTAATCAGGTTGTTTCTGTTGGTGCTGATATTGCTTTTGATGCCGACCCTAAATTTTTTGCCTGTTTGGTTCGCTTTGAGTCTTCTTCGGTTCCGACTACCCTCCCGACTGCCTATGATGTTTATCCTTTGAATGGTCGCCATGATGGTGGTTATTATACCGTCAAGGACTGTGTGACTATTGACGTCCTTCCCCGTACGCCGGGCAATAAtGTTTATGTTGGTTTCATGGTTTGGTCTAACTTTACCGCTACTAAATGCCGCGGATTGGTTTCGCTGAATCAGGTTATTAAAGAGATTATTTGTCTCCAGCCACTTAAGTGAGGTGATTTATGTTTGGTGCTATTGCTGGCGGTATTGCTTCTGCTC
TTGCTGGTGGCGCCATGTCTAAATTGTTTGGAGGCGGTCAAAAAGCCGCCTCCGGTGGCATTCAAGGTGATGTGCTTGCTACCGATAACAATACTGTAGGCATGGGTGATGCTGGTATTAAATCTGCCATTCAAGGCTCTAATGTTCCTAACCCTGATGAGGCCGCCCCTAGTTTTGTTTCTGGTGCTATGGCTAAAGCTGGTAAAGGACTTCTTGAAGGTACGTTGCAGGCTGGCACTTCTGCCGTTTCTGATAAGTTGCTTGATTTGGTTGGACTTGGTGGCAAGTCTGCCGCTGATAAAGGAAAGGATACTCGTGATTATCTTGCTGCTGCATTTCCTGAGCTTAATGCTTGGGAGCGTGCTGGTGCTGATGCTTCCTCTGCTGGTATGGTTGACGCCGGATTTGAGAATCAAAAAGAGCTTACTAAAATGCAACTGGACAATCAGAAAGAGATTGCCGAGATGCAAAATGAGACTCAAAAAGAGATTGCTGGCATTCAGTCGGCGACTTCACGCCAGAATACGAAAGACCAGGTATATGCACAAAATGAGATGCTTGCTTATCAACAGAAGGAGTCTACTGCTCGCGTTGCGTCTATTATGGAAAACACCAATCTTTCCAAGCAACAGCAGGTTTCCGAGATTATGCGCCAAATGCTTACTCAAGCTCAAACGGCTGGTCAGTATTTTACCAATGACCAAATCAAAGAAATGACTCGCAAGGTTAGTGCTGAGGTTGACTTAGTTCATCAGCAAACGCAGAATCAGCGGTATGGCTCTTCTCATATTGGCGCTACTGCAAAGGATATTTCTAATGTCGTCACTGATGCTGCTTCTGGTGTGGTTGATATTTTTCATGGTATTGATAAAGCTGTTGCCGATACTTGGAACAATTTCTGGAAAGACGGTAAAGCTGATGGTATTGGCTCTAATTTGTCTAGGAAATAACCGTCAGGATTGACACCCTCCCAATTGTATGTTTTCATG
CCTCCAAATCTTGGAGGCTTTTTTATGGTTCGTTCTTATTACCCTTCTGAATGTCACGCTGATTATTTTGACTTTGAGCGTATCGAGGCTCTTAAACCTGCTATTGAGGCTTGTGGCATTTCTACTCTTTCTCAATCCCCAATGCTTGGCTTCCATAAGCAGATGGATAACCGCATCAAGCTCTTGGAAGAGATTCTGTCTTTTCGTATGCAGGGCGTTGAGTTCGATAATGGTGATATGTATGTTGACGGCCATAAGGCTGCTTCTGACGTTCGTGATGAGTTTGTATCTGTTACTGAGAAGTTAATGGATGAATTGGCACAATGCTACAATGTGCTCCCCCAACTTGATATTAATAACACTATAGACCACCGCCCCGAAGGGGACGAAAAATGGTTTTTAGAGAACGAGAAGACGGTTACGCAGTTTTGCCGCAAGCTGGCTGCTGAACGCCCTCTTAAGGATATTCGCGATGAGTATAATTACCCCAAAAAGAAAGGTATTAAGGATGAGTGTTCAAGATTGCTGGAGGCCTCCACTATGAAATCGCGTAGAGGCTTTaCTATTCAGCGTTTGATGAATGCAATGCGACAGGCTCATGCTGATGGTTGGTTTATCGTTTTTGACACTCTCACGTTGGCTGACGACCGATTAGAGGCGTTTTATGATAATCCCAATGCTTTGCGTGACTATTTTCGTGATATTGGTCGTATGGTTCTTGCTGCCGAGGGTCGCAAGGCTAATGATTCACACGCCGACTGCTATCAGTATTTTTGTGTGCCTGAGTATGGTACAGCTAATGGCCGTCTTCATTTCCATGCGGTGCAtTTTATGCGGACACTTCCTACAGGTAGCGTTGACCCTAATTTTGGTCGTCGGGTACGCAATCGCCGCCAGTTAAATAGCTTGCAAAATACGTGGCCTTATGGTTACAGTATGCCCATCGCAGTTCGCTACACGCAGGACGCTTTTTCACGTTCTGGTTGGTT
GTGGCCTGTTGATGCTAAAGGTGAGCCGCTTAAAGCTACCAGTTATATGGCTGTTGGTTTCTATGTGGCTAAATACGTTAACAAAAAGTCAGATATGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACTTCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTGTCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGCAGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCCGACGTTTTGGCGGCGCAACCTGTGACGACAAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCA
diff -r 26825f08d362 -r 842f1883cf53 test-data/shrimp_wrapper_test1.fastq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/shrimp_wrapper_test1.fastq Mon Sep 15 15:04:41 2008 -0400
@@ -0,0 +1,40 @@
+@HWI-EAS91_1_306UPAAXX:6:1:959:874
+GCGGGCTGCGACATAAAGCATACCGCCTGGGCGGCG
++HWI-EAS91_1_306UPAAXX:6:1:959:874
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
+@HWI-EAS91_1_306UPAAXX:6:1:1630:1975
+GAAAGAAAATCAGCAACAGTGGCATCGATTTTACGG
++HWI-EAS91_1_306UPAAXX:6:1:1630:1975
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
+@HWI-EAS91_1_306UPAAXX:6:1:770:994
+GCAGGCAGCGTGCTGCGAGTCTTTTCGAATGATAAG
++HWI-EAS91_1_306UPAAXX:6:1:770:994
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
+@HWI-EAS91_1_306UPAAXX:6:1:1274:306
+GTCTTTGGTATGTAGGTGGTCAACAATTTTAATTGC
++HWI-EAS91_1_306UPAAXX:6:1:1274:306
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh\h
+@HWI-EAS91_1_306UPAAXX:6:1:1339:209
+GTTTGGTCAGTTCCATCAACATCATAGCCAGATGCC
++HWI-EAS91_1_306UPAAXX:6:1:1339:209
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
+@HWI-EAS91_1_306UPAAXX:6:1:203:1240
+GATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTA
++HWI-EAS91_1_306UPAAXX:6:1:203:1240
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
+@HWI-EAS91_1_306UPAAXX:6:1:869:448
+GCTGGCCATCAGTTCGCGGATACCGGCGGCAAACAT
++HWI-EAS91_1_306UPAAXX:6:1:869:448
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhKhhhhhhh
+@HWI-EAS91_1_306UPAAXX:6:1:939:928
+GGAGGCCTCCAGCAATCTTGAACACTCATCCTTAAT
++HWI-EAS91_1_306UPAAXX:6:1:939:928
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
+@HWI-EAS91_1_306UPAAXX:6:1:1756:1476
+GCGTAGAGGCTTTACTATTCAGCGTTTGATGAATGC
++HWI-EAS91_1_306UPAAXX:6:1:1756:1476
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
+@HWI-EAS91_1_306UPAAXX:6:1:1528:181
+GGCTGGTCAGTATTTTACCAATGACCAAATCAAAGA
++HWI-EAS91_1_306UPAAXX:6:1:1528:181
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
diff -r 26825f08d362 -r 842f1883cf53 test-data/shrimp_wrapper_test1.out1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/shrimp_wrapper_test1.out1 Mon Sep 15 15:04:41 2008 -0400
@@ -0,0 +1,7 @@
+#FORMAT: readname contigname strand contigstart contigend readstart readend readlength score editstring
+>HWI-EAS91_1_306UPAAXX:6:1:1528:181 PHIX174 + 3644 3679 1 36 36 3600 36
+>HWI-EAS91_1_306UPAAXX:6:1:1756:1476 PHIX174 + 4505 4540 1 36 36 3600 36
+>HWI-EAS91_1_306UPAAXX:6:1:203:1240 PHIX174 + 310 345 1 36 36 3600 36
+>HWI-EAS91_1_306UPAAXX:6:1:1274:306 PHIX174 + 933 968 1 36 36 3600 36
+>HWI-EAS91_1_306UPAAXX:6:1:939:928 PHIX174 - 4458 4493 1 36 36 3600 36
+>HWI-EAS91_1_306UPAAXX:6:1:1339:209 PHIX174 - 1732 1767 1 36 36 3600 36
diff -r 26825f08d362 -r 842f1883cf53 tool_conf.xml.sample
--- a/tool_conf.xml.sample Sun Sep 14 14:58:50 2008 -0400
+++ b/tool_conf.xml.sample Mon Sep 15 15:04:41 2008 -0400
@@ -276,6 +276,7 @@
<tool file="metag_tools/blat_coverage_report.xml" />
</section>
<section name="Short Read Mapping" id="solexa_tools">
+ <tool file="metag_tools/shrimp_wrapper.xml" />
<tool file="sr_mapping/lastz_wrapper.xml" />
<tool file="metag_tools/megablast_wrapper.xml" />
<tool file="metag_tools/megablast_xml_parser.xml" />
diff -r 26825f08d362 -r 842f1883cf53 tools/metag_tools/shrimp_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/shrimp_wrapper.py Mon Sep 15 15:04:41 2008 -0400
@@ -0,0 +1,577 @@
+#! /usr/bin/python
+
+"""
+SHRiMP wrapper
+
+Inputs:
+ reference seq and reads
+
+Outputs:
+ table of 8 columns:
+ chrom ref_loc read_id read_loc ref_nuc read_nuc quality coverage
+ SHRiMP output
+
+Parameters:
+ -s Spaced Seed (default: 111111011111)
+ -n Seed Matches per Window (default: 2)
+ -t Seed Hit Taboo Length (default: 4)
+ -9 Seed Generation Taboo Length (default: 0)
+ -w Seed Window Length (default: 115.00%)
+ -o Maximum Hits per Read (default: 100)
+ -r Maximum Read Length (default: 1000)
+ -d Kmer Std. Deviation Limit (default: -1 [None])
+
+ -m S-W Match Value (default: 100)
+ -i S-W Mismatch Value (default: -150)
+ -g S-W Gap Open Penalty (Reference) (default: -400)
+ -q S-W Gap Open Penalty (Query) (default: -400)
+ -e S-W Gap Extend Penalty (Reference) (default: -70)
+ -f S-W Gap Extend Penalty (Query) (default: -70)
+ -h S-W Hit Threshold (default: 68.00%)
+
+Command:
+%rmapper -s spaced_seed -n seed_matches_per_window -t seed_hit_taboo_length -9 seed_generation_taboo_length -w seed_window_length -o max_hits_per_read -r max_read_length -d kmer -m sw_match_value -i sw_mismatch_value -g sw_gap_open_ref -q sw_gap_open_query -e sw_gap_ext_ref -f sw_gap_ext_query -h sw_hit_threshold <query> <target> > <output> 2> <log>
+
+SHRiMP output:
+>7:2:1147:982/1 chr3 + 36586562 36586595 2 35 36 2900 3G16G13
+>7:2:1147:982/1 chr3 + 95338194 95338225 4 35 36 2700 9T7C14
+>7:2:587:93/1 chr3 + 14913541 14913577 1 35 36 2960 19--16
+
+Testing:
+%python shrimp_wrapper.py single ~/Desktop/shrimp_wrapper/phix_anc.fa tmp tmp1 ~/Desktop/shrimp_wrapper/phix.10.solexa.fastq
+%python shrimp_wrapper.py paired ~/Desktop/shrimp_wrapper/eca_ref_chrMT.fa tmp tmp1 ~/Desktop/shrimp_wrapper/eca.5.solexa_1.fastq ~/Desktop/shrimp_wrapper/eca.5.solexa_2.fastq
+
+"""
+
+import os, sys, tempfile, os.path
+
+assert sys.version_info[:2] >= (2.4)
+
+def stop_err( msg ):
+
+ sys.stderr.write( "%s\n" % msg )
+ sys.exit()
+
+def reverse_complement(s):
+
+ complement_dna = {"A":"T", "T":"A", "C":"G", "G":"C", "a":"t", "t":"a", "c":"g", "g":"c", "N":"N", "n":"n" , ".":".", "-":"-"}
+ reversed_s = []
+ for i in s:
+ reversed_s.append(complement_dna[i])
+ reversed_s.reverse()
+ return "".join(reversed_s)
+
+def generate_sub_table(result_file, ref_file, score_files, table_outfile, hit_per_read):
+
+ """
+ TODO: the cross-over error has not been addressed yet.
+ """
+
+ insertion_size = 600
+
+ all_score_file = score_files.split('&')
+
+ if len(all_score_file) != hit_per_read: stop_err('Un-equal number of files!')
+
+ temp_table_name = tempfile.NamedTemporaryFile().name
+ temp_table = open(temp_table_name, 'w')
+
+ outfile = open(table_outfile,'w')
+
+ # reference seq: not a single fasta seq
+ refseq = {}
+ chrom_cov = {}
+ seq = ''
+
+ for i, line in enumerate(file(ref_file)):
+ line = line.rstrip()
+ if not line or line.startswith('#'): continue
+
+ if line.startswith('>'):
+ if seq:
+ if refseq.has_key(title):
+ pass
+ else:
+ refseq[title] = seq
+ chrom_cov[title] = {}
+ seq = ''
+ title = line[1:]
+ else:
+ seq += line
+ if seq:
+ if not refseq.has_key(title):
+ refseq[title] = seq
+ chrom_cov[title] = {}
+
+ # find hits : one end and/or the other
+ hits = {}
+ for i, line in enumerate(file(result_file)):
+ line = line.rstrip()
+ if not line or line.startswith('#'): continue
+
+ #FORMAT: readname contigname strand contigstart contigend readstart readend readlength score editstring
+ fields = line.split('\t')
+ readname = fields[0][1:]
+ chrom = fields[1]
+ strand = fields[2]
+ chrom_start = int(fields[3]) - 1
+ chrom_end = int(fields[4])
+ read_start = fields[5]
+ read_end = fields[6]
+ read_len = fields[7]
+ score = fields[8]
+ editstring = fields[9]
+
+ if hit_per_read == 1:
+ endindex = '1'
+ else:
+ readname, endindex = readname.split('/')
+
+ if hits.has_key(readname):
+ if hits[readname].has_key(endindex):
+ hits[readname][endindex].append([strand, editstring, chrom_start, chrom_end, read_start, chrom])
+ else:
+ hits[readname][endindex] = [[strand, editstring, chrom_start, chrom_end, read_start, chrom]]
+ else:
+ hits[readname] = {}
+ hits[readname][endindex] = [[strand, editstring, chrom_start, chrom_end, read_start, chrom]]
+
+ # find score : one end and the other end
+ hits_score = {}
+ readname = ''
+ score = ''
+ for num_score_file in range(len(all_score_file)):
+ score_file = all_score_file[num_score_file]
+ for i, line in enumerate(file(score_file)):
+ line = line.rstrip()
+ if not line or line.startswith('#'): continue
+
+ if line.startswith('>'):
+ if score:
+ if hits.has_key(readname):
+ if len(hits[readname]) == hit_per_read:
+ if hits_score.has_key(readname):
+ if hits_score[readname].has_key(endindex):
+ pass
+ else:
+ hits_score[readname][endindex] = score
+ else:
+ hits_score[readname] = {}
+ hits_score[readname][endindex] = score
+ score = ''
+ if hit_per_read == 1:
+ readname = line[1:]
+ endindex = '1'
+ else:
+ readname, endindex = line[1:].split('/')
+ else:
+ score = line
+ if score: # the last one
+ if hits.has_key(readname):
+ if len(hits[readname]) == hit_per_read:
+ if hits_score.has_key(readname):
+ if hits_score[readname].has_key(endindex):
+ pass
+ else:
+ hits_score[readname][endindex] = score
+ else:
+ hits_score[readname] = {}
+ hits_score[readname][endindex] = score
+
+ # mutation call to all mappings
+ for readkey in hits.keys():
+ if len(hits[readkey]) != hit_per_read: continue
+
+ matches = []
+ match_count = 0
+
+ if hit_per_read == 1:
+ matches = [ hits[readkey]['1'] ]
+ match_count = 1
+ else:
+ end1_data = hits[readkey]['1']
+ end2_data = hits[readkey]['2']
+
+ for i, end1_hit in enumerate(end1_data):
+ crin_strand = {'+': False, '-': False}
+ crin_insertSize = {'+': False, '-': False}
+
+ crin_strand[end1_hit[0]] = True
+ crin_insertSize[end1_hit[0]] = int(end1_hit[2])
+
+ for j, end2_hit in enumerate(end2_data):
+ crin_strand[end2_hit[0]] = True
+ crin_insertSize[end2_hit[0]] = int(end2_hit[2])
+
+ if end1_hit[-1] != end2_hit[-1] : continue
+
+ if crin_strand['+'] and crin_strand['-']:
+ if (crin_insertSize['-'] - crin_insertSize['+']) <= insertion_size:
+ matches.append([end1_hit, end2_hit])
+ match_count += 1
+
+ if match_count == 1:
+ for x, end_data in enumerate(matches[0]):
+
+ end_strand, end_editstring, end_chr_start, end_chr_end, end_read_start, end_chrom = end_data
+ end_read_start = int(end_read_start) - 1
+
+ if end_strand == '-':
+ refsegment = reverse_complement(refseq[end_chrom][end_chr_start:end_chr_end])
+ else:
+ refsegment = refseq[end_chrom][end_chr_start:end_chr_end]
+
+ match_len = 0
+ editindex = 0
+ gap_read = 0
+
+ while editindex < len(end_editstring):
+ editchr = end_editstring[editindex]
+ chrA = ''
+ chrB = ''
+ locIndex = []
+ if editchr.isdigit():
+ editcode = ''
+ while editchr.isdigit() and editindex < len(end_editstring):
+ editcode += editchr
+ editindex += 1
+ if editindex < len(end_editstring): editchr = end_editstring[editindex]
+ for baseIndex in range(int(editcode)):
+ chrA += refsegment[match_len+baseIndex]
+ chrB = chrA
+ match_len += int(editcode)
+ elif editchr == 'x':
+ # crossover: inserted between the appropriate two bases
+ # Two sequencing errors: 4x15x6 (25 matches with 2 crossovers)
+ # Treated as errors in the reads; Do nothing.
+ editindex += 1
+
+ elif editchr.isalpha():
+ editcode = editchr
+ editindex += 1
+ chrA = refsegment[match_len]
+ chrB = editcode
+ match_len += len(editcode)
+
+ elif editchr == '-':
+ editcode = editchr
+ editindex += 1
+ chrA = refsegment[match_len]
+ chrB = editcode
+ match_len += len(editcode)
+ gap_read += 1
+
+ elif editchr == '(':
+ editcode = ''
+ while editchr != ')' and editindex < len(end_editstring):
+ if editindex < len(end_editstring): editchr = end_editstring[editindex]
+ editcode += editchr
+ editindex += 1
+ editcode = editcode[1:-1]
+ chrA = '-'*len(editcode)
+ chrB = editcode
+
+ else:
+ print 'Warning! Unknown symbols', editchr
+
+ if end_strand == '-':
+ chrA = reverse_complement(chrA)
+ chrB = reverse_complement(chrB)
+
+ pos_line = ''
+ rev_line = ''
+
+ for mappingIndex in range(len(chrA)):
+ # reference
+ chrAx = chrA[mappingIndex]
+ # read
+ chrBx = chrB[mappingIndex]
+
+ if chrAx and chrBx and chrBx.upper() != 'N':
+ if end_strand == '+':
+ chrom_loc = end_chr_start+match_len-len(chrA)+mappingIndex
+ read_loc = end_read_start+match_len-len(chrA)+mappingIndex-gap_read
+ if chrAx == '-': chrom_loc -= 1
+
+ if chrBx == '-':
+ scoreBx = '-1'
+ else:
+ scoreBx = hits_score[readkey][str(x+1)].split()[read_loc]
+
+ # 1-based on chrom_loc and read_loc
+ pos_line = pos_line + '\t'.join([end_chrom, str(chrom_loc+1), readkey+'/'+str(x+1), str(read_loc+1), chrAx, chrBx, scoreBx]) + '\n'
+ else:
+ chrom_loc = end_chr_end-match_len+mappingIndex
+ read_loc = end_read_start+match_len-1-mappingIndex-gap_read
+ if chrAx == '-': chrom_loc -= 1
+
+ if chrBx == '-':
+ scoreBx = '-1'
+ else:
+ scoreBx = hits_score[readkey][str(x+1)].split()[read_loc]
+
+ # 1-based on chrom_loc and read_loc
+ rev_line = '\t'.join([end_chrom, str(chrom_loc+1), readkey+'/'+str(x+1), str(read_loc+1), chrAx, chrBx, scoreBx]) +'\n' + rev_line
+
+ if chrom_cov.has_key(end_chrom):
+ if chrom_cov[end_chrom].has_key(chrom_loc):
+ chrom_cov[end_chrom][chrom_loc] += 1
+ else:
+ chrom_cov[end_chrom][chrom_loc] = 1
+ else:
+ chrom_cov[end_chrom] = {}
+ chrom_cov[end_chrom][chrom_loc] = 1
+
+ if pos_line: temp_table.write('%s\n' %(pos_line.rstrip('\r\n')))
+ if rev_line: temp_table.write('%s\n' %(rev_line.rstrip('\r\n')))
+
+ temp_table.close()
+
+ # chrom-wide coverage
+ for i, line in enumerate(open(temp_table_name)):
+ line = line.rstrip()
+ if not line or line.startswith('#'): continue
+
+ fields = line.split()
+ chrom = fields[0]
+ eachBp = int(fields[1])
+ readname = fields[2]
+
+ if hit_per_read == 1:
+ fields[2] = readname.split('/')[0]
+
+ if chrom_cov[chrom].has_key(eachBp):
+ outfile.write('%s\t%d\n' %('\t'.join(fields), chrom_cov[chrom][eachBp]))
+ else:
+ outfile.write('%s\t%d\n' %('\t'.join(fields), 0))
+
+ outfile.close()
+
+ if os.path.exists(temp_table_name): os.remove(temp_table_name)
+
+ return True
+
+def convert_fastqsolexa_to_fasta_qual(infile_name, query_fasta, query_qual):
+
+ outfile_seq = open( query_fasta, 'w' )
+ outfile_score = open( query_qual, 'w' )
+
+ seq_title_startswith = ''
+ qual_title_startswith = ''
+
+ default_coding_value = 64
+ fastq_block_lines = 0
+
+ for i, line in enumerate( file( infile_name ) ):
+ line = line.rstrip()
+ if not line or line.startswith( '#' ): continue
+
+ fastq_block_lines = ( fastq_block_lines + 1 ) % 4
+ line_startswith = line[0:1]
+
+ if fastq_block_lines == 1:
+ # first line is @title_of_seq
+ if not seq_title_startswith:
+ seq_title_startswith = line_startswith
+
+ if line_startswith != seq_title_startswith:
+ outfile_seq.close()
+ outfile_score.close()
+ stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) )
+
+ read_title = line[1:]
+ outfile_seq.write( '>%s\n' % line[1:] )
+
+ elif fastq_block_lines == 2:
+ # second line is nucleotides
+ read_length = len( line )
+ outfile_seq.write( '%s\n' % line )
+
+ elif fastq_block_lines == 3:
+ # third line is +title_of_qualityscore ( might be skipped )
+ if not qual_title_startswith:
+ qual_title_startswith = line_startswith
+
+ if line_startswith != qual_title_startswith:
+ outfile_seq.close()
+ outfile_score.close()
+ stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) )
+
+ quality_title = line[1:]
+ if quality_title and read_title != quality_title:
+ outfile_seq.close()
+ outfile_score.close()
+ stop_err( 'Invalid fastqsolexa format at line %d: sequence title "%s" differes from score title "%s".' % ( i + 1, read_title, quality_title ) )
+
+ if not quality_title:
+ outfile_score.write( '>%s\n' % read_title )
+ else:
+ outfile_score.write( '>%s\n' % line[1:] )
+
+ else:
+ # fourth line is quality scores
+ qual = ''
+ fastq_integer = True
+ # peek: ascii or digits?
+ val = line.split()[0]
+ try:
+ check = int( val )
+ fastq_integer = True
+ except:
+ fastq_integer = False
+
+ if fastq_integer:
+ # digits
+ qual = line
+ else:
+ # ascii
+ quality_score_length = len( line )
+ if quality_score_length == read_length + 1:
+ # first char is qual_score_startswith
+ qual_score_startswith = ord( line[0:1] )
+ line = line[1:]
+ elif quality_score_length == read_length:
+ qual_score_startswith = default_coding_value
+ else:
+ stop_err( 'Invalid fastqsolexa format at line %d: the number of quality scores ( %d ) is not the same as bases ( %d ).' % ( i + 1, quality_score_length, read_length ) )
+
+ for j, char in enumerate( line ):
+ score = ord( char ) - qual_score_startswith # 64
+ qual = "%s%s " % ( qual, str( score ) )
+
+ outfile_score.write( '%s\n' % qual )
+
+ outfile_seq.close()
+ outfile_score.close()
+
+ return True
+
+def __main__():
+
+ # I/O
+ type_of_reads = sys.argv[1] # single or paired
+ input_target = sys.argv[2] # fasta
+ shrimp_outfile = sys.argv[3] # shrimp output
+ table_outfile = sys.argv[4] # table output
+
+ # SHRiMP parameters: total = 15
+ # TODO: put threshold on each of these parameters
+ if len(sys.argv) == 21 or len(sys.argv) == 22:
+ spaced_seed = sys.argv[5]
+ seed_matches_per_window = sys.argv[6]
+ seed_hit_taboo_length = sys.argv[7]
+ seed_generation_taboo_length = sys.argv[8]
+ seed_window_length = sys.argv[9]
+ max_hits_per_read = sys.argv[10]
+ max_read_length = sys.argv[11]
+ kmer = sys.argv[12]
+ sw_match_value = sys.argv[13]
+ sw_mismatch_value = sys.argv[14]
+ sw_gap_open_ref = sys.argv[15]
+ sw_gap_open_query = sys.argv[16]
+ sw_gap_ext_ref = sys.argv[17]
+ sw_gap_ext_query = sys.argv[18]
+ sw_hit_threshold = sys.argv[19]
+
+ # Single-end parameters
+ if type_of_reads == 'single':
+ input_query = sys.argv[20] # single-end
+ hit_per_read = 1
+ query_fasta = tempfile.NamedTemporaryFile().name
+ query_qual = tempfile.NamedTemporaryFile().name
+ else: # Paired-end parameters
+ input_query_end1 = sys.argv[20] # paired-end
+ input_query_end2 = sys.argv[21]
+ hit_per_read = 2
+ query_fasta_end1 = tempfile.NamedTemporaryFile().name
+ query_fasta_end2 = tempfile.NamedTemporaryFile().name
+ query_qual_end1 = tempfile.NamedTemporaryFile().name
+ query_qual_end2 = tempfile.NamedTemporaryFile().name
+ else:
+ spaced_seed = '111111011111'
+ seed_matches_per_window = '2'
+ seed_hit_taboo_length = '4'
+ seed_generation_taboo_length = '0'
+ seed_window_length = '115.0'
+ max_hits_per_read = '100'
+ max_read_length = '1000'
+ kmer = '-1'
+ sw_match_value = '100'
+ sw_mismatch_value = '-150'
+ sw_gap_open_ref = '-400'
+ sw_gap_open_query = '-400'
+ sw_gap_ext_ref = '-70'
+ sw_gap_ext_query = '-70'
+ sw_hit_threshold = '68.0'
+
+ # Single-end parameters
+ if type_of_reads == 'single':
+ input_query = sys.argv[5] # single-end
+ hit_per_read = 1
+ query_fasta = tempfile.NamedTemporaryFile().name
+ query_qual = tempfile.NamedTemporaryFile().name
+ else: # Paired-end parameters
+ input_query_end1 = sys.argv[5] # paired-end
+ input_query_end2 = sys.argv[6]
+ hit_per_read = 2
+ query_fasta_end1 = tempfile.NamedTemporaryFile().name
+ query_fasta_end2 = tempfile.NamedTemporaryFile().name
+ query_qual_end1 = tempfile.NamedTemporaryFile().name
+ query_qual_end2 = tempfile.NamedTemporaryFile().name
+
+
+ # temp file for shrimp log file
+ shrimp_log = tempfile.NamedTemporaryFile().name
+
+ # convert fastq to fasta and quality score files
+ if type_of_reads == 'single':
+ return_value = convert_fastqsolexa_to_fasta_qual(input_query, query_fasta, query_qual)
+ else:
+ return_value = convert_fastqsolexa_to_fasta_qual(input_query_end1, query_fasta_end1, query_qual_end1)
+ return_value = convert_fastqsolexa_to_fasta_qual(input_query_end2, query_fasta_end2, query_qual_end2)
+
+ # SHRiMP command
+ if type_of_reads == 'single':
+ command = ' '.join(['rmapper-ls', '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta, input_target, '>', shrimp_outfile, '2>', shrimp_log])
+
+ try:
+ os.system(command)
+ except Exception, e:
+ if os.path.exists(query_fasta): os.remove(query_fasta)
+ if os.path.exists(query_qual): os.remove(query_qual)
+ stop_err(str(e))
+
+ else:
+ command_end1 = ' '.join(['rmapper-ls', '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end1, input_target, '>', shrimp_outfile, '2>', shrimp_log])
+ command_end2 = ' '.join(['rmapper-ls', '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end2, input_target, '>>', shrimp_outfile, '2>>', shrimp_log])
+
+ try:
+ os.system(command_end1)
+ os.system(command_end2)
+ except Exception, e:
+ if os.path.exists(query_fasta_end1): os.remove(query_fasta_end1)
+ if os.path.exists(query_fasta_end2): os.remove(query_fasta_end2)
+ if os.path.exists(query_qual_end1): os.remove(query_qual_end1)
+ if os.path.exists(query_qual_end2): os.remove(query_qual_end2)
+ stop_err(str(e))
+
+ # convert to table
+ if type_of_reads == 'single':
+ return_value = generate_sub_table(shrimp_outfile, input_target, query_qual, table_outfile, hit_per_read)
+ else:
+ return_value = generate_sub_table(shrimp_outfile, input_target, query_qual_end1+'&'+query_qual_end2, table_outfile, hit_per_read)
+
+ # remove temp. files
+ if type_of_reads == 'single':
+ if os.path.exists(query_fasta): os.remove(query_fasta)
+ if os.path.exists(query_qual): os.remove(query_qual)
+ else:
+ if os.path.exists(query_fasta_end1): os.remove(query_fasta_end1)
+ if os.path.exists(query_fasta_end2): os.remove(query_fasta_end2)
+ if os.path.exists(query_qual_end1): os.remove(query_qual_end1)
+ if os.path.exists(query_qual_end2): os.remove(query_qual_end2)
+
+ if os.path.exists(shrimp_log): os.remove(shrimp_log)
+
+if __name__ == '__main__': __main__()
+
diff -r 26825f08d362 -r 842f1883cf53 tools/metag_tools/shrimp_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/shrimp_wrapper.xml Mon Sep 15 15:04:41 2008 -0400
@@ -0,0 +1,196 @@
+<tool id="shrimp_wrapper" name="SHRiMP" version="1.0.0">
+ <description>SHort Read Mapping Package</description>
+ <command interpreter="python">
+ #if ($type_of_reads.single_or_paired=="single" and $param.skip_or_full=="skip"):#shrimp_wrapper.py $type_of_reads.single_or_paired $input_target $output1 $output2 $input_query
+ #elif ($type_of_reads.single_or_paired=="paired" and $param.skip_or_full=="skip"):#shrimp_wrapper.py $type_of_reads.single_or_paired $input_target $output1 $output2 ${type_of_reads.input1} ${type_of_reads.input2}
+ #elif ($type_of_reads.single_or_paired=="single" and $param.skip_or_full=="full"):#shrimp_wrapper.py $type_of_reads.single_or_paired $input_target $output1 $output2 $param.spaced_seed $param.seed_matches_per_window $param.seed_hit_taboo_length $param.seed_generation_taboo_length $param.seed_window_length $param.max_hits_per_read $param.max_read_length $param.kmer $param.sw_match_value $param.sw_mismatch_value $param.sw_gap_open_ref $param.sw_gap_open_query $param.sw_gap_ext_ref $param.sw_gap_ext_query $param.sw_hit_threshold $input_query
+ #elif ($type_of_reads.single_or_paired=="paired" and $param.skip_or_full=="full"):#shrimp_wrapper.py $type_of_reads.single_or_paired $input_target $output1 $output2 $param.spaced_seed $param.seed_matches_per_window $param.seed_hit_taboo_length $param.seed_generation_taboo_length $param.seed_window_length $param.max_hits_per_read $param.max_read_length $param.kmer $param.sw_match_value $param.sw_mismatch_value $param.sw_gap_open_ref $param.sw_gap_open_query $param.sw_gap_ext_ref $param.sw_gap_ext_query $param.sw_hit_threshold ${type_of_reads.input1} ${type_of_reads.input2}
+ #end if
+ </command>
+ <inputs>
+ <page>
+ <param name="input_target" type="data" format="fasta" label="Reference sequence" />
+ <conditional name="type_of_reads">
+ <param name="single_or_paired" type="select" label="Single- or Paired-ends">
+ <option value="single">Single-end</option>
+ <option value="paired">Paired-end</option>
+ </param>
+ <when value="single">
+ <param name="input_query" type="data" format="fastqsolexa" label="Sequence file" />
+ </when>
+ <when value="paired">
+ <param name="input1" type="data" format="fastqsolexa" label="One end" />
+ <param name="input2" type="data" format="fastqsolexa" label="The other end" />
+ </when>
+ </conditional>
+ <conditional name="param">
+ <param name="skip_or_full" type="select" label="SHRiMP parameter selection">
+ <option value="skip">Default setting</option>
+ <option value="full">Full list</option>
+ </param>
+ <when value="skip" />
+ <when value="full">
+ <param name="spaced_seed" type="text" size="30" value="111111011111" label="Spaced Seed" />
+ <param name="seed_matches_per_window" type="integer" size="5" value="2" label="Seed Matches per Window" />
+ <param name="seed_hit_taboo_length" type="integer" size="5" value="4" label="Seed Hit Taboo Length" />
+ <param name="seed_generation_taboo_length" type="integer" size="5" value="0" label="Seed Generation Taboo Length" />
+ <param name="seed_window_length" type="float" size="10" value="115.0" label="Seed Window Length" help="in percentage"/>
+ <param name="max_hits_per_read" type="integer" size="10" value="100" label="Maximum Hits per Read" />
+ <param name="max_read_length" type="integer" size="10" value="1000" label="Maximum Read Length" />
+ <param name="kmer" type="integer" size="10" value="-1" label="Kmer Std. Deviation Limit" help="-1 as None"/>
+ <param name="sw_match_value" type="integer" size="10" value="100" label="S-W Match Value" />
+ <param name="sw_mismatch_value" type="integer" size="10" value="-150" label="S-W Mismatch Value" />
+ <param name="sw_gap_open_ref" type="integer" size="10" value="-400" label="S-W Gap Open Penalty (Reference)" />
+ <param name="sw_gap_open_query" type="integer" size="10" value="-400" label="S-W Gap Open Penalty (Query)" />
+ <param name="sw_gap_ext_ref" type="integer" size="10" value="-70" label="S-W Gap Extend Penalty (Reference)" />
+ <param name="sw_gap_ext_query" type="integer" size="10" value="-70" label="S-W Gap Extend Penalty (Query)" />
+ <param name="sw_hit_threshold" type="float" size="10" value="68.0" label="S-W Hit Threshold" help="in percentage"/>
+ </when>
+ </conditional>
+ </page>
+ </inputs>
+ <outputs>
+ <data name="output1" format="tabular"/>
+ <data name="output2" format="tabular"/>
+ </outputs>
+ <requirements>
+ <requirement type="binary">SHRiMP_rmapper</requirement>
+ </requirements>
+ <tests>
+ <test>
+ <param name="single_or_paired" value="single" />
+ <param name="skip_or_full" value="skip" />
+ <param name="input_target" value="shrimp_phix_anc.fa" ftype="fasta" />
+ <param name="input_query" value="shrimp_wrapper_test1.fastq" ftype="fastqsolexa"/>
+ <output name="output1" file="shrimp_wrapper_test1.out1" />
+ </test>
+ <!--
+ <test>
+ <param name="input1" value="shrimp_wrapper_test2_end1.fastq" ftype="fastqsolexa" />
+ <param name="input2" value="shrimp_wrapper_test2_end2.fastq" ftype="fastqsolexa" />
+ <param name="single_or_paired" value="paired" />
+ <param name="skip_or_full" value="skip" />
+ <param name="input_target" value="shrimp_eca_chrMT.fa" ftype="fasta" />
+ <output name="output1" file="shrimp_wrapper_test2.out1" />
+ </test>
+ <test>
+ <param name="single_or_paired" value="single" />
+ <param name="skip_or_full" value="full" />
+ <param name="input_target" value="shrimp_phix_anc.fa" ftype="fasta" />
+ <param name="input_query" value="shrimp_wrapper_test1.fastq" ftype="fastqsolexa"/>
+ <param name="spaced_seed" value="111111011111" />
+ <param name="seed_matches_per_window" value="2" />
+ <param name="seed_hit_taboo_length" value="4" />
+ <param name="seed_generation_taboo_length" value="0" />
+ <param name="seed_window_length" value="115.0" />
+ <param name="max_hits_per_read" value="100" />
+ <param name="max_read_length" value="1000" />
+ <param name="kmer" value="-1" />
+ <param name="sw_match_value" value="100" />
+ <param name="sw_mismatch_value" value="-150" />
+ <param name="sw_gap_open_ref" value="-400" />
+ <param name="sw_gap_open_query" value="-400" />
+ <param name="sw_gap_ext_ref" value="-70" />
+ <param name="sw_gap_ext_query" value="-70" />
+ <param name="sw_hit_threshold" value="68.0" />
+ <output name="output1" file="shrimp_wrapper_test1.out1" />
+ </test>
+ <test>
+ <param name="single_or_paired" value="paired" />
+ <param name="skip_or_full" value="full" />
+ <param name="input_target" value="shrimp_eca_chrMT.fa" ftype="fasta" />
+ <param name="spaced_seed" value="111111011111" />
+ <param name="seed_matches_per_window" value="2" />
+ <param name="seed_hit_taboo_length" value="4" />
+ <param name="seed_generation_taboo_length" value="0" />
+ <param name="seed_window_length" value="115.0" />
+ <param name="max_hits_per_read" value="100" />
+ <param name="max_read_length" value="1000" />
+ <param name="kmer" value="-1" />
+ <param name="sw_match_value" value="100" />
+ <param name="sw_mismatch_value" value="-150" />
+ <param name="sw_gap_open_ref" value="-400" />
+ <param name="sw_gap_open_query" value="-400" />
+ <param name="sw_gap_ext_ref" value="-70" />
+ <param name="sw_gap_ext_query" value="-70" />
+ <param name="sw_hit_threshold" value="68.0" />
+ <param name="input1" value="shrimp_wrapper_test2_end1.fastq" ftype="fastqsolexa"/>
+ <param name="input2" value="shrimp_wrapper_test2_end2.fastq" ftype="fastqsolexa"/>
+ <output name="output1" file="shrimp_wrapper_test2.out1" />
+ </test>
+ -->
+ </tests>
+<help>
+
+.. class:: warningmark
+
+Only nucleotide sequences as query.
+
+-----
+
+**What it does**
+
+Run SHRiMP on letter-space reads.
+
+-----
+
+**Example**
+
+- Input a multiple-fastq file like the following::
+
+ @seq1
+ TACCCGATTTTTTGCTTTCCACTTTATCCTACCCTT
+ +seq2
+ hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
+
+- Use default settings (for detail explanations, please see **Parameters** section)
+
+- Search against your own uploaded file, result will be in the following format::
+
+ +-------+-------+--------+----------+----------+---------+--------+--------+-------+------------+
+ | id | chrom | strand | t.start | t.end | q.start | q.end | length | score | editstring |
+ +-------+-------+--------+----------+----------+---------+--------+--------+-------+------------+
+ | >seq1 | chrMT | + | 14712 | 14747 | 1 | 36 | 36 | 3350 | 24T11 |
+ +-------+-------+--------+----------+----------+---------+--------+--------+-------+------------+
+
+- The result will be formatted Table::
+
+ +-------+---------+---------+----------+---------+----------+---------+----------+
+ | chrom | ref_loc | read_id | read_loc | ref_nuc | read_nuc | quality | coverage |
+ +-------+---------+---------+----------+---------+----------+---------+----------+
+ | chrMT | 14711 | seq1 | 0 | T | T | 40 | 1 |
+ | chrMT | 14712 | seq1 | 1 | A | A | 40 | 1 |
+ | chrMT | 14713 | seq1 | 2 | C | C | 40 | 1 |
+ +-------+---------+---------+----------+---------+----------+---------+----------+
+
+-----
+
+**Parameters**
+
+Parameter list with default value settings::
+
+ -s Spaced Seed (default: 111111011111)
+ -n Seed Matches per Window (default: 2)
+ -t Seed Hit Taboo Length (default: 4)
+ -9 Seed Generation Taboo Length (default: 0)
+ -w Seed Window Length (default: 115.00%)
+ -o Maximum Hits per Read (default: 100)
+ -r Maximum Read Length (default: 1000)
+ -d Kmer Std. Deviation Limit (default: -1 [None])
+
+ -m S-W Match Value (default: 100)
+ -i S-W Mismatch Value (default: -150)
+ -g S-W Gap Open Penalty (Reference) (default: -400)
+ -q S-W Gap Open Penalty (Query) (default: -400)
+ -e S-W Gap Extend Penalty (Reference) (default: -70)
+ -f S-W Gap Extend Penalty (Query) (default: -70)
+ -h S-W Hit Threshold (default: 68.00%)
+
+-----
+
+**Reference**
+
+ **SHRiMP**: Stephen M. Rumble, Michael Brudno, Phil Lacroute, Vladimir Yanovsky, Marc Fiume, Adrian Dalca. shrimp at cs dot toronto dot edu.
+
+</help>
+</tool>
1
0

[hg] galaxy 1509: Rewrote "Compare two queries" tool in Python.
by gregļ¼ scofield.bx.psu.edu 22 Sep '08
by gregļ¼ scofield.bx.psu.edu 22 Sep '08
22 Sep '08
details: http://www.bx.psu.edu/hg/galaxy/rev/eb941905fd70
changeset: 1509:eb941905fd70
user: guru
date: Tue Sep 16 14:09:16 2008 -0400
description:
Rewrote "Compare two queries" tool in Python.
2 file(s) affected in this change:
tools/filters/compare.xml
tools/filters/joinWrapper.py
diffs (68 lines):
diff -r ec547440ec97 -r eb941905fd70 tools/filters/compare.xml
--- a/tools/filters/compare.xml Tue Sep 16 13:25:42 2008 -0400
+++ b/tools/filters/compare.xml Tue Sep 16 14:09:16 2008 -0400
@@ -1,6 +1,6 @@
<tool id="comp1" name="Compare two Queries">
<description>to find common or distinct rows</description>
- <command interpreter="perl">joinWrapper.pl $input1 $input2 $field1 $field2 $mode "Y" $out_file1</command>
+ <command interpreter="python">joinWrapper.py $input1 $input2 $field1 $field2 $mode $out_file1</command>
<inputs>
<param format="tabular" name="input1" type="data" label="Compare"/>
<param name="field1" label="Using column" type="data_column" data_ref="input1" />
diff -r ec547440ec97 -r eb941905fd70 tools/filters/joinWrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/joinWrapper.py Tue Sep 16 14:09:16 2008 -0400
@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+#Guruprasad Ananda
+"""
+This tool provides the UNIX "join" functionality.
+"""
+import sys, os, tempfile
+
+def stop_err(msg):
+ sys.stderr.write(msg)
+ sys.exit()
+
+def main():
+ infile1 = sys.argv[1]
+ infile2 = sys.argv[2]
+ field1 = int(sys.argv[3])
+ field2 = int(sys.argv[4])
+ mode =sys.argv[5]
+ outfile = sys.argv[6]
+
+ tmpfile1 = tempfile.NamedTemporaryFile()
+ tmpfile2 = tempfile.NamedTemporaryFile()
+
+ try:
+ #Sort the two files based on specified fields
+ os.system("sort -k %d -o %s %s" %(field1, tmpfile1.name, infile1))
+ os.system("sort -k %d -o %s %s" %(field2, tmpfile2.name, infile2))
+ except Exception, exc:
+ stop_err( 'Initialization error -> %s' %str(exc) )
+
+ option = ""
+ for line in file(tmpfile1.name):
+ line = line.strip()
+ if line:
+ elems = line.split('\t')
+ for j in range(1,len(elems)+1):
+ if j == 1:
+ option = "1.1"
+ else:
+ option = option + ",1." + str(j)
+ break
+
+ if mode == "V":
+ cmdline = 'join -v 1 -o %s -1 %d -2 %d %s %s | tr " " "\t" > %s' %(option, field1, field2, tmpfile1.name, tmpfile2.name, outfile)
+ else:
+ cmdline = 'join -o %s -1 %d -2 %d %s %s | tr " " "\t" > %s' %(option, field1, field2, tmpfile1.name, tmpfile2.name, outfile)
+
+ try:
+ os.system(cmdline)
+ except Exception, exj:
+ stop_err('Error joining the two datasets -> %s' %str(exj))
+
+if __name__ == "__main__":
+ main()
1
0