- galaxy-dev - lists.galaxyproject.org

[hg] galaxy 1519: Update GMAJ tool interface.
by greg＠scofield.bx.psu.edu 22 Sep '08

22 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/b2a9827178e2 changeset: 1519:b2a9827178e2 user: Dan Blankenberg <dan(a)bx.psu.edu> date: Fri Sep 19 12:27:20 2008 -0400 description: Update GMAJ tool interface. 1 file(s) affected in this change: tools/visualization/GMAJ.xml diffs (117 lines): diff -r 0f735b21dc12 -r b2a9827178e2 tools/visualization/GMAJ.xml --- a/tools/visualization/GMAJ.xml Thu Sep 18 16:48:29 2008 -0400 +++ b/tools/visualization/GMAJ.xml Fri Sep 19 12:27:20 2008 -0400 @@ -3,7 +3,10 @@ <command interpreter="python">GMAJ.py $out_file1 $maf_input $gmaj_file $filenames_file</command> <inputs> <param name="maf_input" type="data" format="maf" label="Alignment File" optional="False"/> - <param name="refseq" label="Reference Sequence" value="" type="text" help="Leave empty to allow interactive selection."/> + <param name="refseq" label="Reference Sequence" type="select"> + <option value="first" selected="true">First sequence in each block</option> + <option value="any">Any sequence</option> + </param> <repeat name="annotations" title="Annotations"> <conditional name="annotation_style"> <param name="style" type="select" label="Annotation Style" help="If your data is not in a style similar to what is available from Galaxy (and the UCSC table browser), choose 'Basic'."> @@ -11,7 +14,7 @@ <option value="basic">Basic</option> </param> <when value="galaxy"> - <param name="species" type="select" label="Species of Annotation" multiple="False"> + <param name="species" type="select" label="Species" multiple="False"> <options> <filter type="data_meta" ref="maf_input" key="species" /> </options> @@ -21,7 +24,6 @@ <param name="underlays_file" type="data" format="bed,gff" label="Underlays File" optional="True"/> <param name="repeats_file" type="data" format="bed,gff" label="Repeats File" optional="True"/> <param name="links_file" type="data" format="bed,gff" label="Links File" optional="True"/> - <param name="offset" label="Offset" value="0" type="integer"/> </when> <when value="basic"> <param name="seq_name" label="Full Sequence Name" value="" type="text"> @@ -44,6 +46,7 @@ <option name="Skipping unsupported paragraph (maf_paragraph)" value="maf_paragraph"/> <option name="Skipping all reconstruction scores: no species specified (recon_noseq)" value="recon_noseq"/> <option name="Skipping reconstruction scores in blocks with missing row (recon_missing)" value="recon_missing"/> + <option name="The first row in some blocks is not the specified reference sequence (refseq_not_first)" value="refseq_not_first"/> <option name="Skipping extra MAF File (unused_maf)" value="unused_maf"/> </option> <option name="Annotation Files" value="annotations"> @@ -71,12 +74,15 @@ </option> <option name="Red Flags" value="red"> <option name="Sequence name in annotation file does not match name in MAF (seqname_mismatch)" value="seqname_mismatch"/> - <option name="BED Start or end < 0 (bed_coord)" value="bed_coord"/> - <option name="GFF Start or end < 1 (gff_coord)" value="gff_coord"/> + <option name="BED start or end < 0 (bed_coord)" value="bed_coord"/> + <option name="GFF start or end < 1 (gff_coord)" value="gff_coord"/> <option name="Missing item name for URL substitution (url_subst)" value="url_subst"/> </option> </option> <option name="Miscellaneous" value="miscellaneous"> + <option name="No refseq specified; assuming 'first' (default_refseq)" value="default_refseq"/> + <option name="One or more bundle entries are not used in parameters file(unused_entry)" value="unused_entry"/> + <option name="Skipping blocks for export where reference sequence is hidden or all gaps (export_skip)" value="export_skip"/> <option name="Possible parse error: token ends with an escaped quote (escaped_quote)" value="escaped_quote"/> <option name="Draggable panel dividers will not be sticky (no_sticky)" value="no_sticky"/> </option> @@ -89,11 +95,7 @@ title = "Galaxy: $maf_input.name" alignfile = input.maf -#if $refseq.value: refseq = $refseq -#else: -refseq = any -#end if tabext = .bed .gff .gtf #if $nowarn.value: nowarn = $nowarn @@ -102,36 +104,35 @@ #set $seq_count = 0 #for $annotation_count, $annotation in $enumerate( $annotations ): #if $annotation.annotation_style.style == "galaxy": -#if $maf_input.metadata.species_chromosomes and $annotation.annotation_style['species'].value in $maf_input.metadata.species_chromosomes and $maf_input.metadata.species_chromosomes[$annotation.annotation_style['species'].value]: -#set $seq_names = [ "%s.%s" % ( $annotation.annotation_style['species'].value, $chrom ) for $chrom in $maf_input.metadata.species_chromosomes[$annotation.annotation_style['species'].value]] -#set $aliases = [ " %s" % $chrom for $chrom in $maf_input.metadata.species_chromosomes[$annotation.annotation_style['species'].value]] +#if $maf_input.dataset.metadata.species_chromosomes and $annotation.annotation_style['species'].value in $maf_input.dataset.metadata.species_chromosomes and $maf_input.dataset.metadata.species_chromosomes[$annotation.annotation_style['species'].value]: +#set $seq_names = [ "%s.%s" % ( $annotation.annotation_style['species'].value, $chrom ) for $chrom in $maf_input.dataset.metadata.species_chromosomes[$annotation.annotation_style['species'].value]] #else: #set $seq_names = [$annotation.annotation_style['species']] -#set $aliases = [""] #end if #else: #set $seq_names = [$annotation.annotation_style['seq_name']] -#set $aliases = [""] #end if -#for $seq_name, $alias in $zip( $seq_names, $aliases ): +#for $seq_name in $seq_names: seq ${seq_count}: seqname = $seq_name #if $annotation.annotation_style['exons_file'].dataset: -exons = ${annotation_count}.exons.${annotation.annotation_style['exons_file'].extension}$alias +exons = ${annotation_count}.exons.${annotation.annotation_style['exons_file'].extension} #end if #if $annotation.annotation_style['repeats_file'].dataset: -repeats = ${annotation_count}.repeats.${annotation.annotation_style['repeats_file'].extension}$alias +repeats = ${annotation_count}.repeats.${annotation.annotation_style['repeats_file'].extension} #end if #if $annotation.annotation_style['links_file'].dataset: -links = ${annotation_count}.links.${annotation.annotation_style['links_file'].extension}$alias +links = ${annotation_count}.links.${annotation.annotation_style['links_file'].extension} #end if #if $annotation.annotation_style['underlays_file'].dataset: -underlays = ${annotation_count}.underlays.${annotation.annotation_style['underlays_file'].extension}$alias +underlays = ${annotation_count}.underlays.${annotation.annotation_style['underlays_file'].extension} #end if #if $annotation.annotation_style['highlights_file'].dataset: -highlights = ${annotation_count}.highlights.${annotation.annotation_style['highlights_file'].extension}$alias +highlights = ${annotation_count}.highlights.${annotation.annotation_style['highlights_file'].extension} #end if +#if $annotation.annotation_style.style == "basic": offset = $annotation.annotation_style['offset'] +#end if #set $seq_count = $seq_count + 1 #end for

1 0

[hg] galaxy 1521: Merge with b2a9827178e28d93e2a978f64033a556a72...
by greg＠scofield.bx.psu.edu 22 Sep '08

22 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/618210a97e62 changeset: 1521:618210a97e62 user: wychung date: Fri Sep 19 12:34:51 2008 -0400 description: Merge with b2a9827178e28d93e2a978f64033a556a72b4c51 0 file(s) affected in this change: diffs (117 lines): diff -r 9ef55e79068b -r 618210a97e62 tools/visualization/GMAJ.xml --- a/tools/visualization/GMAJ.xml Fri Sep 19 12:02:13 2008 -0400 +++ b/tools/visualization/GMAJ.xml Fri Sep 19 12:34:51 2008 -0400 @@ -3,7 +3,10 @@ <command interpreter="python">GMAJ.py $out_file1 $maf_input $gmaj_file $filenames_file</command> <inputs> <param name="maf_input" type="data" format="maf" label="Alignment File" optional="False"/> - <param name="refseq" label="Reference Sequence" value="" type="text" help="Leave empty to allow interactive selection."/> + <param name="refseq" label="Reference Sequence" type="select"> + <option value="first" selected="true">First sequence in each block</option> + <option value="any">Any sequence</option> + </param> <repeat name="annotations" title="Annotations"> <conditional name="annotation_style"> <param name="style" type="select" label="Annotation Style" help="If your data is not in a style similar to what is available from Galaxy (and the UCSC table browser), choose 'Basic'."> @@ -11,7 +14,7 @@ <option value="basic">Basic</option> </param> <when value="galaxy"> - <param name="species" type="select" label="Species of Annotation" multiple="False"> + <param name="species" type="select" label="Species" multiple="False"> <options> <filter type="data_meta" ref="maf_input" key="species" /> </options> @@ -21,7 +24,6 @@ <param name="underlays_file" type="data" format="bed,gff" label="Underlays File" optional="True"/> <param name="repeats_file" type="data" format="bed,gff" label="Repeats File" optional="True"/> <param name="links_file" type="data" format="bed,gff" label="Links File" optional="True"/> - <param name="offset" label="Offset" value="0" type="integer"/> </when> <when value="basic"> <param name="seq_name" label="Full Sequence Name" value="" type="text"> @@ -44,6 +46,7 @@ <option name="Skipping unsupported paragraph (maf_paragraph)" value="maf_paragraph"/> <option name="Skipping all reconstruction scores: no species specified (recon_noseq)" value="recon_noseq"/> <option name="Skipping reconstruction scores in blocks with missing row (recon_missing)" value="recon_missing"/> + <option name="The first row in some blocks is not the specified reference sequence (refseq_not_first)" value="refseq_not_first"/> <option name="Skipping extra MAF File (unused_maf)" value="unused_maf"/> </option> <option name="Annotation Files" value="annotations"> @@ -71,12 +74,15 @@ </option> <option name="Red Flags" value="red"> <option name="Sequence name in annotation file does not match name in MAF (seqname_mismatch)" value="seqname_mismatch"/> - <option name="BED Start or end < 0 (bed_coord)" value="bed_coord"/> - <option name="GFF Start or end < 1 (gff_coord)" value="gff_coord"/> + <option name="BED start or end < 0 (bed_coord)" value="bed_coord"/> + <option name="GFF start or end < 1 (gff_coord)" value="gff_coord"/> <option name="Missing item name for URL substitution (url_subst)" value="url_subst"/> </option> </option> <option name="Miscellaneous" value="miscellaneous"> + <option name="No refseq specified; assuming 'first' (default_refseq)" value="default_refseq"/> + <option name="One or more bundle entries are not used in parameters file(unused_entry)" value="unused_entry"/> + <option name="Skipping blocks for export where reference sequence is hidden or all gaps (export_skip)" value="export_skip"/> <option name="Possible parse error: token ends with an escaped quote (escaped_quote)" value="escaped_quote"/> <option name="Draggable panel dividers will not be sticky (no_sticky)" value="no_sticky"/> </option> @@ -89,11 +95,7 @@ title = "Galaxy: $maf_input.name" alignfile = input.maf -#if $refseq.value: refseq = $refseq -#else: -refseq = any -#end if tabext = .bed .gff .gtf #if $nowarn.value: nowarn = $nowarn @@ -102,36 +104,35 @@ #set $seq_count = 0 #for $annotation_count, $annotation in $enumerate( $annotations ): #if $annotation.annotation_style.style == "galaxy": -#if $maf_input.metadata.species_chromosomes and $annotation.annotation_style['species'].value in $maf_input.metadata.species_chromosomes and $maf_input.metadata.species_chromosomes[$annotation.annotation_style['species'].value]: -#set $seq_names = [ "%s.%s" % ( $annotation.annotation_style['species'].value, $chrom ) for $chrom in $maf_input.metadata.species_chromosomes[$annotation.annotation_style['species'].value]] -#set $aliases = [ " %s" % $chrom for $chrom in $maf_input.metadata.species_chromosomes[$annotation.annotation_style['species'].value]] +#if $maf_input.dataset.metadata.species_chromosomes and $annotation.annotation_style['species'].value in $maf_input.dataset.metadata.species_chromosomes and $maf_input.dataset.metadata.species_chromosomes[$annotation.annotation_style['species'].value]: +#set $seq_names = [ "%s.%s" % ( $annotation.annotation_style['species'].value, $chrom ) for $chrom in $maf_input.dataset.metadata.species_chromosomes[$annotation.annotation_style['species'].value]] #else: #set $seq_names = [$annotation.annotation_style['species']] -#set $aliases = [""] #end if #else: #set $seq_names = [$annotation.annotation_style['seq_name']] -#set $aliases = [""] #end if -#for $seq_name, $alias in $zip( $seq_names, $aliases ): +#for $seq_name in $seq_names: seq ${seq_count}: seqname = $seq_name #if $annotation.annotation_style['exons_file'].dataset: -exons = ${annotation_count}.exons.${annotation.annotation_style['exons_file'].extension}$alias +exons = ${annotation_count}.exons.${annotation.annotation_style['exons_file'].extension} #end if #if $annotation.annotation_style['repeats_file'].dataset: -repeats = ${annotation_count}.repeats.${annotation.annotation_style['repeats_file'].extension}$alias +repeats = ${annotation_count}.repeats.${annotation.annotation_style['repeats_file'].extension} #end if #if $annotation.annotation_style['links_file'].dataset: -links = ${annotation_count}.links.${annotation.annotation_style['links_file'].extension}$alias +links = ${annotation_count}.links.${annotation.annotation_style['links_file'].extension} #end if #if $annotation.annotation_style['underlays_file'].dataset: -underlays = ${annotation_count}.underlays.${annotation.annotation_style['underlays_file'].extension}$alias +underlays = ${annotation_count}.underlays.${annotation.annotation_style['underlays_file'].extension} #end if #if $annotation.annotation_style['highlights_file'].dataset: -highlights = ${annotation_count}.highlights.${annotation.annotation_style['highlights_file'].extension}$alias +highlights = ${annotation_count}.highlights.${annotation.annotation_style['highlights_file'].extension} #end if +#if $annotation.annotation_style.style == "basic": offset = $annotation.annotation_style['offset'] +#end if #set $seq_count = $seq_count + 1 #end for

1 0

[hg] galaxy 1520: Fix a bug in shrimp_wrapper and add a tool for...
by greg＠scofield.bx.psu.edu 22 Sep '08

22 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/9ef55e79068b changeset: 1520:9ef55e79068b user: wychung date: Fri Sep 19 12:02:13 2008 -0400 description: Fix a bug in shrimp_wrapper and add a tool for splitting paired-end reads. Update datatype/fastqsolexa so the number of sequences is correct. 7 file(s) affected in this change: lib/galaxy/datatypes/sequence.py test-data/split_paired_reads_test1.fastq test-data/split_paired_reads_test1.out1 tool_conf.xml.sample tools/metag_tools/shrimp_wrapper.py tools/metag_tools/split_paired_reads.py tools/metag_tools/split_paired_reads.xml diffs (216 lines): diff -r 0f735b21dc12 -r 9ef55e79068b lib/galaxy/datatypes/sequence.py --- a/lib/galaxy/datatypes/sequence.py Thu Sep 18 16:48:29 2008 -0400 +++ b/lib/galaxy/datatypes/sequence.py Fri Sep 19 12:02:13 2008 -0400 @@ -98,8 +98,8 @@ dataset.peek = data.get_file_peek( dataset.file_name ) count = size = 0 bases_regexp = re.compile("^[NGTAC]*$") - for line in file( dataset.file_name ): - if line and line[0] == "@": + for i, line in enumerate(file( dataset.file_name )): + if line and line[0] == "@" and i % 4 == 0: count += 1 elif bases_regexp.match(line): line = line.strip() diff -r 0f735b21dc12 -r 9ef55e79068b test-data/split_paired_reads_test1.fastq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/split_paired_reads_test1.fastq Fri Sep 19 12:02:13 2008 -0400 @@ -0,0 +1,21 @@ +@HWI-EAS91_1_30788AAXX:7:21:1542:1758 +GTCAATTGTACTGGTCAATACTAAAAGAATAGGATCGCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA ++HWI-EAS91_1_30788AAXX:7:21:1542:1758 +hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR +@HWI-EAS91_1_30788AAXX:7:22:1621:462 +ATAATGGCTATTATTGTGGGGGGGATGATGCTGGAAACTAGCCCCAATATCAATCCTATATCAAATCTCACC ++HWI-EAS91_1_30788AAXX:7:22:1621:462 +hhhhhhhhhhhhQAhh@hhhhNhhhfhMbCIScC?hhJhhhhChhhJhhhRhhKhePhc\KhhV\KhXhJhh +@HWI-EAS91_1_30788AAXX:7:45:408:807 +TACCCGATTTTTTGCTTTCCACTTTATCCTACCCTTATGAGTGCTAGGATCAGGATGGAGAGGATTAGGGCT ++HWI-EAS91_1_30788AAXX:7:45:408:807 +hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hhhZh`hhhhhRXhhYh +@HWI-EAS91_1_30788AAXX:7:49:654:1439 +CTAACTCTATTTATTGTATTTCAACTAAAAATCTCATAGGTTTATTGATAGTTGTGTTGTTGGTGTAAATGG ++HWI-EAS91_1_30788AAXX:7:49:654:1439 +hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhdhh_hG\XhU@ +@HWI-EAS91_1_30788AAXX:7:64:947:234 +TATCAAAAAAGAATATAATCTGAATCAACACTACAACCTATTAGTGTGTAGAATAGGAAGTAGAGGCCTGCG ++HWI-EAS91_1_30788AAXX:7:64:947:234 +hhhhhhhhhhhhhhhhhhhhhhhRhhehhahhhhhJhhhhhhhh^hPhWfhhhhThWUhhfhh_hhNIVPUd + diff -r 0f735b21dc12 -r 9ef55e79068b test-data/split_paired_reads_test1.out1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/split_paired_reads_test1.out1 Fri Sep 19 12:02:13 2008 -0400 @@ -0,0 +1,20 @@ +@HWI-EAS91_1_30788AAXX:7:21:1542:1758/1 +GTCAATTGTACTGGTCAATACTAAAAGAATAGGATC ++HWI-EAS91_1_30788AAXX:7:21:1542:1758/1 +hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh +@HWI-EAS91_1_30788AAXX:7:22:1621:462/1 +ATAATGGCTATTATTGTGGGGGGGATGATGCTGGAA ++HWI-EAS91_1_30788AAXX:7:22:1621:462/1 +hhhhhhhhhhhhQAhh@hhhhNhhhfhMbCIScC?h +@HWI-EAS91_1_30788AAXX:7:45:408:807/1 +TACCCGATTTTTTGCTTTCCACTTTATCCTACCCTT ++HWI-EAS91_1_30788AAXX:7:45:408:807/1 +hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh +@HWI-EAS91_1_30788AAXX:7:49:654:1439/1 +CTAACTCTATTTATTGTATTTCAACTAAAAATCTCA ++HWI-EAS91_1_30788AAXX:7:49:654:1439/1 +hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh +@HWI-EAS91_1_30788AAXX:7:64:947:234/1 +TATCAAAAAAGAATATAATCTGAATCAACACTACAA ++HWI-EAS91_1_30788AAXX:7:64:947:234/1 +hhhhhhhhhhhhhhhhhhhhhhhRhhehhahhhhhJ diff -r 0f735b21dc12 -r 9ef55e79068b tool_conf.xml.sample --- a/tool_conf.xml.sample Thu Sep 18 16:48:29 2008 -0400 +++ b/tool_conf.xml.sample Fri Sep 19 12:02:13 2008 -0400 @@ -274,6 +274,7 @@ <tool file="metag_tools/short_reads_figure_high_quality_length.xml" /> <tool file="metag_tools/short_reads_trim_seq.xml" /> <tool file="metag_tools/blat_coverage_report.xml" /> + <tool file="metag_tools/split_paired_reads.xml" /> </section> <section name="Short Read Mapping" id="solexa_tools"> <tool file="metag_tools/shrimp_wrapper.xml" /> diff -r 0f735b21dc12 -r 9ef55e79068b tools/metag_tools/shrimp_wrapper.py --- a/tools/metag_tools/shrimp_wrapper.py Thu Sep 18 16:48:29 2008 -0400 +++ b/tools/metag_tools/shrimp_wrapper.py Fri Sep 19 12:02:13 2008 -0400 @@ -162,6 +162,7 @@ readname, endindex = line[1:].split('/') else: score = line + if score: # the last one if hits.has_key(readname): if len(hits[readname]) == hit_per_read: @@ -182,8 +183,9 @@ match_count = 0 if hit_per_read == 1: - matches = [ hits[readkey]['1'] ] - match_count = 1 + if len(hits[readkey]['1']) == 1: + matches = [ hits[readkey]['1'] ] + match_count = 1 else: end1_data = hits[readkey]['1'] end2_data = hits[readkey]['2'] @@ -591,6 +593,7 @@ if os.path.exists(query_qual_end2): os.remove(query_qual_end2) if os.path.exists(shrimp_log): os.remove(shrimp_log) + if __name__ == '__main__': __main__() diff -r 0f735b21dc12 -r 9ef55e79068b tools/metag_tools/split_paired_reads.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/metag_tools/split_paired_reads.py Fri Sep 19 12:02:13 2008 -0400 @@ -0,0 +1,46 @@ +#! /usr/bin/python + +""" +Split Solexa paired end reads +""" + +import os, sys + +if __name__ == '__main__': + + infile = sys.argv[1] + outfile_end1 = open(sys.argv[2], 'w') + outfile_end2 = open(sys.argv[3], 'w') + + for i, line in enumerate(file(infile)): + line = line.rstrip() + if not line or line.startswith('#'): continue + + end1 = '' + end2 = '' + + line_index = i % 4 + + if line_index == 0: + end1 = line + '/1' + end2 = line + '/2' + + elif line_index == 1: + seq_len = len(line)/2 + end1 = line[0:seq_len] + end2 = line[seq_len:] + + elif line_index == 2: + end1 = line + '/1' + end2 = line + '/2' + + else: + qual_len = len(line)/2 + end1 = line[0:qual_len] + end2 = line[qual_len:] + + outfile_end1.write('%s\n' %(end1)) + outfile_end2.write('%s\n' %(end2)) + + outfile_end1.close() + outfile_end2.close() \ No newline at end of file diff -r 0f735b21dc12 -r 9ef55e79068b tools/metag_tools/split_paired_reads.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/metag_tools/split_paired_reads.xml Fri Sep 19 12:02:13 2008 -0400 @@ -0,0 +1,56 @@ +<tool id="split_paired_reads" name="Split" version="1.0.0"> + <description>paired-end reads into two ends</description> + <command interpreter="python"> + split_paired_reads.py $input $output1 $output2 + </command> + <inputs> + <param name="input" type="data" format="fastqsolexa" label="Your paired-end file" /> + </inputs> + <outputs> + <data name="output1" format="fastqsolexa"/> + <data name="output2" format="fastqsolexa"/> + </outputs> + <tests> + <test> + <param name="input" value="split_paired_reads_test1.fastq" ftype="fastqsolexa" /> + <output name="output1" file="split_paired_reads_test1.out1" fype="fastqsolexa" /> + </test> + </tests> +<help> + +**What it does** + +This tool splits a single paired-end file in half and returns two files with each ends. + +----- + +**Input formats** + +A multiple-fastq file, for example:: + + @HWI-EAS91_1_30788AAXX:7:21:1542:1758 + GTCAATTGTACTGGTCAATACTAAAAGAATAGGATCGCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA + +HWI-EAS91_1_30788AAXX:7:21:1542:1758 + hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR + + +----- + +**Outputs** + +One end:: + + @HWI-EAS91_1_30788AAXX:7:21:1542:1758/1 + GTCAATTGTACTGGTCAATACTAAAAGAATAGGATC + +HWI-EAS91_1_30788AAXX:7:21:1542:1758/1 + hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh + +The other end:: + + @HWI-EAS91_1_30788AAXX:7:21:1542:1758/2 + GCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA + +HWI-EAS91_1_30788AAXX:7:21:1542:1758/2 + hhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR + +</help> +</tool>

1 0

[hg] galaxy 1522: Adding a new set of toolss to perform multiple...
by greg＠scofield.bx.psu.edu 22 Sep '08

22 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/05974294cbf1 changeset: 1522:05974294cbf1 user: guru date: Sat Sep 20 18:14:24 2008 -0400 description: Adding a new set of toolss to perform multiple linear regression analysis. 9 file(s) affected in this change: test-data/rcve_out.dat test-data/reg_inp.tab tool_conf.xml.sample tools/regVariation/best_regression_subsets.py tools/regVariation/best_regression_subsets.xml tools/regVariation/linear_regression.py tools/regVariation/linear_regression.xml tools/regVariation/rcve.py tools/regVariation/rcve.xml diffs (700 lines): diff -r 618210a97e62 -r 05974294cbf1 test-data/rcve_out.dat --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/rcve_out.dat Sat Sep 20 18:14:24 2008 -0400 @@ -0,0 +1,8 @@ +#Model R-sq RCVE_Terms RCVE_Value +2 3 4 0.3997 - - +3 4 0.3319 2 0.1697 +2 4 0.2974 3 0.2561 +2 3 0.3985 4 0.0031 +4 0.1226 2 3 0.6934 +3 0.2733 2 4 0.3164 +2 0.2972 3 4 0.2564 diff -r 618210a97e62 -r 05974294cbf1 test-data/reg_inp.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/reg_inp.tab Sat Sep 20 18:14:24 2008 -0400 @@ -0,0 +1,100 @@ +2.04 2.01 1070 5 +2.56 3.40 1254 6 +3.75 3.68 1466 6 +1.10 1.54 706 4 +3.00 3.32 1160 5 +0.05 0.33 756 3 +1.38 0.36 1058 2 +1.50 1.97 1008 7 +1.38 2.03 1104 4 +4.01 2.05 1200 7 +1.50 2.13 896 7 +1.29 1.34 848 3 +1.90 1.51 958 5 +3.11 3.12 1246 6 +1.92 2.14 1106 4 +0.81 2.60 790 5 +1.01 1.90 954 4 +3.66 3.06 1500 6 +2.00 1.60 1046 5 +2.05 1.96 1054 4 +2.60 1.96 1198 6 +2.55 1.56 940 3 +0.38 1.60 456 6 +2.48 1.92 1150 7 +2.74 3.09 636 6 +1.77 0.78 744 5 +1.61 2.12 644 5 +0.99 1.85 842 3 +1.62 1.78 852 5 +2.03 1.03 1170 3 +3.50 3.44 1034 10 +3.18 2.42 1202 5 +2.39 1.74 1018 5 +1.48 1.89 1180 5 +1.54 1.43 952 3 +1.57 1.64 1038 4 +2.46 2.69 1090 6 +2.42 1.79 694 5 +2.11 2.72 1096 6 +2.04 2.15 1114 5 +1.68 2.22 1256 6 +1.64 1.55 1208 5 +2.41 2.34 820 6 +2.10 2.92 1222 4 +1.40 2.10 1120 5 +2.03 1.64 886 4 +1.99 2.83 1126 7 +2.24 1.76 1158 4 +0.45 1.81 676 6 +2.31 2.68 1214 7 +2.41 2.55 1136 6 +2.56 2.70 1264 6 +2.50 1.66 1116 3 +2.92 2.23 1292 4 +2.35 2.01 604 5 +2.82 1.24 854 6 +1.80 1.95 814 6 +1.29 1.73 778 3 +1.68 1.08 800 2 +3.44 3.46 1424 7 +1.90 3.01 950 6 +2.06 0.54 1056 3 +3.30 3.20 956 8 +1.80 1.50 1352 5 +2.00 1.71 852 5 +1.68 1.99 1168 5 +1.94 2.76 970 6 +0.97 1.56 776 4 +1.12 1.78 854 6 +1.31 1.32 1232 5 +1.68 0.87 1140 6 +3.09 1.75 1084 4 +1.87 1.41 954 2 +2.00 2.77 1000 4 +2.39 1.78 1084 4 +1.50 1.34 1058 4 +1.82 1.52 816 5 +1.80 2.97 1146 7 +2.01 1.75 1000 6 +1.88 1.64 856 4 +1.64 1.80 798 4 +2.42 3.37 1324 6 +0.22 1.15 704 6 +2.31 1.72 1222 5 +0.95 2.27 948 6 +1.99 2.85 1182 8 +1.86 2.21 1000 6 +1.79 1.94 910 6 +3.02 4.25 1374 9 +1.85 1.83 1014 6 +1.98 2.75 1420 7 +2.15 1.71 400 6 +1.46 2.20 998 7 +2.29 2.13 776 6 +2.39 2.38 1134 7 +1.80 1.64 772 4 +2.64 1.87 1304 6 +2.08 2.53 1212 4 +0.70 1.78 818 6 +0.89 1.20 864 2 \ No newline at end of file diff -r 618210a97e62 -r 05974294cbf1 tool_conf.xml.sample --- a/tool_conf.xml.sample Fri Sep 19 12:34:51 2008 -0400 +++ b/tool_conf.xml.sample Sat Sep 20 18:14:24 2008 -0400 @@ -128,6 +128,11 @@ <tool file="regVariation/getIndels_2way.xml" /> <tool file="regVariation/getIndels_3way.xml" /> <tool file="regVariation/getIndelRates_3way.xml" /> + </section> + <section name="Multiple regression" id="multReg"> + <tool file="regVariation/linear_regression.xml" /> + <tool file="regVariation/best_regression_subsets.xml" /> + <tool file="regVariation/rcve.xml" /> </section> <section name="Evolution: HyPhy" id="hyphy"> <tool file="hyphy/hyphy_branch_lengths_wrapper.xml" /> diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/best_regression_subsets.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/regVariation/best_regression_subsets.py Sat Sep 20 18:14:24 2008 -0400 @@ -0,0 +1,90 @@ +#!/usr/bin/env python + +from galaxy import eggs + +import sys, string +from rpy import * +import numpy + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + +infile = sys.argv[1] +y_col = int(sys.argv[2])-1 +x_cols = sys.argv[3].split(',') +outfile = sys.argv[4] +outfile2 = sys.argv[5] +print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1) +fout = open(outfile,'w') + +for i, line in enumerate( file ( infile )): + line = line.rstrip('\r\n') + if len( line )>0 and not line.startswith( '#' ): + elems = line.split( '\t' ) + break + if i == 30: + break # Hopefully we'll never get here... + +if len( elems )<1: + stop_err( "The data in your input dataset is either missing or not formatted properly." ) + +y_vals = [] +x_vals = [] + +for k,col in enumerate(x_cols): + x_cols[k] = int(col)-1 + x_vals.append([]) + +NA = 'NA' +for ind,line in enumerate( file( infile )): + if line and not line.startswith( '#' ): + try: + fields = line.split("\t") + try: + yval = float(fields[y_col]) + except Exception, ey: + yval = r('NA') + y_vals.append(yval) + for k,col in enumerate(x_cols): + try: + xval = float(fields[col]) + except Exception, ex: + xval = r('NA') + x_vals[k].append(xval) + except: + pass + +response_term = "" + +x_vals1 = numpy.asarray(x_vals).transpose() + +dat= r.list(x=array(x_vals1), y=y_vals) + +r.library("leaps") + +set_default_mode(NO_CONVERSION) +try: + leaps = r.regsubsets(r("y ~ x"), data= r.na_exclude(dat)) +except RException, rex: + stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain no numeric values.") +set_default_mode(BASIC_CONVERSION) + +summary = r.summary(leaps) +tot = len(x_vals) +pattern = "[" +for i in range(tot): + pattern = pattern + 'c' + str(int(x_cols[int(i)]) + 1) + ' ' +pattern = pattern.strip() + ']' +print >>fout, "#Vars\t%s\tR-sq\tAdj. R-sq\tC-p\tbic" %(pattern) +for ind,item in enumerate(summary['outmat']): + print >>fout, "%s\t%s\t%s\t%s\t%s\t%s" %(str(item).count('*'), item, summary['rsq'][ind], summary['adjr2'][ind], summary['cp'][ind], summary['bic'][ind]) + + +r.pdf( outfile2, 8, 8 ) +r.plot(leaps, scale="Cp", main="Best subsets using Cp Criterion") +r.plot(leaps, scale="r2", main="Best subsets using R-sq Criterion") +r.plot(leaps, scale="adjr2", main="Best subsets using Adjusted R-sq Criterion") +r.plot(leaps, scale="bic", main="Best subsets using bic Criterion") + +r.dev_off() diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/best_regression_subsets.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/regVariation/best_regression_subsets.xml Sat Sep 20 18:14:24 2008 -0400 @@ -0,0 +1,64 @@ +<tool id="BestSubsetsRegression1" name="Perform Best-subsets Regression"> + <description> </description> + <command interpreter="python"> + best_regression_subsets.py + $input1 + $response_col + $predictor_cols + $out_file1 + $out_file2 + 1>/dev/null + 2>/dev/null + </command> + <inputs> + <param format="tabular" name="input1" type="data" label="Select data" help="Query missing? See TIP below."/> + <param name="response_col" label="Response column (Y)" type="data_column" data_ref="input1" /> + <param name="predictor_cols" label="Predictor columns (X)" type="data_column" data_ref="input1" multiple="true" /> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input1" /> + <data format="pdf" name="out_file2" /> + </outputs> + <requirements> + <requirement type="python-module">rpy</requirement> + </requirements> + <tests> +  + </tests> + <help> + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Edit Queries->Convert characters* + +----- + +.. class:: infomark + +**What it does** + +This tool uses the 'regsubsets' function from R statistical package for regression subset selection. It outputs two files, one containing a table with the best subsets and the corresponding summary statistics, and the other containing the graphical representation of the results. + +----- + +.. class:: warningmark + +**Note** + +- This tool currently treats all predictor and response variables as continuous variables. + +- Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis. + +- The 6 columns in the output are described below: + + - Column 1 (Vars): denotes the number of variables in the model + - Column 2 ([c2 c3 c4...]): represents a list of the user-selected predictor variables (full model). An asterix denotes the presence of the corresponding predictor variable in the selected model. + - Column 3 (R-sq): the fraction of variance explained by the model + - Column 4 (Adj. R-sq): the above R-squared statistic adjusted, penalizing for higher number of predictors (p) + - Column 5 (Cp): Mallow's Cp statistics + - Column 6 (bic): Bayesian Information Criterion. + + + </help> +</tool> diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/linear_regression.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/regVariation/linear_regression.py Sat Sep 20 18:14:24 2008 -0400 @@ -0,0 +1,117 @@ +#!/usr/bin/env python + +from galaxy import eggs +import sys, string +from rpy import * +import numpy + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + +infile = sys.argv[1] +y_col = int(sys.argv[2])-1 +x_cols = sys.argv[3].split(',') +outfile = sys.argv[4] +outfile2 = sys.argv[5] + +print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1) +fout = open(outfile,'w') + +for i, line in enumerate( file ( infile )): + line = line.rstrip('\r\n') + if len( line )>0 and not line.startswith( '#' ): + elems = line.split( '\t' ) + break + if i == 30: + break # Hopefully we'll never get here... + +if len( elems )<1: + stop_err( "The data in your input dataset is either missing or not formatted properly." ) + +y_vals = [] +x_vals = [] + +for k,col in enumerate(x_cols): + x_cols[k] = int(col)-1 + x_vals.append([]) + +NA = 'NA' +for ind,line in enumerate( file( infile )): + if line and not line.startswith( '#' ): + try: + fields = line.split("\t") + try: + yval = float(fields[y_col]) + except: + yval = r('NA') + y_vals.append(yval) + for k,col in enumerate(x_cols): + try: + xval = float(fields[col]) + except: + xval = r('NA') + x_vals[k].append(xval) + except: + pass + +x_vals1 = numpy.asarray(x_vals).transpose() + +dat= r.list(x=array(x_vals1), y=y_vals) + +set_default_mode(NO_CONVERSION) +try: + linear_model = r.lm(r("y ~ x"), data = r.na_exclude(dat)) +except RException, rex: + stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain only non-numeric or invalid values.") +set_default_mode(BASIC_CONVERSION) + +coeffs=linear_model.as_py()['coefficients'] +yintercept= coeffs['(Intercept)'] +print >>fout, "Y-intercept\t%s" %(yintercept) +summary = r.summary(linear_model) + +co = summary.get('coefficients', 'NA') +""" +if len(co) != len(x_vals)+1: + stop_err("Stopped performing linear regression on the input data, since one of the predictor columns contains only non-numeric or invalid values.") +""" +print >>fout, "p-value (Y-intercept)\t%s" %(co[0][3]) + +if len(x_vals) == 1: #Simple linear regression case with 1 predictor variable + try: + slope = coeffs['x'] + except: + slope = 'NA' + try: + pval = co[1][3] + except: + pval = 'NA' + print >>fout, "Slope (c%d)\t%s" %(x_cols[0]+1,slope) + print >>fout, "p-value (c%d)\t%s" %(x_cols[0]+1,pval) +else: #Multiple regression case with >1 predictors + ind=1 + while ind < len(coeffs.keys()): + print >>fout, "Slope (c%d)\t%s" %(x_cols[ind-1]+1,coeffs['x'+str(ind)]) + try: + pval = co[ind][3] + except: + pval = 'NA' + print >>fout, "p-value (c%d)\t%s" %(x_cols[ind-1]+1,pval) + ind+=1 + +print >>fout, "R-squared\t%s" %(summary.get('r.squared','NA')) +print >>fout, "Adjusted R-squared\t%s" %(summary.get('adj.r.squared','NA')) +print >>fout, "F-statistic\t%s" %(summary.get('fstatistic','NA')) +print >>fout, "Sigma\t%s" %(summary.get('sigma','NA')) + +r.pdf( outfile2, 8, 8 ) +if len(x_vals) == 1: #Simple linear regression case with 1 predictor variable + sub_title = "Slope = %s; Y-int = %s" %(slope,yintercept) + r.plot(x=x_vals[0], y=y_vals, xlab="X", ylab="Y", sub=sub_title, main="Scatterplot with regression") + r.abline(a=yintercept, b=slope, col="red") +else: + r.pairs(dat, main="Scatterplot Matrix", col="blue") + +r.plot(linear_model) +r.dev_off() diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/linear_regression.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/regVariation/linear_regression.xml Sat Sep 20 18:14:24 2008 -0400 @@ -0,0 +1,62 @@ +<tool id="LinearRegression1" name="Perform Linear Regression"> + <description> </description> + <command interpreter="python"> + linear_regression.py + $input1 + $response_col + $predictor_cols + $out_file1 + $out_file2 + 1>/dev/null + </command> + <inputs> + <param format="tabular" name="input1" type="data" label="Select data" help="Query missing? See TIP below."/> + <param name="response_col" label="Response column (Y)" type="data_column" data_ref="input1" /> + <param name="predictor_cols" label="Predictor columns (X)" type="data_column" data_ref="input1" multiple="true" /> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input1" /> + <data format="pdf" name="out_file2" /> + </outputs> + <requirements> + <requirement type="python-module">rpy</requirement> + </requirements> + <tests> +  + </tests> + <help> + + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Edit Queries->Convert characters* + +----- + +.. class:: infomark + +**What it does** + +This tool uses the 'lm' function from R statistical package to perform linear regression on the input data. It outputs two files, one containing the summary statistics of the performed regression, and the other containing diagnostic plots to check whether model assumptions are satisfied. + +----- + +.. class:: warningmark + +**Note** + +- This tool currently treats all predictor and response variables as continuous variables. + +- Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis. + +- The summary statistics in the output are described below: + + - sigma: the square root of the estimated variance of the random error (standard error of the residiuals) + - R-squared: the fraction of variance explained by the model + - Adjusted R-squared: the above R-squared statistic adjusted, penalizing for the number of the predictors (p) + - p-value: p-value for the t-test of the null hypothesis that the corresponding slope is equal to zero against the two-sided alternative. + + + </help> +</tool> diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/rcve.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/regVariation/rcve.py Sat Sep 20 18:14:24 2008 -0400 @@ -0,0 +1,143 @@ +#!/usr/bin/env python + +from galaxy import eggs + +import sys, string +from rpy import * +import numpy + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + +def sscombs(s): + if len(s) == 1: + return [s] + else: + ssc = sscombs(s[1:]) + return [s[0]] + [s[0]+comb for comb in ssc] + ssc + + +infile = sys.argv[1] +y_col = int(sys.argv[2])-1 +x_cols = sys.argv[3].split(',') +outfile = sys.argv[4] + +print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1) +fout = open(outfile,'w') + +for i, line in enumerate( file ( infile )): + line = line.rstrip('\r\n') + if len( line )>0 and not line.startswith( '#' ): + elems = line.split( '\t' ) + break + if i == 30: + break # Hopefully we'll never get here... + +if len( elems )<1: + stop_err( "The data in your input dataset is either missing or not formatted properly." ) + +y_vals = [] +x_vals = [] + +for k,col in enumerate(x_cols): + x_cols[k] = int(col)-1 + x_vals.append([]) + """ + try: + float( elems[x_cols[k]] ) + except: + try: + msg = "This operation cannot be performed on non-numeric column %d containing value '%s'." %( col, elems[x_cols[k]] ) + except: + msg = "This operation cannot be performed on non-numeric data." + stop_err( msg ) + """ +NA = 'NA' +for ind,line in enumerate( file( infile )): + if line and not line.startswith( '#' ): + try: + fields = line.split("\t") + try: + yval = float(fields[y_col]) + except Exception, ey: + yval = r('NA') + #print >>sys.stderr, "ey = %s" %ey + y_vals.append(yval) + for k,col in enumerate(x_cols): + try: + xval = float(fields[col]) + except Exception, ex: + xval = r('NA') + #print >>sys.stderr, "ex = %s" %ex + x_vals[k].append(xval) + except: + pass + +x_vals1 = numpy.asarray(x_vals).transpose() +dat= r.list(x=array(x_vals1), y=y_vals) + +set_default_mode(NO_CONVERSION) +try: + full = r.lm(r("y ~ x"), data= r.na_exclude(dat)) #full model includes all the predictor variables specified by the user +except RException, rex: + stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain no numeric values.") +set_default_mode(BASIC_CONVERSION) + +summary = r.summary(full) +fullr2 = summary.get('r.squared','NA') + +if fullr2 == 'NA': + stop_error("Error in linear regression") + +if len(x_vals) < 10: + s = "" + for ch in range(len(x_vals)): + s += str(ch) +else: + stop_err("This tool only works with less than 10 predictors.") + +print >>fout, "#Model\tR-sq\tRCVE_Terms\tRCVE_Value" +all_combos = sorted(sscombs(s), key=len) +all_combos.reverse() +for j,cols in enumerate(all_combos): + #if len(cols) == len(s): #Same as the full model above + # continue + if len(cols) == 1: + x_vals1 = x_vals[int(cols)] + else: + x_v = [] + for col in cols: + x_v.append(x_vals[int(col)]) + x_vals1 = numpy.asarray(x_v).transpose() + dat= r.list(x=array(x_vals1), y=y_vals) + set_default_mode(NO_CONVERSION) + red = r.lm(r("y ~ x"), data= dat) #Reduced model + set_default_mode(BASIC_CONVERSION) + summary = r.summary(red) + redr2 = summary.get('r.squared','NA') + try: + rcve = (float(fullr2)-float(redr2))/float(fullr2) + except: + rcve = 'NA' + col_str = "" + for col in cols: + col_str = col_str + str(int(x_cols[int(col)]) + 1) + " " + col_str.strip() + rcve_col_str = "" + for col in s: + if col not in cols: + rcve_col_str = rcve_col_str + str(int(x_cols[int(col)]) + 1) + " " + rcve_col_str.strip() + if len(cols) == len(s): #full model + rcve_col_str = "-" + rcve = "-" + try: + redr2 = "%.4f" %(float(redr2)) + except: + pass + try: + rcve = "%.4f" %(float(rcve)) + except: + pass + print >>fout, "%s\t%s\t%s\t%s" %(col_str,redr2,rcve_col_str,rcve) diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/rcve.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/regVariation/rcve.xml Sat Sep 20 18:14:24 2008 -0400 @@ -0,0 +1,68 @@ +<tool id="rcve1" name="Compute RCVE" version="1.0.0"> + <description> </description> + <command interpreter="python"> + rcve.py + $input1 + $response_col + $predictor_cols + $out_file1 + 1>/dev/null + </command> + <inputs> + <param format="tabular" name="input1" type="data" label="Select data" help="Query missing? See TIP below."/> + <param name="response_col" label="Response column (Y)" type="data_column" data_ref="input1" /> + <param name="predictor_cols" label="Predictor columns (X)" type="data_column" data_ref="input1" multiple="true" /> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input1" /> + </outputs> + <requirements> + <requirement type="python-module">rpy</requirement> + </requirements> + <tests> +  + <test> + <param name="input1" value="reg_inp.tab"/> + <param name="response_col" value="1"/> + <param name="predictor_cols" value="2,3,4"/> + <output name="out_file1" file="rcve_out.dat"/> + </test> + + </tests> + <help> + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Edit Queries->Convert characters* + +----- + +.. class:: infomark + +**What it does** + +This tool computes the RCVE (Relative Contribution to Variance) for all possible variable subsets using the following formula: + +**RCVE(i) = [R-sq (full: 1,2,..,i..,p-1) - R-sq(without i: 1,2,...,p-1)] / R-sq (full: 1,2,..,i..,p-1)**, +which denotes the case where the 'i'th predictor is dropped. + + +In general, +**RCVE(X+) = [R-sq (full: {X,X+}) - R-sq(reduced: {X})] / R-sq (full: {X,X+})**, +where, + +- {X,X+} denotes the set of all predictors, +- X+ is the set of predictors for which we compute RCVE (and therefore drop from the full model to obtain a reduced one), +- {X} is the set of the predictors that are left in the reduced model after excluding {X+} + + +The 4 columns in the output are described below: + +- Column 1 (Model): denotes the variables present in the model ({X}) +- Column 2 (R-sq): denotes the R-squared value corresponding to the model in Column 1 +- Column 3 (RCVE_Terms): denotes the variable/s for which RCVE is computed ({X+}). These are the variables that are absent in the reduced model in Column 1. A '-' in this column indicates that the model in Column 1 is the Full model. +- Column 4 (RCVE): denotes the RCVE value corresponding to the variable/s in Column 3. A '-' in this column indicates that the model in Column 1 is the Full model. + + + </help> +</tool>

1 0

[hg] galaxy 1518: Add a wrapper for metadata inside of DatasetFi...
by greg＠scofield.bx.psu.edu 22 Sep '08

22 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/0f735b21dc12 changeset: 1518:0f735b21dc12 user: Dan Blankenberg <dan(a)bx.psu.edu> date: Thu Sep 18 16:48:29 2008 -0400 description: Add a wrapper for metadata inside of DatasetFilenameWrapper to allow proper string substitution in commandline and templates. 2 file(s) affected in this change: lib/galaxy/datatypes/metadata.py lib/galaxy/tools/__init__.py diffs (56 lines): diff -r 1d326855ba89 -r 0f735b21dc12 lib/galaxy/datatypes/metadata.py --- a/lib/galaxy/datatypes/metadata.py Thu Sep 18 15:41:23 2008 -0400 +++ b/lib/galaxy/datatypes/metadata.py Thu Sep 18 16:48:29 2008 -0400 @@ -211,6 +211,9 @@ elif not isinstance(value, list): MetadataParameter.__setattr__(self, name, [value]) + def __iter__( self ): + return iter( self.value ) + def __str__(self): if self.value in [None, []]: return str(self.spec.no_value) diff -r 1d326855ba89 -r 0f735b21dc12 lib/galaxy/tools/__init__.py --- a/lib/galaxy/tools/__init__.py Thu Sep 18 15:41:23 2008 -0400 +++ b/lib/galaxy/tools/__init__.py Thu Sep 18 16:48:29 2008 -0400 @@ -1177,6 +1177,31 @@ Wraps a dataset so that __str__ returns the filename, but all other attributes are accessible. """ + + class MetadataWrapper: + """ + Wraps a Metadata Collection to return MetadataParameters wrapped according to the metadata spec. + Methods implemented to match behavior of a Metadata Collection. + """ + def __init__( self, metadata ): + self.metadata = metadata + def __getattr__( self, name ): + rval = self.metadata.get( name, None ) + if name in self.metadata.spec: + rval = self.metadata.spec[name].wrap( rval, self.metadata.parent ) + return rval + def __nonzero__( self ): + return self.metadata.__nonzero__() + def __iter__( self ): + return self.metadata.__iter__() + def get( self, key, default=None ): + try: + return getattr( self, key ) + except: + return default + def items( self ): + return iter( [ ( k, self.get( k ) ) for k, v in self.metadata.items() ] ) + def __init__( self, dataset, datatypes_registry = None, tool = None, name = None ): if not dataset: try: @@ -1187,6 +1212,7 @@ self.dataset = NoneDataset( datatypes_registry = datatypes_registry, ext = ext ) else: self.dataset = dataset + self.metadata = self.MetadataWrapper( dataset.metadata ) def __str__( self ): return self.dataset.file_name def __getattr__( self, key ):

1 0

[hg] galaxy 1510: Strip whitespace from columns in file for data...
by greg＠scofield.bx.psu.edu 22 Sep '08

22 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/f8e3770c23f6 changeset: 1510:f8e3770c23f6 user: Dan Blankenberg <dan(a)bx.psu.edu> date: Tue Sep 16 14:10:53 2008 -0400 description: Strip whitespace from columns in file for dataset_metadata_in_file validator. 1 file(s) affected in this change: lib/galaxy/tools/parameters/validation.py diffs (12 lines): diff -r ec547440ec97 -r f8e3770c23f6 lib/galaxy/tools/parameters/validation.py --- a/lib/galaxy/tools/parameters/validation.py Tue Sep 16 13:25:42 2008 -0400 +++ b/lib/galaxy/tools/parameters/validation.py Tue Sep 16 14:10:53 2008 -0400 @@ -247,7 +247,7 @@ if line_startswith is None or line.startswith( line_startswith ): fields = line.split( '\t' ) if metadata_column < len( fields ): - self.valid_values.append( fields[metadata_column] ) + self.valid_values.append( fields[metadata_column].strip() ) def validate( self, value, history = None ): if not value: return if hasattr( value, "metadata" ):

1 0

[hg] galaxy 1516: Update to latest gmaj.
by greg＠scofield.bx.psu.edu 22 Sep '08

22 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/f1da9b95549b changeset: 1516:f1da9b95549b user: Dan Blankenberg <dan(a)bx.psu.edu> date: Thu Sep 18 15:24:51 2008 -0400 description: Update to latest gmaj. 1 file(s) affected in this change: static/gmaj/gmaj.jar diffs (2 lines): diff -r 280e8b68f845 -r f1da9b95549b static/gmaj/gmaj.jar Binary file static/gmaj/gmaj.jar has changed

1 0

[hg] galaxy 1515: Forgot to update tool_conf.sample with the new...
by greg＠scofield.bx.psu.edu 22 Sep '08

22 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/280e8b68f845 changeset: 1515:280e8b68f845 user: guru date: Wed Sep 17 17:14:59 2008 -0400 description: Forgot to update tool_conf.sample with the new tool details. 1 file(s) affected in this change: tool_conf.xml.sample diffs (10 lines): diff -r 33e06a98b6d8 -r 280e8b68f845 tool_conf.xml.sample --- a/tool_conf.xml.sample Wed Sep 17 16:42:08 2008 -0400 +++ b/tool_conf.xml.sample Wed Sep 17 17:14:59 2008 -0400 @@ -281,5 +281,6 @@ <tool file="metag_tools/megablast_wrapper.xml" /> <tool file="metag_tools/megablast_xml_parser.xml" /> <tool file="metag_tools/blat_wrapper.xml" /> + <tool file="metag_tools/mapping_to_ucsc.xml" /> </section> </toolbox>

1 0

[hg] galaxy 1512: The MetadataCollection object is now created o...
by greg＠scofield.bx.psu.edu 22 Sep '08

22 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/1e408bab8941 changeset: 1512:1e408bab8941 user: Dan Blankenberg <dan(a)bx.psu.edu> date: Tue Sep 16 15:23:23 2008 -0400 description: The MetadataCollection object is now created only once per dataset object instance (and when datatype is changed), instead of each time dataset.metadata is called. The 'no_value' attribute for a metadata element's spec is returned when the metadata element's value is None. 2 file(s) affected in this change: lib/galaxy/datatypes/metadata.py lib/galaxy/model/__init__.py diffs (89 lines): diff -r c3ce08879473 -r 1e408bab8941 lib/galaxy/datatypes/metadata.py --- a/lib/galaxy/datatypes/metadata.py Tue Sep 16 14:26:14 2008 -0400 +++ b/lib/galaxy/datatypes/metadata.py Tue Sep 16 15:23:23 2008 -0400 @@ -151,9 +151,16 @@ """ def __init__(self, parent, spec): self.parent = parent - self.bunch = parent._metadata or dict() if spec is None: self.spec = MetadataSpecCollection() else: self.spec = spec + + #set default metadata values + if not self.parent._metadata: + self.parent._metadata = {} + for name, value in self.spec.items(): + if name not in self.bunch: + self.bunch[name] = value.default + def __iter__(self): return self.bunch.__iter__() def get( self, key, default=None ): @@ -168,19 +175,21 @@ def __nonzero__(self): return self.bunch.__nonzero__() def __getattr__(self, name): - if self.bunch.get( name ): - return self.bunch.get( name ) + if name == "bunch": + return self.parent._metadata + rval = self.bunch.get( name ) + if rval is None: + rval = self.spec.get( name, None ) + if rval: + rval = rval.no_value + return rval + def __setattr__(self, name, value): + if name in ["parent","spec"]: + self.__dict__[name] = value + elif name == "bunch": + self.parent._metadata = value else: - if self.spec.get(name, None): - return self.spec[name].default - else: - return None - def __setattr__(self, name, value): - if name in ["parent","bunch","spec"]: - self.__dict__[name] = value - else: - self.__dict__["bunch"][name] = value - self.bunch = self.parent._metadata = dict( self.bunch ) + self.bunch[name] = value MetadataElement = Statement(MetadataElementSpec) diff -r c3ce08879473 -r 1e408bab8941 lib/galaxy/model/__init__.py --- a/lib/galaxy/model/__init__.py Tue Sep 16 14:26:14 2008 -0400 +++ b/lib/galaxy/model/__init__.py Tue Sep 16 15:23:23 2008 -0400 @@ -113,7 +113,7 @@ self.peek = peek self.extension = extension self.designation = designation - self._metadata = metadata or dict() + self.metadata = metadata or dict() self.dbkey = dbkey self.deleted = deleted self.visible = visible @@ -159,9 +159,9 @@ return datatypes_registry.get_datatype_by_extension( self.extension ) def get_metadata( self ): - if not self._metadata: - self._metadata = dict() - return MetadataCollection( self, self.datatype.metadata_spec ) + if not hasattr( self, '_metadata_collection' ): + self._metadata_collection = MetadataCollection( self, self.datatype.metadata_spec ) + return self._metadata_collection def set_metadata( self, bunch ): # Needs to accept a MetadataCollection, a bunch, or a dict self._metadata = dict( bunch.items() ) @@ -191,6 +191,8 @@ def change_datatype( self, new_ext ): self.clear_associated_files() + if hasattr( self, '_metadata_collection' ): + del self._metadata_collection datatypes_registry.change_datatype( self, new_ext ) def get_size( self ): """Returns the size of the data on disk"""

1 0

[hg] galaxy 1511: Merge local heads
by greg＠scofield.bx.psu.edu 22 Sep '08

22 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/c3ce08879473 changeset: 1511:c3ce08879473 user: Dan Blankenberg <dan(a)bx.psu.edu> date: Tue Sep 16 14:26:14 2008 -0400 description: Merge local heads 0 file(s) affected in this change: diffs (12 lines): diff -r eb941905fd70 -r c3ce08879473 lib/galaxy/tools/parameters/validation.py --- a/lib/galaxy/tools/parameters/validation.py Tue Sep 16 14:09:16 2008 -0400 +++ b/lib/galaxy/tools/parameters/validation.py Tue Sep 16 14:26:14 2008 -0400 @@ -247,7 +247,7 @@ if line_startswith is None or line.startswith( line_startswith ): fields = line.split( '\t' ) if metadata_column < len( fields ): - self.valid_values.append( fields[metadata_column] ) + self.valid_values.append( fields[metadata_column].strip() ) def validate( self, value, history = None ): if not value: return if hasattr( value, "metadata" ):

1 0