2 new commits in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/42cc410f02da/ Changeset: 42cc410f02da User: davebgx Date: 2014-07-28 17:58:52 Summary: Migrate tools from the distribution to the tool shed. Affected #: 33 files diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 lib/tool_shed/galaxy_install/migrate/versions/0012_tools.py --- /dev/null +++ b/lib/tool_shed/galaxy_install/migrate/versions/0012_tools.py @@ -0,0 +1,48 @@ +""" +The following tools have been eliminated from the distribution: + +1: Compute an expression on every row +2: Correlation for numeric columns +3: Count GFF Features +4: Filter on ambiguities in polymorphism datasets +5: Generate A Matrix for using PC and LDA +6: Histogram of a numeric column +7: Perform Linear Discriminant Analysis +8: Maximal Information-based Nonparametric Exploration +9: Pearson and apos Correlation between any two numeric columns +10: Convert from pgSnp to gd_snp +11: Draw ROC plot on "Perform LDA" output +12: Scatterplot of two numeric columns +13: snpFreq significant SNPs in case-control data +14: Build custom track for UCSC genome browser +15: VCF to pgSnp + +The tools are now available in the repositories respectively: + +1: column_maker +2: correlation +3: count_gff_features +4: dna_filtering +5: generate_pc_lda_matrix +6: histogram +7: lda_analysis +8: mine +9: pearson_correlation +10: pgsnp2gd_snp +11: plot_from_lda +12: scatterplot +13: snpfreq +14: ucsc_custom_track +15: vcf2pgsnp + +from the main Galaxy tool shed at http://toolshed.g2.bx.psu.edu +and will be installed into your local Galaxy instance at the +location discussed above by running the following command. + +""" + +def upgrade( migrate_engine ): + print __doc__ + +def downgrade( migrate_engine ): + pass diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 scripts/migrate_tools/0012_tools.sh --- /dev/null +++ b/scripts/migrate_tools/0012_tools.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +cd `dirname $0`/../.. +python ./scripts/migrate_tools/migrate_tools.py 0012_tools.xml $@ diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 scripts/migrate_tools/0012_tools.xml --- /dev/null +++ b/scripts/migrate_tools/0012_tools.xml @@ -0,0 +1,48 @@ +<?xml version="1.0"?> +<toolshed name="toolshed.g2.bx.psu.edu"> + <repository changeset_revision="08a01b2ce4cd" owner="devteam" name="column_maker" description="Compute an expression on every row"> + <tool file="stats/column_maker.xml" id="Add_a_column1" version="1.1.0" /> + </repository> + <repository changeset_revision="24e01abf9e34" owner="devteam" name="correlation" description="Correlation for numeric columns"> + <tool file="stats/cor.xml" id="cor2" version="1.0.0" /> + </repository> + <repository changeset_revision="fabda887a71f" owner="devteam" name="count_gff_features" description="Count GFF Features"> + <tool file="stats/count_gff_features.xml" id="count_gff_features" version="0.1" /> + </repository> + <repository changeset_revision="a6f0d355b05f" owner="devteam" name="dna_filtering" description="Filter on ambiguities in polymorphism datasets"> + <tool file="stats/dna_filtering.xml" id="dna_filter" version="1.0.0" /> + </repository> + <repository changeset_revision="04cdbd00dcec" owner="devteam" name="generate_pc_lda_matrix" description="Generate A Matrix for using PC and LDA"> + <tool file="stats/generate_matrix_for_pca_lda.xml" id="generate_matrix_for_pca_and_lda1" version="1.0.0" /> + </repository> + <repository changeset_revision="6ff47de059a0" owner="devteam" name="histogram" description="Histogram of a numeric column"> + <tool file="plotting/histogram2.xml" id="histogram_rpy" version="1.0.3" /> + </repository> + <repository changeset_revision="f38763b52f33" owner="devteam" name="lda_analysis" description="Perform Linear Discriminant Analysis"> + <tool file="stats/lda_analy.xml" id="lda_analy1" version="1.0.1" /> + </repository> + <repository changeset_revision="783d91de9e6d" owner="devteam" name="mine" description="Maximal Information-based Nonparametric Exploration"> + <tool file="stats/MINE.xml" id="maximal_information_based_nonparametric_exploration" version="0.0.1" /> + </repository> + <repository changeset_revision="5ebbb889236a" owner="devteam" name="pearson_correlation" description="Pearson and apos Correlation between any two numeric columns"> + <tool file="stats/correlation.xml" id="Pearson_and_apos_Correlation1" version="1.0.0" /> + </repository> + <repository changeset_revision="d281062566f9" owner="devteam" name="pgsnp2gd_snp" description="Convert from pgSnp to gd_snp"> + <tool file="phenotype_association/pgSnp2gd_snp.xml" id="pgSnp2gd_snp" version="1.0.0" /> + </repository> + <repository changeset_revision="c5ab37076128" owner="devteam" name="plot_from_lda" description="Draw ROC plot on "Perform LDA" output"> + <tool file="stats/plot_from_lda.xml" id="plot_for_lda_output1" version="1.0.1" /> + </repository> + <repository changeset_revision="c12b0759203b" owner="devteam" name="scatterplot" description="Scatterplot of two numeric columns"> + <tool file="plotting/scatterplot.xml" id="scatterplot_rpy" version="1.0.0" /> + </repository> + <repository changeset_revision="72ea0d13dd66" owner="devteam" name="snpfreq" description="snpFreq significant SNPs in case-control data"> + <tool file="phenotype_association/snpFreq.xml" id="hgv_snpFreq" version="1.0.1" /> + </repository> + <repository changeset_revision="618e56c3109b" owner="devteam" name="ucsc_custom_track" description="Build custom track for UCSC genome browser"> + <tool file="visualization/build_ucsc_custom_track.xml" id="build_ucsc_custom_track_1" version="1.0.0" /> + </repository> + <repository changeset_revision="5fca46616675" owner="devteam" name="vcf2pgsnp" description="VCF to pgSnp"> + <tool file="phenotype_association/vcf2pgSnp.xml" id="vcf2pgSnp" version="1.0.0" /> + </repository> +</toolshed> \ No newline at end of file diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tool_conf.xml.sample --- a/tool_conf.xml.sample +++ b/tool_conf.xml.sample @@ -36,7 +36,6 @@ </section><section id="textutil" name="Text Manipulation"><tool file="filters/fixedValueColumn.xml" /> - <tool file="stats/column_maker.xml" /><tool file="filters/catWrapper.xml" /><tool file="filters/cutWrapper.xml" /><tool file="filters/mergeCols.xml" /> @@ -52,7 +51,6 @@ <tool file="filters/trimmer.xml" /><tool file="filters/wc_gnu.xml" /><tool file="filters/secure_hash_message_digest.xml" /> - <tool file="stats/dna_filtering.xml" /></section><section id="filter" name="Filter and Sort"><tool file="stats/filtering.xml" /> @@ -113,22 +111,11 @@ <section id="stats" name="Statistics"><tool file="stats/gsummary.xml" /><tool file="filters/uniq.xml" /> - <tool file="stats/cor.xml" /> - <tool file="stats/generate_matrix_for_pca_lda.xml" /> - <tool file="stats/lda_analy.xml" /> - <tool file="stats/plot_from_lda.xml" /> - <tool file="stats/MINE.xml" /> - - <label id="gff" text="GFF" /> - <tool file="stats/count_gff_features.xml" /></section><section id="plots" name="Graph/Display Data"> - <tool file="plotting/histogram2.xml" /> - <tool file="plotting/scatterplot.xml" /><tool file="plotting/bar_chart.xml" /><tool file="plotting/boxplot.xml" /><tool file="visualization/LAJ.xml" /> - <tool file="visualization/build_ucsc_custom_track.xml" /><tool file="maf/vcf_to_maf_customtrack.xml" /><tool file="mutation/visualize.xml" /></section> @@ -189,13 +176,11 @@ <tool file="phenotype_association/sift.xml" /><tool file="phenotype_association/linkToGProfile.xml" /><tool file="phenotype_association/linkToDavid.xml" /> - <tool file="phenotype_association/snpFreq.xml" /><tool file="phenotype_association/ldtools.xml" /><tool file="phenotype_association/pass.xml" /><tool file="phenotype_association/gpass.xml" /><tool file="phenotype_association/beam.xml" /><tool file="phenotype_association/lps.xml" /><tool file="phenotype_association/master2pg.xml" /> - <tool file="phenotype_association/vcf2pgSnp.xml" /></section></toolbox> diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/phenotype_association/pgSnp2gd_snp.pl --- a/tools/phenotype_association/pgSnp2gd_snp.pl +++ /dev/null @@ -1,208 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -#convert from pgSnp file to snp table (Webb format?) - -#snp table format: -#1. chr -#2. position (0 based) -#3. ref allele -#4. second allele -#5. overall quality -#foreach individual (6-9, 10-13, ...) -#a. count of allele in 3 -#b. count of allele in 4 -#c. genotype call (-1, or count of ref allele) -#d. quality of genotype call (quality of non-ref allele from masterVar) - -if (!@ARGV) { - print "usage: pgSnp2gd_snp.pl file.pgSnp[.gz|.bz2] [-tab=snpTable.txt -addColsOnly -build=hg19 -name=na -ref=#1based -chr=#1based ] > newSnpTable.txt\n"; - exit; -} - -my $in = shift @ARGV; -my $tab; -my $tabOnly; -my $build; -my $name; -my $ref; -my $binChr = 1; #position of chrom column, indicates if bin is added -foreach (@ARGV) { - if (/-tab=(.*)/) { $tab = $1; } - elsif (/-addColsOnly/) { $tabOnly = 1; } - elsif (/-build=(.*)/) { $build = $1; } - elsif (/-name=(.*)/) { $name = $1; } - elsif (/-ref=(\d+)/) { $ref = $1 - 1; } #go to index - elsif (/-chr=(\d+)/) { $binChr = $1; } -} - -if ($binChr == 2 && $ref) { $ref--; } #shift over by 1, we will delete bin -if ((!$tab or !$tabOnly) && !$ref) { - print "Error the reference allele must be in a column in the file if not just adding to a previous SNP table.\n"; - exit; -} - -#WARNING loads snp table in memory, this could take > 1G ram -my %old; -my $colcnt = 0; -my @head; -if ($tab) { - open(FH, $tab) or die "Couldn't open $tab, $!\n"; - while (<FH>) { - chomp; - if (/^#/) { push(@head, $_); next; } - my @f = split(/\t/); - $old{"$f[0]:$f[1]"} = join("\t", @f); - $colcnt = scalar @f; - } - close FH or die "Couldn't close $tab, $!\n"; -} - -if ($in =~ /.gz$/) { - open(FH, "zcat $in |") or die "Couldn't open $in, $!\n"; -}elsif ($in =~ /.bz2$/) { - open(FH, "bzcat $in |") or die "Couldn't open $in, $!\n"; -}else { - open(FH, $in) or die "Couldn't open $in, $!\n"; -} -prepHeader(); -if (@head) { #keep old header, add new? - print join("\n", @head), "\n"; -} -while (<FH>) { - chomp; - if (/^#/) { next; } - if (/^\s*$/) { next; } - my @f = split(/\t/); - if ($binChr == 2) { #must have a bin column prepended on the beginning - shift @f; #delete it - } - if (!$f[3]) { next; } #WHAT? most likely still zipped? - if ($f[4] > 2) { next; } #can only do cases of 2 alleles - if ($f[2] == $f[1] or $f[2] - $f[1] != 1) { next; } #no indels - if ($f[3] =~ /-/) { next; } #no indels - #if creating a new table need the reference allele in a column - if (%old && $old{"$f[0]:$f[1]"}) { - my @o = split(/\t/, $old{"$f[0]:$f[1]"}); - my $freq = 0; - my $freq2 = 0; - my $sc; - my $g = 1; #genotype == ref allele count - if ($f[4] == 1) { #should be homozygous - if ($f[3] eq $o[2]) { $g = 2; $freq = $f[5]; } - elsif ($f[3] eq $o[3]) { $g = 0; $freq2 = $f[5]; } - else { next; } #doesn't match either allele, skip - $sc = $f[6]; - }else { - my $a = 0; #index of a alleles, freq, scores - my $b = 1; #same for b - my @all = split(/\//, $f[3]); - if ($o[2] ne $all[0] && $o[2] ne $all[1]) { next; } #must match one - if ($o[3] ne $all[0] && $o[3] ne $all[1]) { next; } - if ($o[2] eq $all[1]) { #switch indexes - $a = 1; - $b = 0; - } - my @fr = split(/,/, $f[5]); - $freq = $fr[$a]; - $freq2 = $fr[$b]; - my @s = split(/,/, $f[6]); - $sc = $s[$b]; - } - #print old - print $old{"$f[0]:$f[1]"}; - #add new columns - print "\t$freq\t$freq2\t$g\t$sc\n"; - $old{"$f[0]:$f[1]"} = ''; - }elsif (!$tabOnly) { #new table, or don't have this SNP - #need reference allele - if ($f[3] !~ /$f[$ref]/ && $f[4] == 2) { next; } #no reference allele - my $freq = 0; - my $freq2 = 0; - my $sc; - my $g = 1; #genotype == ref allele count - my $alt; - if ($f[4] == 1) { #should be homozygous - if ($f[3] eq $f[$ref]) { $g = 2; $freq = $f[5]; $alt = 'N'; } - else { $g = 0; $freq2 = $f[5]; $alt = $f[3]; } #matches alternate - $sc = $f[6]; - }else { - my $a = 0; #index of a alleles, freq, scores - my $b = 1; #same for b - my @all = split(/\//, $f[3]); - if ($f[$ref] ne $all[0] && $f[$ref] ne $all[1]) { next; } #must match one - if ($f[$ref] eq $all[1]) { #switch indexes - $a = 1; - $b = 0; - } - my @fr = split(/,/, $f[5]); - $freq = $fr[$a]; - $freq2 = $fr[$b]; - my @s = split(/,/, $f[6]); - $sc = $s[$b]; - $alt = $all[$b]; - } - #print initial columns - print "$f[0]\t$f[1]\t$f[$ref]\t$alt\t-1"; - #pad for other individuals if needed - my $i = 5; - while ($i < $colcnt) { - print "\t-1\t-1\t-1\t-1"; - $i += 4; - } - #add new columns - print "\t$freq\t$freq2\t$g\t$sc\n"; - } -} -close FH or die "Couldn't close $in, $!\n"; - -#if adding to a snp table, now we need to finish those not in the latest set -foreach my $k (keys %old) { - if ($old{$k} ne '') { #not printed yet - print $old{$k}, "\t-1\t-1\t-1\t-1\n"; #plus blank for this one - } -} - -exit; - -#parse old header and add or create new -sub prepHeader { - if (!$build) { $build = 'hg19'; } #set default - my @cnames; - my @ind; - my $n; - if (@head) { #parse previous header - my $h = join("", @head); #may split between lines - if ($h =~ /"column_names":\[(.*?)\]/) { - my @t = split(/,/, $1); - foreach (@t) { s/"//g; } - @cnames = @t; - $n = $cnames[$#cnames]; - $n =~ s/Q//; - $n++; - } - if ($h =~ /"dbkey":"(.*?)"/) { $build = $1; } - if ($h =~ /"individuals":\[(.*)\]/) { - my $t = $1; - $t =~ s/\]\].*/]/; #remove if there is more categories - @ind = split(/,/, $t); - } - }else { #start new header - @cnames = ("chr", "pos", "A", "B", "Q"); - $n = 1; - } - #add current - if (!$name) { $name= 'na'; } - my $stcol = $colcnt + 1; - if ($stcol == 1) { $stcol = 6; } #move past initial columns - push(@ind, "[\"$name\",$stcol]"); - push(@cnames, "${n}A", "${n}B", "${n}G", "${n}Q"); - #reassign head - undef @head; - foreach (@cnames) { $_ = "\"$_\""; } #quote name - $head[0] = "#{\"column_names\":[" . join(",", @cnames) . "],"; - $head[1] = "#\"individuals\":[" . join(",", @ind) . "],"; - $head[2] = "#\"dbkey\":\"$build\",\"pos\":2,\"rPos\":2,\"ref\":1,\"scaffold\":1,\"species\":\"$build\"}"; -} -####End - diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/phenotype_association/pgSnp2gd_snp.xml --- a/tools/phenotype_association/pgSnp2gd_snp.xml +++ /dev/null @@ -1,97 +0,0 @@ -<tool id="pgSnp2gd_snp" name="pgSnp to gd_snp" hidden="false"> - <description>Convert from pgSnp to gd_snp</description> - <command interpreter="perl"> - #if $snptab.tab2 == "yes" - #if $snptab.colsOnly == "addColsOnly" #pgSnp2gd_snp.pl $input1 -tab=$snptab.input2 -name=$indName -build=${input1.metadata.dbkey} -addColsOnly -chr=${input1.metadata.chromCol} > $out_file1 - #else #pgSnp2gd_snp.pl $input1 -tab=$snptab.input2 -name=$indName -build=${input1.metadata.dbkey} -ref=${ref} -chr=${input1.metadata.chromCol} > $out_file1 - #end if - #else #pgSnp2gd_snp.pl $input1 -name=$indName -build=${input1.metadata.dbkey} -ref=${ref} -chr=${input1.metadata.chromCol} > $out_file1 - #end if - </command> - <inputs> - <param format="tab" name="input1" type="data" label="pgSnp dataset" /> - <conditional name="snptab"> - <param name="tab2" type="select" label="Append to gd_snp dataset in history"> - <option value="yes">yes</option> - <option value="no" selected="true">no</option> - </param> - <when value="yes"> - <param format="gd_snp" name="input2" type="data" label="gd_snp dataset" /> - <conditional name="needRef"> - <param name="colsOnly" type="select" label="Skip new SNPs"> - <option value="no" selected="true">no</option> - <option value="addColsOnly">yes</option> - </param> - <when value="no"> - <param name="ref" type="data_column" data_ref="input1" label="Column with reference allele" /> - </when> - <when value="addColsOnly"><!-- do nothing --> - </when> - </conditional> - </when> - <when value="no"> - <param name="ref" type="data_column" data_ref="input1" label="Column with reference allele" /> - </when> - </conditional> - <param name="indName" type="text" size="20" label="Label for new individual/group" value="na" /> - </inputs> - <outputs> - <data format="gd_snp" name="out_file1" /> - </outputs> - <tests> - <test> - <param name='input1' value='pgSnpTest.ref.txt' ftype='interval' /> - <param name='tab2' value='no' /> - <param name='ref' value='8' /> - <param name='indName' value='na' /> - <output name="output" file="pgSnp2snp_output.txt" /> - </test> - </tests> - - <help> - -**Dataset formats** - -The input dataset is of Galaxy datatype interval_, with the additional columns -required for pgSnp_ format. -Any further columns beyond those defined for pgSnp will be ignored. -The output dataset is a gd_snp_ table. (`Dataset missing?`_) - -.. _interval: ./static/formatHelp.html#interval -.. _pgSnp: ./static/formatHelp.html#pgSnp -.. _gd_snp: ./static/formatHelp.html#gd_snp -.. _Dataset missing?: ./static/formatHelp.html - ------ - -**What it does** - -This tool converts a pgSnp dataset to gd_snp format, either starting a new -dataset or appending to an old one. When appending, -if any new SNPs appear only in the pgSnp file they can either be skipped entirely, or -backfilled with "-1" (meaning "unknown") for previous individuals/groups in the -input gd_snp dataset. -If any new SNPs are being added (either by creating a new table or by backfilling), -then an extra column with the reference allele must be supplied in the pgSnp dataset, -as shown in the example below. - ------ - -**Example** - -- input pgSnp file, with reference allele added:: - - chr1 1888681 1888682 C/T 2 4,3 0.8893,0.8453 T - chr1 3118325 3118326 T 1 8 0.8796 C - chr1 3211457 3211458 A/C 2 17,10 0.8610,0.8576 A - etc. - -- gd_snp output:: - - chr1 1888681 T C -1 3 4 1 0.8893 - chr1 3118325 C T -1 0 8 0 0.8796 - chr1 3211457 A C -1 17 10 1 0.8576 - etc. - -</help> -</tool> diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/phenotype_association/snpFreq.xml --- a/tools/phenotype_association/snpFreq.xml +++ /dev/null @@ -1,124 +0,0 @@ -<tool id="hgv_snpFreq" name="snpFreq" version="1.0.1"> - <description>significant SNPs in case-control data</description> - - <command interpreter="perl"> - snpFreq2.pl $inTypeCond.inType 0.05 $input $output - #if $inTypeCond.inType == "tab" - $inTypeCond.group1_1 $inTypeCond.group1_2 $inTypeCond.group1_3 - $inTypeCond.group2_1 $inTypeCond.group2_2 $inTypeCond.group2_3 0.05 - #else if $inTypeCond.inType == "snp" - $group1 $group2 - #end if - </command> - - <inputs> - <conditional name="inTypeCond"> - <param name="inType" type="select" label="Format of input" > - <option value="tab">Alleles pre-counted</option> - <option value="snp">SNP table</option> - </param> - <when value="tab"> - <param format="tabular" name="input" type="data" label="Dataset" /> - <param name="group1_1" label="Column with genotype 1 count for group 1" type="data_column" data_ref="input" /> - <param name="group1_2" label="Column with genotype 2 count for group 1" type="data_column" data_ref="input" /> - <param name="group1_3" label="Column with genotype 3 count for group 1" type="data_column" data_ref="input" /> - <param name="group2_1" label="Column with genotype 1 count for group 2" type="data_column" data_ref="input" /> - <param name="group2_2" label="Column with genotype 2 count for group 2" type="data_column" data_ref="input" /> - <param name="group2_3" label="Column with genotype 3 count for group 2" type="data_column" data_ref="input" /> - </when> - <when value="snp"> - <param format="snp" name="input" type="data" label="SNP Dataset" /> - <param format="ind" name="group1" type="data" label="Group 1" /> - <param format="ind" name="group2" type="data" label="Group 2" /> - </when> - </conditional> - </inputs> - - <outputs> - <data format="tabular" name="output" /> - </outputs> - - <requirements> - <requirement type="binary">R</requirement> - </requirements> - - <tests> - <test> - <param name="inType" value="tab" /> - <param name="input" ftype="tabular" value="snpFreqInput.txt" dbkey="hg18" /> - <param name="group1_1" value="4" /> - <param name="group1_2" value="5" /> - <param name="group1_3" value="6" /> - <param name="group2_1" value="7" /> - <param name="group2_2" value="8" /> - <param name="group2_3" value="9" /> - <output name="output" file="snpFreqTestOut.txt" /> - </test> - </tests> - - <help> - -**Dataset formats** - -The input is tabular_, with six columns of allele counts. The output is also tabular, -and includes all of the input data plus the additional columns described below. -(`Dataset missing?`_) - -.. _tabular: ${static_path}/formatHelp.html#tab -.. _Dataset missing?: ${static_path}/formatHelp.html - ------ - -**What it does** - -This tool performs a basic analysis of bi-allelic SNPs in case-control -data, using the R statistical environment and Fisher's exact test to -identify SNPs with a significant difference in the allele frequencies -between the two groups. R's "qvalue" package is used to correct for -multiple testing. - -The input file includes counts for each allele combination (AA aa Aa) -for each group at each SNP position. The assignment of codes (1 2 3) -to these genotypes is arbitrary, as long as it is consistent for both -groups. Any other input columns are ignored in the computation, but -are copied to the output. The output appends eight additional columns, -namely the minimum expected counts of the three genotypes for each -group, the p-value, and the q-value. - ------ - -**Example** - -- input file:: - - chr1 210 211 38 4 15 56 0 1 x - chr1 228 229 55 0 2 56 0 1 x - chr1 230 231 46 0 11 55 0 2 x - chr1 234 235 43 0 14 55 0 2 x - chr1 236 237 55 0 2 13 10 34 x - chr1 437 438 55 0 2 46 0 11 x - chr1 439 440 56 0 1 55 0 2 x - chr1 449 450 56 0 1 13 20 24 x - chr1 518 519 56 0 1 38 4 15 x - -Here the group 1 genotype counts are in columns 4 - 6, while those -for group 2 are in columns 7 - 9. - -Note that the "x" column has no meaning. It was added to this example -to show that extra columns can be included, and to make it easier -to see where the new columns are appended in the output. - -- output file:: - - chr1 210 211 38 4 15 56 0 1 x 47 2 8 47 2 8 1.50219088598917e-05 6.32501425679652e-06 - chr1 228 229 55 0 2 56 0 1 x 55.5 0 1.5 55.5 0 1.5 1 0.210526315789474 - chr1 230 231 46 0 11 55 0 2 x 50.5 0 6.5 50.5 0 6.5 0.0155644201009862 0.00409590002657532 - chr1 234 235 43 0 14 55 0 2 x 49 0 8 49 0 8 0.00210854461554067 0.000739840215979182 - chr1 236 237 55 0 2 13 10 34 x 34 5 18 34 5 18 6.14613878554783e-17 4.31307984950725e-17 - chr1 437 438 55 0 2 46 0 11 x 50.5 0 6.5 50.5 0 6.5 0.0155644201009862 0.00409590002657532 - chr1 439 440 56 0 1 55 0 2 x 55.5 0 1.5 55.5 0 1.5 1 0.210526315789474 - chr1 449 450 56 0 1 13 20 24 x 34.5 10 12.5 34.5 10 12.5 2.25757007974134e-18 2.37638955762246e-18 - chr1 518 519 56 0 1 38 4 15 x 47 2 8 47 2 8 1.50219088598917e-05 6.32501425679652e-06 - - </help> -</tool> diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/phenotype_association/snpFreq2.pl --- a/tools/phenotype_association/snpFreq2.pl +++ /dev/null @@ -1,196 +0,0 @@ -#!/usr/bin/env perl - -use strict; -use warnings; - -#using large SNP tables (~1G) may require large memory ~15G in R - -#expected input: path to file, cols of counts (2 sets of 3), threshold -if (!@ARGV or scalar @ARGV != 11) { - if (!@ARGV or scalar @ARGV != 6) { #snpTable usage - print "usage for tab separated allele counts\n", - "snpFreq.pl inputType #threshold /path/to/snps.txt outfile <6 column numbers(1 based) with counts for alleles, first one group then another>\n"; - print "OR for SNP tables\n"; - print "usage snpFreq.pl inputType #threshold /path/to/snpTable.txt outfile group1File group2File\n"; - exit 1; - } -} - -#get and verify inputs -my ($file, $a1, $a2, $a3, $b1, $b2, $b3, $thresh, $outfile); -if ($ARGV[0] eq 'tab') { - shift @ARGV; - $thresh = shift @ARGV; - if ($thresh !~ /^\d*\.?\d+$/) { - print "Error the threshold must be a number. Got $thresh\n"; - exit 1; - }elsif ($thresh > .3) { - print "Error the threshold can not be greater than 0.3 got $thresh\n"; - exit 1; - } - $file = shift @ARGV; - $outfile = shift @ARGV; - $a1 = shift @ARGV; - if ($a1 =~ /\D/ or $a1 < 1) { - print "Error the column number, must be an integer greater than or equal to 1. Got $a1\n"; - exit 1; - } - $a2 = shift @ARGV; - if ($a2 =~ /\D/ or $a2 < 1) { - print "Error the column number, must be an integer greater than or equal to 1. Got $a2\n"; - exit 1; - } - $a3 = shift @ARGV; - if ($a3 =~ /\D/ or $a3 < 1) { - print "Error the column number, must be an integer greater than or equal to 1. Got $a3\n"; - exit 1; - } - $b1 = shift @ARGV; - if ($b1 =~ /\D/ or $b1 < 1) { - print "Error the column number, must be an integer greater than or equal to 1. Got $b1\n"; - exit 1; - } - $b2 = shift @ARGV; - if ($b2 =~ /\D/ or $b2 < 1) { - print "Error the column number, must be an integer greater than or equal to 1. Got $b2\n"; - exit 1; - } - $b3 = shift @ARGV; - if ($b3 =~ /\D/ or $b3 < 1) { - print "Error the column number, must be an integer greater than or equal to 1. Got $b3\n"; - exit 1; - } -}else { #snp table convert and assign variables - #snpTable.txt #threshold outfile workingdir - shift @ARGV; - $thresh = shift @ARGV; - if ($thresh !~ /^\d*\.?\d+$/) { - print "Error the threshold must be a number. Got $thresh\n"; - exit 1; - }elsif ($thresh > .3) { - print "Error the threshold can not be greater than 0.3 got $thresh\n"; - exit 1; - } - $file = shift @ARGV; - $outfile = shift @ARGV; - my $grpFile = shift @ARGV; - my @g1; - open(FH, $grpFile) or die "Couldn't open $grpFile, $!\n"; - while (<FH>) { - chomp; - if (/^(\d+)\s/) { push(@g1, $1); } - } - close FH or die "Couldn't close $grpFile, $!\n"; - $grpFile = shift @ARGV; - my @g2; - open(FH, $grpFile) or die "Couldn't open $grpFile, $!\n"; - while (<FH>) { - chomp; - if (/^(\d+)\s/) { push(@g2, $1); } - } - close FH or die "Couldn't close $grpFile, $!\n"; - if ($file =~ /.gz$/) { - open(FH, "zcat $file |") or die "Couldn't read $file, $!\n"; - }else { - open(FH, $file) or die "Couldn't read $file, $!\n"; - } - open(OUT, ">", "snpTable.txt") or die "Couldn't open snpTable.txt, $!\n"; - my $size; - while (<FH>) { - chomp; - if (/^#/) { next; } #header - my @f = split(/\t/); - $size = scalar @f; - my @gc1 = (0, 0, 0); - my @gc2 = (0, 0, 0); - foreach my $g (@g1) { - my $i = $g+1; #g is 1 based first col want 0 based snp call column - if ($i > $#f) { die "ERROR looking for index $i which is greater than the list $#f\n"; } - if ($f[$i] == -1 or $f[$i] == 2) { #treat unknown as ref - $gc1[0]++; - }elsif ($f[$i] == 1) { - $gc1[2]++; - }elsif ($f[$i] == 0) { - $gc1[1]++; - }else { die "Unexpected value for genotype $f[$i] in ", join(" ", @f), "\n"; } - } - foreach my $g (@g2) { - my $i = $g+1; #g is 1 based first col want 0 based snp call column - if ($f[$i] == -1 or $f[$i] == 2) { #treat unknown as ref - $gc2[0]++; - }elsif ($f[$i] == 1) { - $gc2[2]++; - }elsif ($f[$i] == 0) { - $gc2[1]++; - }else { die "Unexpected value for genotype $f[$i] in ", join(" ", @f), "\n"; } - } - print OUT join("\t", @f), "\t", join("\t", @gc1), "\t", join("\t", @gc2), - "\n"; - } - close FH or die "Couldn't close $file, $!\n"; - close OUT or die "Couldn't close snpTable.txt, $!\n"; - my $i = $size + 1; #next 1 based column after input data - $a1 = $i++; - $a2 = $i++; - $a3 = $i++; - $b1 = $i++; - $b2 = $i++; - $b3 = $i++; - $file = "snpTable.txt"; -} - -#run a fishers exact test (using R) on whole table -my $cmd = qq|options(warn=-1) - tab <- read.table('$file', sep="\t") - size <- length(tab[,1]) - width <- length(tab[1,]) - x <- 1:size - y <- matrix(data=0, nr=size, nc=6) - for(i in 1:size) { - m <- matrix(c(tab[i,$a1], tab[i,$b1], tab[i,$a2], tab[i,$b2], tab[i,$a3], tab[i,$b3]), nrow=2) - t <- fisher.test(m) - x[i] <- t\$p.value - if (x[i] >= 1) { - x[i] <- .999 - } - n <- (tab[i,$a1] + tab[i,$a2] + tab[i,$a3] + tab[i,$b1] + tab[i,$b2] + tab[i,$b3]) - n_a <- (tab[i,$a1] + tab[i,$a2] + tab[i,$a3]) - y[i,1] <- ((tab[i,$a1] + tab[i,$b1])*(n_a))/n - y[i,1] <- round(y[i,1],3) - y[i,2] <- ((tab[i,$a2] + tab[i,$b2])*(n_a))/n - y[i,2] <- round(y[i,2],3) - y[i,3] <- ((tab[i,$a3] + tab[i,$b3])*(n_a))/n - y[i,3] <- round(y[i,3],3) - n_b <- (tab[i,$b1] + tab[i,$b2] + tab[i,$b3]) - y[i,4] <- ((tab[i,$a1] + tab[i,$b1])*(n_b))/n - y[i,4] <- round(y[i,4],3) - y[i,5] <- ((tab[i,$a2] + tab[i,$b2])*(n_b))/n - y[i,5] <- round(y[i,5],3) - y[i,6] <- ((tab[i,$a3] + tab[i,$b3])*(n_b))/n - y[i,6] <- round(y[i,6],3) - }|; - #results <- data.frame(tab[1:size,1:width], x[1:size]) - #write.table(results, file="$outfile", row.names = FALSE ,col.names = FALSE,quote = FALSE, sep="\t") - #q()|; - -#my $cmd2 = qq|suppressPackageStartupMessages(library(lib.loc='/afs/bx.psu.edu/home/giardine/lib/R', qvalue)) -my $cmd2 = qq|suppressPackageStartupMessages(library(qvalue)) - qobj <- qvalue(x[1:size], lambda=seq(0,0.90,$thresh), pi0.method="bootstrap", fdr.level=0.1, robust=FALSE, smooth.log.pi0 = FALSE) - q <- qobj\$qvalues - results <- data.frame(tab[1:size,1:width], y[1:size,1:6], x[1:size], q[1:size]) - write.table(results, file="$outfile", row.names = FALSE ,col.names = FALSE,quote = FALSE, sep="\t") - q()|; - -#for TESTING -my $pr = qq|results <- data.frame(tab[1:size,1:width], y[1:size,1:6], x[1:size]) - write.table(results, file="$outfile", row.names = FALSE ,col.names = FALSE,quote = FALSE, sep="\t") - q()|; - -open(FT, "| R --slave --vanilla") - or die "Couldn't call fisher.text, $!\n"; -print FT $cmd, "\n"; #fisher test -print FT $cmd2, "\n"; #qvalues and results -#print FT $pr, "\n"; -close FT or die "Couldn't finish fisher.test, $!\n"; - -exit; diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/phenotype_association/vcf2pgSnp.pl --- a/tools/phenotype_association/vcf2pgSnp.pl +++ /dev/null @@ -1,116 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -#convert from a vcf file to a pgSnp file. -#frequency count = chromosome count -#either a single column/individual -#or all columns as a population - -my $in; -my $stCol = 9; -my $endCol; -if (@ARGV && scalar @ARGV == 2) { - $stCol = shift @ARGV; - $in = shift @ARGV; - if ($stCol eq 'all') { $stCol = 10; } - else { $endCol = $stCol; } - $stCol--; #go from 1 based to zero based column number - if ($stCol < 9) { - print "ERROR genotype fields don't start until column 10\n"; - exit; - } -}elsif (@ARGV && scalar @ARGV == 1) { - $in = shift @ARGV; -}elsif (@ARGV) { - print "usage: vcf2pgSnp.pl [indColNum default=all] file.vcf > file.pgSnp\n"; - exit; -} - -open(FH, $in) or die "Couldn't open $in, $!\n"; -while (<FH>) { - chomp; - if (/^\s*#/) { next; } #skip comments/headers - if (/^\s*$/) { next; } #skip blank lines - my @f = split(/\t/); - #chr pos1base ID refNt altNt[,|D#|Int] quality filter info format geno1 ... - my $a; - my %nt; - my %all; - my $cnt = 0; - my $var; - if ($f[3] eq 'N') { next; } #ignore ref=N - if ($f[4] =~ /[DI]/ or $f[3] =~ /[DI]/) { next; } #don't do microsatellite - #if ($f[4] =~ /[ACTG],[ACTG]/) { next; } #only do positions with single alternate - if ($f[6] && !($f[6] eq '.' or $f[6] eq 'PASS')) { next; } #filtered for some reason - my $ind = 0; - if ($f[8] ne 'GT') { #more than just genotype - my @t = split(/:/, $f[8]); - foreach (@t) { if ($_ eq 'GT') { last; } $ind++; } - if ($ind == 0 && $f[8] !~ /^GT/) { die "ERROR couldn't find genotype in format $f[8]\n"; } - } - #count 0's, 1's, 2's - if (!$endCol) { $endCol = $#f; } - foreach my $col ($stCol .. $endCol) { - if ($ind > 0) { - my @t = split(/:/, $f[$col]); - $f[$col] = $t[$ind] . ":"; #only keep genotype part - } - if ($f[$col] =~ /^(0|1|2).(0|1|2)/) { - $nt{$1}++; - $nt{$2}++; - }elsif ($f[$col] =~ /^(0|1|2):/) { #chrY or male chrX, single - $nt{$1}++; - } #else ignore - } - if (%nt) { - if ($f[0] !~ /chr/) { $f[0] = "chr$f[0]"; } - print "$f[0]\t", ($f[1]-1), "\t$f[1]\t"; #position info - my $cnt = scalar(keys %nt); - my $fr; - my $sc; - my $all; - if (exists $nt{0}) { - $all = uc($f[3]); - $fr = $nt{0}; - $sc = 0; - } - if (!exists $nt{0} && exists $nt{1}) { - if ($f[4] =~ /([ACTG]),?/) { - $all = $1; - $fr = $nt{1}; - $sc = 0; - }else { die "bad variant nt $f[4] for nt 1"; } - }elsif (exists $nt{1}) { - if ($f[4] =~ /([ACTG]),?/) { - $all .= '/' . $1; - $fr .= ",$nt{1}"; - $sc .= ",0"; - }else { die "bad variant nt $f[4] for nt 1"; } - } - if (exists $nt{2}) { - if ($f[4] =~ /^[ACTG],([ACTG]),?/) { - $all .= '/' . $1; - $fr .= ",$nt{2}"; - $sc .= ",0"; - }else { die "bad variant nt $f[4] for nt 2"; } - } - if (exists $nt{3}) { - if ($f[4] =~ /^[ACTG],[ACTG],([ACTG])/) { - $all .= '/' . $1; - $fr .= ",$nt{3}"; - $sc .= ",0"; - }else { die "bad variant nt $f[4] for nt 3"; } - } - if (exists $nt{4}) { - if ($f[4] =~ /^[ACTG],[ACTG],[ACTG],([ACTG])/) { - $all .= '/' . $1; - $fr .= ",$nt{4}"; - $sc .= ",0"; - }else { die "bad variant nt $f[4] for nt 4"; } - } - print "$all\t$cnt\t$fr\t$sc\n"; - } -} -close FH; - -exit; diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/phenotype_association/vcf2pgSnp.xml --- a/tools/phenotype_association/vcf2pgSnp.xml +++ /dev/null @@ -1,79 +0,0 @@ -<tool id="vcf2pgSnp" name="VCF to pgSnp" hidden="false"> - <description>Convert from VCF to pgSnp format</description> - <command interpreter="perl"> - #if $inType.how == "all" #vcf2pgSnp.pl all $input1 > $out_file1 - #else #vcf2pgSnp.pl $inType.ind_column $input1 > $out_file1 - #end if - </command> - <inputs> - <param format="vcf" name="input1" type="data" label="VCF dataset" /> - <conditional name="inType"> - <param name="how" type="select" label="How to treat individuals"> - <option value="all">Group all as a population</option> - <option value="one">Do just one individual</option> - </param> - <when value="one"> - <param name="ind_column" type="data_column" data_ref="input1" label="Column to convert" value="10" /> - </when> - <when value="all"> - <!-- do nothing --> - </when> - </conditional> - </inputs> - <outputs> - <data format="interval" name="out_file1" /> - </outputs> - <tests> - <test> - <param name="input1" value="vcf2pgSnp_input.vcf" ftype="vcf" /> - <param name="how" value="all" /> - <output name="output" file="vcf2pgSnp_output.pgSnp" /> - </test> - </tests> - - <help> -**Dataset formats** - -The input dataset is VCF_ format. -The output dataset is pgSnp_. (`Dataset missing?`_) - -.. _Dataset missing?: ./static/formatHelp.html -.. _VCF: ./static/formatHelp.html#vcf -.. _pgSnp: ./static/formatHelp.html#pgSnp - ------ - -**What it does** - -This converts a VCF dataset to pgSnp with the frequency counts being -chromosome counts. If there is more than one column of SNP data it will either -accumulate all columns as a population or convert the column indicated -to pgSnp. - ------ - -**Examples** - -- input:: - - 1 13327 rs144762171 G C 100 PASS VT=SNP;SNPSOURCE=LOWCOV GT:DS:GL 0|0:0.000:-0.03,-1.11,-5.00 0|1:1.000:-1.97,-0.01,-2.51 0|0:0.050:-0.01,-1.69,-5.00 0|0:0.100:-0.48,-0.48,-0.48 - 1 13980 rs151276478 T C 100 PASS VT=SNP;SNPSOURCE=LOWCOV GT:DS:GL 0|0:0.100:-0.48,-0.48,-0.48 0|1:0.950:-0.48,-0.48,-0.48 0|0:0.050:-0.48,-0.48,-0.48 0|0:0.050:-0.48,-0.48,-0.48 - 1 30923 rs140337953 G T 100 PASS VT=SNP;SNPSOURCE=LOWCOV GT:DS:GL 1|1:1.950:-5.00,-0.61,-0.12 0|0:0.450:-0.10,-0.69,-2.81 0|0:0.450:-0.11,-0.64,-3.49 1|1:1.500:-0.48,-0.48,-0.48 - etc. - -- output as a population:: - - chr1 13326 13327 G/C 2 7,1 0,0 - chr1 13979 13980 T/C 2 7,1 0,0 - chr1 30922 30923 G/T 2 4,4 0,0 - etc. - -- output for each column separately:: - - chr1 13326 13327 G 1 2 0 G/C 2 1,1 0,0 G 1 2 0 G 1 2 0 - chr1 13979 13980 T 1 2 0 T/C 2 1,1 0,0 T 1 2 0 T 1 2 0 - chr1 30922 30923 T 1 2 0 G 1 2 0 G 1 2 0 T 1 2 0 - etc. - -</help> -</tool> diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/plotting/histogram.py --- a/tools/plotting/histogram.py +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env python -#Greg Von Kuster - -import sys -from rpy import * - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -def main(): - - # Handle input params - in_fname = sys.argv[1] - out_fname = sys.argv[2] - try: - column = int( sys.argv[3] ) - 1 - except: - stop_err( "Column not specified, your query does not contain a column of numerical data." ) - title = sys.argv[4] - xlab = sys.argv[5] - breaks = int( sys.argv[6] ) - if breaks == 0: - breaks = "Sturges" - if sys.argv[7] == "true": - density = True - else: density = False - if len( sys.argv ) >= 9 and sys.argv[8] == "true": - frequency = True - else: frequency = False - - matrix = [] - skipped_lines = 0 - first_invalid_line = 0 - invalid_value = '' - i = 0 - for i, line in enumerate( file( in_fname ) ): - valid = True - line = line.rstrip('\r\n') - # Skip comments - if line and not line.startswith( '#' ): - # Extract values and convert to floats - row = [] - try: - fields = line.split( "\t" ) - val = fields[column] - if val.lower() == "na": - row.append( float( "nan" ) ) - except: - valid = False - skipped_lines += 1 - if not first_invalid_line: - first_invalid_line = i+1 - else: - try: - row.append( float( val ) ) - except ValueError: - valid = False - skipped_lines += 1 - if not first_invalid_line: - first_invalid_line = i+1 - invalid_value = fields[column] - else: - valid = False - skipped_lines += 1 - if not first_invalid_line: - first_invalid_line = i+1 - - if valid: - matrix += row - - if skipped_lines < i: - try: - a = r.array( matrix ) - r.pdf( out_fname, 8, 8 ) - histogram = r.hist( a, probability=not frequency, main=title, xlab=xlab, breaks=breaks ) - if density: - density = r.density( a ) - if frequency: - scale_factor = len( matrix ) * ( histogram['mids'][1] - histogram['mids'][0] ) #uniform bandwidth taken from first 2 midpoints - density[ 'y' ] = map( lambda x: x * scale_factor, density[ 'y' ] ) - r.lines( density ) - r.dev_off() - except Exception, exc: - stop_err( "%s" %str( exc ) ) - else: - if i == 0: - stop_err("Input dataset is empty.") - else: - stop_err( "All values in column %s are non-numeric." %sys.argv[3] ) - - print "Histogram of column %s. " %sys.argv[3] - if skipped_lines > 0: - print "Skipped %d invalid lines starting with line #%d, '%s'." % ( skipped_lines, first_invalid_line, invalid_value ) - - r.quit( save="no" ) - -if __name__ == "__main__": - main() diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/plotting/histogram2.xml --- a/tools/plotting/histogram2.xml +++ /dev/null @@ -1,77 +0,0 @@ -<tool id="histogram_rpy" name="Histogram" version="1.0.3"> - <description>of a numeric column</description> - <command interpreter="python">histogram.py $input $out_file1 $numerical_column "$title" "$xlab" $breaks $density $frequency</command> - <inputs> - <param name="input" type="data" format="tabular" label="Dataset" help="Dataset missing? See TIP below"/> - <param name="numerical_column" type="data_column" data_ref="input" numerical="True" label="Numerical column for x axis" /> - <param name="breaks" type="integer" size="4" value="0" label="Number of breaks (bars)"/> - <param name="title" type="text" size="30" value="Histogram" label="Plot title"/> - <param name="xlab" type="text" size="30" value="V1" label="Label for x axis"/> - <param name="density" type="boolean" checked="yes" label="Include smoothed density"/> - <param name="frequency" type="boolean" checked="no" label="Plot as frequency (counts)"/> - </inputs> - <outputs> - <data format="pdf" name="out_file1" /> - </outputs> - <tests> - <test> - <param name="input" value="histogram_in1.tabular" ftype="tabular"/> - <param name="numerical_column" value="2"/> - <param name="breaks" value="0"/> - <param name="title" value="Histogram"/> - <param name="xlab" value="V1"/> - <param name="density" value="true"/> - <param name="frequency" value="false"/> - <output name="out_file1" file="histogram_out1.pdf"/> - </test> - </tests> - <requirements> - <requirement type="python-module">rpy</requirement> - <requirement type="package">R</requirement> - </requirements> - <help> - -.. class:: infomark - -**TIP:** To remove comment lines that do not begin with a *#* character, use *Text Manipulation->Remove beginning* - - .. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* - ------ - -**Syntax** - -This tool computes a histogram of the numerical values in a column of a dataset. - -- All invalid, blank and comment lines in the dataset are skipped. The number of skipped lines is displayed in the resulting history item. -- **Column for x axis** - only numerical columns are possible. -- **Number of breaks(bars)** - breakpoints between histogram cells. Value of '0' will determine breaks automatically. -- **Plot title** - the histogram title. -- **Label for x axis** - the label of the x axis for the histogram. -- **Include smoothed density** - if checked, the resulting graph will join the given corresponding points with line segments. - ------ - -**Example** - -- Input file:: - - 1 68 4.1 - 2 71 4.6 - 3 62 3.8 - 4 75 4.4 - 5 58 3.2 - 6 60 3.1 - 7 67 3.8 - 8 68 4.1 - 9 71 4.3 - 10 69 3.7 - -- Create a histogram on column 2 of the above dataset. - -.. image:: ${static_path}/images/histogram2.png - -</help> -</tool> diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/plotting/scatterplot.py --- a/tools/plotting/scatterplot.py +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env python -#Greg Von Kuster - -import sys -from rpy import * - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -def main(): - - in_fname = sys.argv[1] - out_fname = sys.argv[2] - try: - columns = int( sys.argv[3] ) - 1, int( sys.argv[4] ) - 1 - except: - stop_err( "Columns not specified, your query does not contain a column of numerical data." ) - title = sys.argv[5] - xlab = sys.argv[6] - ylab = sys.argv[7] - - matrix = [] - skipped_lines = 0 - first_invalid_line = 0 - invalid_value = '' - invalid_column = 0 - i = 0 - for i, line in enumerate( file( in_fname ) ): - valid = True - line = line.rstrip( '\r\n' ) - if line and not line.startswith( '#' ): - row = [] - fields = line.split( "\t" ) - for column in columns: - try: - val = fields[column] - if val.lower() == "na": - row.append( float( "nan" ) ) - else: - row.append( float( fields[column] ) ) - except: - valid = False - skipped_lines += 1 - if not first_invalid_line: - first_invalid_line = i + 1 - try: - invalid_value = fields[column] - except: - invalid_value = '' - invalid_column = column + 1 - break - else: - valid = False - skipped_lines += 1 - if not first_invalid_line: - first_invalid_line = i+1 - - if valid: - matrix.append( row ) - - if skipped_lines < i: - try: - r.pdf( out_fname, 8, 8 ) - r.plot( array( matrix ), type="p", main=title, xlab=xlab, ylab=ylab, col="blue", pch=19 ) - r.dev_off() - except Exception, exc: - stop_err( "%s" %str( exc ) ) - else: - stop_err( "All values in both columns %s and %s are non-numeric or empty." % ( sys.argv[3], sys.argv[4] ) ) - - print "Scatter plot on columns %s, %s. " % ( sys.argv[3], sys.argv[4] ) - if skipped_lines > 0: - print "Skipped %d lines starting with line #%d, value '%s' in column %d is not numeric." % ( skipped_lines, first_invalid_line, invalid_value, invalid_column ) - - r.quit( save="no" ) - -if __name__ == "__main__": - main() diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/plotting/scatterplot.xml --- a/tools/plotting/scatterplot.xml +++ /dev/null @@ -1,71 +0,0 @@ -<tool id="scatterplot_rpy" name="Scatterplot"> - <description>of two numeric columns</description> - <command interpreter="python">scatterplot.py $input $out_file1 $col1 $col2 "$title" "$xlab" "$ylab"</command> - <inputs> - <param name="input" type="data" format="tabular" label="Dataset" help="Dataset missing? See TIP below"/> - <param name="col1" type="data_column" data_ref="input" numerical="True" label="Numerical column for x axis" /> - <param name="col2" type="data_column" data_ref="input" numerical="True" label="Numerical column for y axis" /> - <param name="title" size="30" type="text" value="Scatterplot" label="Plot title"/> - <param name="xlab" size="30" type="text" value="V1" label="Label for x axis"/> - <param name="ylab" size="30" type="text" value="V2" label="Label for y axis"/> - </inputs> - <outputs> - <data format="pdf" name="out_file1" /> - </outputs> - <requirements> - <requirement type="python-module">rpy</requirement> - </requirements> - <!-- TODO: uncomment the following test when we have tools.update_state() working for - multiple dependents with the same dependency. - <tests> - <test> - <param name="input" value="scatterplot_in1.tabular" ftype="tabular"/> - <param name="col1" value="2"/> - <param name="col2" value="3"/> - <param name="title" value="Scatterplot"/> - <param name="xlab" value="V1"/> - <param name="ylab" value="V2"/> - <output name="out_file1" file="scatterplot_out1.pdf" /> - </test> - </tests> - --> - <help> - -.. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* - ------ - -**Syntax** - -This tool creates a simple scatter plot between two variables containing numeric values of a selected dataset. - -- All invalid, blank and comment lines in the dataset are skipped. The number of skipped lines is displayed in the resulting history item. - -- **Plot title** The scatterplot title -- **Label for x axis** and **Label for y axis** The labels for x and y axis of the scatterplot. - ------ - -**Example** - -- Input file:: - - 1 68 4.1 - 2 71 4.6 - 3 62 3.8 - 4 75 4.4 - 5 58 3.2 - 6 60 3.1 - 7 67 3.8 - 8 68 4.1 - 9 71 4.3 - 10 69 3.7 - -- Create a simple scatterplot between the variables in column 2 and column 3 of the above dataset. - -.. image:: ${static_path}/images/scatterplot.png - -</help> -</tool> diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/stats/MINE.xml --- a/tools/stats/MINE.xml +++ /dev/null @@ -1,82 +0,0 @@ -<tool id="maximal_information_based_nonparametric_exploration" name="MINE" version="0.0.1"> - <description>- Maximal Information-based Nonparametric Exploration</description> - <requirements> - <requirement type="package" version="1.0">MINE</requirement> - </requirements> - <command interpreter="python">mine_wrapper.py - --jar "${GALAXY_DATA_INDEX_DIR}/shared/jars/mine/MINE.jar" - - --infile "${input_file}" - - #if str( $master_variable_type.master_variable_type_selector ) in [ 'allPairs', 'adjacentPairs' ]: - --master_variable "${master_variable_type.master_variable_type_selector}" - #else: - --master_variable "${master_variable_type.master_variable}" - #end if - - --cv "${cv}" - - --exp "${exp}" - - --c "${c}" - - ##--gc ##skip - - - #if str( $master_variable_type.master_variable_type_selector ) != 'allPairs' and $master_variable_type.permute: - --permute - #end if - - --output_results "${output_results}" - - --output_log "${output_log}" - </command> - <inputs> - <param name="input_file" type="data" format="csv" label="CSV file" /> - - <conditional name="master_variable_type"> - <param name="master_variable_type_selector" type="select" label="Choose the master variable type"> - <option value="allPairs">allPairs</option> - <option value="adjacentPairs">adjacentPairs</option> - <option value="compare_against_ith" selected="True">compare against i-th</option> - </param> - <when value="compare_against_ith"> - <param type="integer" value="0" name="master_variable" /> - <param type="boolean" truevalue="--permute" false_value="" name="permute" checked="False" /> - </when> - <when value="adjacentPairs"> - <param type="boolean" truevalue="--permute" false_value="" name="permute" checked="False" /> - </when> - </conditional> - - <param type="float" value="0" name="cv" /> - - <param type="float" value="0.6" name="exp" /> - - <param type="float" value="15" name="c" /> - - </inputs> - <outputs> - <data format="csv" name="output_results" label="${tool.name} on ${on_string} (Results)" /> - <data format="txt" name="output_log" label="${tool.name} on ${on_string} (log)" /> - </outputs> - <tests> - <!-- TODO --> - </tests> - <help> -**What it does** - -Applies the Maximal Information-based Nonparametric Exploration strategy to an input dataset. - -See http://www.exploredata.net/ for more information. - ------- - -**Citation** - -For the underlying tool, please cite `David N. Reshef, Yakir A. Reshef, Hilary K. Finucane5, Sharon R. Grossman, Gilean McVean, Peter J. Turnbaugh, Eric S. Lander, Michael Mitzenmacher, Pardis C. Sabeti Detecting Novel Associations in Large Data Sets. Science. 2011 Dec. <http://www.sciencemag.org/content/334/6062/1518>`_ - -If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.* - - </help> -</tool> diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/stats/column_maker.py --- a/tools/stats/column_maker.py +++ /dev/null @@ -1,125 +0,0 @@ -#!/usr/bin/env python -# This tool takes a tab-delimited textfile as input and creates another column in the file which is the result of -# a computation performed on every row in the original file. The tool will skip over invalid lines within the file, -# informing the user about the number of lines skipped. - -import sys, re -# These functions may be used in compute expression: -from math import log,exp,sqrt,ceil,floor - - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err( msg ): - sys.stderr.write( msg ) - sys.exit() - -inp_file = sys.argv[1] -out_file = sys.argv[2] -expr = sys.argv[3] -round_result = sys.argv[4] -try: - in_columns = int( sys.argv[5] ) -except: - stop_err( "Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data." ) -if in_columns < 2: - # To be considered tabular, data must fulfill requirements of the sniff.is_column_based() method. - stop_err( "Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data." ) -try: - in_column_types = sys.argv[6].split( ',' ) -except: - stop_err( "Missing or invalid 'column_types' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data." ) -if len( in_column_types ) != in_columns: - stop_err( "The 'columns' metadata setting does not conform to the 'column_types' metadata setting, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data." ) - -# Unescape if input has been escaped -mapped_str = { - '__lt__': '<', - '__le__': '<=', - '__eq__': '==', - '__ne__': '!=', - '__gt__': '>', - '__ge__': '>=', - '__sq__': '\'', - '__dq__': '"', -} -for key, value in mapped_str.items(): - expr = expr.replace( key, value ) - -operators = 'is|not|or|and' -builtin_and_math_functions = 'abs|all|any|bin|chr|cmp|complex|divmod|float|hex|int|len|long|max|min|oct|ord|pow|range|reversed|round|sorted|str|sum|type|unichr|unicode|log|exp|sqrt|ceil|floor' -string_and_list_methods = [ name for name in dir('') + dir([]) if not name.startswith('_') ] -whitelist = "^([c0-9\+\-\*\/\(\)\.\'\"><=,:! ]|%s|%s|%s)*$" % (operators, builtin_and_math_functions, '|'.join(string_and_list_methods)) -if not re.compile(whitelist).match(expr): - stop_err("Invalid expression") - -# Prepare the column variable names and wrappers for column data types -cols, type_casts = [], [] -for col in range( 1, in_columns + 1 ): - col_name = "c%d" % col - cols.append( col_name ) - col_type = in_column_types[ col - 1 ].strip() - if round_result == 'no' and col_type == 'int': - col_type = 'float' - type_cast = "%s(%s)" % ( col_type, col_name ) - type_casts.append( type_cast ) - -col_str = ', '.join( cols ) # 'c1, c2, c3, c4' -type_cast_str = ', '.join( type_casts ) # 'str(c1), int(c2), int(c3), str(c4)' -assign = "%s = line.split( '\\t' )" % col_str -wrap = "%s = %s" % ( col_str, type_cast_str ) -skipped_lines = 0 -first_invalid_line = 0 -invalid_line = None -lines_kept = 0 -total_lines = 0 -out = open( out_file, 'wt' ) - -# Read input file, skipping invalid lines, and perform computation that will result in a new column -code = ''' -for i, line in enumerate( file( inp_file ) ): - total_lines += 1 - line = line.rstrip( '\\r\\n' ) - if not line or line.startswith( '#' ): - skipped_lines += 1 - if not invalid_line: - first_invalid_line = i + 1 - invalid_line = line - continue - try: - %s - %s - new_val = %s - if round_result == "yes": - new_val = int( round( new_val ) ) - new_line = line + '\\t' + str( new_val ) - print >> out, new_line - lines_kept += 1 - except: - skipped_lines += 1 - if not invalid_line: - first_invalid_line = i + 1 - invalid_line = line -''' % ( assign, wrap, expr ) - -valid_expr = True -try: - exec code -except Exception, e: - out.close() - if str( e ).startswith( 'invalid syntax' ): - valid_expr = False - stop_err( 'Expression "%s" likely invalid. See tool tips, syntax and examples.' % expr ) - else: - stop_err( str( e ) ) - -if valid_expr: - out.close() - valid_lines = total_lines - skipped_lines - print 'Creating column %d with expression %s' % ( in_columns + 1, expr ) - if valid_lines > 0: - print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines ) - else: - print 'Possible invalid expression "%s" or non-existent column referenced. See tool tips, syntax and examples.' % expr - if skipped_lines > 0: - print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line ) diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/stats/column_maker.xml --- a/tools/stats/column_maker.xml +++ /dev/null @@ -1,83 +0,0 @@ -<tool id="Add_a_column1" name="Compute" version="1.1.0"> - <description>an expression on every row</description> - <command interpreter="python"> - column_maker.py $input $out_file1 "$cond" $round ${input.metadata.columns} "${input.metadata.column_types}" - </command> - <inputs> - <param name="cond" size="40" type="text" value="c3-c2" label="Add expression"/> - <param format="tabular" name="input" type="data" label="as a new column to" help="Dataset missing? See TIP below"/> - <param name="round" type="select" label="Round result?"> - <option value="no">NO</option> - <option value="yes">YES</option> - </param> - </inputs> - <outputs> - <data format="input" name="out_file1" metadata_source="input"/> - </outputs> - <tests> - <test> - <param name="cond" value="c3-c2"/> - <param name="input" value="1.bed"/> - <param name="round" value="no"/> - <output name="out_file1" file="column_maker_out1.interval"/> - </test> - <test> - <param name="cond" value="c4*1"/> - <param name="input" value="1.interval"/> - <param name="round" value="no"/> - <output name="out_file1" file="column_maker_out2.interval"/> - </test> - <test> - <param name="cond" value="c4*1"/> - <param name="input" value="1.interval"/> - <param name="round" value="yes"/> - <output name="out_file1" file="column_maker_out3.interval"/> - </test> - </tests> - <help> - - .. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* - ------ - -**What it does** - -This tool computes an expression for every row of a dataset and appends the result as a new column (field). - -- Columns are referenced with **c** and a **number**. For example, **c1** refers to the first column of a tab-delimited file - -- **c3-c2** will add a length column to the dataset if **c2** and **c3** are start and end position - ------ - -**Example** - -If this is your input:: - - chr1 151077881 151077918 2 200 - - chr1 151081985 151082078 3 500 + - -computing "c4*c5" will produce:: - - chr1 151077881 151077918 2 200 - 400.0 - chr1 151081985 151082078 3 500 + 1500.0 - -if, at the same time, "Round result?" is set to **YES** results will look like this:: - - chr1 151077881 151077918 2 200 - 400 - chr1 151081985 151082078 3 500 + 1500 - -You can also use this tool to evaluate expressions. For example, computing "c3>=c2" for Input will result in the following:: - - chr1 151077881 151077918 2 200 - True - chr1 151081985 151082078 3 500 + True - -or computing "type(c2)==type('') for Input will return:: - - chr1 151077881 151077918 2 200 - False - chr1 151081985 151082078 3 500 + False - -</help> -</tool> diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/stats/cor.py --- a/tools/stats/cor.py +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env python -#Greg Von Kuster -""" -Calculate correlations between numeric columns in a tab delim file. -usage: %prog infile output.txt columns method -""" - -import sys -from rpy import * - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -def main(): - method = sys.argv[4] - assert method in ( "pearson", "kendall", "spearman" ) - - try: - columns = map( int, sys.argv[3].split( ',' ) ) - except: - stop_err( "Problem determining columns, perhaps your query does not contain a column of numerical data." ) - - matrix = [] - skipped_lines = 0 - first_invalid_line = 0 - invalid_value = '' - invalid_column = 0 - - for i, line in enumerate( file( sys.argv[1] ) ): - valid = True - line = line.rstrip('\n\r') - - if line and not line.startswith( '#' ): - # Extract values and convert to floats - row = [] - for column in columns: - column -= 1 - fields = line.split( "\t" ) - if len( fields ) <= column: - valid = False - else: - val = fields[column] - if val.lower() == "na": - row.append( float( "nan" ) ) - else: - try: - row.append( float( fields[column] ) ) - except: - valid = False - skipped_lines += 1 - if not first_invalid_line: - first_invalid_line = i+1 - invalid_value = fields[column] - invalid_column = column+1 - else: - valid = False - skipped_lines += 1 - if not first_invalid_line: - first_invalid_line = i+1 - - if valid: - matrix.append( row ) - - if skipped_lines < i: - try: - out = open( sys.argv[2], "w" ) - except: - stop_err( "Unable to open output file" ) - - # Run correlation - try: - value = r.cor( array( matrix ), use="pairwise.complete.obs", method=method ) - except Exception, exc: - out.close() - stop_err("%s" %str( exc )) - for row in value: - print >> out, "\t".join( map( str, row ) ) - out.close() - - if skipped_lines > 0: - msg = "..Skipped %d lines starting with line #%d. " %( skipped_lines, first_invalid_line ) - if invalid_value and invalid_column > 0: - msg += "Value '%s' in column %d is not numeric." % ( invalid_value, invalid_column ) - print msg - -if __name__ == "__main__": - main() diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/stats/cor.xml --- a/tools/stats/cor.xml +++ /dev/null @@ -1,101 +0,0 @@ -<tool id="cor2" name="Correlation"> - <description>for numeric columns</description> - <command interpreter="python">cor.py $input1 $out_file1 $numeric_columns $method</command> - <inputs> - <param format="tabular" name="input1" type="data" label="Dataset" help="Dataset missing? See TIP below"/> - <param name="numeric_columns" label="Numerical columns" type="data_column" numerical="True" multiple="True" data_ref="input1" help="Multi-select list - hold the appropriate key while clicking to select multiple columns" /> - <param name="method" type="select" label="Method"> - <option value="pearson">Pearson</option> - <option value="kendall">Kendall rank</option> - <option value="spearman">Spearman rank</option> - </param> - </inputs> - <outputs> - <data format="txt" name="out_file1" /> - </outputs> - <requirements> - <requirement type="python-module">rpy</requirement> - </requirements> - <tests> - <!-- - Test a tabular input with the first line being a comment without a # character to start - --> - <test> - <param name="input1" value="cor.tabular" /> - <param name="numeric_columns" value="2,3" /> - <param name="method" value="pearson" /> - <output name="out_file1" file="cor_out.txt" /> - </test> - </tests> - <help> - -.. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* - -.. class:: warningmark - -Missing data ("nan") removed from each pairwise comparison - ------ - -**Syntax** - -This tool computes the matrix of correlation coefficients between numeric columns. - -- All invalid, blank and comment lines are skipped when performing computations. The number of skipped lines is displayed in the resulting history item. - -- **Pearson's Correlation** reflects the degree of linear relationship between two variables. It ranges from +1 to -1. A correlation of +1 means that there is a perfect positive linear relationship between variables. The formula for Pearson's correlation is: - - .. image:: ${static_path}/images/pearson.png - - where n is the number of items - -- **Kendall's rank correlation** is used to measure the degree of correspondence between two rankings and assessing the significance of this correspondence. The formula for Kendall's rank correlation is: - - .. image:: ${static_path}/images/kendall.png - - where n is the number of items, and P is the sum. - -- **Spearman's rank correlation** assesses how well an arbitrary monotonic function could describe the relationship between two variables, without making any assumptions about the frequency distribution of the variables. The formula for Spearman's rank correlation is - - .. image:: ${static_path}/images/spearman.png - - where D is the difference between the ranks of corresponding values of X and Y, and N is the number of pairs of values. - ------ - -**Example** - -- Input file:: - - #Person Height Self Esteem - 1 68 4.1 - 2 71 4.6 - 3 62 3.8 - 4 75 4.4 - 5 58 3.2 - 6 60 3.1 - 7 67 3.8 - 8 68 4.1 - 9 71 4.3 - 10 69 3.7 - 11 68 3.5 - 12 67 3.2 - 13 63 3.7 - 14 62 3.3 - 15 60 3.4 - 16 63 4.0 - 17 65 4.1 - 18 67 3.8 - 19 63 3.4 - 20 61 3.6 - -- Computing the correlation coefficients between columns 2 and 3 of the above file (using Pearson's Correlation), the output is:: - - 1.0 0.730635686279 - 0.730635686279 1.0 - - So the correlation for our twenty cases is .73, which is a fairly strong positive relationship. - </help> -</tool> diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/stats/correlation.pl --- a/tools/stats/correlation.pl +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/perl - -########################################################################### -# Purpose: To calculate the correlation of two sets of scores in one file. -# Usage: correlation.pl infile.bed output.txt column1 column2 -# (column start from 1) -# Written by: Yi Zhang (June, 2005) -########################################################################### -if (!$ARGV[0] || !$ARGV[1] || !defined($ARGV[2]) || !defined($ARGV[3]) ) { - print STDERR "Usage: correlation.pl infile.bed output.txt column1 column2\n"; - print STDERR " (column start from 1)\n"; - exit; -} -my $file = $ARGV[0]; -my $out = $ARGV[1]; - -die "<font color=\"yellow\">The input columns contain numerical values: $ARGV[2], $ARGV[3]</font>.\n" if ($ARGV[2] =~ /[a-zA-Z]+/ || $ARGV[3] =~ /[a-zA-Z]+/); - -my $col1 = $ARGV[2] - 1; -my $col2 = $ARGV[3] - 1; - -my ($f, $o); -my (@a, @b); - -my $n_t = 0; -open($f, $file) or die "Could't open $file, $!\n"; -while(<$f>) { - chomp; - my @t = split(/\t/); - if ($n_t == 0) { - $n_t = scalar(@t) - 1; - die "<font color=\"yellow\">The input column number exceeds the size of the file: $col1, $col2, $n_t</font>\n" if ( $col1 > $n_t || $col2 > $n_t ); - } - die "<font color=\"yellow\">The columns you have selected contain non numeric characters:$t[$col1] and $t[$col2] \n</font>" if ($t[$col1] =~ /[a-zA-Z]+/ || $t[$col2] =~ /[a-zA-Z]+/); - push(@a, $t[$col1]); - push(@b, $t[$col2]); -} -close($f); - -my $result = correlation(\@a, \@b); - -open($o, ">$out") or die "Couldn't open $out, $!\n"; -$col1 = $col1 + 1; -$col2 = $col2 + 1; -print $o "The correlation of column $col1 and $col2 is $result\n"; -close($o); -print "The correlation of column $col1 and $col2 is $result\n"; - -sub correlation { - my ($array1ref, $array2ref) = @_; - my ($sum1, $sum2); - my ($sum1_squared, $sum2_squared); - foreach (@$array1ref) { $sum1 += $_; $sum1_squared += $_**2; } - foreach (@$array2ref) { $sum2 += $_; $sum2_squared += $_**2; } - my $numerator = (@$array1ref**2) * covariance($array1ref, $array2ref); - my $denominator = sqrt(((@$array1ref * $sum1_squared) - ($sum1**2)) * - ((@$array1ref * $sum2_squared) - ($sum2**2))); - my $r; - if ($denominator == 0) { - print STDERR "The denominator is 0.\n"; - exit 0; - } else { - $r = $numerator / $denominator; - } - return $r; -} - -sub covariance { - my ($array1ref, $array2ref) = @_; - my ($i, $result); - for ($i = 0; $i < @$array1ref; $i++) { - $result += $array1ref->[$i] * $array2ref->[$i]; - } - $result /= @$array1ref; - $result -= mean($array1ref) * mean($array2ref); -} - -sub mean { - my ($arrayref) = @_; - my $result; - foreach (@$arrayref) { $result += $_; } - return $result/@$arrayref; -} - diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/stats/correlation.xml --- a/tools/stats/correlation.xml +++ /dev/null @@ -1,15 +0,0 @@ -<tool id="Pearson_and_apos_Correlation1" name="Pearson and apos Correlation"> - <description>between any two numeric columns</description> - <command interpreter="perl">correlation.pl $input $out_file1 $col1 $col2</command> - <inputs> -<!-- <display>on column $col1 and column $col2 of $input</display> --> - <param name="col1" size="3" type="text" value="5" label="Correlate data in column"/> - <param name="col2" size="3" type="text" value="6" label="with data in column"/> - <param format="txt" name="input" type="data" label="in Query"/> - </inputs> - <outputs> - <data format="txt" name="out_file1" /> - </outputs> - <help>Computes Pearsons correlation coefficient between any two numerical columns. Column numbers start at 1. -</help> -</tool> diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/stats/count_gff_features.py --- a/tools/stats/count_gff_features.py +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/env python -# This tool takes a gff file as input and counts the number of features in it. - -import sys, fileinput -from galaxy import eggs -from galaxy.datatypes.util.gff_util import GFFReaderWrapper -from bx.intervals.io import GenomicInterval - -# Get args. -input_file = sys.argv[1:] - -# Count features. -count = 0 -for feature in GFFReaderWrapper( fileinput.FileInput( input_file ), fix_strand=True ): - if isinstance( feature, GenomicInterval ): - count += 1 - -print count \ No newline at end of file This diff is so big that we needed to truncate the remainder. https://bitbucket.org/galaxy/galaxy-central/commits/799ceff557bb/ Changeset: 799ceff557bb User: davebgx Date: 2014-07-28 18:07:07 Summary: Remove references to migrated tools from tool_conf.xml.main Affected #: 1 file diff -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 -r 799ceff557bbbfea80cbfd164a297a0364b0c699 tool_conf.xml.main --- a/tool_conf.xml.main +++ b/tool_conf.xml.main @@ -27,7 +27,6 @@ </section><section id="textutil" name="Text Manipulation"><tool file="filters/fixedValueColumn.xml" /> - <tool file="stats/column_maker.xml" /><tool file="filters/catWrapper.xml" /><tool file="filters/condense_characters.xml" /><tool file="filters/convert_characters.xml" /> @@ -58,7 +57,6 @@ <tool file="stats/filtering.xml" /><tool file="filters/sorter.xml" /><tool file="filters/grep.xml" /> - <tool file="stats/dna_filtering.xml" /><label id="gff" text="GFF" /><tool file="filters/gff/extract_GFF_Features.xml" /><tool file="filters/gff/gff_filter_by_attribute.xml" /> @@ -96,16 +94,9 @@ <section id="stats" name="Statistics"><tool file="stats/gsummary.xml" /><tool file="filters/uniq.xml" /> - <tool file="stats/cor.xml" /> - <tool file="stats/generate_matrix_for_pca_lda.xml" /> - <tool file="stats/lda_analy.xml" /> - <tool file="stats/plot_from_lda.xml" /></section><section id="plots" name="Graph/Display Data"> - <tool file="plotting/histogram2.xml" /> - <tool file="plotting/scatterplot.xml" /><tool file="plotting/boxplot.xml" /> - <tool file="visualization/build_ucsc_custom_track.xml" /><tool file="maf/vcf_to_maf_customtrack.xml" /><tool file="mutation/visualize.xml" /></section> @@ -115,14 +106,12 @@ <tool file="phenotype_association/sift.xml" /><tool file="phenotype_association/linkToGProfile.xml" /><tool file="phenotype_association/linkToDavid.xml" /> - <tool file="phenotype_association/snpFreq.xml" /><tool file="phenotype_association/ldtools.xml" /><tool file="phenotype_association/pass.xml" /><tool file="phenotype_association/gpass.xml" /><tool file="phenotype_association/beam.xml" /><tool file="phenotype_association/lps.xml" /><tool file="phenotype_association/master2pg.xml" /> - <tool file="phenotype_association/vcf2pgSnp.xml" /></section><label id="ngs" text="NGS Toolbox Beta" /><section id="cshl_library_information" name="NGS: QC and manipulation"> Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.