1 new changeset in galaxy-central:
http://bitbucket.org/galaxy/galaxy-central/changeset/33fb9df64172/
changeset: 33fb9df64172
user: greg
date: 2011-07-28 17:41:01
summary: Rewrite Pac Bio assembly_stats tool - it now takes 1 input fasta dataset and
produces 1 output tabular dataset. The "wiki" output format has been
eliminated, and the dependency on the pbpy module has been eliiminated.
affected #: 3 files (289 bytes)
--- a/tools/ilmn_pacbio/assembly_stats.py Thu Jul 28 10:00:02 2011 -0400
+++ b/tools/ilmn_pacbio/assembly_stats.py Thu Jul 28 11:41:01 2011 -0400
@@ -1,107 +1,83 @@
#!/usr/bin/env python
-import sys
-import os
-import random
+#
+#Copyright (c) 2011, Pacific Biosciences of California, Inc.
+#
+#All rights reserved.
+#
+#Redistribution and use in source and binary forms, with or without modification, are
permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright notice, this list
of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or other
materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its contributors may be
used to endorse or promote products derived from this software without specific prior
written permission.
+#
+#THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
EVENT SHALL PACIFIC BIOSCIENCES OR ITS CONTRIBUTORS BE LIABLE FOR ANY
+#DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+import sys, os
+from optparse import OptionParser
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( 'bx-python' )
+from bx.seq.fasta import FastaReader
-from optparse import OptionParser
-from pbpy.io.FastaIO import FastaEntry, SimpleFastaReader
-
-class FastaStats:
- def __init__(self, argv):
- self.__parseOptions( argv )
-
- def __parseOptions(self, argv):
- usage = 'Usage: %prog [--help] [options] [fastaFileList]'
- parser = OptionParser( usage=usage )
- parser.add_option("--minContigLength", help="Minimum length of
contigs to analyze")
- parser.add_option("--genomeLength", help="Length of genome to
calculate N50s for.")
- parser.add_option("--outputFormat", help="Format of output
[wiki]")
- parser.add_option("--noHeader", action="store_true",
- help="Don't output a header line" )
- parser.set_defaults( noHeader=False,
- minContigLength=0, genomeLength=0, outputFormat="wiki")
-
- self.options, args = parser.parse_args(argv)
-
- if len(args) < 2:
- parser.error( 'Expected 1 arguments' )
-
- self.fastaFiles = args[1:]
- self.outputFormat = self.options.outputFormat
- self.genomeLength = int(self.options.genomeLength)
- self.minContigLength = int(self.options.minContigLength)
- self.statKeys = "File Num Sum Max Avg N50 99%".split(" ")
-
- def getStats(self, fastaFile):
- lengths = []
- for entry in SimpleFastaReader(fastaFile):
- if len(entry.sequence) < self.minContigLength: continue
- lengths.append( len(entry.sequence) )
-
- stats = {"File":fastaFile,
- "Num":len(lengths),
- "Sum":sum(lengths),
- "Max":max(lengths),
- # "MinLenSum": sum( filter(lambda x: x > self.minContigLength,
lengths)),
- "Avg":int(sum(lengths)/float(len(lengths))),
- "N50":0,
- "99%":0}
-
- if self.genomeLength == 0: self.genomeLength = sum(lengths)
-
+def getStats( fastaFile, genomeLength, minContigLength ):
+ lengths = []
+ stats = { "Num" : 0,
+ "Sum" : 0,
+ "Max" : 0,
+ "Avg" : 0,
+ "N50" : 0,
+ "99%" : 0 }
+ fasta_reader = FastaReader( open( fastaFile, 'rb' ) )
+ while True:
+ seq = fasta_reader.next()
+ if not seq:
+ break
+ if seq.length < minContigLength:
+ continue
+ lengths.append( seq.length )
+ if lengths:
+ stats[ 'Num' ] = len( lengths )
+ stats[ 'Sum' ] = sum( lengths )
+ stats[ 'Max' ] = max( lengths )
+ stats[ 'Avg' ] = int( sum( lengths ) / float( len( lengths ) ) )
+ stats[ 'N50' ] = 0
+ stats[ '99%' ] = 0
+ if genomeLength == 0:
+ genomeLength = sum( lengths )
lengths.sort()
lengths.reverse()
lenSum = 0
- stats["99%"] = len(lengths)
- for idx, length in enumerate(lengths):
+ stats[ "99%" ] = len( lengths )
+ for idx, length in enumerate( lengths ):
lenSum += length
- if (lenSum > self.genomeLength/2):
- stats["N50"] = length
+ if ( lenSum > genomeLength / 2 ):
+ stats[ "N50" ] = length
break
lenSum = 0
- for idx, length in enumerate(lengths):
+ for idx, length in enumerate( lengths ):
lenSum += length
- if (lenSum > self.genomeLength*0.99):
- stats["99%"] = idx + 1
+ if lenSum > genomeLength * 0.99:
+ stats[ "99%" ] = idx + 1
break
+ return stats
- return stats
+def __main__():
+ #Parse Command Line
+ usage = 'Usage: %prog input output --minContigLength'
+ parser = OptionParser( usage=usage )
+ parser.add_option( "--minContigLength", dest="minContigLength",
help="Minimum length of contigs to analyze" )
+ parser.add_option( "--genomeLength", dest="genomeLength",
help="Length of genome for which to calculate N50s" )
+ parser.set_defaults( minContigLength=0, genomeLength=0 )
+ options, args = parser.parse_args()
+ input_fasta_file = args[ 0 ]
+ output_tabular_file = args[ 1 ]
+ statKeys = "Num Sum Max Avg N50 99%".split( " " )
+ stats = getStats( input_fasta_file, int( options.genomeLength ), int(
options.minContigLength ) )
+ fout = open( output_tabular_file, "w" )
+ fout.write( "%s\n" % "\t".join( map( lambda key: str( stats[ key
] ), statKeys ) ) )
+ fout.close()
- def header(self):
- if self.outputFormat == "wiki":
- buffer = '{| width="200" cellspacing="1"
cellpadding="1" border="1"\n'
- buffer += '|-\n'
- for key in self.statKeys:
- buffer += '| %s\n' % key
- return buffer
- elif self.outputFormat == "tsv":
- return "%s\n" % "\t".join(self.statKeys)
- else:
- sys.exit("Unsupported format %s" % self.outputFormat)
-
- def footer(self):
- if self.outputFormat == "wiki":
- return "|}\n"
- else:
- return ""
-
- def format(self, stats):
- if self.outputFormat == "wiki":
- buffer = "|-\n"
- for key in self.statKeys:
- buffer += "| %s\n" % stats[key]
- return buffer
- elif self.outputFormat == "tsv":
- return "%s\n" % "\t".join(map(lambda key:
str(stats[key]), self.statKeys))
- else:
- sys.exit("Unsupported format %s" % self.outputFormat)
-
- def run(self):
- if not self.options.noHeader:
- print self.header(),
- for file in self.fastaFiles: print self.format(self.getStats(file)),
- print self.footer()
-
-if __name__=='__main__':
- app = FastaStats(sys.argv)
- app.run()
+if __name__=="__main__": __main__()
--- a/tools/ilmn_pacbio/assembly_stats.xml Thu Jul 28 10:00:02 2011 -0400
+++ b/tools/ilmn_pacbio/assembly_stats.xml Thu Jul 28 11:41:01 2011 -0400
@@ -1,35 +1,33 @@
<tool id="assembly_stats" name="Assembly Statistics"
version="1.0.0">
- <description>Calculate common measures of assembly quality</description>
- <command interpreter="python">
- assembly_stats.py ${wiki} --minContigLength=${minLength} $input1 > $output1
- </command>
- <inputs>
- <param name="input1" format="fasta" type="data"
label="Select FASTA file containing contigs"/>
- <param name="minLength" type="integer" value="0"
label="Minimum length of contigs to consider"/>
- <param name="wiki" type="boolean"
- checked="true" value="True"
- truevalue="--outputFormat=wiki"
- falsevalue="--noHeader --outputFormat=tsv"
- label="Human-readable?" />
- </inputs>
- <outputs>
- <data format="tabular" name="output1" label="Assembly
statistics for ${on_string}"/>
- </outputs>
- <help>
+ <description>Calculate common measures of assembly quality</description>
+ <command interpreter="python">
+ assembly_stats.py $input1 $output1 --minContigLength=${minLength}
+ </command>
+ <inputs>
+ <param name="input1" format="fasta" type="data"
label="Select FASTA file containing contigs"/>
+ <param name="minLength" type="integer" value="0"
label="Minimum length of contigs to consider"/>
+ </inputs>
+ <outputs>
+ <data name="output1" format="tabular" label="Assembly
statistics for ${on_string}"/>
+ </outputs>
+ <tests>
+ <test>
+ <param name="input1" value="3.fasta"
ftype="fasta"/>
+ <param name="minLength" value="100"/>
+ <output name="output1" ftype="tabular"
file="assembly_stats.tabular" />
+ </test>
+ </tests>
+ <help>
**What it does**
-Reports standard measures of *de novo* assembly quality such as
-number of contigs, sum of contigs, mean contig length, and N50.
+Reports standard measures of *de novo* assembly quality such as number of contigs, sum of
contigs, mean contig length, and N50.
**Parameter list**
Minimum length
Only include contigs of this size or greater for calculating statistics.
-Human-readable?
- If true, output the statistics in a wiki format which can be read by humans. If
false, output the metrics in a tab-delimited row.
-
**Output**
Num contigs
@@ -50,7 +48,7 @@
99%
Number of contigs accounting for 99% of the observed assembly.
- </help>
+ </help></tool>
Repository URL:
https://bitbucket.org/galaxy/galaxy-central/
--
This is a commit notification from
bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.