commit/galaxy-central: greg: Rewrite Pac Bio assembly_stats tool - it now takes 1 input fasta dataset and produces 1 output tabular dataset. The "wiki" output format has been eliminated, and the dependency on the pbpy module has been eliiminated.
1 new changeset in galaxy-central: http://bitbucket.org/galaxy/galaxy-central/changeset/33fb9df64172/ changeset: 33fb9df64172 user: greg date: 2011-07-28 17:41:01 summary: Rewrite Pac Bio assembly_stats tool - it now takes 1 input fasta dataset and produces 1 output tabular dataset. The "wiki" output format has been eliminated, and the dependency on the pbpy module has been eliiminated. affected #: 3 files (289 bytes) --- a/tools/ilmn_pacbio/assembly_stats.py Thu Jul 28 10:00:02 2011 -0400 +++ b/tools/ilmn_pacbio/assembly_stats.py Thu Jul 28 11:41:01 2011 -0400 @@ -1,107 +1,83 @@ #!/usr/bin/env python -import sys -import os -import random +# +#Copyright (c) 2011, Pacific Biosciences of California, Inc. +# +#All rights reserved. +# +#Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. +# * Neither the name of Pacific Biosciences nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. +# +#THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +#WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS CONTRIBUTORS BE LIABLE FOR ANY +#DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +#LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +import sys, os +from optparse import OptionParser +from galaxy import eggs +import pkg_resources +pkg_resources.require( 'bx-python' ) +from bx.seq.fasta import FastaReader -from optparse import OptionParser -from pbpy.io.FastaIO import FastaEntry, SimpleFastaReader - -class FastaStats: - def __init__(self, argv): - self.__parseOptions( argv ) - - def __parseOptions(self, argv): - usage = 'Usage: %prog [--help] [options] [fastaFileList]' - parser = OptionParser( usage=usage ) - parser.add_option("--minContigLength", help="Minimum length of contigs to analyze") - parser.add_option("--genomeLength", help="Length of genome to calculate N50s for.") - parser.add_option("--outputFormat", help="Format of output [wiki]") - parser.add_option("--noHeader", action="store_true", - help="Don't output a header line" ) - parser.set_defaults( noHeader=False, - minContigLength=0, genomeLength=0, outputFormat="wiki") - - self.options, args = parser.parse_args(argv) - - if len(args) < 2: - parser.error( 'Expected 1 arguments' ) - - self.fastaFiles = args[1:] - self.outputFormat = self.options.outputFormat - self.genomeLength = int(self.options.genomeLength) - self.minContigLength = int(self.options.minContigLength) - self.statKeys = "File Num Sum Max Avg N50 99%".split(" ") - - def getStats(self, fastaFile): - lengths = [] - for entry in SimpleFastaReader(fastaFile): - if len(entry.sequence) < self.minContigLength: continue - lengths.append( len(entry.sequence) ) - - stats = {"File":fastaFile, - "Num":len(lengths), - "Sum":sum(lengths), - "Max":max(lengths), - # "MinLenSum": sum( filter(lambda x: x > self.minContigLength, lengths)), - "Avg":int(sum(lengths)/float(len(lengths))), - "N50":0, - "99%":0} - - if self.genomeLength == 0: self.genomeLength = sum(lengths) - +def getStats( fastaFile, genomeLength, minContigLength ): + lengths = [] + stats = { "Num" : 0, + "Sum" : 0, + "Max" : 0, + "Avg" : 0, + "N50" : 0, + "99%" : 0 } + fasta_reader = FastaReader( open( fastaFile, 'rb' ) ) + while True: + seq = fasta_reader.next() + if not seq: + break + if seq.length < minContigLength: + continue + lengths.append( seq.length ) + if lengths: + stats[ 'Num' ] = len( lengths ) + stats[ 'Sum' ] = sum( lengths ) + stats[ 'Max' ] = max( lengths ) + stats[ 'Avg' ] = int( sum( lengths ) / float( len( lengths ) ) ) + stats[ 'N50' ] = 0 + stats[ '99%' ] = 0 + if genomeLength == 0: + genomeLength = sum( lengths ) lengths.sort() lengths.reverse() lenSum = 0 - stats["99%"] = len(lengths) - for idx, length in enumerate(lengths): + stats[ "99%" ] = len( lengths ) + for idx, length in enumerate( lengths ): lenSum += length - if (lenSum > self.genomeLength/2): - stats["N50"] = length + if ( lenSum > genomeLength / 2 ): + stats[ "N50" ] = length break lenSum = 0 - for idx, length in enumerate(lengths): + for idx, length in enumerate( lengths ): lenSum += length - if (lenSum > self.genomeLength*0.99): - stats["99%"] = idx + 1 + if lenSum > genomeLength * 0.99: + stats[ "99%" ] = idx + 1 break + return stats - return stats +def __main__(): + #Parse Command Line + usage = 'Usage: %prog input output --minContigLength' + parser = OptionParser( usage=usage ) + parser.add_option( "--minContigLength", dest="minContigLength", help="Minimum length of contigs to analyze" ) + parser.add_option( "--genomeLength", dest="genomeLength", help="Length of genome for which to calculate N50s" ) + parser.set_defaults( minContigLength=0, genomeLength=0 ) + options, args = parser.parse_args() + input_fasta_file = args[ 0 ] + output_tabular_file = args[ 1 ] + statKeys = "Num Sum Max Avg N50 99%".split( " " ) + stats = getStats( input_fasta_file, int( options.genomeLength ), int( options.minContigLength ) ) + fout = open( output_tabular_file, "w" ) + fout.write( "%s\n" % "\t".join( map( lambda key: str( stats[ key ] ), statKeys ) ) ) + fout.close() - def header(self): - if self.outputFormat == "wiki": - buffer = '{| width="200" cellspacing="1" cellpadding="1" border="1"\n' - buffer += '|-\n' - for key in self.statKeys: - buffer += '| %s\n' % key - return buffer - elif self.outputFormat == "tsv": - return "%s\n" % "\t".join(self.statKeys) - else: - sys.exit("Unsupported format %s" % self.outputFormat) - - def footer(self): - if self.outputFormat == "wiki": - return "|}\n" - else: - return "" - - def format(self, stats): - if self.outputFormat == "wiki": - buffer = "|-\n" - for key in self.statKeys: - buffer += "| %s\n" % stats[key] - return buffer - elif self.outputFormat == "tsv": - return "%s\n" % "\t".join(map(lambda key: str(stats[key]), self.statKeys)) - else: - sys.exit("Unsupported format %s" % self.outputFormat) - - def run(self): - if not self.options.noHeader: - print self.header(), - for file in self.fastaFiles: print self.format(self.getStats(file)), - print self.footer() - -if __name__=='__main__': - app = FastaStats(sys.argv) - app.run() +if __name__=="__main__": __main__() --- a/tools/ilmn_pacbio/assembly_stats.xml Thu Jul 28 10:00:02 2011 -0400 +++ b/tools/ilmn_pacbio/assembly_stats.xml Thu Jul 28 11:41:01 2011 -0400 @@ -1,35 +1,33 @@ <tool id="assembly_stats" name="Assembly Statistics" version="1.0.0"> - <description>Calculate common measures of assembly quality</description> - <command interpreter="python"> - assembly_stats.py ${wiki} --minContigLength=${minLength} $input1 > $output1 - </command> - <inputs> - <param name="input1" format="fasta" type="data" label="Select FASTA file containing contigs"/> - <param name="minLength" type="integer" value="0" label="Minimum length of contigs to consider"/> - <param name="wiki" type="boolean" - checked="true" value="True" - truevalue="--outputFormat=wiki" - falsevalue="--noHeader --outputFormat=tsv" - label="Human-readable?" /> - </inputs> - <outputs> - <data format="tabular" name="output1" label="Assembly statistics for ${on_string}"/> - </outputs> - <help> + <description>Calculate common measures of assembly quality</description> + <command interpreter="python"> + assembly_stats.py $input1 $output1 --minContigLength=${minLength} + </command> + <inputs> + <param name="input1" format="fasta" type="data" label="Select FASTA file containing contigs"/> + <param name="minLength" type="integer" value="0" label="Minimum length of contigs to consider"/> + </inputs> + <outputs> + <data name="output1" format="tabular" label="Assembly statistics for ${on_string}"/> + </outputs> + <tests> + <test> + <param name="input1" value="3.fasta" ftype="fasta"/> + <param name="minLength" value="100"/> + <output name="output1" ftype="tabular" file="assembly_stats.tabular" /> + </test> + </tests> + <help> **What it does** -Reports standard measures of *de novo* assembly quality such as -number of contigs, sum of contigs, mean contig length, and N50. +Reports standard measures of *de novo* assembly quality such as number of contigs, sum of contigs, mean contig length, and N50. **Parameter list** Minimum length Only include contigs of this size or greater for calculating statistics. -Human-readable? - If true, output the statistics in a wiki format which can be read by humans. If false, output the metrics in a tab-delimited row. - **Output** Num contigs @@ -50,7 +48,7 @@ 99% Number of contigs accounting for 99% of the observed assembly. - </help> + </help></tool> Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
Bitbucket