commit/galaxy-central: greg: Rewrite Pac Bio assembly_stats tool - it now takes 1 input fasta dataset and produces 1 output tabular dataset. The "wiki" output format has been eliminated, and the dependency on the pbpy module has been eliiminated. - galaxy-commits

28 Jul 2011

1 new changeset in galaxy-central:

http://bitbucket.org/galaxy/galaxy-central/changeset/33fb9df64172/
changeset:   33fb9df64172
user:        greg
date:        2011-07-28 17:41:01
summary:     Rewrite Pac Bio assembly_stats tool - it now takes 1 input fasta dataset and produces 1 output tabular dataset.  The "wiki" output format has been eliminated, and the dependency on the pbpy module has been eliiminated.
affected #:  3 files (289 bytes)

--- a/tools/ilmn_pacbio/assembly_stats.py	Thu Jul 28 10:00:02 2011 -0400
+++ b/tools/ilmn_pacbio/assembly_stats.py	Thu Jul 28 11:41:01 2011 -0400
@@ -1,107 +1,83 @@
 #!/usr/bin/env python
-import sys
-import os
-import random
+#
+#Copyright (c) 2011, Pacific Biosciences of California, Inc.
+#
+#All rights reserved.
+#
+#Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+#    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+#    * Neither the name of Pacific Biosciences nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+#
+#THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS CONTRIBUTORS BE LIABLE FOR ANY
+#DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+import sys, os
+from optparse import OptionParser
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( 'bx-python' )
+from bx.seq.fasta import FastaReader
 
-from optparse import OptionParser
-from pbpy.io.FastaIO import FastaEntry, SimpleFastaReader
-
-class FastaStats:
-    def __init__(self, argv):
-        self.__parseOptions( argv )
-        
-    def __parseOptions(self, argv):
-        usage = 'Usage: %prog [--help] [options] [fastaFileList]'
-        parser = OptionParser( usage=usage )
-        parser.add_option("--minContigLength", help="Minimum length of contigs to analyze")
-        parser.add_option("--genomeLength", help="Length of genome to calculate N50s for.")
-        parser.add_option("--outputFormat", help="Format of output [wiki]")
-        parser.add_option("--noHeader", action="store_true",
-            help="Don't output a header line" )
-        parser.set_defaults( noHeader=False, 
-            minContigLength=0, genomeLength=0, outputFormat="wiki")
-
-        self.options, args = parser.parse_args(argv)
-        
-        if len(args) < 2:
-            parser.error( 'Expected 1 arguments' )
-
-        self.fastaFiles = args[1:]
-        self.outputFormat = self.options.outputFormat
-        self.genomeLength = int(self.options.genomeLength)
-        self.minContigLength = int(self.options.minContigLength)
-        self.statKeys = "File Num Sum Max Avg N50 99%".split(" ")
-
-    def getStats(self, fastaFile):
-        lengths = []
-        for entry in SimpleFastaReader(fastaFile):
-            if len(entry.sequence) < self.minContigLength: continue
-            lengths.append( len(entry.sequence) )
-
-        stats = {"File":fastaFile,   
-            "Num":len(lengths),
-            "Sum":sum(lengths), 
-            "Max":max(lengths), 
-            # "MinLenSum": sum( filter(lambda x: x > self.minContigLength, lengths)), 
-            "Avg":int(sum(lengths)/float(len(lengths))),
-            "N50":0,
-            "99%":0}
-
-        if self.genomeLength == 0: self.genomeLength = sum(lengths)
-         
+def getStats( fastaFile, genomeLength, minContigLength ):
+    lengths = []
+    stats = { "Num" : 0,
+              "Sum" : 0, 
+              "Max" : 0, 
+              "Avg" : 0,
+              "N50" : 0,
+              "99%" : 0 }
+    fasta_reader = FastaReader( open( fastaFile, 'rb' ) )
+    while True:
+        seq = fasta_reader.next()
+        if not seq:
+            break
+        if seq.length < minContigLength:
+            continue
+        lengths.append( seq.length )
+    if lengths:
+        stats[ 'Num' ] = len( lengths )
+        stats[ 'Sum' ] = sum( lengths )
+        stats[ 'Max' ] = max( lengths )
+        stats[ 'Avg' ] = int( sum( lengths ) / float( len( lengths ) ) )
+        stats[ 'N50' ] = 0
+        stats[ '99%' ] = 0
+        if genomeLength == 0:
+            genomeLength = sum( lengths )
         lengths.sort()
         lengths.reverse()
         lenSum = 0
-        stats["99%"] = len(lengths)
-        for idx, length in enumerate(lengths):
+        stats[ "99%" ] = len( lengths )
+        for idx, length in enumerate( lengths ):
             lenSum += length
-            if (lenSum > self.genomeLength/2):
-                stats["N50"] = length
+            if ( lenSum > genomeLength / 2 ):
+                stats[ "N50" ] = length
                 break
         lenSum = 0
-        for idx, length in enumerate(lengths):
+        for idx, length in enumerate( lengths ):
             lenSum += length
-            if (lenSum > self.genomeLength*0.99):
-                stats["99%"] = idx + 1
+            if lenSum > genomeLength * 0.99:
+                stats[ "99%" ] = idx + 1
                 break
+    return stats
 
-        return stats
+def __main__():
+    #Parse Command Line
+    usage = 'Usage: %prog input output --minContigLength'
+    parser = OptionParser( usage=usage )
+    parser.add_option( "--minContigLength", dest="minContigLength", help="Minimum length of contigs to analyze" )
+    parser.add_option( "--genomeLength", dest="genomeLength", help="Length of genome for which to calculate N50s" )
+    parser.set_defaults( minContigLength=0, genomeLength=0 )
+    options, args = parser.parse_args()
+    input_fasta_file = args[ 0 ]
+    output_tabular_file = args[ 1 ]
+    statKeys = "Num Sum Max Avg N50 99%".split( " " )
+    stats = getStats( input_fasta_file, int( options.genomeLength ), int( options.minContigLength ) )
+    fout = open( output_tabular_file, "w" )
+    fout.write( "%s\n" % "\t".join( map( lambda key: str( stats[ key ] ), statKeys ) ) )
+    fout.close()
 
-    def header(self):
-        if self.outputFormat == "wiki":
-            buffer = '{| width="200" cellspacing="1" cellpadding="1" border="1"\n'
-            buffer += '|-\n'
-            for key in self.statKeys:
-                buffer += '| %s\n' % key
-            return buffer
-        elif self.outputFormat == "tsv":
-            return "%s\n" % "\t".join(self.statKeys)
-        else:
-            sys.exit("Unsupported format %s" % self.outputFormat)
-    
-    def footer(self):
-        if self.outputFormat == "wiki":
-            return "|}\n"
-        else:
-            return ""
-
-    def format(self, stats):
-        if self.outputFormat == "wiki":
-            buffer = "|-\n"
-            for key in self.statKeys:
-                buffer += "| %s\n" % stats[key]
-            return buffer
-        elif self.outputFormat == "tsv":
-            return "%s\n" % "\t".join(map(lambda key: str(stats[key]), self.statKeys))
-        else:
-            sys.exit("Unsupported format %s" % self.outputFormat)
-        
-    def run(self):  
-        if not self.options.noHeader:
-            print self.header(),
-        for file in self.fastaFiles: print self.format(self.getStats(file)),
-        print self.footer()
-
-if __name__=='__main__':
-    app = FastaStats(sys.argv)
-    app.run()
+if __name__=="__main__": __main__()


--- a/tools/ilmn_pacbio/assembly_stats.xml	Thu Jul 28 10:00:02 2011 -0400
+++ b/tools/ilmn_pacbio/assembly_stats.xml	Thu Jul 28 11:41:01 2011 -0400
@@ -1,35 +1,33 @@
 <tool id="assembly_stats" name="Assembly Statistics" version="1.0.0">
-  <description>Calculate common measures of assembly quality</description>
-  <command interpreter="python">
-    assembly_stats.py ${wiki} --minContigLength=${minLength} $input1 > $output1
-  </command>
-  <inputs>
-    <param name="input1" format="fasta" type="data" label="Select FASTA file containing contigs"/>
-    <param name="minLength" type="integer" value="0" label="Minimum length of contigs to consider"/>
-    <param name="wiki" type="boolean" 
-      checked="true" value="True"
-      truevalue="--outputFormat=wiki"
-      falsevalue="--noHeader --outputFormat=tsv"
-      label="Human-readable?" />
-  </inputs>
-  <outputs>
-    <data format="tabular" name="output1" label="Assembly statistics for ${on_string}"/>
-  </outputs>
-  <help>
+    <description>Calculate common measures of assembly quality</description>
+    <command interpreter="python">
+        assembly_stats.py $input1 $output1 --minContigLength=${minLength}
+    </command>
+    <inputs>
+        <param name="input1" format="fasta" type="data" label="Select FASTA file containing contigs"/>
+        <param name="minLength" type="integer" value="0" label="Minimum length of contigs to consider"/>
+    </inputs>
+    <outputs>
+        <data name="output1" format="tabular" label="Assembly statistics for ${on_string}"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input1" value="3.fasta" ftype="fasta"/>
+            <param name="minLength" value="100"/>
+            <output name="output1" ftype="tabular" file="assembly_stats.tabular" />
+        </test>
+    </tests>
+    <help>
 
 **What it does**
 
-Reports standard measures of *de novo* assembly quality such as 
-number of contigs, sum of contigs, mean contig length, and N50.
+Reports standard measures of *de novo* assembly quality such as number of contigs, sum of contigs, mean contig length, and N50.
 
 **Parameter list**
 
 Minimum length
     Only include contigs of this size or greater for calculating statistics.
 
-Human-readable?
-    If true, output the statistics in a wiki format which can be read by humans.  If false, output the metrics in a tab-delimited row.
-
 **Output**
 
 Num contigs
@@ -50,7 +48,7 @@
 99%
     Number of contigs accounting for 99% of the observed assembly.
 
-  </help>
+    </help></tool>

Repository URL: https://bitbucket.org/galaxy/galaxy-central/

--

This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.

    

commit/galaxy-central: greg: Rewrite Pac Bio assembly_stats tool - it now takes 1 input fasta dataset and produces 1 output tabular dataset. The "wiki" output format has been eliminated, and the dependency on the pbpy module has been eliiminated.

Bitbucket

tags

participants (1)