[galaxy-dev] [hg] galaxy 3581: Updated several NGS tools to better handle std...

16 Apr 2010

details:   http://www.bx.psu.edu/hg/galaxy/rev/c07a6bdc794c
changeset: 3581:c07a6bdc794c
user:      Kelly Vincent <kpvincent@bx.psu.edu>
date:      Tue Mar 30 16:33:19 2010 -0400
description:
Updated several NGS tools to better handle stderr (allow for extremely large stderr output)

diffstat:

 tools/metag_tools/megablast_wrapper.py  |  43 +++++++++----
 tools/metag_tools/megablast_wrapper.xml |  98 ++++++++++++++++----------------
 tools/samtools/sam_merge.py             |  11 +++-
 tools/samtools/sam_pileup.py            |  22 ++++++-
 tools/samtools/sam_to_bam.py            |  33 ++++++++++-
 tools/sr_mapping/bowtie_wrapper.py      |  21 ++++++-
 tools/sr_mapping/bowtie_wrapper_code.py |   4 +-
 tools/sr_mapping/bwa_wrapper.py         |  42 ++++++++++++-
 8 files changed, 197 insertions(+), 77 deletions(-)

diffs (453 lines):

diff -r 8268439390c6 -r c07a6bdc794c tools/metag_tools/megablast_wrapper.py

--- a/tools/metag_tools/megablast_wrapper.py	Tue Mar 30 15:39:14 2010 -0400
+++ b/tools/metag_tools/megablast_wrapper.py	Tue Mar 30 16:33:19 2010 -0400
@@ -15,7 +15,7 @@
 usage: %prog db_build input_file word_size identity_cutoff eval_cutoff filter_query index_dir output_file
 """
 
-import sys, os, tempfile
+import os, subprocess, sys, tempfile
 from galaxy import eggs
 import pkg_resources; pkg_resources.require( "bx-python" )
 from bx.cookbook import doc_optparse
@@ -29,7 +29,7 @@
 def __main__():
     #Parse Command Line
     options, args = doc_optparse.parse( __doc__ )
-    
+
     db_build = options.db_build
     query_filename = options.input.strip()
     output_filename = options.output.strip()
@@ -66,18 +66,33 @@
 
     if not db.has_key( db_build ):
         stop_err( 'Cannot locate the target database. Please check your location file.' )
-    
+
     # arguments for megablast    
     chunk = db[ ( db_build ) ]
-    megablast_command = "megablast -d %s -i %s -o %s -m 8 -a 8 -W %s -p %s -e %s -F %s > /dev/null 2>&1 " \
+    megablast_command = "megablast -d %s -i %s -o %s -m 8 -a 8 -W %s -p %s -e %s -F %s > /dev/null " \
         % ( chunk, query_filename, mega_temp_output, mega_word_size, mega_iden_cutoff, mega_evalue_cutoff, mega_filter ) 
-    
+
     print megablast_command
-       
+
     try:
-        os.system( megablast_command )
+        proc = subprocess.Popen( args=megablast_command, shell=True, stderr=subprocess.PIPE )
+        returncode = proc.wait()
+        # get stderr, allowing for case where it's very large
+        stderr = ''
+        buffsize = 1048576
+        try:
+            while True:
+                stderr += proc.stderr.read( buffsize )
+                if not stderr or len( stderr ) % buffsize != 0:
+                    break
+        except OverflowError:
+            pass
+        if returncode != 0:
+            raise Exception, stderr
     except Exception, e:
-        stop_err( str( e ) )
+        if os.path.exists( mega_temp_output ):
+            os.unlink( mega_temp_output )
+        stop_err( 'Error indexing reference sequence. ' + str( e ) )
 
     output = open( output_filename, 'w' )
     invalid_lines = 0
@@ -89,24 +104,24 @@
             gi, gi_len = fields[1].split( '_' )
             # convert the last column (causing problem in filter tool) to float
             fields[-1] = float( fields[-1] )
-            
             new_line = "%s\t%s\t%s\t%s\t%0.1f" % ( fields[0], gi, gi_len, '\t'.join( fields[2:-1] ), fields[-1] )
         except:
             new_line = line
             invalid_lines += 1
         output.write( "%s\n" % new_line )
     output.close()
-    
-    os.unlink( mega_temp_output ) #remove the tempfile that we just reformatted the contents of
-    
+
+    if os.path.exists( mega_temp_output ):
+        os.unlink( mega_temp_output ) #remove the tempfile that we just reformatted the contents of
+
     if invalid_lines:
         print "Unable to parse %d lines. Keep the default format." % invalid_lines
-        
+
     # megablast generates a file called error.log, if empty, delete it, if not, show the contents
     if os.path.exists( './error.log' ):
         for i, line in enumerate( file( './error.log' ) ):
             line = line.rstrip( '\r\n' )
             print line
         os.remove( './error.log' )
-    
+
 if __name__ == "__main__" : __main__()
diff -r 8268439390c6 -r c07a6bdc794c tools/metag_tools/megablast_wrapper.xml
--- a/tools/metag_tools/megablast_wrapper.xml	Tue Mar 30 15:39:14 2010 -0400
+++ b/tools/metag_tools/megablast_wrapper.xml	Tue Mar 30 16:33:19 2010 -0400
@@ -1,55 +1,55 @@
 <tool id="megablast_wrapper" name="Megablast" version="1.0.5">
-	<description> compare short reads against htgs, nt, and wgs databases</description>
-	<command interpreter="python">
-	  megablast_wrapper.py 
-	   --db_build="$source_select"
-	   --input=$input_query
-	   --word_size=$word_size
-	   --identity_cutoff=$iden_cutoff
-	   --eval_cutoff=$evalue_cutoff 
-	   --filter_query=$filter_query
-	   --index_dir=${GALAXY_DATA_INDEX_DIR}
-	   --output=$output1
-	</command>
-	<inputs>
-		<param name="input_query" type="data" format="fasta" label="Compare these sequences"/> 
-		<param name="source_select" type="select" display="radio" label="against target database">
-			<options from_file="blastdb.loc">
-			  <column name="name" index="0"/>
-			  <column name="value" index="0"/>
-			</options>
-		</param>
-		<param name="word_size" type="select" label="using word size" help="Size of best perfect match">
-			<option value="28">28</option>
-			<option value="16">16</option>
-		</param>
-		<param name="iden_cutoff" type="float" size="15" value="90.0" label="report hits above this identity" help="no cutoff if 0" />
+    <description> compare short reads against htgs, nt, and wgs databases</description>
+    <command interpreter="python">
+      megablast_wrapper.py 
+       --db_build="$source_select"
+       --input=$input_query
+       --word_size=$word_size
+       --identity_cutoff=$iden_cutoff
+       --eval_cutoff=$evalue_cutoff 
+       --filter_query=$filter_query
+       --index_dir=${GALAXY_DATA_INDEX_DIR}
+       --output=$output1
+    </command>
+    <inputs>
+        <param name="input_query" type="data" format="fasta" label="Compare these sequences"/> 
+        <param name="source_select" type="select" display="radio" label="against target database">
+            <options from_file="blastdb.loc">
+              <column name="name" index="0"/>
+              <column name="value" index="0"/>
+            </options>
+        </param>
+        <param name="word_size" type="select" label="using word size" help="Size of best perfect match">
+            <option value="28">28</option>
+            <option value="16">16</option>
+        </param>
+        <param name="iden_cutoff" type="float" size="15" value="90.0" label="report hits above this identity" help="no cutoff if 0" />
         <param name="evalue_cutoff" type="float" size="15" value="0.001" label="set expectation value cutoff" />
-		<param name="filter_query" type="select" label="Filter out low complexity regions?">
+        <param name="filter_query" type="select" label="Filter out low complexity regions?">
             <option value="T">Yes</option>
             <option value="F">No</option>
-		</param>
-	</inputs>
-	<outputs>
-		<data name="output1" format="tabular"/>
-	</outputs>
-	<requirements>
-		<requirement type="binary">megablast</requirement>
-	</requirements>
-	<tests>
-		<test>
-			<param name="input_query" value="megablast_wrapper_test1.fa" ftype="fasta"/>
-			<!-- source_select needs to match the entry in the blastdb.loc file, which includes the last update date if appropriate --> 
-			<param name="source_select" value="phiX" />
-			<param name="word_size" value="28" />
-			<param name="iden_cutoff" value="99.0" />
-			<param name="evalue_cutoff" value="10.0" />
-			<param name="filter_query" value="T" />
-			<output name="output1" file="megablast_wrapper_test1.out"/> 
-		</test>
-	</tests>
-	<help>
-	
+        </param>
+    </inputs>
+    <outputs>
+        <data name="output1" format="tabular"/>
+    </outputs>
+    <requirements>
+        <requirement type="binary">megablast</requirement>
+    </requirements>
+    <tests>
+        <test>
+            <param name="input_query" value="megablast_wrapper_test1.fa" ftype="fasta"/>
+            <!-- source_select needs to match the entry in the blastdb.loc file, which includes the last update date if appropriate --> 
+            <param name="source_select" value="phiX" />
+            <param name="word_size" value="28" />
+            <param name="iden_cutoff" value="99.0" />
+            <param name="evalue_cutoff" value="10.0" />
+            <param name="filter_query" value="T" />
+            <output name="output1" file="megablast_wrapper_test1.out"/> 
+        </test>
+    </tests>
+    <help>
+    
 .. class:: warningmark
 
 **Note**. Database searches may take substantial amount of time. For large input datasets it is advisable to allow overnight processing.  
@@ -86,5 +86,5 @@
 
 Zhang et al. A Greedy Algorithm for Aligning DNA Sequences. 2000. JCB: 203-214.
 
-	</help>
+    </help>
 </tool>
diff -r 8268439390c6 -r c07a6bdc794c tools/samtools/sam_merge.py
--- a/tools/samtools/sam_merge.py	Tue Mar 30 15:39:14 2010 -0400
+++ b/tools/samtools/sam_merge.py	Tue Mar 30 16:33:19 2010 -0400
@@ -25,7 +25,16 @@
     try:
         proc = subprocess.Popen( args=cmd, shell=True, stderr=subprocess.PIPE )
         returncode = proc.wait()
-        stderr = proc.stderr.read()
+        # get stderr, allowing for case where it's very large
+        stderr = ''
+        buffsize = 1048576
+        try:
+            while True:
+                stderr += proc.stderr.read( buffsize )
+                if not stderr or len( stderr ) % buffsize != 0:
+                    break
+        except OverflowError:
+            pass
         if returncode != 0:
             raise Exception, stderr
     except Exception, e:
diff -r 8268439390c6 -r c07a6bdc794c tools/samtools/sam_pileup.py
--- a/tools/samtools/sam_pileup.py	Tue Mar 30 15:39:14 2010 -0400
+++ b/tools/samtools/sam_pileup.py	Tue Mar 30 16:33:19 2010 -0400
@@ -90,7 +90,16 @@
                 cmdIndex = 'samtools faidx %s' % ( tmpf1_name )
                 proc = subprocess.Popen( args=cmdIndex, shell=True, cwd=tmpDir, stderr=subprocess.PIPE )
                 returncode = proc.wait()
-                stderr = proc.stderr.read()
+                # get stderr, allowing for case where it's very large
+                stderr = ''
+                buffsize = 1048576
+                try:
+                    while True:
+                        stderr += proc.stderr.read( buffsize )
+                        if not stderr or len( stderr ) % buffsize != 0:
+                            break
+                except OverflowError:
+                    pass
                 #did index succeed?
                 if returncode != 0:
                     raise Exception, 'Error creating index file\n' + stderr
@@ -99,7 +108,16 @@
             proc = subprocess.Popen( args=cmd, shell=True, cwd=tmpDir, stderr=subprocess.PIPE )
             returncode = proc.wait()
             #did it succeed?
-            stderr = proc.stderr.read()
+            # get stderr, allowing for case where it's very large
+            stderr = ''
+            buffsize = 1048576
+            try:
+                while True:
+                    stderr += proc.stderr.read( buffsize )
+                    if not stderr or len( stderr ) % buffsize != 0:
+                        break
+            except OverflowError:
+                pass
             if returncode != 0:
                 raise Exception, stderr
         except Exception, e:
diff -r 8268439390c6 -r c07a6bdc794c tools/samtools/sam_to_bam.py
--- a/tools/samtools/sam_to_bam.py	Tue Mar 30 15:39:14 2010 -0400
+++ b/tools/samtools/sam_to_bam.py	Tue Mar 30 16:33:19 2010 -0400
@@ -77,7 +77,16 @@
             command = 'samtools faidx %s' % fai_index_file_base
             proc = subprocess.Popen( args=command, shell=True, cwd=tmp_dir, stderr=subprocess.PIPE )
             returncode = proc.wait()
-            stderr = proc.stderr.read()
+            # get stderr, allowing for case where it's very large
+            stderr = ''
+            buffsize = 1048576
+            try:
+                while True:
+                    stderr += proc.stderr.read( buffsize )
+                    if not stderr or len( stderr ) % buffsize != 0:
+                        break
+            except OverflowError:
+                pass
             if returncode != 0:
                 raise Exception, stderr 
             if len( open( fai_index_file_path ).read().strip() ) == 0:
@@ -97,7 +106,16 @@
         command = 'samtools view -bt %s -o %s %s' % ( fai_index_file_path, tmp_aligns_file_name, options.input1 )
         proc = subprocess.Popen( args=command, shell=True, cwd=tmp_dir, stderr=subprocess.PIPE )
         returncode = proc.wait()
-        stderr = proc.stderr.read()
+        # get stderr, allowing for case where it's very large
+        stderr = ''
+        buffsize = 1048576
+        try:
+            while True:
+                stderr += proc.stderr.read( buffsize )
+                if not stderr or len( stderr ) % buffsize != 0:
+                    break
+        except OverflowError:
+            pass
         if returncode != 0:
             raise Exception, stderr
         if len( open( tmp_aligns_file_name ).read() ) == 0:
@@ -117,7 +135,16 @@
         command = 'samtools sort %s %s' % ( tmp_aligns_file_name, tmp_sorted_aligns_file_name )
         proc = subprocess.Popen( args=command, shell=True, cwd=tmp_dir, stderr=subprocess.PIPE )
         returncode = proc.wait()
-        stderr = proc.stderr.read()
+        # get stderr, allowing for case where it's very large
+        stderr = ''
+        buffsize = 1048576
+        try:
+            while True:
+                stderr += proc.stderr.read( buffsize )
+                if not stderr or len( stderr ) % buffsize != 0:
+                    break
+        except OverflowError:
+            pass
         if returncode != 0:
             raise Exception, stderr
     except Exception, e:
diff -r 8268439390c6 -r c07a6bdc794c tools/sr_mapping/bowtie_wrapper.py
--- a/tools/sr_mapping/bowtie_wrapper.py	Tue Mar 30 15:39:14 2010 -0400
+++ b/tools/sr_mapping/bowtie_wrapper.py	Tue Mar 30 16:33:19 2010 -0400
@@ -196,7 +196,15 @@
         try:
             proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
             returncode = proc.wait()
-            stderr = proc.stderr.read()
+            stderr = ''
+            buffsize = 1048576
+            try:
+                while True:
+                    stderr += proc.stderr.read( buffsize )
+                    if not stderr or len( stderr ) % buffsize != 0:
+                        break
+            except OverflowError:
+                pass
             if returncode != 0:
                 raise Exception, stderr
         except Exception, e:
@@ -352,7 +360,16 @@
             # align
             proc = subprocess.Popen( args=cmd2, shell=True, cwd=tmp_index_dir, stderr=subprocess.PIPE )
             returncode = proc.wait()
-            stderr = proc.stderr.read()
+            # get stderr, allowing for case where it's very large
+            stderr = ''
+            buffsize = 1048576
+            try:
+                while True:
+                    stderr += proc.stderr.read( buffsize )
+                    if not stderr or len( stderr ) % buffsize != 0:
+                        break
+            except OverflowError:
+                pass
             if returncode != 0:
                 raise Exception, stderr
             # check that there are results in the output file
diff -r 8268439390c6 -r c07a6bdc794c tools/sr_mapping/bowtie_wrapper_code.py
--- a/tools/sr_mapping/bowtie_wrapper_code.py	Tue Mar 30 15:39:14 2010 -0400
+++ b/tools/sr_mapping/bowtie_wrapper_code.py	Tue Mar 30 16:33:19 2010 -0400
@@ -10,6 +10,6 @@
             refFile = '?'
     dbkey = os.path.split( refFile )[1].split( '.' )[0]
     # deal with the one odd case
-    if dbkey.find( 'chrM' ) >= 0 or dbkey.find( 'chr_m' ) >= 0:
+    if dbkey == 'equCab2chrM':
         dbkey = 'equCab2'
-    out_data[ 'output' ].set_dbkey(dbkey)
+    out_data[ 'output' ].set_dbkey( dbkey )
diff -r 8268439390c6 -r c07a6bdc794c tools/sr_mapping/bwa_wrapper.py
--- a/tools/sr_mapping/bwa_wrapper.py	Tue Mar 30 15:39:14 2010 -0400
+++ b/tools/sr_mapping/bwa_wrapper.py	Tue Mar 30 16:33:19 2010 -0400
@@ -93,7 +93,16 @@
         try:
             proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=subprocess.PIPE )
             returncode = proc.wait()
-            stderr = proc.stderr.read()
+            # get stderr, allowing for case where it's very large
+            stderr = ''
+            buffsize = 1048576
+            try:
+                while True:
+                    stderr += proc.stderr.read( buffsize )
+                    if not stderr or len( stderr ) % buffsize != 0:
+                        break
+            except OverflowError:
+                pass
             if returncode != 0:
                 raise Exception, stderr
         except Exception, e:
@@ -151,6 +160,7 @@
     else:
         cmd3 = 'bwa samse %s %s %s %s >> %s' % ( gen_alignment_cmds, ref_file_name, tmp_align_out_name, options.fastq, options.output )
     # perform alignments
+    buffsize = 1048576
     try:
         # need to nest try-except in try-finally to handle 2.4
         try:
@@ -158,7 +168,15 @@
             try:
                 proc = subprocess.Popen( args=cmd2, shell=True, cwd=tmp_dir, stderr=subprocess.PIPE )
                 returncode = proc.wait()
-                stderr = proc.stderr.read()
+                # get stderr, allowing for case where it's very large
+                stderr = ''
+                try:
+                    while True:
+                        stderr += proc.stderr.read( buffsize )
+                        if not stderr or len( stderr ) % buffsize != 0:
+                            break
+                except OverflowError:
+                    pass
                 if returncode != 0:
                     raise Exception, stderr
             except Exception, e:
@@ -168,7 +186,15 @@
                 if cmd2b: 
                     proc = subprocess.Popen( args=cmd2b, shell=True, cwd=tmp_dir, stderr=subprocess.PIPE )
                     returncode = proc.wait()
-                    stderr = proc.stderr.read()
+                    # get stderr, allowing for case where it's very large
+                    stderr = ''
+                    try:
+                        while True:
+                            stderr += proc.stderr.read( buffsize )
+                            if not stderr or len( stderr ) % buffsize != 0:
+                                break
+                    except OverflowError:
+                        pass
                     if returncode != 0:
                         raise Exception, stderr
             except Exception, e:
@@ -177,7 +203,15 @@
             try:
                 proc = subprocess.Popen( args=cmd3, shell=True, cwd=tmp_dir, stderr=subprocess.PIPE )
                 returncode = proc.wait()
-                stderr = proc.stderr.read()
+                # get stderr, allowing for case where it's very large
+                stderr = ''
+                try:
+                    while True:
+                        stderr += proc.stderr.read( buffsize )
+                        if not stderr or len( stderr ) % buffsize != 0:
+                            break
+                except OverflowError:
+                    pass
                 if returncode != 0:
                     raise Exception, stderr
             except Exception, e:

    

[galaxy-dev] [hg] galaxy 3581: Updated several NGS tools to better handle std...

Greg Von Kuster