details: http://www.bx.psu.edu/hg/galaxy/rev/c07a6bdc794c changeset: 3581:c07a6bdc794c user: Kelly Vincent <kpvincent@bx.psu.edu> date: Tue Mar 30 16:33:19 2010 -0400 description: Updated several NGS tools to better handle stderr (allow for extremely large stderr output) diffstat: tools/metag_tools/megablast_wrapper.py | 43 +++++++++---- tools/metag_tools/megablast_wrapper.xml | 98 ++++++++++++++++---------------- tools/samtools/sam_merge.py | 11 +++- tools/samtools/sam_pileup.py | 22 ++++++- tools/samtools/sam_to_bam.py | 33 ++++++++++- tools/sr_mapping/bowtie_wrapper.py | 21 ++++++- tools/sr_mapping/bowtie_wrapper_code.py | 4 +- tools/sr_mapping/bwa_wrapper.py | 42 ++++++++++++- 8 files changed, 197 insertions(+), 77 deletions(-) diffs (453 lines): diff -r 8268439390c6 -r c07a6bdc794c tools/metag_tools/megablast_wrapper.py --- a/tools/metag_tools/megablast_wrapper.py Tue Mar 30 15:39:14 2010 -0400 +++ b/tools/metag_tools/megablast_wrapper.py Tue Mar 30 16:33:19 2010 -0400 @@ -15,7 +15,7 @@ usage: %prog db_build input_file word_size identity_cutoff eval_cutoff filter_query index_dir output_file """ -import sys, os, tempfile +import os, subprocess, sys, tempfile from galaxy import eggs import pkg_resources; pkg_resources.require( "bx-python" ) from bx.cookbook import doc_optparse @@ -29,7 +29,7 @@ def __main__(): #Parse Command Line options, args = doc_optparse.parse( __doc__ ) - + db_build = options.db_build query_filename = options.input.strip() output_filename = options.output.strip() @@ -66,18 +66,33 @@ if not db.has_key( db_build ): stop_err( 'Cannot locate the target database. Please check your location file.' ) - + # arguments for megablast chunk = db[ ( db_build ) ] - megablast_command = "megablast -d %s -i %s -o %s -m 8 -a 8 -W %s -p %s -e %s -F %s > /dev/null 2>&1 " \ + megablast_command = "megablast -d %s -i %s -o %s -m 8 -a 8 -W %s -p %s -e %s -F %s > /dev/null " \ % ( chunk, query_filename, mega_temp_output, mega_word_size, mega_iden_cutoff, mega_evalue_cutoff, mega_filter ) - + print megablast_command - + try: - os.system( megablast_command ) + proc = subprocess.Popen( args=megablast_command, shell=True, stderr=subprocess.PIPE ) + returncode = proc.wait() + # get stderr, allowing for case where it's very large + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += proc.stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + if returncode != 0: + raise Exception, stderr except Exception, e: - stop_err( str( e ) ) + if os.path.exists( mega_temp_output ): + os.unlink( mega_temp_output ) + stop_err( 'Error indexing reference sequence. ' + str( e ) ) output = open( output_filename, 'w' ) invalid_lines = 0 @@ -89,24 +104,24 @@ gi, gi_len = fields[1].split( '_' ) # convert the last column (causing problem in filter tool) to float fields[-1] = float( fields[-1] ) - new_line = "%s\t%s\t%s\t%s\t%0.1f" % ( fields[0], gi, gi_len, '\t'.join( fields[2:-1] ), fields[-1] ) except: new_line = line invalid_lines += 1 output.write( "%s\n" % new_line ) output.close() - - os.unlink( mega_temp_output ) #remove the tempfile that we just reformatted the contents of - + + if os.path.exists( mega_temp_output ): + os.unlink( mega_temp_output ) #remove the tempfile that we just reformatted the contents of + if invalid_lines: print "Unable to parse %d lines. Keep the default format." % invalid_lines - + # megablast generates a file called error.log, if empty, delete it, if not, show the contents if os.path.exists( './error.log' ): for i, line in enumerate( file( './error.log' ) ): line = line.rstrip( '\r\n' ) print line os.remove( './error.log' ) - + if __name__ == "__main__" : __main__() diff -r 8268439390c6 -r c07a6bdc794c tools/metag_tools/megablast_wrapper.xml --- a/tools/metag_tools/megablast_wrapper.xml Tue Mar 30 15:39:14 2010 -0400 +++ b/tools/metag_tools/megablast_wrapper.xml Tue Mar 30 16:33:19 2010 -0400 @@ -1,55 +1,55 @@ <tool id="megablast_wrapper" name="Megablast" version="1.0.5"> - <description> compare short reads against htgs, nt, and wgs databases</description> - <command interpreter="python"> - megablast_wrapper.py - --db_build="$source_select" - --input=$input_query - --word_size=$word_size - --identity_cutoff=$iden_cutoff - --eval_cutoff=$evalue_cutoff - --filter_query=$filter_query - --index_dir=${GALAXY_DATA_INDEX_DIR} - --output=$output1 - </command> - <inputs> - <param name="input_query" type="data" format="fasta" label="Compare these sequences"/> - <param name="source_select" type="select" display="radio" label="against target database"> - <options from_file="blastdb.loc"> - <column name="name" index="0"/> - <column name="value" index="0"/> - </options> - </param> - <param name="word_size" type="select" label="using word size" help="Size of best perfect match"> - <option value="28">28</option> - <option value="16">16</option> - </param> - <param name="iden_cutoff" type="float" size="15" value="90.0" label="report hits above this identity" help="no cutoff if 0" /> + <description> compare short reads against htgs, nt, and wgs databases</description> + <command interpreter="python"> + megablast_wrapper.py + --db_build="$source_select" + --input=$input_query + --word_size=$word_size + --identity_cutoff=$iden_cutoff + --eval_cutoff=$evalue_cutoff + --filter_query=$filter_query + --index_dir=${GALAXY_DATA_INDEX_DIR} + --output=$output1 + </command> + <inputs> + <param name="input_query" type="data" format="fasta" label="Compare these sequences"/> + <param name="source_select" type="select" display="radio" label="against target database"> + <options from_file="blastdb.loc"> + <column name="name" index="0"/> + <column name="value" index="0"/> + </options> + </param> + <param name="word_size" type="select" label="using word size" help="Size of best perfect match"> + <option value="28">28</option> + <option value="16">16</option> + </param> + <param name="iden_cutoff" type="float" size="15" value="90.0" label="report hits above this identity" help="no cutoff if 0" /> <param name="evalue_cutoff" type="float" size="15" value="0.001" label="set expectation value cutoff" /> - <param name="filter_query" type="select" label="Filter out low complexity regions?"> + <param name="filter_query" type="select" label="Filter out low complexity regions?"> <option value="T">Yes</option> <option value="F">No</option> - </param> - </inputs> - <outputs> - <data name="output1" format="tabular"/> - </outputs> - <requirements> - <requirement type="binary">megablast</requirement> - </requirements> - <tests> - <test> - <param name="input_query" value="megablast_wrapper_test1.fa" ftype="fasta"/> - <!-- source_select needs to match the entry in the blastdb.loc file, which includes the last update date if appropriate --> - <param name="source_select" value="phiX" /> - <param name="word_size" value="28" /> - <param name="iden_cutoff" value="99.0" /> - <param name="evalue_cutoff" value="10.0" /> - <param name="filter_query" value="T" /> - <output name="output1" file="megablast_wrapper_test1.out"/> - </test> - </tests> - <help> - + </param> + </inputs> + <outputs> + <data name="output1" format="tabular"/> + </outputs> + <requirements> + <requirement type="binary">megablast</requirement> + </requirements> + <tests> + <test> + <param name="input_query" value="megablast_wrapper_test1.fa" ftype="fasta"/> + <!-- source_select needs to match the entry in the blastdb.loc file, which includes the last update date if appropriate --> + <param name="source_select" value="phiX" /> + <param name="word_size" value="28" /> + <param name="iden_cutoff" value="99.0" /> + <param name="evalue_cutoff" value="10.0" /> + <param name="filter_query" value="T" /> + <output name="output1" file="megablast_wrapper_test1.out"/> + </test> + </tests> + <help> + .. class:: warningmark **Note**. Database searches may take substantial amount of time. For large input datasets it is advisable to allow overnight processing. @@ -86,5 +86,5 @@ Zhang et al. A Greedy Algorithm for Aligning DNA Sequences. 2000. JCB: 203-214. - </help> + </help> </tool> diff -r 8268439390c6 -r c07a6bdc794c tools/samtools/sam_merge.py --- a/tools/samtools/sam_merge.py Tue Mar 30 15:39:14 2010 -0400 +++ b/tools/samtools/sam_merge.py Tue Mar 30 16:33:19 2010 -0400 @@ -25,7 +25,16 @@ try: proc = subprocess.Popen( args=cmd, shell=True, stderr=subprocess.PIPE ) returncode = proc.wait() - stderr = proc.stderr.read() + # get stderr, allowing for case where it's very large + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += proc.stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass if returncode != 0: raise Exception, stderr except Exception, e: diff -r 8268439390c6 -r c07a6bdc794c tools/samtools/sam_pileup.py --- a/tools/samtools/sam_pileup.py Tue Mar 30 15:39:14 2010 -0400 +++ b/tools/samtools/sam_pileup.py Tue Mar 30 16:33:19 2010 -0400 @@ -90,7 +90,16 @@ cmdIndex = 'samtools faidx %s' % ( tmpf1_name ) proc = subprocess.Popen( args=cmdIndex, shell=True, cwd=tmpDir, stderr=subprocess.PIPE ) returncode = proc.wait() - stderr = proc.stderr.read() + # get stderr, allowing for case where it's very large + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += proc.stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass #did index succeed? if returncode != 0: raise Exception, 'Error creating index file\n' + stderr @@ -99,7 +108,16 @@ proc = subprocess.Popen( args=cmd, shell=True, cwd=tmpDir, stderr=subprocess.PIPE ) returncode = proc.wait() #did it succeed? - stderr = proc.stderr.read() + # get stderr, allowing for case where it's very large + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += proc.stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass if returncode != 0: raise Exception, stderr except Exception, e: diff -r 8268439390c6 -r c07a6bdc794c tools/samtools/sam_to_bam.py --- a/tools/samtools/sam_to_bam.py Tue Mar 30 15:39:14 2010 -0400 +++ b/tools/samtools/sam_to_bam.py Tue Mar 30 16:33:19 2010 -0400 @@ -77,7 +77,16 @@ command = 'samtools faidx %s' % fai_index_file_base proc = subprocess.Popen( args=command, shell=True, cwd=tmp_dir, stderr=subprocess.PIPE ) returncode = proc.wait() - stderr = proc.stderr.read() + # get stderr, allowing for case where it's very large + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += proc.stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass if returncode != 0: raise Exception, stderr if len( open( fai_index_file_path ).read().strip() ) == 0: @@ -97,7 +106,16 @@ command = 'samtools view -bt %s -o %s %s' % ( fai_index_file_path, tmp_aligns_file_name, options.input1 ) proc = subprocess.Popen( args=command, shell=True, cwd=tmp_dir, stderr=subprocess.PIPE ) returncode = proc.wait() - stderr = proc.stderr.read() + # get stderr, allowing for case where it's very large + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += proc.stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass if returncode != 0: raise Exception, stderr if len( open( tmp_aligns_file_name ).read() ) == 0: @@ -117,7 +135,16 @@ command = 'samtools sort %s %s' % ( tmp_aligns_file_name, tmp_sorted_aligns_file_name ) proc = subprocess.Popen( args=command, shell=True, cwd=tmp_dir, stderr=subprocess.PIPE ) returncode = proc.wait() - stderr = proc.stderr.read() + # get stderr, allowing for case where it's very large + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += proc.stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass if returncode != 0: raise Exception, stderr except Exception, e: diff -r 8268439390c6 -r c07a6bdc794c tools/sr_mapping/bowtie_wrapper.py --- a/tools/sr_mapping/bowtie_wrapper.py Tue Mar 30 15:39:14 2010 -0400 +++ b/tools/sr_mapping/bowtie_wrapper.py Tue Mar 30 16:33:19 2010 -0400 @@ -196,7 +196,15 @@ try: proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=subprocess.PIPE, stdout=subprocess.PIPE ) returncode = proc.wait() - stderr = proc.stderr.read() + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += proc.stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass if returncode != 0: raise Exception, stderr except Exception, e: @@ -352,7 +360,16 @@ # align proc = subprocess.Popen( args=cmd2, shell=True, cwd=tmp_index_dir, stderr=subprocess.PIPE ) returncode = proc.wait() - stderr = proc.stderr.read() + # get stderr, allowing for case where it's very large + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += proc.stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass if returncode != 0: raise Exception, stderr # check that there are results in the output file diff -r 8268439390c6 -r c07a6bdc794c tools/sr_mapping/bowtie_wrapper_code.py --- a/tools/sr_mapping/bowtie_wrapper_code.py Tue Mar 30 15:39:14 2010 -0400 +++ b/tools/sr_mapping/bowtie_wrapper_code.py Tue Mar 30 16:33:19 2010 -0400 @@ -10,6 +10,6 @@ refFile = '?' dbkey = os.path.split( refFile )[1].split( '.' )[0] # deal with the one odd case - if dbkey.find( 'chrM' ) >= 0 or dbkey.find( 'chr_m' ) >= 0: + if dbkey == 'equCab2chrM': dbkey = 'equCab2' - out_data[ 'output' ].set_dbkey(dbkey) + out_data[ 'output' ].set_dbkey( dbkey ) diff -r 8268439390c6 -r c07a6bdc794c tools/sr_mapping/bwa_wrapper.py --- a/tools/sr_mapping/bwa_wrapper.py Tue Mar 30 15:39:14 2010 -0400 +++ b/tools/sr_mapping/bwa_wrapper.py Tue Mar 30 16:33:19 2010 -0400 @@ -93,7 +93,16 @@ try: proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=subprocess.PIPE ) returncode = proc.wait() - stderr = proc.stderr.read() + # get stderr, allowing for case where it's very large + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += proc.stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass if returncode != 0: raise Exception, stderr except Exception, e: @@ -151,6 +160,7 @@ else: cmd3 = 'bwa samse %s %s %s %s >> %s' % ( gen_alignment_cmds, ref_file_name, tmp_align_out_name, options.fastq, options.output ) # perform alignments + buffsize = 1048576 try: # need to nest try-except in try-finally to handle 2.4 try: @@ -158,7 +168,15 @@ try: proc = subprocess.Popen( args=cmd2, shell=True, cwd=tmp_dir, stderr=subprocess.PIPE ) returncode = proc.wait() - stderr = proc.stderr.read() + # get stderr, allowing for case where it's very large + stderr = '' + try: + while True: + stderr += proc.stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass if returncode != 0: raise Exception, stderr except Exception, e: @@ -168,7 +186,15 @@ if cmd2b: proc = subprocess.Popen( args=cmd2b, shell=True, cwd=tmp_dir, stderr=subprocess.PIPE ) returncode = proc.wait() - stderr = proc.stderr.read() + # get stderr, allowing for case where it's very large + stderr = '' + try: + while True: + stderr += proc.stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass if returncode != 0: raise Exception, stderr except Exception, e: @@ -177,7 +203,15 @@ try: proc = subprocess.Popen( args=cmd3, shell=True, cwd=tmp_dir, stderr=subprocess.PIPE ) returncode = proc.wait() - stderr = proc.stderr.read() + # get stderr, allowing for case where it's very large + stderr = '' + try: + while True: + stderr += proc.stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass if returncode != 0: raise Exception, stderr except Exception, e: