1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/01cfd0c5af0f/ Changeset: 01cfd0c5af0f User: jgoecks Date: 2013-05-06 20:40:44 Summary: Tophat2 wrapper update: (a) remove unneeded wrapper script; and (b) use stdio tags to capture and show output; and (c) update functional tests for most recent version of Tophat. Affected #: 3 files diff -r 10a0ad2f0280cb044bde1e6424b7a7a93365d5ce -r 01cfd0c5af0fa5555697d14283404af2f5ef496c test-data/tophat2_out2j.bed --- a/test-data/tophat2_out2j.bed +++ b/test-data/tophat2_out2j.bed @@ -1,3 +1,3 @@ track name=junctions description="TopHat junctions" -test_chromosome 179 400 JUNC00000001 45 + 179 400 255,0,0 2 71,50 0,171 -test_chromosome 350 550 JUNC00000002 38 + 350 550 255,0,0 2 50,50 0,150 +test_chromosome 179 400 JUNC00000001 37 + 179 400 255,0,0 2 71,50 0,171 +test_chromosome 350 550 JUNC00000002 37 + 350 550 255,0,0 2 50,50 0,150 diff -r 10a0ad2f0280cb044bde1e6424b7a7a93365d5ce -r 01cfd0c5af0fa5555697d14283404af2f5ef496c tools/ngs_rna/tophat2_wrapper.py --- a/tools/ngs_rna/tophat2_wrapper.py +++ /dev/null @@ -1,270 +0,0 @@ -#!/usr/bin/env python - -import optparse, os, shutil, subprocess, sys, tempfile, fileinput - -def stop_err( msg ): - sys.stderr.write( "%s\n" % msg ) - sys.exit() - -def __main__(): - #Parse Command Line - parser = optparse.OptionParser() - parser.add_option( '-p', '--num-threads', dest='num_threads', help='Use this many threads to align reads. The default is 1.' ) - parser.add_option( '-C', '--color-space', dest='color_space', action='store_true', help='This indicates color-space data' ) - parser.add_option( '-J', '--junctions-output', dest='junctions_output_file', help='Junctions output file; formate is BED.' ) - parser.add_option( '-H', '--hits-output', dest='accepted_hits_output_file', help='Accepted hits output file; formate is BAM.' ) - parser.add_option( '', '--own-file', dest='own_file', help='' ) - parser.add_option( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' ) - parser.add_option( '-r', '--mate-inner-dist', dest='mate_inner_dist', help='This is the expected (mean) inner distance between mate pairs. \ - For, example, for paired end runs with fragments selected at 300bp, \ - where each end is 50bp, you should set -r to be 200. There is no default, \ - and this parameter is required for paired end runs.') - parser.add_option( '', '--mate-std-dev', dest='mate_std_dev', help='Standard deviation of distribution on inner distances between male pairs.' ) - parser.add_option( '', '--read-mismatches', dest='read_mismatches' ) - parser.add_option( '', '--bowtie-n', action="store_true", dest='bowtie_n' ) - parser.add_option( '', '--no-discordant', action="store_true", dest='report_concordant_pairs_only' ) - parser.add_option( '-a', '--min-anchor-length', dest='min_anchor_length', - help='The "anchor length". TopHat will report junctions spanned by reads with at least this many bases on each side of the junction.' ) - parser.add_option( '-m', '--splice-mismatches', dest='splice_mismatches', help='The maximum number of mismatches that can appear in the anchor region of a spliced alignment.' ) - parser.add_option( '-i', '--min-intron-length', dest='min_intron_length', - help='The minimum intron length. TopHat will ignore donor/acceptor pairs closer than this many bases apart.' ) - parser.add_option( '-I', '--max-intron-length', dest='max_intron_length', - help='The maximum intron length. When searching for junctions ab initio, TopHat will ignore donor/acceptor pairs farther than this many bases apart, except when such a pair is supported by a split segment alignment of a long read.' ) - parser.add_option( '-g', '--max_multihits', dest='max_multihits', help='Maximum number of alignments to be allowed' ) - parser.add_option( '', '--seg-mismatches', dest='seg_mismatches', help='Number of mismatches allowed in each segment alignment for reads mapped independently' ) - parser.add_option( '', '--seg-length', dest='seg_length', help='Minimum length of read segments' ) - parser.add_option( '', '--library-type', dest='library_type', help='TopHat will treat the reads as strand specific. Every read alignment will have an XS attribute tag. Consider supplying library type options below to select the correct RNA-seq protocol.' ) - parser.add_option( '', '--allow-indels', action="store_true", help='Allow indel search. Indel search is disabled by default.(Not used since version 1.3.0)' ) - parser.add_option( '', '--max-insertion-length', dest='max_insertion_length', help='The maximum insertion length. The default is 3.' ) - parser.add_option( '', '--max-deletion-length', dest='max_deletion_length', help='The maximum deletion length. The default is 3.' ) - - # Options for supplying own junctions - parser.add_option( '-G', '--GTF', dest='gene_model_annotations', help='Supply TopHat with a list of gene model annotations. \ - TopHat will use the exon records in this file to build \ - a set of known splice junctions for each gene, and will \ - attempt to align reads to these junctions even if they \ - would not normally be covered by the initial mapping.') - parser.add_option( '-j', '--raw-juncs', dest='raw_juncs', help='Supply TopHat with a list of raw junctions. Junctions are \ - specified one per line, in a tab-delimited format. Records \ - look like: <chrom><left><right><+/-> left and right are \ - zero-based coordinates, and specify the last character of the \ - left sequenced to be spliced to the first character of the right \ - sequence, inclusive.') - parser.add_option( '', '--no-novel-juncs', action="store_true", dest='no_novel_juncs', help="Only look for junctions indicated in the \ - supplied GFF file. (ignored without -G)") - parser.add_option( '', '--no-novel-indels', action="store_true", dest='no_novel_indels', help="Skip indel search. Indel search is enabled by default.") - # Types of search. - parser.add_option( '', '--microexon-search', action="store_true", dest='microexon_search', help='With this option, the pipeline will attempt to find alignments incident to microexons. Works only for reads 50bp or longer.') - parser.add_option( '', '--coverage-search', action="store_true", dest='coverage_search', help='Enables the coverage based search for junctions. Use when coverage search is disabled by default (such as for reads 75bp or longer), for maximum sensitivity.') - parser.add_option( '', '--no-coverage-search', action="store_false", dest='coverage_search' ) - parser.add_option( '', '--min-segment-intron', dest='min_segment_intron', help='Minimum intron length that may be found during split-segment search' ) - parser.add_option( '', '--max-segment-intron', dest='max_segment_intron', help='Maximum intron length that may be found during split-segment search' ) - parser.add_option( '', '--min-coverage-intron', dest='min_coverage_intron', help='Minimum intron length that may be found during coverage search' ) - parser.add_option( '', '--max-coverage-intron', dest='max_coverage_intron', help='Maximum intron length that may be found during coverage search' ) - - # Bowtie2 options. - parser.add_option( '', '--b2-very-fast', action='store_true', dest='b2_very_fast') - parser.add_option( '', '--b2-fast', action='store_true', dest='b2_fast') - parser.add_option( '', '--b2-very-sensitive', action='store_true', dest='b2_very_sensitive') - parser.add_option( '', '--b2-sensitive', action='store_true', dest='b2_sensitive') - - # Fusion search options. - parser.add_option( '', '--fusion-search', action='store_true', dest='fusion_search' ) - parser.add_option( '', '--fusion-anchor-length', dest='fusion_anchor_length' ) - parser.add_option( '', '--fusion-min-dist', dest='fusion_min_dist' ) - parser.add_option( '', '--fusion-read-mismatches', dest='fusion_read_mismatches' ) - parser.add_option( '', '--fusion-multireads', dest='fusion_multireads' ) - parser.add_option( '', '--fusion-multipairs', dest='fusion_multipairs' ) - parser.add_option( '', '--fusion-ignore-chromosomes', dest='fusion_ignore_chromosomes' ) - - # Wrapper options. - parser.add_option( '-1', '--input1', dest='input1', help='The (forward or single-end) reads file in Sanger FASTQ format' ) - parser.add_option( '-2', '--input2', dest='input2', help='The reverse reads file in Sanger FASTQ format' ) - parser.add_option( '', '--single-paired', dest='single_paired', help='' ) - parser.add_option( '', '--settings', dest='settings', help='' ) - - # Read group options. - parser.add_option( '', '--rgid', dest='rgid', help='Read group identifier' ) - parser.add_option( '', '--rglb', dest='rglb', help='Library name' ) - parser.add_option( '', '--rgpl', dest='rgpl', help='Platform/technology used to produce the reads' ) - parser.add_option( '', '--rgsm', dest='rgsm', help='Sample' ) - - (options, args) = parser.parse_args() - - # Color or base space - space = '' - if options.color_space: - space = '-C' - - # Creat bowtie index if necessary. - tmp_index_dir = tempfile.mkdtemp() - if options.own_file: - index_path = os.path.join( tmp_index_dir, '.'.join( os.path.split( options.own_file )[1].split( '.' )[:-1] ) ) - try: - os.link( options.own_file, index_path + '.fa' ) - except: - # Tophat prefers (but doesn't require) fasta file to be in same directory, with .fa extension - pass - cmd_index = 'bowtie2-build %s -f %s %s' % ( space, options.own_file, index_path ) - try: - tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name - tmp_stderr = open( tmp, 'wb' ) - proc = subprocess.Popen( args=cmd_index, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() ) - returncode = proc.wait() - tmp_stderr.close() - # get stderr, allowing for case where it's very large - tmp_stderr = open( tmp, 'rb' ) - stderr = '' - buffsize = 1048576 - try: - while True: - stderr += tmp_stderr.read( buffsize ) - if not stderr or len( stderr ) % buffsize != 0: - break - except OverflowError: - pass - tmp_stderr.close() - if returncode != 0: - raise Exception, stderr - except Exception, e: - if os.path.exists( tmp_index_dir ): - shutil.rmtree( tmp_index_dir ) - stop_err( 'Error indexing reference sequence\n' + str( e ) ) - else: - index_path = options.index_path - - # Build tophat command. - cmd = 'tophat2 --keep-fasta-order %s %s %s' - reads = options.input1 - if options.input2: - reads += ' ' + options.input2 - opts = '-p %s %s' % ( options.num_threads, space ) - if options.single_paired == 'paired': - opts += ' -r %s' % options.mate_inner_dist - if options.report_concordant_pairs_only: - opts += ' --no-discordant' - # Read group options. - if options.rgid: - if not options.rglb or not options.rgpl or not options.rgsm: - stop_err( 'If you want to specify read groups, you must include the ID, LB, PL, and SM tags.' ) - opts += ' --rg-id %s' % options.rgid - opts += ' --rg-library %s' % options.rglb - opts += ' --rg-platform %s' % options.rgpl - opts += ' --rg-sample %s' % options.rgsm - - if options.settings == 'preSet': - cmd = cmd % ( opts, index_path, reads ) - else: - try: - if int( options.min_anchor_length ) >= 3: - opts += ' -a %s' % options.min_anchor_length - else: - raise Exception, 'Minimum anchor length must be 3 or greater' - opts += ' -m %s' % options.splice_mismatches - opts += ' -i %s' % options.min_intron_length - opts += ' -I %s' % options.max_intron_length - opts += ' -g %s' % options.max_multihits - # Custom junctions options. - if options.gene_model_annotations: - opts += ' -G %s' % options.gene_model_annotations - if options.raw_juncs: - opts += ' -j %s' % options.raw_juncs - if options.no_novel_juncs: - opts += ' --no-novel-juncs' - if options.library_type: - opts += ' --library-type %s' % options.library_type - if options.no_novel_indels: - opts += ' --no-novel-indels' - else: - if options.max_insertion_length: - opts += ' --max-insertion-length %i' % int( options.max_insertion_length ) - if options.max_deletion_length: - opts += ' --max-deletion-length %i' % int( options.max_deletion_length ) - # Max options do not work for Tophat v1.2.0, despite documentation to the contrary. (Fixed in version 1.3.1) - # need to warn user of this fact - #sys.stdout.write( "Max insertion length and max deletion length options don't work in Tophat v1.2.0\n" ) - - if options.read_mismatches: - opts += ' --read-mismatches %i' % int( options.read_mismatches ) - if options.bowtie_n: - opts += ' --bowtie-n' - - # Search type options. - if options.coverage_search: - opts += ' --coverage-search --min-coverage-intron %s --max-coverage-intron %s' % ( options.min_coverage_intron, options.max_coverage_intron ) - else: - opts += ' --no-coverage-search' - if options.microexon_search: - opts += ' --microexon-search' - if options.single_paired == 'paired' and options.mate_std_dev: - opts += ' --mate-std-dev %s' % options.mate_std_dev - if options.seg_mismatches: - opts += ' --segment-mismatches %d' % int( options.seg_mismatches ) - if options.seg_length: - opts += ' --segment-length %d' % int( options.seg_length ) - if options.min_segment_intron: - opts += ' --min-segment-intron %d' % int( options.min_segment_intron ) - if options.max_segment_intron: - opts += ' --max-segment-intron %d' % int( options.max_segment_intron ) - - # Fusion search options. - if options.fusion_search: - opts += ' --fusion-search --fusion-anchor-length %i --fusion-min-dist %i --fusion-read-mismatches %i --fusion-multireads %i --fusion-multipairs %i' % \ - ( int( options.fusion_anchor_length ), int( options.fusion_min_dist ), - int( options.fusion_read_mismatches ), int( options.fusion_multireads ), - int( options.fusion_multipairs ) ) - if options.fusion_ignore_chromosomes: - opts += ' --fusion-ignore-chromosomes %s' % options.fusion_ignore_chromosomes - - # Bowtie2 options. - if options.b2_very_fast: - opts += ' --b2-very-fast' - if options.b2_fast: - opts += ' --b2-fast' - if options.b2_sensitive: - opts += ' --b2-sensitive' - if options.b2_very_sensitive: - opts += ' --b2-very-sensitive' - - cmd = cmd % ( opts, index_path, reads ) - - except Exception, e: - # Clean up temp dirs - if os.path.exists( tmp_index_dir ): - shutil.rmtree( tmp_index_dir ) - stop_err( 'Something is wrong with the alignment parameters and the alignment could not be run\n' + str( e ) ) - print cmd - - # Run - try: - tmp_out = tempfile.NamedTemporaryFile().name - tmp_stdout = open( tmp_out, 'wb' ) - tmp_err = tempfile.NamedTemporaryFile().name - tmp_stderr = open( tmp_err, 'wb' ) - proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr ) - returncode = proc.wait() - tmp_stderr.close() - # get stderr, allowing for case where it's very large - tmp_stderr = open( tmp_err, 'rb' ) - stderr = '' - buffsize = 1048576 - try: - while True: - stderr += tmp_stderr.read( buffsize ) - if not stderr or len( stderr ) % buffsize != 0: - break - except OverflowError: - pass - tmp_stdout.close() - tmp_stderr.close() - if returncode != 0: - raise Exception, stderr - - except Exception, e: - stop_err( 'Error in tophat:\n' + str( e ) ) - - # Clean up temp dirs - if os.path.exists( tmp_index_dir ): - shutil.rmtree( tmp_index_dir ) - -if __name__=="__main__": __main__() diff -r 10a0ad2f0280cb044bde1e6424b7a7a93365d5ce -r 01cfd0c5af0fa5555697d14283404af2f5ef496c tools/ngs_rna/tophat2_wrapper.xml --- a/tools/ngs_rna/tophat2_wrapper.xml +++ b/tools/ngs_rna/tophat2_wrapper.xml @@ -7,49 +7,38 @@ <requirement type="package">bowtie2</requirement><requirement type="package">tophat2</requirement></requirements> - <command interpreter="python"> - tophat2_wrapper.py + + <command> + ## + ## Set path to index, building the reference if necessary. + ## + + #set index_path = '' + #if $refGenomeSource.genomeSource == "history": + bowtie2-build "$refGenomeSource.ownFile" genome ; ln -s "$refGenomeSource.ownFile" genome.fa ; + #set index_path = 'genome' + #else: + #set index_path = $refGenomeSource.index.fields.path + #end if + + ## + ## Run tophat. + ## + + tophat2 ## Change this to accommodate the number of threads you have available. - --num-threads="4" - - ## Provide outputs. - --junctions-output=$junctions - --hits-output=$accepted_hits - - ## Handle reference file. - #if $refGenomeSource.genomeSource == "history": - --own-file=$refGenomeSource.ownFile - #else: - --indexes-path="${refGenomeSource.index.fields.path}" - #end if - - ## Are reads single-end or paired? - --single-paired=$singlePaired.sPaired - - ## First input file always required. - --input1=$input1 - - ## Second input only if input is paired-end. - ## Also set parameters specific to paired data. - #if $singlePaired.sPaired == "paired" - --input2=$singlePaired.input2 - -r $singlePaired.mate_inner_distance - --mate-std-dev=$singlePaired.mate_std_dev - - #if str($singlePaired.report_discordant_pairs) == "No": - --no-discordant - #end if - #end if + --num-threads 4 ## Set params. - --settings=$params.settingsType #if $params.settingsType == "full": --read-mismatches $params.read_mismatches #if str($params.bowtie_n) == "Yes": --bowtie-n #end if + --read-edit-dist $params.read_edit_dist + --read-realign-edit-dist $params.read_realign_edit_dist -a $params.anchor_length -m $params.splice_mismatches -i $params.min_intron_length @@ -57,9 +46,9 @@ -g $params.max_multihits --min-segment-intron $params.min_segment_intron --max-segment-intron $params.max_segment_intron - --seg-mismatches=$params.seg_mismatches - --seg-length=$params.seg_length - --library-type=$params.library_type + --segment-mismatches $params.seg_mismatches + --segment-length $params.seg_length + --library-type $params.library_type ## Indel search. #if $params.indel_search.allow_indel_search == "Yes": @@ -78,7 +67,6 @@ #if $params.own_junctions.raw_juncs.use_juncs == "Yes": -j $params.own_junctions.raw_juncs.raw_juncs #end if - ## TODO: No idea why a string cast is necessary, but it is: #if str($params.own_junctions.no_novel_juncs) == "Yes": --no-novel-juncs #end if @@ -91,7 +79,7 @@ #else: --no-coverage-search #end if - ## TODO: No idea why the type conversion is necessary, but it seems to be. + #if str($params.microexon_search) == "Yes": --microexon-search #end if @@ -116,12 +104,27 @@ ## Read group information. #if $readGroup.specReadGroup == "yes" - --rgid="$readGroup.rgid" - --rglb="$readGroup.rglb" - --rgpl="$readGroup.rgpl" - --rgsm="$readGroup.rgsm" + --rg-id "$readGroup.rgid" + --rg-library "$readGroup.rglb" + --rg-platform "$readGroup.rgpl" + --rg-sample "$readGroup.rgsm" + #end if + + ## Set index path, inputs and parameters specific to paired data. + #if $singlePaired.sPaired == "paired" + -r $singlePaired.mate_inner_distance + --mate-std-dev=$singlePaired.mate_std_dev + + #if str($singlePaired.report_discordant_pairs) == "No": + --no-discordant + #end if + + ${index_path} $singlePaired.input1 $singlePaired.input2 + #else + ${index_path} $singlePaired.input1 #end if </command> + <inputs><conditional name="singlePaired"><param name="sPaired" type="select" label="Is this library mate-paired?"> @@ -157,6 +160,10 @@ <when value="preSet" /><!-- Full/advanced params. --><when value="full"> + <param name="read_realign_edit_dist" type="integer" value="1000" label="Max realign edit distance" help="Some of the reads spanning multiple exons may be mapped incorrectly as a contiguous alignment to the genome even though the correct alignment should be a spliced one - this can happen in the presence of processed pseudogenes that are rarely (if at all) transcribed or expressed. This option can direct TopHat to re-align reads for which the edit distance of an alignment obtained in a previous mapping step is above or equal to this option value. If you set this option to 0, TopHat will map every read in all the mapping steps (transcriptome if you provided gene annotations, genome, and finally splice variants detected by TopHat), reporting the best possible alignment found in any of these mapping steps. This may greatly increase the mapping accuracy at the expense of an increase in running time. The default value for this option is set such that TopHat will not try to realign reads already mapped in earlier steps." /> + + <param name="read_edit_dist" type="integer" value="2" label="Max edit distance" help="Final read alignments having more than these many edit distance are discarded." /> + <param name="library_type" type="select" label="Library Type" help="TopHat will treat the reads as strand specific. Every read alignment will have an XS attribute tag. Consider supplying library type options below to select the correct RNA-seq protocol."><option value="fr-unstranded">FR Unstranded</option><option value="fr-firststrand">FR First Strand</option> @@ -260,6 +267,11 @@ </conditional><!-- readGroup --></inputs> + <stdio> + <regex match="Exception" source="both" level="error" description="Tool exception"/> + <regex match=".*" source="both" level="log" description="tool progress"/> + </stdio> + <outputs><data format="tabular" name="fusions" label="${tool.name} on ${on_string}: fusions" from_work_dir="tophat_out/fusions.out"><filter>(params['settingsType'] == 'full' and params['fusion_search']['do_search'] == 'Yes')</filter> @@ -277,6 +289,7 @@ <expand macro="dbKeyActions" /></data></outputs> + <macros><import>tophat_macros.xml</import><macro name="dbKeyActions"> @@ -312,7 +325,7 @@ <param name="index" value="tophat_test" /><param name="settingsType" value="preSet" /><param name="specReadGroup" value="No" /> - <output name="junctions" file="tophat_out1j.bed" /> + <output name="junctions" file="tophat2_out1j.bed" /><output name="accepted_hits" file="tophat_out1h.bam" compare="sim_size" /></test><!-- Test using base-space test data: paired-end reads, index from history. --> @@ -369,6 +382,7 @@ <param name="min_coverage_intron" value="50" /><param name="max_coverage_intron" value="20000" /><param name="microexon_search" value="Yes" /> + <param name="b2_settings" value="No" /><!-- Fusion search params --><param name="do_search" value="Yes" /><param name="anchor_len" value="21" /> @@ -418,6 +432,7 @@ <param name="report_discordant_pairs" value="Yes" /><param name="use_search" value="No" /><param name="microexon_search" value="Yes" /> + <param name="b2_settings" value="No" /><!-- Fusion search params --><param name="do_search" value="Yes" /><param name="anchor_len" value="21" /> Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.