galaxy-dist commit 9d68027b0109: Add options to Tophat wrapper for specifying own splice junctions.

19 Nov 2010

# HG changeset patch -- Bitbucket.org
# Project galaxy-dist
# URL http://bitbucket.org/galaxy/galaxy-dist/overview
# User jeremy goecks <jeremy.goecks@emory.edu>
# Date 1288805582 14400
# Node ID 9d68027b01096e2b32101234878d11946c03d08c
# Parent  49f0e8441a4da6b1ec03250448ab84854f07aa77
Add options to Tophat wrapper for specifying own splice junctions.

--- a/tools/ngs_rna/tophat_wrapper.py
+++ b/tools/ngs_rna/tophat_wrapper.py
@@ -29,6 +29,24 @@ def __main__():
                         help='The maximum intron length. When searching for junctions ab initio, TopHat will ignore donor/acceptor pairs farther than this many bases apart, except when such a pair is supported by a split segment alignment of a long read.' )
     parser.add_option( '-F', '--junction_filter', dest='junction_filter', help='Filter out junctions supported by too few alignments (number of reads divided by average depth of coverage)' )
     parser.add_option( '-g', '--max_multihits', dest='max_multihits', help='Maximum number of alignments to be allowed' )
+    parser.add_option( '', '--seg-mismatches', dest='seg_mismatches', help='Number of mismatches allowed in each segment alignment for reads mapped independently' )
+    parser.add_option( '', '--seg-length', dest='seg_length', help='Minimum length of read segments' )
+    
+    # Options for supplying own junctions
+    parser.add_option( '-G', '--GTF', dest='gene_model_annotations', help='Supply TopHat with a list of gene model annotations. \
+                                                                           TopHat will use the exon records in this file to build \
+                                                                           a set of known splice junctions for each gene, and will \
+                                                                           attempt to align reads to these junctions even if they \
+                                                                           would not normally be covered by the initial mapping.')
+    parser.add_option( '-j', '--raw-juncs', dest='raw_juncs', help='Supply TopHat with a list of raw junctions. Junctions are \
+                                                                    specified one per line, in a tab-delimited format. Records \
+                                                                    look like: <chrom><left><right><+/-> left and right are \
+                                                                    zero-based coordinates, and specify the last character of the \
+                                                                    left sequenced to be spliced to the first character of the right \
+                                                                    sequence, inclusive.')
+    parser.add_option( '', '--no-novel-juncs', action="store_true", dest='no_novel_juncs', help="Only look for junctions indicated in the \
+                                                                                            supplied GFF file. (ignored without -G)")
+    # Types of search.
     parser.add_option( '', '--microexon-search', action="store_true", dest='microexon_search', help='With this option, the pipeline will attempt to find alignments incident to microexons. Works only for reads 50bp or longer.')
     parser.add_option( '', '--closure-search', action="store_true", dest='closure_search', help='Enables the mate pair closure-based search for junctions. Closure-based search should only be used when the expected inner distance between mates is small (<= 50bp)')
     parser.add_option( '', '--no-closure-search', action="store_false", dest='closure_search' )
@@ -41,8 +59,6 @@ def __main__():
     parser.add_option( '', '--max-closure-intron', dest='max_closure_intron', help='Maximum intron length that may be found during closure search' )
     parser.add_option( '', '--min-coverage-intron', dest='min_coverage_intron', help='Minimum intron length that may be found during coverage search' )
     parser.add_option( '', '--max-coverage-intron', dest='max_coverage_intron', help='Maximum intron length that may be found during coverage search' )
-    parser.add_option( '', '--seg-mismatches', dest='seg_mismatches', help='Number of mismatches allowed in each segment alignment for reads mapped independently' )
-    parser.add_option( '', '--seg-length', dest='seg_length', help='Minimum length of read segments' )
     
     # Wrapper options.
     parser.add_option( '-1', '--input1', dest='input1', help='The (forward or single-end) reads file in Sanger FASTQ format' )
@@ -107,6 +123,15 @@ def __main__():
             if float( options.junction_filter ) != 0.0:
                 opts += ' -F %s' % options.junction_filter
             opts += ' -g %s' % options.max_multihits
+            # Custom junctions options.
+            if options.gene_model_annotations:
+                opts += ' -G %s' % options.gene_model_annotations
+            if options.raw_juncs:
+                opts += ' -j %s' % options.raw_juncs
+            if options.no_novel_juncs:
+                opts += ' --no-novel-juncs'
+                
+            # Search type options.
             if options.coverage_search:
                 opts += ' --coverage-search --min-coverage-intron %s --max-coverage-intron %s' % ( options.min_coverage_intron, options.max_coverage_intron )
             else:

--- a/tools/ngs_rna/tophat_wrapper.xml
+++ b/tools/ngs_rna/tophat_wrapper.xml
@@ -45,6 +45,21 @@
                    --max-segment-intron $singlePaired.sParams.max_segment_intron
                    --seg-mismatches=$singlePaired.sParams.seg_mismatches
                    --seg-length=$singlePaired.sParams.seg_length
+                   
+                   ## Supplying junctions parameters.
+                   #if $singlePaired.sParams.own_junctions.use_junctions == "Yes":
+                        #if $singlePaired.sParams.own_junctions.gene_model_ann.use_annotations == "Yes":
+                            -G $singlePaired.sParams.own_junctions.gene_model_ann.gene_annotation_model
+                        #end if
+                        #if $singlePaired.sParams.own_junctions.raw_juncs.use_juncs == "Yes":
+                            -j $singlePaired.sParams.own_junctions.raw_juncs.raw_juncs
+                        #end if
+                        ## TODO: No idea why a string cast is necessary, but it is:
+                        #if str($singlePaired.sParams.own_junctions.no_novel_juncs) == "Yes":
+                            --no-novel-juncs
+                        #end if
+                   #end if
+                   
                    #if $singlePaired.sParams.closure_search.use_search == "Yes":
                         --closure-search
                         --min-closure-exon $singlePaired.sParams.closure_search.min_closure_exon
@@ -60,8 +75,8 @@
                    #else:
                         --no-coverage-search
                    #end if
-                   ## No idea why the type conversion is necessary, but it seems to be.
-                   #if str ($singlePaired.sParams.microexon_search) == "Yes":
+                   ## TODO: No idea why the type conversion is necessary, but it seems to be.
+                   #if str($singlePaired.sParams.microexon_search) == "Yes":
                         --microexon-search
                    #end if
                  #end if
@@ -81,6 +96,21 @@
                    --max-segment-intron $singlePaired.pParams.max_segment_intron
                    --seg-mismatches=$singlePaired.pParams.seg_mismatches
                    --seg-length=$singlePaired.pParams.seg_length
+                   
+                   ## Supplying junctions parameters.
+                   #if $singlePaired.pParams.own_junctions.use_junctions == "Yes":
+                        #if $singlePaired.pParams.own_junctions.gene_model_ann.use_annotations == "Yes":
+                            -G $singlePaired.pParams.own_junctions.gene_model_ann.gene_annotation_model
+                        #end if
+                        #if $singlePaired.pParams.own_junctions.raw_juncs.use_juncs == "Yes":
+                            -j $singlePaired.pParams.own_junctions.raw_juncs.raw_juncs
+                        #end if
+                        ## TODO: No idea why type cast is necessary, but it is:
+                        #if str($singlePaired.pParams.own_junctions.no_novel_juncs) == "Yes":
+                            --no-novel-juncs
+                        #end if
+                   #end if
+                   
                    #if $singlePaired.pParams.closure_search.use_search == "Yes":
                         --closure-search
                         --min-closure-exon $singlePaired.pParams.closure_search.min_closure_exon
@@ -96,7 +126,7 @@
                    #else:
                         --no-coverage-search
                    #end if
-                   ## No idea why the type conversion is necessary, but it seems to be.
+                   ## TODO: No idea why the type conversion is necessary, but it seems to be.
                    #if str ($singlePaired.pParams.microexon_search) == "Yes":
                         --microexon-search
                    #end if
@@ -146,6 +176,42 @@
                   <param name="max_segment_intron" type="integer" value="500000" label="Maximum intron length that may be found during split-segment (default) search" /><param name="seg_mismatches" type="integer" value="2" label="Number of mismatches allowed in each segment alignment for reads mapped independently" /><param name="seg_length" type="integer" value="25" label="Minimum length of read segments" />
+                  
+                  <!-- Options for supplying own junctions. -->
+                  <conditional name="own_junctions">
+                      <param name="use_junctions" type="select" label="Use Own Junctions">
+                        <option value="No">No</option>
+                        <option value="Yes">Yes</option>
+                      </param>
+                      <when value="Yes">
+                          <conditional name="gene_model_ann">
+                             <param name="use_annotations" type="select" label="Use Gene Annotation Model">
+                                <option value="No">No</option>
+                                <option value="Yes">Yes</option>                               
+                             </param>
+                             <when value="No" />
+                             <when value="Yes">
+                               <param format="gtf" name="gene_annotation_model" type="data" label="Gene Model Annotations" help="TopHat will use the exon records in this file to build a set of known splice junctions for each gene, and will attempt to align reads to these junctions even if they would not normally be covered by the initial mapping."/>
+                             </when>
+                          </conditional>
+                          <conditional name="raw_juncs">
+                             <param name="use_juncs" type="select" label="Use Raw Junctions">
+                                <option value="No">No</option>
+                                <option value="Yes">Yes</option>                               
+                             </param>
+                             <when value="No" />
+                             <when value="Yes">
+                               <param format="interval" name="raw_juncs" type="data" label="Raw Junctions" help="Supply TopHat with a list of raw junctions. Junctions are specified one per line, in a tab-delimited format. Records look like: [chrom] [left] [right] [+/-] left and right are zero-based coordinates, and specify the last character of the left sequenced to be spliced to the first character of the right sequence, inclusive."/>
+                             </when>
+                          </conditional>
+                          <param name="no_novel_juncs" type="select" label="Only look for supplied junctions">
+                            <option value="No">No</option>
+                            <option value="Yes">Yes</option>
+                          </param>
+                      </when>
+                      <when value="No" />
+                  </conditional><!-- /own_junctions -->
+                  
                   <!-- Closure search. --><conditional name="closure_search"><param name="use_search" type="select" label="Use Closure Search">
@@ -201,6 +267,41 @@
                   <param name="max_segment_intron" type="integer" value="500000" label="Maximum intron length that may be found during split-segment (default) search" /><param name="seg_mismatches" type="integer" value="2" label="Number of mismatches allowed in each segment alignment for reads mapped independently" /><param name="seg_length" type="integer" value="25" label="Minimum length of read segments" />
+                  <!-- Options for supplying own junctions. -->
+                  <conditional name="own_junctions">
+                      <param name="use_junctions" type="select" label="Use Own Junctions">
+                        <option value="No">No</option>
+                        <option value="Yes">Yes</option>
+                      </param>
+                      <when value="Yes">
+                          <conditional name="gene_model_ann">
+                             <param name="use_annotations" type="select" label="Use Gene Annotation Model">
+                                <option value="No">No</option>
+                                <option value="Yes">Yes</option>                               
+                             </param>
+                             <when value="No" />
+                             <when value="Yes">
+                               <param format="gtf" name="gene_annotation_model" type="data" label="Gene Model Annotations" help="TopHat will use the exon records in this file to build a set of known splice junctions for each gene, and will attempt to align reads to these junctions even if they would not normally be covered by the initial mapping."/>
+                             </when>
+                          </conditional>
+                          <conditional name="raw_juncs">
+                             <param name="use_juncs" type="select" label="Use Raw Junctions">
+                                <option value="No">No</option>
+                                <option value="Yes">Yes</option>                               
+                             </param>
+                             <when value="No" />
+                             <when value="Yes">
+                               <param format="interval" name="raw_juncs" type="data" label="Raw Junctions" help="Supply TopHat with a list of raw junctions. Junctions are specified one per line, in a tab-delimited format. Records look like: [chrom] [left] [right] [+/-] left and right are zero-based coordinates, and specify the last character of the left sequenced to be spliced to the first character of the right sequence, inclusive."/>
+                             </when>
+                          </conditional>
+                          <param name="no_novel_juncs" type="select" label="Only look for supplied junctions">
+                            <option value="No">No</option>
+                            <option value="Yes">Yes</option>
+                          </param>
+                      </when>
+                      <when value="No" />
+                  </conditional><!-- /own_junctions -->
+                  
                   <!-- Closure search. --><conditional name="closure_search"><param name="use_search" type="select" label="Use Closure Search">
@@ -385,8 +486,11 @@ This is a list of implemented Tophat opt
   -F/--min-isoform-fraction 0.0-1.0 TopHat filters out junctions supported by too few alignments. Suppose a junction spanning two exons, is supported by S reads. Let the average depth of coverage of 
                                     exon A be D, and assume that it is higher than B. If S / D is less than the minimum isoform fraction, the junction is not reported. A value of zero disables the 
                                     filter. The default is 0.15.
-  -g/--max-multihits INT        Instructs TopHat to allow up to this many alignments to the reference for a given read, and suppresses all alignments for reads with more than this many 
+  -g/--max-multihits INT            Instructs TopHat to allow up to this many alignments to the reference for a given read, and suppresses all alignments for reads with more than this many 
                                     alignments. The default is 40.
+  -G/--GTF [GTF 2.2 file]           Supply TopHat with a list of gene model annotations. TopHat will use the exon records in this file to build a set of known splice junctions for each gene, and will attempt to align reads to these junctions even if they would not normally be covered by the initial mapping.
+  -j/--raw-juncs [juncs file]       Supply TopHat with a list of raw junctions. Junctions are specified one per line, in a tab-delimited format. Records look like: [chrom] [left] [right] [+/-], left and right are zero-based coordinates, and specify the last character of the left sequenced to be spliced to the first character of the right sequence, inclusive.
+  -no-novel-juncs                   Only look for junctions indicated in the supplied GFF file. (ignored without -G)
   --no-closure-search	            Disables the mate pair closure-based search for junctions. Currently, has no effect - closure search is off by default.
   --closure-search	            Enables the mate pair closure-based search for junctions. Closure-based search should only be used when the expected inner distance between mates is small (about or less than 50bp)
   --no-coverage-search	            Disables the coverage based search for junctions.

    

commits-noreply＠bitbucket.org

tags

participants (1)