[galaxy-commits] commit/galaxy-central: jgoecks: New version of Cuffdiff wrapper: (a) remove obsolete wrapper script; (b) rework steps to specify conditions and replicates; and (c) remove deprecated options.

31 Jul 2013

1 new commit in galaxy-central:

https://bitbucket.org/galaxy/galaxy-central/commits/6a32d26aefc7/
Changeset:   6a32d26aefc7
User:        jgoecks
Date:        2013-07-31 15:50:00
Summary:     New version of Cuffdiff wrapper: (a) remove obsolete wrapper script; (b) rework steps to specify conditions and replicates; and (c) remove deprecated options.
Affected #:  2 files

diff -r 951e853b0bcd2c62cedee0b95d46c9e36ab6c605 -r 6a32d26aefc758cf576e0b753c2626ac56005c18 tools/ngs_rna/cuffdiff_wrapper.py

--- a/tools/ngs_rna/cuffdiff_wrapper.py
+++ /dev/null
@@ -1,241 +0,0 @@
-#!/usr/bin/env python
-
-# Wrapper supports Cuffdiff versions v1.3.0-v2.0
-
-import optparse, os, shutil, subprocess, sys, tempfile
-
-def group_callback( option, op_str, value, parser ):
-    groups = []
-    flist = []
-    for arg in parser.rargs:
-        arg = arg.strip()
-        if arg[0] is "-":
-            break
-        elif arg[0] is ",":
-            groups.append(flist)
-            flist = []
-        else:
-            flist.append(arg)
-    groups.append(flist)
-
-    setattr(parser.values, option.dest, groups)
-    
-def label_callback( option, op_str, value, parser ):
-    labels = []
-    for arg in parser.rargs:
-        arg = arg.strip()
-        if arg[0] is "-":
-            break
-        else:
-            labels.append(arg)
-
-    setattr(parser.values, option.dest, labels)
-
-def stop_err( msg ):
-    sys.stderr.write( "%s\n" % msg )
-    sys.exit()
-    
-# Copied from sam_to_bam.py:
-def check_seq_file( dbkey, cached_seqs_pointer_file ):
-    seq_path = ''
-    for line in open( cached_seqs_pointer_file ):
-        line = line.rstrip( '\r\n' )
-        if line and not line.startswith( '#' ) and line.startswith( 'index' ):
-            fields = line.split( '\t' )
-            if len( fields ) < 3:
-                continue
-            if fields[1] == dbkey:
-                seq_path = fields[2].strip()
-                break
-    return seq_path
-
-def __main__():
-    #Parse Command Line
-    parser = optparse.OptionParser()
-    
-    # Cuffdiff options.
-    parser.add_option( '-s', '--inner-dist-std-dev', dest='inner_dist_std_dev', help='The standard deviation for the distribution on inner distances between mate pairs. The default is 20bp.' )
-    parser.add_option( '-p', '--num-threads', dest='num_threads', help='Use this many threads to align reads. The default is 1.' )
-    parser.add_option( '-m', '--inner-mean-dist', dest='inner_mean_dist', help='This is the expected (mean) inner distance between mate pairs. \
-                                                                                For, example, for paired end runs with fragments selected at 300bp, \
-                                                                                where each end is 50bp, you should set -r to be 200. The default is 45bp.')
-    parser.add_option( '-c', '--min-alignment-count', dest='min_alignment_count', help='The minimum number of alignments in a locus for needed to conduct significance testing on changes in that locus observed between samples. If no testing is performed, changes in the locus are deemed not signficant, and the locus\' observed changes don\'t contribute to correction for multiple testing. The default is 1,000 fragment alignments (up to 2,000 paired reads).' )
-    parser.add_option( '--FDR', dest='FDR', help='The allowed false discovery rate. The default is 0.05.' )
-    parser.add_option( '-u', '--multi-read-correct', dest='multi_read_correct', action="store_true", help='Tells Cufflinks to do an initial estimation procedure to more accurately weight reads mapping to multiple locations in the genome')
-    parser.add_option( '--library-norm-method', dest='library_norm_method' )
-    parser.add_option( '--dispersion-method', dest='dispersion_method' )
-
-    # Advanced Options:	
-    parser.add_option( '--num-importance-samples', dest='num_importance_samples', help='Sets the number of importance samples generated for each locus during abundance estimation. Default: 1000' )
-    parser.add_option( '--max-mle-iterations', dest='max_mle_iterations', help='Sets the number of iterations allowed during maximum likelihood estimation of abundances. Default: 5000' )
-    
-    # Wrapper / Galaxy options.
-    parser.add_option( '-f', '--files', dest='groups', action="callback", callback=group_callback, help="Groups to be processed, groups are separated by spaces, replicates in a group comma separated. group1_rep1,group1_rep2 group2_rep1,group2_rep2, ..., groupN_rep1, groupN_rep2" )
-    parser.add_option( '-A', '--inputA', dest='inputA', help='A transcript GTF file produced by cufflinks, cuffcompare, or other source.')
-    parser.add_option( '-1', '--input1', dest='input1', help='File of RNA-Seq read alignments in the SAM format. SAM is a standard short read alignment, that allows aligners to attach custom tags to individual alignments, and Cufflinks requires that the alignments you supply have some of these tags. Please see Input formats for more details.' )
-    parser.add_option( '-2', '--input2', dest='input2', help='File of RNA-Seq read alignments in the SAM format. SAM is a standard short read alignment, that allows aligners to attach custom tags to individual alignments, and Cufflinks requires that the alignments you supply have some of these tags. Please see Input formats for more details.' )
-
-    # Label options
-    parser.add_option('-L', '--labels', dest='labels', action="callback", callback=label_callback, help="Labels for the groups the replicates are in.")
-    
-	# Normalization options.
-    parser.add_option( "-N", "--quartile-normalization", dest="do_normalization", action="store_true" )
-
-    # Bias correction options.
-    parser.add_option( '-b', dest='do_bias_correction', action="store_true", help='Providing Cufflinks with a multifasta file via this option instructs it to run our new bias detection and correction algorithm which can significantly improve accuracy of transcript abundance estimates.')
-    parser.add_option( '', '--dbkey', dest='dbkey', help='The build of the reference dataset' )
-    parser.add_option( '', '--index_dir', dest='index_dir', help='GALAXY_DATA_INDEX_DIR' )
-    parser.add_option( '', '--ref_file', dest='ref_file', help='The reference dataset from the history' )
-
-    # Outputs.
-    parser.add_option( "--isoforms_fpkm_tracking_output", dest="isoforms_fpkm_tracking_output" )
-    parser.add_option( "--genes_fpkm_tracking_output", dest="genes_fpkm_tracking_output" )
-    parser.add_option( "--cds_fpkm_tracking_output", dest="cds_fpkm_tracking_output" )
-    parser.add_option( "--tss_groups_fpkm_tracking_output", dest="tss_groups_fpkm_tracking_output" )
-    parser.add_option( "--isoforms_exp_output", dest="isoforms_exp_output" )
-    parser.add_option( "--genes_exp_output", dest="genes_exp_output" )
-    parser.add_option( "--tss_groups_exp_output", dest="tss_groups_exp_output" )
-    parser.add_option( "--cds_exp_fpkm_tracking_output", dest="cds_exp_fpkm_tracking_output" )
-    parser.add_option( "--splicing_diff_output", dest="splicing_diff_output" )
-    parser.add_option( "--cds_diff_output", dest="cds_diff_output" )
-    parser.add_option( "--promoters_diff_output", dest="promoters_diff_output" )
-    
-    (options, args) = parser.parse_args()
-    
-    # output version # of tool
-    try:
-        tmp = tempfile.NamedTemporaryFile().name
-        tmp_stdout = open( tmp, 'wb' )
-        proc = subprocess.Popen( args='cuffdiff --no-update-check 2>&1', shell=True, stdout=tmp_stdout )
-        tmp_stdout.close()
-        returncode = proc.wait()
-        stdout = None
-        for line in open( tmp_stdout.name, 'rb' ):
-            if line.lower().find( 'cuffdiff v' ) >= 0:
-                stdout = line.strip()
-                break
-        if stdout:
-            sys.stdout.write( '%s\n' % stdout )
-        else:
-            raise Exception
-    except:
-        sys.stdout.write( 'Could not determine Cuffdiff version\n' )
-    
-    # If doing bias correction, set/link to sequence file.
-    if options.do_bias_correction:
-        if options.ref_file != 'None':
-            # Sequence data from history.
-            # Create symbolic link to ref_file so that index will be created in working directory.
-            seq_path = "ref.fa"
-            os.symlink( options.ref_file, seq_path  )
-        else:
-            # Sequence data from loc file.
-            cached_seqs_pointer_file = os.path.join( options.index_dir, 'sam_fa_indices.loc' )
-            if not os.path.exists( cached_seqs_pointer_file ):
-                stop_err( 'The required file (%s) does not exist.' % cached_seqs_pointer_file )
-            # If found for the dbkey, seq_path will look something like /galaxy/data/equCab2/sam_index/equCab2.fa,
-            # and the equCab2.fa file will contain fasta sequences.
-            seq_path = check_seq_file( options.dbkey, cached_seqs_pointer_file )
-            if seq_path == '':
-                stop_err( 'No sequence data found for dbkey %s, so bias correction cannot be used.' % options.dbkey  )            
-    
-    # Build command.
-    
-    # Base; always use quiet mode to avoid problems with storing log output.
-    cmd = "cuffdiff --no-update-check -q"
-    
-    # Add options.
-    if options.library_norm_method:
-        cmd += ( " --library-norm-method %s" % options.library_norm_method )
-    if options.dispersion_method:
-        cmd += ( " --dispersion-method %s" % options.dispersion_method )
-    if options.inner_dist_std_dev:
-        cmd += ( " -s %i" % int ( options.inner_dist_std_dev ) )
-    if options.num_threads:
-        cmd += ( " -p %i" % int ( options.num_threads ) )
-    if options.inner_mean_dist:
-        cmd += ( " -m %i" % int ( options.inner_mean_dist ) )
-    if options.min_alignment_count:
-        cmd += ( " -c %i" % int ( options.min_alignment_count ) )
-    if options.FDR:
-        cmd += ( " --FDR %f" % float( options.FDR ) )
-    if options.multi_read_correct:
-        cmd += ( " -u" )
-    if options.num_importance_samples:
-        cmd += ( " --num-importance-samples %i" % int ( options.num_importance_samples ) )
-    if options.max_mle_iterations:
-        cmd += ( " --max-mle-iterations %i" % int ( options.max_mle_iterations ) )
-    if options.do_normalization:
-        cmd += ( " -N" )
-    if options.do_bias_correction:
-        cmd += ( " -b %s" % seq_path )
-            
-    # Add inputs.
-    # For replicate analysis: group1_rep1,group1_rep2 groupN_rep1,groupN_rep2
-    if options.groups:
-        cmd += " --labels "
-        for label in options.labels:
-            cmd += '"%s",' % label
-        cmd = cmd[:-1]
-
-        cmd += " " + options.inputA + " "
-
-        for group in options.groups:
-            for filename in group:
-                cmd += filename + ","
-            cmd = cmd[:-1] + " "
-    else: 
-        cmd += " " + options.inputA + " " + options.input1 + " " + options.input2
-        
-    # Debugging.
-    print cmd
-
-    # Run command.
-    try:
-        tmp_name = tempfile.NamedTemporaryFile().name
-        tmp_stderr = open( tmp_name, 'wb' )
-        proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() )
-        returncode = proc.wait()
-        tmp_stderr.close()
-        
-        # Get stderr, allowing for case where it's very large.
-        tmp_stderr = open( tmp_name, 'rb' )
-        stderr = ''
-        buffsize = 1048576
-        try:
-            while True:
-                stderr += tmp_stderr.read( buffsize )
-                if not stderr or len( stderr ) % buffsize != 0:
-                    break
-        except OverflowError:
-            pass
-        tmp_stderr.close()
-        
-        # Error checking.
-        if returncode != 0:
-            raise Exception, stderr
-            
-        # check that there are results in the output file
-        if len( open( "isoforms.fpkm_tracking", 'rb' ).read().strip() ) == 0:
-            raise Exception, 'The main output file is empty, there may be an error with your input file or settings.'
-    except Exception, e:
-        stop_err( 'Error running cuffdiff. ' + str( e ) )
-
-        
-    # Copy output files to specified files.
-    try:
-        shutil.copyfile( "isoforms.fpkm_tracking", options.isoforms_fpkm_tracking_output )
-        shutil.copyfile( "genes.fpkm_tracking", options.genes_fpkm_tracking_output )
-        shutil.copyfile( "cds.fpkm_tracking", options.cds_fpkm_tracking_output )
-        shutil.copyfile( "tss_groups.fpkm_tracking", options.tss_groups_fpkm_tracking_output )
-        shutil.copyfile( "isoform_exp.diff", options.isoforms_exp_output )
-        shutil.copyfile( "gene_exp.diff", options.genes_exp_output )
-        shutil.copyfile( "tss_group_exp.diff", options.tss_groups_exp_output )
-        shutil.copyfile( "splicing.diff", options.splicing_diff_output )
-        shutil.copyfile( "cds.diff", options.cds_diff_output )
-        shutil.copyfile( "cds_exp.diff", options.cds_exp_fpkm_tracking_output )
-        shutil.copyfile( "promoters.diff", options.promoters_diff_output )    
-    except Exception, e:
-        stop_err( 'Error in cuffdiff:\n' + str( e ) )
-
-if __name__=="__main__": __main__()

diff -r 951e853b0bcd2c62cedee0b95d46c9e36ab6c605 -r 6a32d26aefc758cf576e0b753c2626ac56005c18 tools/ngs_rna/cuffdiff_wrapper.xml
--- a/tools/ngs_rna/cuffdiff_wrapper.xml
+++ b/tools/ngs_rna/cuffdiff_wrapper.xml
@@ -1,40 +1,23 @@
-<tool id="cuffdiff" name="Cuffdiff" version="0.0.5">
+<tool id="cuffdiff" name="Cuffdiff" version="0.0.6"><!-- Wrapper supports Cuffdiff versions 2.1.0-2.1.1 --><description>find significant changes in transcript expression, splicing, and promoter use</description><requirements><requirement type="package">cufflinks</requirement></requirements>
-    <command interpreter="python">
-        cuffdiff_wrapper.py
+    <command>
+        cuffdiff
             --FDR=$fdr
             --num-threads="4"
             --min-alignment-count=$min_alignment_count
             --library-norm-method=$library_norm_method
             --dispersion-method=$dispersion_method
 
-            --isoforms_fpkm_tracking_output=$isoforms_fpkm_tracking
-            --genes_fpkm_tracking_output=$genes_fpkm_tracking
-            --cds_fpkm_tracking_output=$cds_fpkm_tracking
-            --tss_groups_fpkm_tracking_output=$tss_groups_fpkm_tracking
-            --isoforms_exp_output=$isoforms_exp
-            --genes_exp_output=$genes_exp
-            --tss_groups_exp_output=$tss_groups_exp
-            --cds_exp_fpkm_tracking_output=$cds_exp_fpkm_tracking
-            --splicing_diff_output=$splicing_diff
-            --cds_diff_output=$cds_diff
-            --promoters_diff_output=$promoters_diff
-            
             ## Set advanced data parameters?
             #if $additional.sAdditional == "Yes":
                 -m $additional.frag_mean_len
                 -s $additional.frag_len_std_dev
             #end if
 
-            ## Normalization?
-            #if str($do_normalization) == "Yes":
-            -N
-            #end if
-
             ## Multi-read correct?
             #if str($multiread_correct) == "Yes":
             -u
@@ -51,50 +34,26 @@
                 --dbkey=${gtf_input.metadata.dbkey} 
                 --index_dir=${GALAXY_DATA_INDEX_DIR}
             #end if
-                
+
+            #set labels = ','.join( [ str( $condition.name ) for $condition in $conditions ] )
+            --labels $labels
+
             ## Inputs.
-            --inputA=$gtf_input
-            #if $group_analysis.do_groups == "No":
-                --input1=$aligned_reads1
-                --input2=$aligned_reads2
-            #else:
-                ## Replicates.
-                --labels
-                #for $group in $group_analysis.groups
-                    ## Cuffdiff uses commas as delimiters, so replace them with underscores to avoid
-                    ## parsing problems.
-                    "${group.group.replace(',', '_')}"
-                #end for
-                --files
-                #for $group in $group_analysis.groups
-                    #for $file in $group.files:
-                        ${file.file}
-                    #end for
-                    ,
-                #end for
-            #end if
-
+            $gtf_input
+            #for $condition in $conditions:
+                #set samples = ','.join( [ str( $sample.sample ) for $sample in $condition.samples ] )
+                $samples
+            #end for
     </command><inputs><param format="gtf,gff3" name="gtf_input" type="data" label="Transcripts" help="A transcript GFF3 or GTF file produced by cufflinks, cuffcompare, or other source."/>
-        <conditional name="group_analysis"> 
-            <param name="do_groups" type="select" label="Perform replicate analysis" help="Perform cuffdiff with replicates in each group.">
-                <option value="No">No</option>
-                <option value="Yes">Yes</option>
-            </param>
-            <when value="Yes">
-                <repeat name="groups" title="Group">
-                    <param name="group" title="Group name" type="text" label="Group name"/>
-                    <repeat name="files" title="Replicate">
-                        <param name="file" label="Add file" type="data" format="sam,bam"/>
-                    </repeat>
-                </repeat>
-            </when>
-            <when value="No">
-                <param format="sam,bam" name="aligned_reads1" type="data" label="SAM or BAM file of aligned RNA-Seq reads" help=""/>
-                <param format="sam,bam" name="aligned_reads2" type="data" label="SAM or BAM file of aligned RNA-Seq reads" help=""/>
-            </when>
-        </conditional>
+
+        <repeat name="conditions" title="Condition" min="2">
+            <param name="name" title="Condition name" type="text" label="Name"/>
+            <repeat name="samples" title="Replicate" min="1">
+                <param name="sample" label="Add replicate" type="data" format="sam,bam"/>
+            </repeat>
+        </repeat><param name="library_norm_method" type="select" label="Library normalization method"><option value="geometric" selected="True">geometric</option>
@@ -112,11 +71,6 @@
 
         <param name="min_alignment_count" type="integer" value="10" label="Min Alignment Count" help="The minimum number of alignments in a locus for needed to conduct significance testing on changes in that locus observed between samples."/>
 
-        <param name="do_normalization" type="select" label="Perform quartile normalization" help="Removes top 25% of genes from FPKM denominator to improve accuracy of differential expression calls for low abundance transcripts.">
-            <option value="No">No</option>
-            <option value="Yes">Yes</option>
-        </param>
-
         <param name="multiread_correct" type="select" label="Use multi-read correct" help="Tells Cufflinks to do an initial estimation procedure to more accurately weight reads mapping to multiple locations in the genome."><option value="No" selected="true">No</option><option value="Yes">Yes</option>
@@ -155,18 +109,22 @@
         </conditional></inputs>
 
+    <stdio>
+        <regex match=".*" source="both" level="log" description="tool progress"/>
+    </stdio>
+
     <outputs>
-        <data format="tabular" name="splicing_diff" label="${tool.name} on ${on_string}: splicing differential expression testing"/>
-        <data format="tabular" name="promoters_diff" label="${tool.name} on ${on_string}: promoters differential expression testing"/>
-        <data format="tabular" name="cds_diff" label="${tool.name} on ${on_string}: CDS overloading diffential expression testing"/>
-        <data format="tabular" name="cds_exp_fpkm_tracking" label="${tool.name} on ${on_string}: CDS FPKM differential expression testing"/>
-        <data format="tabular" name="cds_fpkm_tracking" label="${tool.name} on ${on_string}: CDS FPKM tracking"/>
-        <data format="tabular" name="tss_groups_exp" label="${tool.name} on ${on_string}: TSS groups differential expression testing"/>
-        <data format="tabular" name="tss_groups_fpkm_tracking" label="${tool.name} on ${on_string}: TSS groups FPKM tracking" />
-        <data format="tabular" name="genes_exp" label="${tool.name} on ${on_string}: gene differential expression testing"/>
-        <data format="tabular" name="genes_fpkm_tracking" label="${tool.name} on ${on_string}: gene FPKM tracking"/>
-        <data format="tabular" name="isoforms_exp" label="${tool.name} on ${on_string}: transcript differential expression testing"/>
-        <data format="tabular" name="isoforms_fpkm_tracking" label="${tool.name} on ${on_string}: transcript FPKM tracking"/>
+        <data format="tabular" name="splicing_diff" label="${tool.name} on ${on_string}: splicing differential expression testing" from_work_dir="splicing.diff" />
+        <data format="tabular" name="promoters_diff" label="${tool.name} on ${on_string}: promoters differential expression testing" from_work_dir="promoters.diff" />
+        <data format="tabular" name="cds_diff" label="${tool.name} on ${on_string}: CDS overloading diffential expression testing" from_work_dir="cds.diff" />
+        <data format="tabular" name="cds_exp_fpkm_tracking" label="${tool.name} on ${on_string}: CDS FPKM differential expression testing" from_work_dir="cds_exp.diff" />
+        <data format="tabular" name="cds_fpkm_tracking" label="${tool.name} on ${on_string}: CDS FPKM tracking" from_work_dir="cds.fpkm_tracking" />
+        <data format="tabular" name="tss_groups_exp" label="${tool.name} on ${on_string}: TSS groups differential expression testing" from_work_dir="tss_group_exp.diff" />
+        <data format="tabular" name="tss_groups_fpkm_tracking" label="${tool.name} on ${on_string}: TSS groups FPKM tracking" from_work_dir="tss_groups.fpkm_tracking" />
+        <data format="tabular" name="genes_exp" label="${tool.name} on ${on_string}: gene differential expression testing" from_work_dir="gene_exp.diff" />
+        <data format="tabular" name="genes_fpkm_tracking" label="${tool.name} on ${on_string}: gene FPKM tracking" from_work_dir="genes.fpkm_tracking" />
+        <data format="tabular" name="isoforms_exp" label="${tool.name} on ${on_string}: transcript differential expression testing" from_work_dir="isoform_exp.diff" />
+        <data format="tabular" name="isoforms_fpkm_tracking" label="${tool.name} on ${on_string}: transcript FPKM tracking" from_work_dir="isoforms.fpkm_tracking" /></outputs><tests>
@@ -174,21 +132,19 @@
                 <!--
                     cuffdiff cuffcompare_out5.gtf cuffdiff_in1.sam cuffdiff_in2.sam 
                 -->
+                <!-- 
+                    NOTE: as of version 0.0.6 of the wrapper, tests cannot be run because multiple inputs to a repeat
+                    element are not supported.
                 <param name="gtf_input" value="cuffcompare_out5.gtf" ftype="gtf" /><param name="do_groups" value="No" /><param name="aligned_reads1" value="cuffdiff_in1.sam" ftype="sam" /><param name="aligned_reads2" value="cuffdiff_in2.sam" ftype="sam" />
-                <!-- Defaults. --><param name="fdr" value="0.05" /><param name="min_alignment_count" value="0" /><param name="do_bias_correction" value="No" /><param name="do_normalization" value="No" /><param name="multiread_correct" value="No"/><param name="sAdditional" value="No"/>
-                <!-- 
-                    Line diffs are needed because cuffdiff does not produce deterministic output.
-                    TODO: can we find datasets that lead to deterministic behavior?
-                --><output name="splicing_diff" file="cuffdiff_out9.txt"/><output name="promoters_diff" file="cuffdiff_out10.txt"/><output name="cds_diff" file="cuffdiff_out11.txt"/>
@@ -200,6 +156,7 @@
                 <output name="genes_fpkm_tracking" file="cuffdiff_out6.txt" lines_diff="200"/><output name="isoforms_exp" file="cuffdiff_out1.txt" lines_diff="200"/><output name="isoforms_fpkm_tracking" file="cuffdiff_out5.txt" lines_diff="200"/>
+                --></test></tests>

Repository URL: https://bitbucket.org/galaxy/galaxy-central/

--

This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.

    

[galaxy-commits] commit/galaxy-central: jgoecks: New version of Cuffdiff wrapper: (a) remove obsolete wrapper script; (b) rework steps to specify conditions and replicates; and (c) remove deprecated options.

commits-noreply＠bitbucket.org