galaxy-dist commit b1ec8342053f: Adding NGS simulation tool

20 Nov 2010

# HG changeset patch -- Bitbucket.org
# Project galaxy-dist
# URL http://bitbucket.org/galaxy/galaxy-dist/overview
# User Kelly Vincent <kpvincent@bx.psu.edu>
# Date 1288806772 14400
# Node ID b1ec8342053f3cdcd0a081ff28f077a76bd188cc
# Parent  9d68027b01096e2b32101234878d11946c03d08c
Adding NGS simulation tool

--- /dev/null
+++ b/tool-data/ngs_sim_fasta.loc.sample
@@ -0,0 +1,20 @@
+#This is a sample file distributed with Galaxy that enables the NGS simulation
+#tool to use some FASTA files. You will need to make sure that these FASTA files
+#are in place and then create the ngs_sim.loc file similar to this one (store it
+#in this directory) that points to the locations of those files. The ngs_sim.loc
+#file has this format (white space characters are TAB characters):
+#
+#<unique_build_id><dbkey><display_name><file_base_path>
+#
+#So, for example, if you had hg18chrM.fa in 
+#/data/path/hg18/seq/, 
+#then the ngs_sim.loc entry would look like this:
+#
+#hg18chrM   hg18   hg18chrM   /data/path/hg18/seq/hg18chrM.fa
+#
+#Your ngs_sim.loc file should include an entry per line for each FASTA file you
+#have stored.
+#
+#hg18chrM   hg18    hg18chrM   /data/path/hg18/seq/hg18chrM.fa
+#phiX174    phiX    phiX174    /data/path/genome/phiX/seq/phiX.fa
+#pUC18      pUC18   pUC18      /data/path/genome/pUC18/seq/pUC18.fa

Binary file test-data/ngs_simulation_out3.png has changed

--- /dev/null
+++ b/tools/ngs_simulation/ngs_simulation.xml
@@ -0,0 +1,217 @@
+<tool id="ngs_simulation" name="Simulate" version="1.0.0">
+<!--<tool id="ngs_simulation" name="Simulate" force_history_refresh="True" version="1.0.0">-->
+  <description>Illumina runs</description>
+  <command interpreter="python">
+    ngs_simulation.py
+      #if $in_type.input_type == "built-in"
+        --input="${ filter( lambda x: str( x[0] ) == str( $in_type.genome ), $__app__.tool_data_tables[ 'ngs_sim_fasta' ].get_fields() )[0][-1] }"
+        --genome=$genome
+      #else
+        --input=$in_type.input1
+      #end if
+      --read_len=$read_len
+      --avg_coverage=$avg_coverage
+      --error_rate=$error_rate
+      --num_sims=$num_sims
+      --polymorphism=$polymorphism
+      --detection_thresh=$detection_thresh
+      --output_png=$output_png
+      --summary_out=$summary_out
+      --output_summary=$output_summary
+      --new_file_path=$__new_file_path__
+  </command>
+<!-- If want to include all simulation results file
+        sim_results=$sim_results
+        output=$output.id
+-->
+  <inputs>
+    <conditional name="in_type">
+      <param name="input_type" type="select" label="Use a built-in FASTA file or one from the history?">
+        <option value="built-in">Built-in</option>
+        <option value="history">History file</option>
+      </param>
+      <when value="built-in">
+        <param name="genome" type="select" label="Select a built-in genome" help="if your genome of interest is not listed - contact Galaxy team">
+          <options from_data_table="ngs_sim_fasta" />
+        </param>
+      </when>
+      <when value="history">
+        <param name="input1" type="data" format="fasta" label="Input genome (FASTA format)" />
+      </when>
+    </conditional>
+    <param name="read_len" type="integer" value="76" label="Read length" />
+    <param name="avg_coverage" type="integer" value="200" label="Average coverage" />
+    <param name="error_rate" type="float" value="0.001" label="Error rate or quality score" help="Quality score if integer 1 or greater; error rate if between 0 and 1" />
+    <param name="num_sims" type="integer" value="100" label="The number of simulations to run" />
+    <param name="polymorphism" type="select" multiple="true" label="Frequency/ies for minor allele">
+      <option value="0.001">0.001</option>
+      <option value="0.002">0.002</option>
+      <option value="0.003">0.003</option>
+      <option value="0.004">0.004</option>
+      <option value="0.005">0.005</option>
+      <option value="0.006">0.006</option>
+      <option value="0.007">0.007</option>
+      <option value="0.008">0.008</option>
+      <option value="0.009">0.009</option>
+      <option value="0.01">0.01</option>
+      <option value="0.02">0.02</option>
+      <option value="0.03">0.03</option>
+      <option value="0.04">0.04</option>
+      <option value="0.05">0.05</option>
+      <option value="0.06">0.06</option>
+      <option value="0.07">0.07</option>
+      <option value="0.08">0.08</option>
+      <option value="0.09">0.09</option>
+      <option value="0.1">0.1</option>
+      <option value="0.2">0.2</option>
+      <option value="0.3">0.3</option>
+      <option value="0.4">0.4</option>
+      <option value="0.5">0.5</option>
+      <option value="0.6">0.6</option>
+      <option value="0.7">0.7</option>
+      <option value="0.8">0.8</option>
+      <option value="0.9">0.9</option>
+      <option value="1.0">1.0</option>
+    </param>
+    <param name="detection_thresh" type="select" multiple="true" label="Detection thresholds">
+      <option value="0.001">0.001</option>
+      <option value="0.002">0.002</option>
+      <option value="0.003">0.003</option>
+      <option value="0.004">0.004</option>
+      <option value="0.005">0.005</option>
+      <option value="0.006">0.006</option>
+      <option value="0.007">0.007</option>
+      <option value="0.008">0.008</option>
+      <option value="0.009">0.009</option>
+      <option value="0.01">0.01</option>
+      <option value="0.02">0.02</option>
+      <option value="0.03">0.03</option>
+      <option value="0.04">0.04</option>
+      <option value="0.05">0.05</option>
+      <option value="0.06">0.06</option>
+      <option value="0.07">0.07</option>
+      <option value="0.08">0.08</option>
+      <option value="0.09">0.09</option>
+      <option value="0.1">0.1</option>
+      <option value="0.2">0.2</option>
+      <option value="0.3">0.3</option>
+      <option value="0.4">0.4</option>
+      <option value="0.5">0.5</option>
+      <option value="0.6">0.6</option>
+      <option value="0.7">0.7</option>
+      <option value="0.8">0.8</option>
+      <option value="0.9">0.9</option>
+      <option value="1.0">1.0</option>
+    </param>
+    <param name="summary_out" type="boolean" truevalue="true" falsevalue="false" checked="true" label="Include a (text) summary file for all the simulations" />
+<!--    <param name="sim_results" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Output all tabular simulation results" help="Number of polymorphisms times number of detection thresholds"/>
+-->
+  </inputs>
+  <outputs>
+    <data format="png" name="output_png" />
+    <data format="tabular" name="output_summary">
+      <filter>summary_out == True</filter>
+    </data>
+<!--
+    <data format="tabular" name="output">
+      <filter>sim_files_out</filter>
+    </data>
+-->
+  </outputs>
+  <tests>
+    <!--
+      Tests cannot be run because of the non-deterministic element of the simulation.
+      But if you run the following "tests" manually in the browser and check against
+      the output files, they should be very similar to the listed output files.
+    -->
+    <!--
+    <test>
+      <param name="input_type" value="history" />
+      <param name="input1" value="ngs_simulation_in1.fasta" ftype="fasta" />
+      <param name="read_len" value="76" />
+      <param name="avg_coverage" value="200" />
+      <param name="error_rate" value="0.001" />
+      <param name="num_sims" value="25" />
+      <param name="polymorphism" value="0.02,0.04,0.1" />
+      <param name="detection_thresh" value="0.01,0.02" />
+      <param name="summary_out" value="true" />
+      <output name="output_png" file="ngs_simulation_out1.png" />
+      <output name="output_summary" file="ngs_simulation_out2.tabular" />
+    </test>
+    <test>
+      <param name="input_type" value="built-in" />
+      <param name="genome" value="pUC18" />
+      <param name="read_len" value="50" />
+      <param name="avg_coverage" value="150" />
+      <param name="error_rate" value="0.005" />
+      <param name="num_sims" value="25" />
+      <param name="polymorphism" value="0.001,0.005" />
+      <param name="detection_thresh" value="0.001,0.002" />
+      <param name="summary_out" value="false" />
+      <output name="output_png" file="ngs_simulation_out3.png" />
+    </test>
+    -->
+  </tests>
+  <help>
+
+**What it does**
+
+This tool simulates an Illumina run and provides plots of false positives and false negatives. It allows for a range of simulation parameters to be set. Note that this simulation sets only one (randomly chosen) position in the genome as polymorphic, according to the value specified. Superimposed on this are "sequencing errors", which are uniformly (and randomly) distributed. Polymorphisms are assigned using the detection threshold, so if the detection threshold is set to the same as the minor allele frequency, the expected false negative rate is 50%.
+
+**Parameter list**
+
+These are the parameters that should be set for the simulation::
+
+  Read length (which is the same for all reads)
+  Average Coverage
+  Frequency for Minor Allele
+  Sequencing Error Rate
+  Detection Threshold
+  Number of Simulations
+
+You also should choose to use either a built-in genome or supply your own FASTA file.
+
+**Output**
+
+There are one or two. The first is a png that contains two different plots and is always generated. The second is optional and is a text file with some summary information about the simulations that were run. Below are some example outputs for a 10-simulation run on phiX with the default settings::
+
+  Read length                    76
+  Average coverage               200
+  Error rate/quality score       0.001
+  Number of simulations          100
+  Frequencies for minor allele   0.002
+                                 0.004
+  Detection thresholds           0.003
+                                 0.005
+                                 0.007
+  Include summary file           Yes
+
+Plot output (png):
+
+.. image:: ../static/images/ngs_simulation.png
+
+Summary output (txt)::
+
+        FP              FN       GENOMESIZE.5386      fprate          hetcol          errcol
+  Min.   : 71.0   Min.   :0.0    Mode:logical     Min.   :0.01318         Min.   :0.004   Min.   :0.007
+  1st Qu.:86.0    1st Qu.:1.0    NA's:10          1st Qu.:0.01597         1st Qu.:0.004   1st Qu.:0.007
+  Median :92.5    Median :1.0    NA       Median :0.01717         Median :0.004   Median :0.007
+  Mean   :93.6    Mean   :0.9    NA       Mean   :0.01738         Mean   :0.004   Mean   :0.007
+  3rd Qu.:100.8   3rd Qu.:1.0    NA       3rd Qu.:0.01871         3rd Qu.:0.004   3rd Qu.:0.007
+  Max.   :123.0   Max.   :1.0    NA       Max.   :0.02284         Max.   :0.004   Max.   :0.007
+  
+  False Positive Rate Summary
+          0.003   0.005   0.007
+  0.001   0.17711 0.10854 0.01673
+  0.009   0.18049 0.10791 0.01738
+
+  False Negative Rate Summary
+          0.003   0.005     0.007
+  0.001   1.0     0.8       1.0
+  0.009   0.4     0.7       0.9
+
+
+  </help>
+</tool>
+
+

Binary file test-data/ngs_simulation_out1.png has changed

--- a/tool_conf.xml.sample
+++ b/tool_conf.xml.sample
@@ -312,6 +312,9 @@
    <tool file="genetrack/genetrack_indexer.xml" /><tool file="genetrack/genetrack_peak_prediction.xml" /></section>
+  <section name="NGS: Simulation" id="ngs-simulation">
+    <tool file="ngs_simulation/ngs_simulation.xml" />
+  </section><section name="SNP/WGA: Data; Filters" id="rgdat"><label text="Data: Import and upload" id="rgimport" /><tool file="data_source/upload.xml"/>

--- /dev/null
+++ b/test-data/ngs_simulation_in1.fasta
@@ -0,0 +1,41 @@
+>gi|209210|gb|L09136.1|SYNPUC18CV pUC18c cloning vector (beta-galactosidase mRNA on complementary strand)
+TCGCGCGTTTCGGTGATGACGGTGAAAACCTCTGACACATGCAGCTCCCGGAGACGGTCACAGCTTGTCT
+GTAAGCGGATGCCGGGAGCAGACAAGCCCGTCAGGGCGCGTCAGCGGGTGTTGGCGGGTGTCGGGGCTGG
+CTTAACTATGCGGCATCAGAGCAGATTGTACTGAGAGTGCACCATATGCGGTGTGAAATACCGCACAGAT
+GCGTAAGGAGAAAATACCGCATCAGGCGCCATTCGCCATTCAGGCTGCGCAACTGTTGGGAAGGGCGATC
+GGTGCGGGCCTCTTCGCTATTACGCCAGCTGGCGAAAGGGGGATGTGCTGCAAGGCGATTAAGTTGGGTA
+ACGCCAGGGTTTTCCCAGTCACGACGTTGTAAAACGACGGCCAGTGCCAAGCTTGCATGCCTGCAGGTCG
+ACTCTAGAGGATCCCCGGGTACCGAGCTCGAATTCGTAATCATGGTCATAGCTGTTTCCTGTGTGAAATT
+GTTATCCGCTCACAATTCCACACAACATACGAGCCGGAAGCATAAAGTGTAAAGCCTGGGGTGCCTAATG
+AGTGAGCTAACTCACATTAATTGCGTTGCGCTCACTGCCCGCTTTCCAGTCGGGAAACCTGTCGTGCCAG
+CTGCATTAATGAATCGGCCAACGCGCGGGGAGAGGCGGTTTGCGTATTGGGCGCTCTTCCGCTTCCTCGC
+TCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGCGGTATCAGCTCACTCAAAGGCGGTAATACG
+GTTATCCACAGAATCAGGGGATAACGCAGGAAAGAACATGTGAGCAAAAGGCCAGCAAAAGGCCAGGAAC
+CGTAAAAAGGCCGCGTTGCTGGCGTTTTTCCATAGGCTCCGCCCCCCTGACGAGCATCACAAAAATCGAC
+GCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCCTGGAAGCTCCCT
+CGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCCTTCGGGAAGCGTG
+GCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCCAAGCTGGGCTGTG
+TGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTGAGTCCAACCCGGT
+AAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAGGTATGTAGGCGGT
+GCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGGACAGTATTTGGTATCTGCGCTC
+TGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAG
+CGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGATCCTTTGATC
+TTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAA
+AAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTA
+AACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTATTTCGTTCA
+TCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCATCTGGCCCCAGTG
+CTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAG
+GGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCT
+AGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCAC
+GCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTACATGATCCCCCAT
+GTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTTGGCCGCAGTGTTA
+TCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGATGCTTTTCTGTGA
+CTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCTCTTGCCCGGCGTC
+AATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGG
+CGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGAT
+CTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAA
+GGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTAT
+CAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGC
+GCACATTTCCCCGAAAAGTGCCACCTGACGTCTAAGAAACCATTATTATCATGACATTAACCTATAAAAA
+TAGGCGTATCACGAGGCCCTTTCGTC
+

--- a/tool_data_table_conf.xml.sample
+++ b/tool_data_table_conf.xml.sample
@@ -1,22 +1,27 @@
 <tables>
-    <!-- Locations of MAF files that have been indexed with bx-python -->    
-    <table name="indexed_maf_files">
-        <columns>name, value, dbkey, species</columns>
-        <file path="tool-data/maf_index.loc" />
+    <!-- Locations of indexes in the BFAST mapper format -->
+    <table name="bfast_indexes" comment_char="#">
+        <columns>value, dbkey, formats, name, path</columns>
+        <file path="tool-data/bfast_indexes.loc" />
+    </table>
+    <!-- Locations of indexes in the Bowtie mapper format -->
+    <table name="bowtie_indexes">
+        <columns>name, value</columns>
+        <file path="tool-data/bowtie_indices.loc" /></table><!-- Locations of indexes in the BWA mapper format --><table name="bwa_indexes"><columns>name, value</columns><file path="tool-data/bwa_index.loc" /></table>
-    <!-- Locations of indexes in the Bowtie mapper format -->
-    <table name="bowtie_indexes">
-        <columns>name, value</columns>
-        <file path="tool-data/bowtie_indices.loc" />
+    <!-- Locations of MAF files that have been indexed with bx-python -->    
+    <table name="indexed_maf_files">
+        <columns>name, value, dbkey, species</columns>
+        <file path="tool-data/maf_index.loc" /></table>
-    <!-- Locations of indexes in the BFAST mapper format -->
-    <table name="bfast_indexes" comment_char="#">
-        <columns>value, dbkey, formats, name, path</columns>
-        <file path="tool-data/bfast_indexes.loc" />
+    <!-- Locations of fasta files appropriate for NGS simulation -->
+    <table name="ngs_sim_fasta" comment_char="#">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/ngs_sim_fasta.loc" /></table></tables>

--- /dev/null
+++ b/tools/ngs_simulation/ngs_simulation.py
@@ -0,0 +1,280 @@
+#!/usr/bin/env python
+
+"""
+Runs Ben's simulation.
+
+usage: %prog [options]
+   -i, --input=i: Input genome (FASTA format)
+   -g, --genome=g: If built-in, the genome being used
+   -l, --read_len=l: Read length
+   -c, --avg_coverage=c: Average coverage
+   -e, --error_rate=e: Error rate (0-1)
+   -n, --num_sims=n: Number of simulations to run
+   -p, --polymorphism=p: Frequency/ies for minor allele (comma-separate list of 0-1)
+   -d, --detection_thresh=d: Detection thresholds (comma-separate list of 0-1)
+   -p, --output_png=p: Plot output
+   -s, --summary_out=s: Whether or not to output a file with summary of all simulations
+   -m, --output_summary=m: File name for output summary of all simulations
+   -f, --new_file_path=f: Directory for summary output files
+
+"""
+# removed output of all simulation results on request (not working)
+#   -r, --sim_results=r: Output all tabular simulation results (number of polymorphisms times number of detection thresholds)
+#   -o, --output=o: Base name for summary output for each run
+
+from rpy import *
+import os
+import random, sys, tempfile
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.cookbook import doc_optparse
+
+def stop_err( msg ):
+    sys.stderr.write( '%s\n' % msg )
+    sys.exit()
+
+def __main__():
+    #Parse Command Line
+    options, args = doc_optparse.parse( __doc__ )
+    # validate parameters
+    error = ''
+    try:
+        read_len = int( options.read_len )
+        if read_len <= 0:
+            raise Exception, ' greater than 0'
+    except TypeError, e:
+        error = ': %s' % str( e )
+    if error:
+        stop_err( 'Make sure your number of reads is an integer value%s' % error )
+    error = ''
+    try:
+        avg_coverage = int( options.avg_coverage )
+        if avg_coverage <= 0:
+            raise Exception, ' greater than 0'
+    except Exception, e:
+        error = ': %s' % str( e )
+    if error:
+        stop_err( 'Make sure your average coverage is an integer value%s' % error )
+    error = ''
+    try:
+        error_rate = float( options.error_rate )
+        if error_rate >= 1.0:
+            error_rate = 10 ** ( -error_rate / 10.0 )
+        elif error_rate < 0:
+            raise Exception, ' between 0 and 1'
+    except Exception, e:
+        error = ': %s' % str( e )
+    if error:
+        stop_err( 'Make sure the error rate is a decimal value%s or the quality score is at least 1' % error )
+    try:
+        num_sims = int( options.num_sims )
+    except TypeError, e:
+        stop_err( 'Make sure the number of simulations is an integer value: %s' % str( e ) )
+    if len( options.polymorphism ) > 0:
+        polymorphisms = [ float( p ) for p in options.polymorphism.split( ',' ) ]
+    else:
+        stop_err( 'Select at least one polymorphism value to use' )
+    if len( options.detection_thresh ) > 0:
+        detection_threshes = [ float( dt ) for dt in options.detection_thresh.split( ',' ) ]
+    else:
+        stop_err( 'Select at least one detection threshold to use' )
+
+    # mutation dictionaries
+    hp_dict = { 'A':'G', 'G':'A', 'C':'T', 'T':'C', 'N':'N' } # heteroplasmy dictionary
+    mt_dict = { 'A':'C', 'C':'A', 'G':'T', 'T':'G', 'N':'N'} # misread dictionary
+
+    # read fasta file to seq string
+    all_lines = open( options.input, 'rb' ).readlines()
+    seq = ''
+    for line in all_lines:
+        line = line.rstrip() 
+        if line.startswith('>'):
+            pass
+        else:
+            seq += line.upper()
+    seq_len = len( seq )
+
+    # output file name template
+# removed output of all simulation results on request (not working)
+#    if options.sim_results == "true":
+#        out_name_template = os.path.join( options.new_file_path, 'primary_output%s_' + options.output + '_visible_tabular' )
+#    else:
+#        out_name_template = tempfile.NamedTemporaryFile().name + '_%s'
+    out_name_template = tempfile.NamedTemporaryFile().name + '_%s'
+    print 'out_name_template:', out_name_template
+
+    # set up output files
+    outputs = {}
+    i = 1
+    for p in polymorphisms:
+        outputs[ p ] = {}
+        for d in detection_threshes:
+            outputs[ p ][ d ] = out_name_template % i
+            i += 1
+
+    # run sims
+    for polymorphism in polymorphisms:
+        for detection_thresh in detection_threshes:
+            output = open( outputs[ polymorphism ][ detection_thresh ], 'wb' )
+            output.write( 'FP\tFN\tGENOMESIZE=%s\n' % seq_len )
+            sim_count = 0
+            while sim_count < num_sims:
+                # randomly pick heteroplasmic base index
+                hbase = random.choice( range( 0, seq_len ) )
+                #hbase = seq_len/2#random.randrange( 0, seq_len )
+                # create 2D quasispecies list
+                qspec = map( lambda x: [], [0] * seq_len )
+                # simulate read indices and assign to quasispecies
+                i = 0
+                while i < ( avg_coverage * ( seq_len / read_len ) ): # number of reads (approximates coverage)
+                    start = random.choice( range( 0, seq_len ) )
+                    #start = seq_len/2#random.randrange( 0, seq_len ) # assign read start
+                    if random.random() < 0.5: # positive sense read
+                        end = start + read_len # assign read end
+                        if end > seq_len: # overshooting origin
+                            read = range( start, seq_len ) + range( 0, ( end - seq_len ) )
+                        else: # regular read
+                            read = range( start, end )
+                    else: # negative sense read
+                        end = start - read_len # assign read end
+                        if end < -1: # overshooting origin
+                            read = range( start, -1, -1) + range( ( seq_len - 1 ), ( seq_len + end ), -1 )
+                        else: # regular read
+                            read = range( start, end, -1 )
+                    # assign read to quasispecies list by index
+                    for j in read:
+                        if j == hbase and random.random() < polymorphism: # heteroplasmic base is variant with p = het
+                            ref = hp_dict[ seq[ j ] ]
+                        else: # ref is the verbatim reference nucleotide (all positions)
+                            ref = seq[ j ]
+                        if random.random() < error_rate: # base in read is misread with p = err
+                            qspec[ j ].append( mt_dict[ ref ] )
+                        else: # otherwise we carry ref through to the end
+                            qspec[ j ].append(ref)
+                    # last but not least
+                    i += 1
+                bases, fpos, fneg = {}, 0, 0 # last two will be outputted to summary file later
+                for i, nuc in enumerate( seq ):
+                    cov = len( qspec[ i ] )
+                    bases[ 'A' ] = qspec[ i ].count( 'A' )
+                    bases[ 'C' ] = qspec[ i ].count( 'C' )
+                    bases[ 'G' ] = qspec[ i ].count( 'G' )
+                    bases[ 'T' ] = qspec[ i ].count( 'T' )
+                    # calculate max NON-REF deviation
+                    del bases[ nuc ]
+                    maxdev = float( max( bases.values() ) ) / cov
+                    # deal with non-het sites
+                    if i != hbase:
+                        if maxdev >= detection_thresh: # greater than detection threshold = false positive
+                            fpos += 1
+                    # deal with het sites
+                    if i == hbase:
+                        hnuc = hp_dict[ nuc ] # let's recover het variant
+                        if ( float( bases[ hnuc ] ) / cov ) < detection_thresh: # less than detection threshold = false negative
+                            fneg += 1
+                        del bases[ hnuc ] # ignore het variant
+                        maxdev = float( max( bases.values() ) ) / cov # check other non-ref bases at het site
+                        if maxdev >= detection_thresh: # greater than detection threshold = false positive (possible)
+                            fpos += 1
+                # output error sums and genome size to summary file
+                output.write( '%d\t%d\n' % ( fpos, fneg ) )
+                sim_count += 1
+            # close output up
+            output.close()
+
+    # Parameters (heteroplasmy, error threshold, colours)
+    r( '''
+    het=c(%s)
+    err=c(%s)
+    grade = (0:32)/32
+    hues = rev(gray(grade))
+    ''' % ( ','.join( [ str( p ) for p in polymorphisms ] ), ','.join( [ str( d ) for d in detection_threshes ] ) ) )
+
+    # Suppress warnings
+    r( 'options(warn=-1)' )
+
+    # Create allsum (for FP) and allneg (for FN) objects
+    r( 'allsum <- data.frame()' )
+    for polymorphism in polymorphisms:
+        for detection_thresh in detection_threshes:
+            output = outputs[ polymorphism ][ detection_thresh ]
+            cmd = '''
+                  ngsum = read.delim('%s', header=T)
+                  ngsum$fprate <- ngsum$FP/%s
+                  ngsum$hetcol <- %s
+                  ngsum$errcol <- %s
+                  allsum <- rbind(allsum, ngsum)
+                  ''' % ( output, seq_len, polymorphism, detection_thresh )
+            r( cmd )
+
+    if os.path.getsize( output ) == 0:
+        for p in outputs.keys():
+            for d in outputs[ p ].keys():
+                sys.stderr.write(outputs[ p ][ d ] + ' '+str( os.path.getsize( outputs[ p ][ d ] ) )+'\n')
+
+    if options.summary_out == "true":
+        r( 'write.table(summary(ngsum), file="%s", quote=FALSE, sep="\t", row.names=FALSE)' % options.output_summary )
+
+    # Summary objects (these could be printed)
+    r( '''
+    tr_pos <- tapply(allsum$fprate,list(allsum$hetcol,allsum$errcol), mean)
+    tr_neg <- tapply(allsum$FN,list(allsum$hetcol,allsum$errcol), mean)
+    cat('\nFalse Positive Rate Summary\n\t', file='%s', append=T, sep='\t')
+    write.table(format(tr_pos, digits=4), file='%s', append=T, quote=F, sep='\t')
+    cat('\nFalse Negative Rate Summary\n\t', file='%s', append=T, sep='\t')
+    write.table(format(tr_neg, digits=4), file='%s', append=T, quote=F, sep='\t')
+    ''' % tuple( [ options.output_summary ] * 4 ) )
+
+    # Setup graphs
+    #pdf(paste(prefix,'_jointgraph.pdf',sep=''), 15, 10)
+    r( '''
+    png('%s', width=800, height=500, units='px', res=250)
+    layout(matrix(data=c(1,2,1,3,1,4), nrow=2, ncol=3), widths=c(4,6,2), heights=c(1,10,10))
+    ''' % options.output_png )
+
+    # Main title
+    genome = ''
+    if options.genome:
+        genome = '%s: ' % options.genome
+    r( '''
+    par(mar=c(0,0,0,0))
+    plot(1, type='n', axes=F, xlab='', ylab='')
+    text(1,1,paste('%sVariation in False Positives and Negatives (', %s, ' simulations, coverage ', %s,')', sep=''), font=2, family='sans', cex=0.7)
+    ''' % ( genome, options.num_sims, options.avg_coverage ) )
+
+    # False positive boxplot
+    r( '''
+    par(mar=c(5,4,2,2), las=1, cex=0.35)
+    boxplot(allsum$fprate ~ allsum$errcol, horizontal=T, ylim=rev(range(allsum$fprate)), cex.axis=0.85)
+    title(main='False Positives', xlab='false positive rate', ylab='')
+    ''' )
+
+    # False negative heatmap (note zlim command!)
+    num_polys = len( polymorphisms )
+    num_dets = len( detection_threshes )
+    r( '''
+    par(mar=c(5,4,2,1), las=1, cex=0.35)
+    image(1:%s, 1:%s, tr_neg, zlim=c(0,1), col=hues, xlab='', ylab='', axes=F, border=1)
+    axis(1, at=1:%s, labels=rownames(tr_neg), lwd=1, cex.axis=0.85, axs='i')
+    axis(2, at=1:%s, labels=colnames(tr_neg), lwd=1, cex.axis=0.85)
+    title(main='False Negatives', xlab='minor allele frequency', ylab='detection threshold')
+    ''' % ( num_polys, num_dets, num_polys, num_dets ) )
+
+    # Scale alongside
+    r( '''
+    par(mar=c(2,2,2,3), las=1)
+    image(1, grade, matrix(grade, ncol=length(grade), nrow=1), col=hues, xlab='', ylab='', xaxt='n', las=1, cex.axis=0.85)
+    title(main='Key', cex=0.35)
+    mtext('false negative rate', side=1, cex=0.35)
+    ''' )
+
+    # Close graphics
+    r( '''
+    layout(1)
+    dev.off()
+    ''' )
+
+    # Tidy up
+#    r( 'rm(folder,prefix,sim,cov,het,err,grade,hues,i,j,ngsum)' )
+
+if __name__ == "__main__" : __main__()

--- /dev/null
+++ b/test-data/ngs_simulation_out2.tabular
@@ -0,0 +1,19 @@
+      FP	      FN	GENOMESIZE.2686	    fprate	    hetcol	    errcol
+Min.   :0.00  	Min.   :0  	Mode:logical  	Min.   :0.000e+00  	Min.   :0.1  	Min.   :0.02  
+1st Qu.:0.00  	1st Qu.:0  	NA's:25       	1st Qu.:0.000e+00  	1st Qu.:0.1  	1st Qu.:0.02  
+Median :0.00  	Median :0  	NA	Median :0.000e+00  	Median :0.1  	Median :0.02  
+Mean   :0.04  	Mean   :0  	NA	Mean   :1.489e-05  	Mean   :0.1  	Mean   :0.02  
+3rd Qu.:0.00  	3rd Qu.:0  	NA	3rd Qu.:0.000e+00  	3rd Qu.:0.1  	3rd Qu.:0.02  
+Max.   :1.00  	Max.   :0  	NA	Max.   :3.723e-04  	Max.   :0.1  	Max.   :0.02  
+
+False Positive Rate Summary
+	0.01	0.02
+0.02	9.710e-03	4.468e-05
+0.04	9.680e-03	1.489e-05
+0.1	9.695e-03	1.489e-05
+
+False Negative Rate Summary
+	0.01	0.02
+0.02	0.16	0.52
+0.04	0.00	0.04
+0.1	0.00	0.00

    

commits-noreply＠bitbucket.org

tags

participants (1)