[hg] galaxy 3470: Error conditions will skip lines instead of st...

3 Mar 2010

details:   http://www.bx.psu.edu/hg/galaxy/rev/0291f870f2c9
changeset: 3470:0291f870f2c9
user:      Greg Von Kuster <greg@bx.psu.edu>
date:      Wed Mar 03 13:40:26 2010 -0500
description:
Error conditions will skip lines instead of stopping the tool for the lastz paired reads, and clean up the tool pages for both lastz tools.

diffstat:

 tools/sr_mapping/lastz_paired_reads_wrapper.py  |  134 ++++++++++++++++++-----
 tools/sr_mapping/lastz_paired_reads_wrapper.xml |  102 +-----------------
 tools/sr_mapping/lastz_wrapper.xml              |   18 +-
 3 files changed, 118 insertions(+), 136 deletions(-)

diffs (464 lines):

diff -r 8e9aa1709c6c -r 0291f870f2c9 tools/sr_mapping/lastz_paired_reads_wrapper.py

--- a/tools/sr_mapping/lastz_paired_reads_wrapper.py	Wed Mar 03 12:07:39 2010 -0500
+++ b/tools/sr_mapping/lastz_paired_reads_wrapper.py	Wed Mar 03 13:40:26 2010 -0500
@@ -78,11 +78,49 @@
 # Keep track of all created temporary files so they can be deleted
 global tmp_file_names
 tmp_file_names = []
+# The values in the skipped_lines dict are tuples consisting of:
+# - the number of skipped lines for that error
+# If not a sequence error:
+# - the 1st line number on which the error was found
+# - the text of the 1st line on which the error was found
+# If a sequence error:
+# - The number of the sequence in the file
+# - the sequence name on which the error occurred
+# We may need to improve dealing with file position and text as
+# much of it comes from temporary files that are created from the
+# inputs, and not the inputs themselves, so this could be confusing
+# to the user.
+global skipped_lines
+skipped_lines = dict( bad_interval=( 0, 0, '' ),
+                      inconsistent_read_lengths=( 0, 0, '' ),
+                      inconsistent_reads=( 0, 0, '' ),
+                      inconsistent_sizes=( 0, 0, '' ),
+                      missing_mate=( 0, 0, '' ),
+                      missing_quals=( 0, 0, '' ),
+                      missing_seq=( 0, 0, '' ),
+                      multiple_seqs=( 0, 0, '' ),
+                      no_header=( 0, 0, '' ),
+                      num_fields=( 0, 0, '' ),
+                      reads_paired=( 0, 0, '' ),
+                      sam_flag=( 0, 0, '' ),
+                      sam_headers=( 0, 0, '' ),
+                      sam_min_columns=( 0, 0, '' ),
+                      two_mate_names=( 0, 0, '' ),
+                      wrong_seq_len=( 0, 0, '' ) )
+global total_skipped_lines
+total_skipped_lines = 0
 
 def stop_err( msg ):
     sys.stderr.write( "%s" % msg )
     sys.exit()
 
+def skip_line( error_key, position, text ):
+    if not skipped_lines[ error_key ][2]:
+        skipped_lines[ error_key ][1] = position
+        skipped_lines[ error_key ][2] = text
+    skipped_lines[ error_key ][0] += 1
+    total_skipped_lines += 1
+
 def get_tmp_file_name( dir=None, suffix=None ):
     """
     Return a unique temporary file name that can be managed.  The
@@ -150,16 +188,16 @@
             line = line.split( "#", 1 )[0].rstrip()
         fields = line.split()
         if len( fields ) != 4:
-            # TODO: Do we want to err out here or just skip the line?
-            stop_err( "Wrong number of fields ( must be 4 ) in line %d: %s" % ( i+1, line ) )
+            skip_line( 'num_fields', i+1, line )
+            continue
         name, start, length, size = fields
         start = int( start )
         length = int( length )
         size = int( size )
         end = start + length
         if end > size:
-            # TODO: Do we want to err out here or just skip the line?
-            stop_err( "Bad interval in line %d: %s" % ( i+1, line ) )
+            skip_line[ 'bad_interval' ] += 1
+            continue
         if name not in read_to_linker_dict:
             read_to_linker_dict[ name ] = ( start, end, size )
             continue
@@ -168,9 +206,8 @@
             continue
         ( s, e, sz ) = read_to_linker_dict[ name ]
         if sz != size:
-            # This should never occur
-            # TODO: Do we want to err out here or just skip the line?
-            stop_err( "Inconsistent sizes for %s" % name )
+            skip_line( 'inconsistent_sizes', i+1, name )
+            continue
         if s > end or e < start:
             # Non-overlapping intervals, so skip this sequence
             read_to_linker_dict[ name ] = None
@@ -194,18 +231,15 @@
             read_to_linker_dict[ seq.name ] = ""
             continue
         if read_to_linker_dict[ seq.name ] == "":
-            # TODO: Do we want to err out here or just skip the line?
-            stop_err( "Multiple sequences named %s" % seq.name )
+            skip_line( 'multiple_seqs', seqs, seq.name )
+            continue
         if read_to_linker_dict[ seq.name ] == None:
             # Read previously marked as non-overlapping intervals, so skip this sequence - see above
             continue
         ( start, end, size ) = read_to_linker_dict[ seq.name ]
         if seq.length != size:
-            # TODO: Do we want to err out here or just skip the line?
-            combined_linker_file.close()
-            mates_file.close()
-            mates_mapping_file.close()
-            stop_err( "Sequence disagrees with size for sequence %s, size: %s seq.length: %s" % ( seq.name, str( size ), str( seq.length ) ) )
+            skip_line( 'wrong_seq_len', seqs, seq.name )
+            continue
         left = seq.text[ :start ]
         right = seq.text[ end: ]
         left_is_small = len( left ) <= seq_len_lower_threshold
@@ -272,7 +306,9 @@
             if not seq:
                 break
             seqs += 1
-            # Create a temporary file to contain the current sequence as input to lastz
+            # Create a temporary file to contain the current sequence as input to lastz.
+            # We're doing this a bit differently here since we could be generating a huge
+            # number of temporary files.
             tmp_in_fd, tmp_in_file_name = tempfile.mkstemp( suffix='seq_%d_in' % seqs )
             tmp_in_file = os.fdopen( tmp_in_fd, 'w+b' )
             tmp_in_file.write( '>%s\n%s\n' % ( seq.name, seq.text ) )
@@ -441,10 +477,12 @@
         if not line.startswith( "#" ):
             fields = line.split()
             if len( fields ) != 4:
-                stop_err( "Incorrect number of fields (must be 4) in line %s of file %s" % ( i+1, tmp_mates_mapping_file_name ) )
+                skip_line( "num_fields", i+1, line )
+                continue
             mate_name, read_name, s_offset, e_offset = fields
             if mate_name in mate_to_read_dict:
-                stop_err( "%s is in the mate_to_read_dict when it should not be." % mate_name )
+                skip_line( 'two_mate_names', i+1, mate_name )
+                continue
             mate_to_read_dict[ mate_name ] = ( read_name, int( s_offset ), int( e_offset ) )
     # Read sequence data
     read_to_nucs_dict = {}
@@ -458,9 +496,8 @@
         seq_text_upper = seq.text.upper()
         if seq.name in read_to_nucs_dict:
             if seq_text_upper != read_to_nucs_dict[ seq.name ]:
-                # TODO: Should we err out here or just skip the line?
-                stop_err( "Inconsistent reads named %s (second occurs at line %d in file %s)" % ( seq.name, seqs, input2 ) )
-                #continue
+                skip_line( 'inconsistent_reads', seqs, seq.name )
+                continue
         read_to_nucs_dict[ seq.name ] = seq_text_upper
     # Read quality data
     def quality_sequences( f ):
@@ -477,7 +514,8 @@
                 seq_line  = line_number
                 seq_quals = []
             elif seq_name is None:
-                stop_err( "First quality sequence has no header" )
+                skip_line( 'no_header', line_number, line )
+                continue
             else:
                 seq_quals += [ int( q ) for q in line.split() ]
         if seq_name is not None:
@@ -494,11 +532,11 @@
         quals = samify_phred_scores( quals )
         if seq_name in read_to_quals_dict:
             if quals != read_to_quals_dict[ seq_name ]:
-                stop_err( "Inconsistent quality sequences named %s (second occurs at line %d in %s)" % ( seq_name, line_number, input4 ) )
+                skip_line( 'inconsistent_reads', line_number, seq_name )
             continue
         if len( quals ) != len( read_to_nucs_dict[ seq_name ] ):
-            stop_err( "Inconsistent read/quality lengths for %s, quals: %s, read_to_nucs_dict[ seq_name ]: %s" % \
-                      ( seq_name, quals, read_to_nucs_dict[ seq_name ] ) )
+            skip_line( 'inconsistent_read_lengths', line_number, seq_name )
+            continue
         read_to_quals_dict[ seq_name ] = quals
     # process the SAM file
     tmp_align_file_names = ' '.join( tmp_align_file_name_list )
@@ -512,21 +550,25 @@
         line = line.strip()
         if line.startswith( "@" ):
             if has_non_header:
-                stop_err( "Input SAM contains headers in several places (e.g., line %d) in file %s" % ( i+1, combined_chrom_file_name ) )
+                skip_line( 'sam_headers', i+1, line )
+                continue
             fout.write( "%s\n" % line )
             continue
         has_non_header = True
         fields = line.split()
         num_fields = len( fields )
         if num_fields < SAM_MIN_COLUMNS:
-            stop_err( "Not enough columns at line %d (%d, expected %d)" % ( i+1, num_fields, SAM_MIN_COLUMNS ) )
+            skip_line( 'sam_min_columns', i+1, line )
+            continue
         # Set flags for mates
         try:
             flag = int( fields[ SAM_FLAG_COLUMN ] )
         except ValueError:
-            stop_err( "Bad SAM flag at line %d: %s" % ( i+1, line ) )
+            skip_line( 'sam_flag', i+1, line )
+            continue
         if not( flag & ( BAM_FPAIRED + BAM_FREAD1 + BAM_FREAD2 ) == 0 ):
-            stop_err( "SAM flag indicates reads already paired, at line %d\n%s" % ( i+1, line ) )
+            skip_line( 'reads_paired', i+1, line )
+            continue
         mate_name = fields[ SAM_QNAME_COLUMN ]
         unmap_it = False
         half = None
@@ -548,7 +590,8 @@
             try:
                 read_name, s_offset, e_offset = mate_to_read_dict[ mate_name ]
             except KeyError:
-                stop_err( "'%s' doesn't appear in the mapping file." % mate_name )
+                skip_line( 'missing_mate', i+1, mate_name )
+                continue
             cigar = fields[ SAM_CIGAR_COLUMN ]
             cigar_prefix = None
             cigar_suffix = None
@@ -598,14 +641,16 @@
             fields[ SAM_CIGAR_COLUMN ] = cigar
         # Fetch sequence and quality values, and flip/clip them
         if read_name not in read_to_nucs_dict:
-            stop_err( "Missing sequence for '%s'" % read_name )
+            skip_line( 'missing_seq', i+1, read_name )
+            continue
         nucs = read_to_nucs_dict[ read_name ]
         if not on_plus_strand:
             nucs = reverse_complement( nucs )
         quals = None
         if read_to_quals_dict != None:
             if read_name not in read_to_quals_dict:
-                stop_err( "Missing quality values for '%s'" % read_name )
+                skip_line( 'missing_quals', i+1, read_name )
+                continue
             quals = read_to_quals_dict[ read_name ]
             if not on_plus_strand:
                 quals = reverse_string( quals )
@@ -752,5 +797,32 @@
     # Delete all temporary files
     for file_name in tmp_file_names:
         os.remove( file_name )
+    # Handle any invalid lines in the input data
+    if total_skipped_lines:
+        msgs = dict( bad_interval="Bad interval in line",
+                     inconsistent_read_lengths="Inconsistent read/quality lengths for seq #",
+                     inconsistent_reads="Inconsistent reads for seq #",
+                     inconsistent_sizes="Inconsistent sizes for seq #",
+                     missing_mate="Mapping file does not include mate on line",
+                     missing_quals="Missing quality values for name on line",
+                     missing_seq="Missing sequence for name on line",
+                     multiple_seqs="Multiple names for seq #",
+                     no_header="First quality sequence has no header",
+                     num_fields="Must have 4 fields in line",
+                     reads_paired="SAM flag indicates reads already paired on line",
+                     sam_flag="Bad SAM flag on line",
+                     sam_headers="SAM headers on line",
+                     sam_min_columns="Need 11 columns on line",
+                     two_mate_names="Mate name already seen, line",
+                     wrong_seq_len="Size differs from length of seq #" )
+        print "Skipped %d invalid lines: "
+        msg = ""
+        for k, v in skipped_lines.items():
+            if v[0]:
+                # v[0] is the number of times the error occurred
+                # v[1] is the position of the line or sequence in the file
+                # v[2] is the name of the sequence or the text of the line
+                msg += "(%d)%s %d:%s. " % ( v[0], msgs[k], v[1], v[2] )
+        print msg
 
 if __name__=="__main__": __main__()
diff -r 8e9aa1709c6c -r 0291f870f2c9 tools/sr_mapping/lastz_paired_reads_wrapper.xml
--- a/tools/sr_mapping/lastz_paired_reads_wrapper.xml	Wed Mar 03 12:07:39 2010 -0500
+++ b/tools/sr_mapping/lastz_paired_reads_wrapper.xml	Wed Mar 03 13:40:26 2010 -0500
@@ -38,7 +38,7 @@
         <param name="input3" format="fasta" type="data" label="Linker file" />
         <param name="input4" format="qual454" type="data" label="Select a base quality score 454 dataset" />
         <conditional name="seq_name">
-            <param name="how_to_name" type="select" label="Do you want to modify reference name?">
+            <param name="how_to_name" type="select" label="Do you want to modify the reference name?">
                 <option value="no">No</option>
                 <option value="yes">Yes</option>
             </param>
@@ -75,9 +75,9 @@
         
 **What it does**    
         
-**LASTZ** is a high performance pairwise sequence aligner derived from BLASTZ. It is written by Bob Harris in Webb Miller's laboratory at Penn State University. Special scoring sets were derived to improve runtime performance and quality. The Galaxy version of LASTZ is geared towards aligning of short (Illumina/Solexa, AB/SOLiD) and medium (Roche/454) reads against a reference sequence. There is excellent, extensive `documentation`__ on LASTZ available, although it hasn't been updated for the version of LASTZ that Galaxy is running (the key changes have to do with output formats, so it is still extremely helpful).
+**LASTZ** is a high performance pairwise sequence aligner derived from BLASTZ. It is written by Bob Harris in Webb Miller's laboratory at Penn State University. Special scoring sets were derived to improve runtime performance and quality. This Galaxy version of LASTZ is geared towards aligning short (Illumina/Solexa, AB/SOLiD) and medium (Roche/454) paired reads against a reference sequence. There is excellent, extensive documentation on LASTZ available here_. 
 
- .. __: http://www.bx.psu.edu/miller_lab/dist/README.lastz-1.01.50/README.lastz-1.01...
+ .. _here: http://www.bx.psu.edu/miller_lab/dist/README.lastz-1.02.00/README.lastz-1.02...
  
 ------
 
@@ -89,7 +89,7 @@
 
 **Outputs**
 
-LASTZ generates one output. Depending on the choice you make in *Select output format* drop-down LASTZ will produce a SAM file showing sequence alignments, a list of differences between the reads and reference (Polymorphisms), or a general table with one line per alignment block (Tabular). Examples of these outputs are shown below.
+This LASTZ tool produces a SAM file showing sequence alignments.
 
 **SAM output**
 
@@ -132,102 +132,11 @@
   0x0080  the read is the second read in a pair
   0x0100  the alignment is not primary
 
-**Polymorphism (SNP or differences) output**
-
-Polymorphism output contains 14 columns::
-
-     1     2     3  4     5                                   6   7   8  9  10  11 12                                   13                                    14
-  --------------------------------------------------------------------------------------------------------------------------------------------------------------
-  chrM  2490  2491  +  5386  HWI-EAS91_1_306UPAAXX:6:1:486:822   10  11  -  36  C  A  ACCTGTTTTACAGACACCTAAAGCTACATCGTCAAC  ACCTGTTTTAAAGACACCTAAAGCTACATCGTCAAC
-  chrM  2173  2174  +  5386  HWI-EAS91_1_306UPAAXX:6:1:259:1389  26  27  +  36  G  T  GCGTACTTATTCGCCACCATGATTATGACCAGTGTT  GCGTACTTATTCGCCACCATGATTATTACCAGTGTT
-
-where::
-
-  1. (chrM)   - Reference sequence id
-  2. (2490)   - Start position of the difference in the reference
-  3. (2491)   - End position of the difference in the reference
-  4. (+)      - Strand of the reference (always plus)
-  5. (5386)   - Length of the reference sequence
-  6. (HWI...) - read id
-  7. (10)     - Start position of the difference in the read
-  8. (11)     - End position of the difference in the read
-  9. (+)      - Strand of the read
- 10. (36)     - Length of the read
- 11. (C)      - Nucleotide in the reference
- 12. (A)      - Nucleotide in the read
- 13. (ACC...) - Reference side os the alignment
- 14. (ACC...) - Read side of the alignment
- 
-**Tabular output**
-
-Tabular output is a tab-separated format with 30 columns::
-
-   1        2  3     4     5     6     7   8                 9              10  11   12   13   14   15   16   17   18  19                20                21   22     23      24      25    26    27    28    29  30
-  -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-  14  PHIX174  +  5386  4648  4647  4661  14  ATTTTCGTGATATT    EYKX4VC01BV8HS  +   204  154  153  167  154  153  167  14  ATTTTCGTGATATT    ..............    14M  14/14  100.0%  14/204  6.9%  0/14  0.0%  4494  NA
-  16  PHIX174  +  5386  3363  3362  3378  16  GACGCCGGATTTGAGA  EYKX4VC01AWJ88  -   259   36   35   51  209  208  224  16  GACGCCGGATTTGAGA  ................  16M  16/16  100.0%  16/259  6.2%  0/16  0.0%  3327  NA
-
-The following columns are present::
-
-             Field  Meaning
-  ----------------  -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-   1.        score  Score of the alignment block. The scale and meaning of this number will vary, depending on the final stage performed and other command-line options.
-   2.        name1  Name of the target sequence.
-   3.      strand1  Target sequence strand, either "+" or "−".
-   4.        size1  Size of the entire target sequence.
-   5.       start1  Starting position of the alignment block in the target, origin-one.
-   6.      zstart1  Starting position of the alignment block in the target, origin-zero.
-   7.         end1  Ending position of the alignment block in the target, expressed either as origin-one closed or origin-zero half-open (the ending value is the same in both systems).
-   8.      length1  Length of the alignment block in the target (excluding gaps).
-   9.        text1  Aligned characters in the target, including gap characters.
-  10.        name2  Name of the query sequence.
-  11.      strand2  Query sequence strand, either "+" or "−".
-  12.        size2  Size of the entire query sequence.
-  13.       start2  Starting position of the alignment block in the query, origin-one.
-  14.      zstart2  Starting position of the alignment block in the query, origin-zero.
-  15.         end2  Ending position of the alignment block in the query, expressed either as origin-one closed or origin-zero half-open (the ending value is the same in both systems).
-  16.      start2+  Starting position of the alignment block in the query, counting along the query sequence's positive strand (regardless of which query strand was aligned), origin-one. Note that if strand2 is "−", then this is the other end of the block from start2.
-  17.     zstart2+  Starting position of the alignment block in the query, counting along the query sequence's positive strand (regardless of which query strand was aligned), origin-zero. Note that if strand2 is "−", then this is the other end of the block from zstart2.
-  18.        end2+  Ending position of the alignment block in the query, counting along the query sequence's positive strand (regardless of which query strand was aligned), expressed either as origin-one closed or origin-zero half-open (the ending value is the same in both systems). Note that if strand2 is "−", then this is the other end of the block from end2.
-  19.      length2  Length of the alignment block in the query (excluding gaps).
-  20.        text2  Aligned characters in the query, including gap characters.
-  21.         diff  Differences between what would be written for text1 and text2. Matches are written as . (period), transitions as : (colon), transversions as X, and gaps as - (hyphen).
-  22.        cigar  A CIGAR-like representation of the alignment's path through the Dynamic Programming matrix. This is the short representation, without spaces, described in the Ensembl CIGAR specification.
-  23./24. identity  Fraction of aligned bases in the block that are matches (see Identity). This is written as two fields. The first field is a fraction, written as <n>/<d>. The second field contains the same value, computed as a percentage.
-  25./26. coverage  Fraction of the entire input sequence (target or query, whichever is shorter) that is covered by the alignment block (see Coverage). This is written as two fields. The first field is a fraction, written as <n>/<d>. The second field contains the same value, computed as a percentage.
-  27./28.  gaprate  Rate of gaps (also called indels) in the alignment block. This is written as two fields. The first field is a fraction, written as <n>/<d>, with the numerator being the number of alignment columns containing gaps and the denominator being the number without gaps. The second field contains the same value, computed as a percentage.
-  29.     diagonal  The diagonal of the start of the alignment block in the dynamic programming matrix, expressed as an identifying number start1-start2.
-  30.      shingle  A measurement of the shingle overlap between the target and the query. This is intended for the case where both the target and query are relatively short, and their ends are expected to overlap.  
-
--------
-
-**LASTZ Settings**
-
-There are two setting modes: (1) **Commonly used settings** and (2) **Full Parameter List**.
-
-**Commonly used settings**
-
-There are seven modes::
-
-  Illumina-Solexa/AB-SOLiD 95% identity
-  Illumina-Solexa/AB-SOLiD 85% identity
-  Roche-454 98% identity
-  Roche-454 95% identity
-  Roche-454 90% identity
-  Roche-454 85% identity
-  Roche-454 75% identity
-
-when deciding which one to use consider the following: a 36 bp read with two difference will be 34/36 = 94% identical to the reference.  
-
-**Full Parameter List**
-
-This modes gives you a fuller control over lastz. The description of these and other parameters is found at the end of this page. Note, that not all parameters are included in this interface. If you would like to make additional options available through Galaxy, e-mail us at galaxy-bugs@bx.psu.edu.
-
 ------
 
-**Do you want to modify reference name?**
+**Do you want to modify the reference name?**
 
-This option allows you set the name of the reference sequence manually. This is helpful when, for example, you would like to make reference name compatible with the UCSC naming conventions to be able to display your lastz results as a custom track at UCSC Genome Browser.
+This option allows you to set the name of the reference sequence manually. This is helpful when, for example, you would like to make the reference name compatible with the UCSC naming conventions to be able to display your lastz results as a custom track at the UCSC Genome Browser.
 
 ------
 
diff -r 8e9aa1709c6c -r 0291f870f2c9 tools/sr_mapping/lastz_wrapper.xml
--- a/tools/sr_mapping/lastz_wrapper.xml	Wed Mar 03 12:07:39 2010 -0500
+++ b/tools/sr_mapping/lastz_wrapper.xml	Wed Mar 03 13:40:26 2010 -0500
@@ -97,7 +97,7 @@
             </when>   
         </conditional>
         <conditional name="seq_name">
-            <param name="how_to_name" type="select" label="Do you want to modify reference name?">
+            <param name="how_to_name" type="select" label="Do you want to modify the reference name?">
                 <option value="no">No</option>
                 <option value="yes">Yes</option>
             </param>
@@ -213,9 +213,9 @@
         
 **What it does**    
         
-**LASTZ** is a high performance pairwise sequence aligner derived from BLASTZ. It is written by Bob Harris in Webb Miller's laboratory at Penn State University. Special scoring sets were derived to improve runtime performance and quality. The Galaxy version of LASTZ is geared towards aligning of short (Illumina/Solexa, AB/SOLiD) and medium (Roche/454) reads against a reference sequence. There is excellent, extensive `documentation`__ on LASTZ available, although it hasn't been updated for the version of LASTZ that Galaxy is running (the key changes have to do with output formats, so it is still extremely helpful).
+**LASTZ** is a high performance pairwise sequence aligner derived from BLASTZ. It is written by Bob Harris in Webb Miller's laboratory at Penn State University. Special scoring sets were derived to improve runtime performance and quality. This Galaxy version of LASTZ is geared towards aligning short (Illumina/Solexa, AB/SOLiD) and medium (Roche/454) reads against a reference sequence. There is excellent, extensive documentation on LASTZ available here_.
 
- .. __: http://www.bx.psu.edu/miller_lab/dist/README.lastz-1.01.50/README.lastz-1.01...
+ .. _here: http://www.bx.psu.edu/miller_lab/dist/README.lastz-1.02.00/README.lastz-1.02...
  
 ------
 
@@ -227,7 +227,7 @@
 
 **Outputs**
 
-LASTZ generates one output. Depending on the choice you make in *Select output format* drop-down LASTZ will produce a SAM file showing sequence alignments, a list of differences between the reads and reference (Polymorphisms), or a general table with one line per alignment block (Tabular). Examples of these outputs are shown below.
+LASTZ generates one output. Depending on the choice you make in the *Select output format* drop-down, LASTZ will produce a SAM file showing sequence alignments, a list of differences between the reads and reference (Polymorphisms), or a general table with one line per alignment block (Tabular). Examples of these outputs are shown below.
 
 **SAM output**
 
@@ -355,23 +355,23 @@
   Roche-454 85% identity
   Roche-454 75% identity
 
-when deciding which one to use consider the following: a 36 bp read with two difference will be 34/36 = 94% identical to the reference.  
+When deciding which one to use, consider the following: a 36 bp read with two differences will be 34/36 = 94% identical to the reference.  
 
 **Full Parameter List**
 
-This modes gives you a fuller control over lastz. The description of these and other parameters is found at the end of this page. Note, that not all parameters are included in this interface. If you would like to make additional options available through Galaxy, e-mail us at galaxy-bugs@bx.psu.edu.
+This mode gives you fuller control over lastz. The description of these and other parameters is found at the end of this page. Note that not all parameters are included in this interface. If you would like to make additional options available through Galaxy, e-mail us at galaxy-bugs@bx.psu.edu.
 
 ------
 
-**Do you want to modify reference name?**
+**Do you want to modify the reference name?**
 
-This option allows you set the name of the reference sequence manually. This is helpful when, for example, you would like to make reference name compatible with the UCSC naming conventions to be able to display your lastz results as a custom track at UCSC Genome Browser.
+This option allows you to set the name of the reference sequence manually. This is helpful when, for example, you would like to make the reference name compatible with the UCSC naming conventions to be able to display your lastz results as a custom track at the UCSC Genome Browser.
 
 ------
 
 **LASTZ parameter list**
 
-This is an exhaustive list of LASTZ options. Once again, please note that not all parameters are included in this interface. If you would like to make additional options available through Galaxy, e-mail us at galaxy-bugs@bx.psu.edu::
+This is an exhaustive list of LASTZ options. Once again, please note that not all options are included in this interface. If you would like to make additional options available through Galaxy, e-mail us at galaxy-bugs@bx.psu.edu::
 
   target[[s..e]][-]       spec/file containing target sequence (fasta or nib)
                           [s..e] defines a subrange of the file

    

Greg Von Kuster

tags

participants (1)