[hg] galaxy 2748: More friendly error reporting for mutate SNP c...

25 Sep 2009

details:   http://www.bx.psu.edu/hg/galaxy/rev/9e8901230940
changeset: 2748:9e8901230940
user:      Dan Blankenberg <dan@bx.psu.edu>
date:      Tue Sep 22 11:37:41 2009 -0400
description:
More friendly error reporting for mutate SNP codon tool.

1 file(s) affected in this change:

tools/evolution/mutate_snp_codon.py

diffs (89 lines):

diff -r 4d32e2d934d0 -r 9e8901230940 tools/evolution/mutate_snp_codon.py

--- a/tools/evolution/mutate_snp_codon.py	Tue Sep 22 10:55:50 2009 -0400
+++ b/tools/evolution/mutate_snp_codon.py	Tue Sep 22 11:37:41 2009 -0400
@@ -34,32 +34,66 @@
     
     DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" )
     skipped_lines = 0
-    for line in open( input_file ):
+    errors = {}
+    for name, message in [ ('max_field_index','not enough fields'), ( 'codon_len', 'codon length must be 3' ), ( 'codon_seq', 'codon sequence must have length 3' ), ( 'snp_len', 'SNP length must be 3' ), ( 'snp_observed', 'SNP observed values must have length 3' ), ( 'empty_comment', 'empty or comment'), ( 'no_overlap', 'codon and SNP do not overlap' ) ]:
+        errors[ name ] = { 'count':0, 'message':message }
+    line_count = 0
+    for line_count, line in enumerate( open( input_file ) ):
         line = line.rstrip( '\n\r' )
         if line and not line.startswith( '#' ):
             fields = line.split( '\t' )
             if max_field_index >= len( fields ):
                 skipped_lines += 1
+                errors[ 'max_field_index' ]['count'] += 1
                 continue
+            
+            #read codon info
             codon_chrom = fields[codon_chrom_col]
             codon_start = int( fields[codon_start_col] )
             codon_end = int( fields[codon_end_col] )
+            if codon_end - codon_start != 3:
+                #codons must be length 3
+                skipped_lines += 1
+                errors[ 'codon_len' ]['count'] += 1
+                continue
             codon_strand = strandify( fields, codon_strand_col )
             codon_seq = fields[codon_seq_col].upper()
+            if len( codon_seq ) != 3:
+                #codon sequence must have length 3
+                skipped_lines += 1
+                errors[ 'codon_seq' ]['count'] += 1
+                continue
             
+            #read snp info
             snp_chrom = fields[snp_chrom_col]
             snp_start = int( fields[snp_start_col] )
             snp_end = int( fields[snp_end_col] )
+            if snp_end - snp_start != 1:
+                #snps must be length 1
+                skipped_lines += 1
+                errors[ 'snp_len' ]['count'] += 1
+                continue
             snp_strand = strandify( fields, snp_strand_col )
             snp_observed = fields[snp_observed_col].split( '/' )
+            snp_observed = [ observed for observed in snp_observed if len( observed ) == 1 ]
+            if not snp_observed:
+                #sequence replacements must be length 1
+                skipped_lines += 1
+                errors[ 'snp_observed' ]['count'] += 1
+                continue
+            
+            #Determine index of replacement for observed values into codon
+            offset = snp_start - codon_start
+            #Extract DNA on neg strand codons will have positions reversed relative to interval positions; i.e. position 0 == position 2
+            if codon_strand == '-':
+                offset = 2 - offset
+            if offset < 0 and offset > 2: #assert offset >= 0 and offset <= 2, ValueError( 'Impossible offset determined: %s' % offset )
+                #codon and snp do not overlap
+                skipped_lines += 1
+                errors[ 'no_overlap' ]['count'] += 1
+                continue
             
             for observed in snp_observed:
-                #Extract DNA on neg strand codons will have positions reversed relative to interval positions; i.e. position 0 == position 2
-                offset = snp_start - codon_start
-                if codon_strand == '-':
-                    offset = 2 - offset
-                assert offset >= 0 and offset <= 2, ValueError( 'Impossible offset determined: %s' % offset )
-                
                 if codon_strand != snp_strand:
                     #if our SNP is on a different strand than our codon, take complement of provided observed SNP base
                     observed = observed.translate( DNA_COMP )
@@ -69,5 +103,10 @@
                 
                 if codon_seq != snp_codon: #only output when we actually have a different codon
                     out.write( "%s\t%s\n" % ( line, snp_codon )  )
-
+        else:
+            skipped_lines += 1
+            errors[ 'empty_comment' ]['count'] += 1
+    if skipped_lines:
+        print "Skipped %i (%4.2f%%) of %i lines; reasons: %s" % ( skipped_lines, ( float( skipped_lines )/float( line_count ) ) * 100, line_count, ', '.join( [ "%s (%i)" % ( error['message'], error['count'] ) for error in errors.itervalues() if error['count'] ] ) )
+    
 if __name__ == "__main__": main()

    

Nate Coraor

tags

participants (1)