details: http://www.bx.psu.edu/hg/galaxy/rev/9e8901230940 changeset: 2748:9e8901230940 user: Dan Blankenberg <dan@bx.psu.edu> date: Tue Sep 22 11:37:41 2009 -0400 description: More friendly error reporting for mutate SNP codon tool. 1 file(s) affected in this change: tools/evolution/mutate_snp_codon.py diffs (89 lines): diff -r 4d32e2d934d0 -r 9e8901230940 tools/evolution/mutate_snp_codon.py --- a/tools/evolution/mutate_snp_codon.py Tue Sep 22 10:55:50 2009 -0400 +++ b/tools/evolution/mutate_snp_codon.py Tue Sep 22 11:37:41 2009 -0400 @@ -34,32 +34,66 @@ DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" ) skipped_lines = 0 - for line in open( input_file ): + errors = {} + for name, message in [ ('max_field_index','not enough fields'), ( 'codon_len', 'codon length must be 3' ), ( 'codon_seq', 'codon sequence must have length 3' ), ( 'snp_len', 'SNP length must be 3' ), ( 'snp_observed', 'SNP observed values must have length 3' ), ( 'empty_comment', 'empty or comment'), ( 'no_overlap', 'codon and SNP do not overlap' ) ]: + errors[ name ] = { 'count':0, 'message':message } + line_count = 0 + for line_count, line in enumerate( open( input_file ) ): line = line.rstrip( '\n\r' ) if line and not line.startswith( '#' ): fields = line.split( '\t' ) if max_field_index >= len( fields ): skipped_lines += 1 + errors[ 'max_field_index' ]['count'] += 1 continue + + #read codon info codon_chrom = fields[codon_chrom_col] codon_start = int( fields[codon_start_col] ) codon_end = int( fields[codon_end_col] ) + if codon_end - codon_start != 3: + #codons must be length 3 + skipped_lines += 1 + errors[ 'codon_len' ]['count'] += 1 + continue codon_strand = strandify( fields, codon_strand_col ) codon_seq = fields[codon_seq_col].upper() + if len( codon_seq ) != 3: + #codon sequence must have length 3 + skipped_lines += 1 + errors[ 'codon_seq' ]['count'] += 1 + continue + #read snp info snp_chrom = fields[snp_chrom_col] snp_start = int( fields[snp_start_col] ) snp_end = int( fields[snp_end_col] ) + if snp_end - snp_start != 1: + #snps must be length 1 + skipped_lines += 1 + errors[ 'snp_len' ]['count'] += 1 + continue snp_strand = strandify( fields, snp_strand_col ) snp_observed = fields[snp_observed_col].split( '/' ) + snp_observed = [ observed for observed in snp_observed if len( observed ) == 1 ] + if not snp_observed: + #sequence replacements must be length 1 + skipped_lines += 1 + errors[ 'snp_observed' ]['count'] += 1 + continue + + #Determine index of replacement for observed values into codon + offset = snp_start - codon_start + #Extract DNA on neg strand codons will have positions reversed relative to interval positions; i.e. position 0 == position 2 + if codon_strand == '-': + offset = 2 - offset + if offset < 0 and offset > 2: #assert offset >= 0 and offset <= 2, ValueError( 'Impossible offset determined: %s' % offset ) + #codon and snp do not overlap + skipped_lines += 1 + errors[ 'no_overlap' ]['count'] += 1 + continue for observed in snp_observed: - #Extract DNA on neg strand codons will have positions reversed relative to interval positions; i.e. position 0 == position 2 - offset = snp_start - codon_start - if codon_strand == '-': - offset = 2 - offset - assert offset >= 0 and offset <= 2, ValueError( 'Impossible offset determined: %s' % offset ) - if codon_strand != snp_strand: #if our SNP is on a different strand than our codon, take complement of provided observed SNP base observed = observed.translate( DNA_COMP ) @@ -69,5 +103,10 @@ if codon_seq != snp_codon: #only output when we actually have a different codon out.write( "%s\t%s\n" % ( line, snp_codon ) ) - + else: + skipped_lines += 1 + errors[ 'empty_comment' ]['count'] += 1 + if skipped_lines: + print "Skipped %i (%4.2f%%) of %i lines; reasons: %s" % ( skipped_lines, ( float( skipped_lines )/float( line_count ) ) * 100, line_count, ', '.join( [ "%s (%i)" % ( error['message'], error['count'] ) for error in errors.itervalues() if error['count'] ] ) ) + if __name__ == "__main__": main()