details: http://www.bx.psu.edu/hg/galaxy/rev/4e44d29377e3
changeset: 1586:4e44d29377e3
user: Dan Blankenberg <dan(a)bx.psu.edu>
date: Thu Oct 30 14:46:57 2008 -0400
description:
Annotation profiler: Better handling of the case where regions extend beyond known chromosome boundaries.
1 file(s) affected in this change:
tools/annotation_profiler/annotation_profiler_for_interval.py
diffs (55 lines):
diff -r 6259ebdd0e99 -r 4e44d29377e3 tools/annotation_profiler/annotation_profiler_for_interval.py
--- a/tools/annotation_profiler/annotation_profiler_for_interval.py Thu Oct 30 13:35:19 2008 -0400
+++ b/tools/annotation_profiler/annotation_profiler_for_interval.py Thu Oct 30 14:46:57 2008 -0400
@@ -121,14 +121,27 @@
self.table_chromosome_size = {} #dict of dict of table:chrom containing total coverage of table for a chrom
self.table_chromosome_count = {} #dict of dict of table:chrom containing total number of coverage ranges of table for a chrom
self.table_regions_overlaped_count = {} #total number of table regions overlaping user's input intervals (non unique)
- self.interval_table_overlap_count = {} #total number of user input intervals which overlap table
+ self.interval_table_overlap_count = {} #total number of user input intervals which overlap table
+ self.region_size_errors = {} #dictionary of lists of invalid ranges by chromosome
def add_region( self, chrom, start, end ):
- self.total_interval_size += ( end - start )
+ chrom_length = self.chrom_lengths.get( chrom )
+ region_start = min( start, chrom_length )
+ region_end = min( end, chrom_length )
+ region_length = region_end - region_start
+
+ if region_length < 1 or region_start != start or region_end != end:
+ if chrom not in self.region_size_errors:
+ self.region_size_errors[chrom] = []
+ self.region_size_errors[chrom].append( ( start, end ) )
+ if region_length < 1: return
+
+ self.total_interval_size += region_length
self.total_interval_count += 1
if chrom not in self.chromosome_coverage:
- self.chromosome_coverage[chrom] = bx.bitset.BitSet( self.chrom_lengths.get( chrom ) )
- self.chromosome_coverage[chrom].set_range( start, end - start )
- for table_name, coverage, regions in self.coverage_reader.iter_table_coverage_regions_by_region( chrom, start, end ):
+ self.chromosome_coverage[chrom] = bx.bitset.BitSet( chrom_length )
+
+ self.chromosome_coverage[chrom].set_range( region_start, region_length )
+ for table_name, coverage, regions in self.coverage_reader.iter_table_coverage_regions_by_region( chrom, region_start, region_end ):
if table_name not in self.table_coverage:
self.table_coverage[table_name] = 0
self.table_chromosome_size[table_name] = {}
@@ -213,7 +226,17 @@
if keep_empty or total_coverage:
#only output tables that have atleast 1 base covered unless empty are requested
out.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( table_name, table_chromosome_size, table_chromosome_count, table_region_coverage, table_region_count, total_interval_count, total_interval_size, total_coverage, table_regions_overlaped_count, interval_region_overlap_count, nr_interval_count, nr_interval_size, nr_coverage, nr_table_regions_overlaped_count, nr_interval_table_overlap_count ) )
- out.close()
+ out.close()
+
+ #report chrom size errors as needed:
+ if table_coverage_summary.region_size_errors:
+ print "Regions provided extended beyond known chromosome lengths, and have been truncated as necessary, for the following intervals:"
+ for chrom, regions in table_coverage_summary.region_size_errors.items():
+ if len( regions ) > 3:
+ extra_region_info = ", ... "
+ else:
+ extra_region_info = ""
+ print "%s has max length of %s, exceeded by %s%s." % ( chrom, chrom_lengths.get( chrom ), ", ".join( map( str, regions[:3] ) ), extra_region_info )
class ChromosomeLengths:
def __init__( self, filename ):