[hg] galaxy 1552: Maf stats will now use bitset instead of numpy...
details: http://www.bx.psu.edu/hg/galaxy/rev/4b9feffc3ce5 changeset: 1552:4b9feffc3ce5 user: Dan Blankenberg <dan@bx.psu.edu> date: Thu Oct 09 11:23:33 2008 -0400 description: Maf stats will now use bitset instead of numpy.zerros. 3 file(s) affected in this change: test-data/maf_stats_summary_out.dat tools/maf/maf_stats.py tools/maf/maf_stats.xml diffs (85 lines): diff -r fdbf15ea1f8a -r 4b9feffc3ce5 test-data/maf_stats_summary_out.dat --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/maf_stats_summary_out.dat Thu Oct 09 11:23:33 2008 -0400 @@ -0,0 +1,9 @@ +#species nucleotides coverage +mm5 17186 0.9555 +panTro1 17732 0.9858 +danRer1 12263 0.6818 +canFam1 17034 0.9470 +fr1 11318 0.6292 +galGal2 13522 0.7518 +hg17 17987 1.0000 +rn3 16410 0.9123 diff -r fdbf15ea1f8a -r 4b9feffc3ce5 tools/maf/maf_stats.py --- a/tools/maf/maf_stats.py Wed Oct 08 12:00:16 2008 -0400 +++ b/tools/maf/maf_stats.py Thu Oct 09 11:23:33 2008 -0400 @@ -8,7 +8,7 @@ from galaxy import eggs import pkg_resources; pkg_resources.require( "bx-python" ) import bx.intervals.io -from numpy import zeros +from bx.bitset import BitSet from galaxy.tools.util import maf_utilities assert sys.version_info[:2] >= ( 2, 4 ) @@ -56,32 +56,35 @@ #loop through interval file for num_region, region in enumerate( bx.intervals.io.NiceReaderWrapper( open( input_interval_filename, 'r' ), chrom_col = chr_col, start_col = start_col, end_col = end_col, fix_strand = True, return_header = False, return_comments = False ) ): src = "%s.%s" % ( dbkey, region.chrom ) - total_length += ( region.end - region.start ) - coverage = { dbkey: zeros( region.end - region.start, dtype = bool ) } + region_length = region.end - region.start + total_length += region_length + coverage = { dbkey: BitSet( region_length ) } for block in maf_utilities.get_chopped_blocks_for_region( index, src, region, force_strand='+' ): #make sure all species are known for c in block.components: spec = c.src.split( '.' )[0] - if spec not in coverage: coverage[spec] = zeros( region.end - region.start, dtype = bool ) + if spec not in coverage: coverage[spec] = BitSet( region_length ) start_offset, alignment = maf_utilities.reduce_block_by_primary_genome( block, dbkey, region.chrom, region.start ) for i in range( len( alignment[dbkey] ) ): for spec, text in alignment.items(): if text[i] != '-': - coverage[spec][start_offset + i] = True + coverage[spec].set( start_offset + i ) if summary: #record summary for key in coverage.keys(): if key not in species_summary: species_summary[key] = 0 - species_summary[key] = species_summary[key] + sum( coverage[key] ) + species_summary[key] = species_summary[key] + coverage[key].count_range() else: #print coverage for interval - out.write( "%s\t%s\t%s\t%s\n" % ( "\t".join( region.fields ), dbkey, sum( coverage[dbkey] ), len(coverage[dbkey]) - sum( coverage[dbkey] ) ) ) + coverage_sum = coverage[dbkey].count_range() + out.write( "%s\t%s\t%s\t%s\n" % ( "\t".join( region.fields ), dbkey, coverage_sum, region_length - coverage_sum ) ) keys = coverage.keys() keys.remove( dbkey ) keys.sort() for key in keys: - out.write( "%s\t%s\t%s\t%s\n" % ( "\t".join( region.fields ), key, sum( coverage[key] ), len(coverage[key]) - sum( coverage[key] ) ) ) + coverage_sum = coverage[key].count_range() + out.write( "%s\t%s\t%s\t%s\n" % ( "\t".join( region.fields ), key, coverage_sum, region_length - coverage_sum ) ) if summary: out.write( "#species\tnucleotides\tcoverage\n" ) for spec in species_summary: diff -r fdbf15ea1f8a -r 4b9feffc3ce5 tools/maf/maf_stats.xml --- a/tools/maf/maf_stats.xml Wed Oct 08 12:00:16 2008 -0400 +++ b/tools/maf/maf_stats.xml Thu Oct 09 11:23:33 2008 -0400 @@ -60,6 +60,13 @@ <param name="mafType" value="8_WAY_MULTIZ_hg17"/> <output name="out_file1" file="maf_stats_interval_out.dat"/> <param name="summary" value="false"/> + </test> + <test> + <param name="input1" value="1.bed" dbkey="hg17" format="bed"/> + <param name="maf_source" value="cached"/> + <param name="mafType" value="8_WAY_MULTIZ_hg17"/> + <output name="out_file1" file="maf_stats_summary_out.dat"/> + <param name="summary" value="true"/> </test> </tests> <help>
participants (1)
-
Nate Coraor