details: http://www.bx.psu.edu/hg/galaxy/rev/e929a2d803e4 changeset: 2662:e929a2d803e4 user: Dan Blankenberg <dan@bx.psu.edu> date: Fri Sep 04 10:40:16 2009 -0400 description: First pass at allowing MAF tools to deal with multiple occurrences of a species within a block. Tool versions have been incremented as necessary. These changes should only affect output when an input block has a species appearing more than once, with the exception being the MAF to multiple FASTA blocks converters: the FASTA headers have been revised to included the sequence index for a species in a block as well as the block index. A new tool "Split MAF Blocks by Species" has been added that will split MAF blocks into the complete combination of multiple blocks when a species appears more than once. 29 file(s) affected in this change: lib/galaxy/datatypes/converters/maf_to_fasta_converter.py lib/galaxy/datatypes/converters/maf_to_fasta_converter.xml lib/galaxy/datatypes/converters/maf_to_interval_converter.py lib/galaxy/datatypes/converters/maf_to_interval_converter.xml lib/galaxy/datatypes/sequence.py lib/galaxy/tools/util/maf_utilities.py test-data/cf_maf2fasta_new.dat test-data/maf_split_by_species_collapsed_out.maf test-data/maf_split_by_species_in.maf test-data/maf_split_by_species_not_collapsed_out.maf tool_conf.xml.sample tools/annotation_profiler/annotation_profiler_for_interval.py tools/maf/genebed_maf_to_fasta.xml tools/maf/interval2maf.py tools/maf/interval2maf.xml tools/maf/interval2maf_pairwise.xml tools/maf/interval_maf_to_merged_fasta.py tools/maf/interval_maf_to_merged_fasta.xml tools/maf/maf_filter.py tools/maf/maf_limit_size.py tools/maf/maf_limit_size.xml tools/maf/maf_limit_to_species.py tools/maf/maf_split_by_species.py tools/maf/maf_split_by_species.xml tools/maf/maf_stats.py tools/maf/maf_stats.xml tools/maf/maf_to_fasta.xml tools/maf/maf_to_fasta_concat.py tools/maf/maf_to_fasta_multiple_sets.py diffs (1927 lines): diff -r 99dcba7af5b6 -r e929a2d803e4 lib/galaxy/datatypes/converters/maf_to_fasta_converter.py --- a/lib/galaxy/datatypes/converters/maf_to_fasta_converter.py Fri Sep 04 10:31:23 2009 -0400 +++ b/lib/galaxy/datatypes/converters/maf_to_fasta_converter.py Fri Sep 04 10:40:16 2009 -0400 @@ -14,12 +14,15 @@ input_name = sys.argv.pop(1) out = open( output_name, 'w' ) count = 0 - for count, maf in enumerate( bx.align.maf.Reader( open( input_name, 'r' ) ) ): - for c in maf.components: - spec, chrom = bx.align.maf.src_split( c.src ) - if not spec or not chrom: - spec = chrom = c.src - out.write( "%s\n" % maf_utilities.get_fasta_header( c, suffix = "%s_%i" % ( spec, count ) ) ) + for count, block in enumerate( bx.align.maf.Reader( open( input_name, 'r' ) ) ): + spec_counts = {} + for c in block.components: + spec, chrom = maf_utilities.src_split( c.src ) + if spec not in spec_counts: + spec_counts[ spec ] = 0 + else: + spec_counts[ spec ] += 1 + out.write( "%s\n" % maf_utilities.get_fasta_header( c, { 'block_index' : count, 'species' : spec, 'sequence_index' : spec_counts[ spec ] }, suffix = "%s_%i_%i" % ( spec, count, spec_counts[ spec ] ) ) ) out.write( "%s\n" % c.text ) out.write( "\n" ) out.close() @@ -27,3 +30,12 @@ if __name__ == "__main__": __main__() + + for component in block.components: + spec, chrom = maf_utilities.src_split( component.src ) + if spec not in spec_counts: + spec_counts[ spec ] = 0 + else: + spec_counts[ spec ] += 1 + file_out.write( "%s\n" % maf_utilities.get_fasta_header( component, { 'block_index' : block_num, 'species' : spec, 'sequence_index' : spec_counts[ spec ] }, suffix = "%s_%i_%i" % ( spec, block_num, spec_counts[ spec ] ) ) ) + file_out.write( "%s\n" % component.text ) diff -r 99dcba7af5b6 -r e929a2d803e4 lib/galaxy/datatypes/converters/maf_to_fasta_converter.xml --- a/lib/galaxy/datatypes/converters/maf_to_fasta_converter.xml Fri Sep 04 10:31:23 2009 -0400 +++ b/lib/galaxy/datatypes/converters/maf_to_fasta_converter.xml Fri Sep 04 10:40:16 2009 -0400 @@ -1,4 +1,4 @@ -<tool id="CONVERTER_maf_to_fasta_0" name="Convert MAF to Fasta"> +<tool id="CONVERTER_maf_to_fasta_0" name="Convert MAF to Fasta" version="1.0.1"> <!-- <description>__NOT_USED_CURRENTLY_FOR_CONVERTERS__</description> --> <command interpreter="python">maf_to_fasta_converter.py $output1 $input1</command> <inputs> diff -r 99dcba7af5b6 -r e929a2d803e4 lib/galaxy/datatypes/converters/maf_to_interval_converter.py --- a/lib/galaxy/datatypes/converters/maf_to_interval_converter.py Fri Sep 04 10:31:23 2009 -0400 +++ b/lib/galaxy/datatypes/converters/maf_to_interval_converter.py Fri Sep 04 10:40:16 2009 -0400 @@ -4,7 +4,8 @@ import sys from galaxy import eggs import pkg_resources; pkg_resources.require( "bx-python" ) -import bx.align.maf +import bx.align.maf +from galaxy.tools.util import maf_utilities assert sys.version_info[:2] >= ( 2, 4 ) @@ -17,15 +18,15 @@ #write interval header line out.write( "#chrom\tstart\tend\tstrand\n" ) try: - for maf in bx.align.maf.Reader( open(input_name, 'r') ): - c = maf.get_component_by_src_start(species) - if c is not None: - out.write( "%s\t%i\t%i\t%s\n" % (bx.align.src_split(c.src)[-1], c.get_forward_strand_start(), c.get_forward_strand_end(), c.strand) ) - count += 1 + for block in bx.align.maf.Reader( open( input_name, 'r' ) ): + for c in maf_utilities.iter_components_by_src_start( block, species ): + if c is not None: + out.write( "%s\t%i\t%i\t%s\n" % ( bx.align.src_split( c.src )[-1], c.get_forward_strand_start(), c.get_forward_strand_end(), c.strand ) ) + count += 1 except Exception, e: print >> sys.stderr, "There was a problem processing your input: %s" % e out.close() - print "%i MAF blocks converted to Genomic Intervals for species %s." % (count, species) + print "%i MAF blocks converted to Genomic Intervals for species %s." % ( count, species ) if __name__ == "__main__": __main__() diff -r 99dcba7af5b6 -r e929a2d803e4 lib/galaxy/datatypes/converters/maf_to_interval_converter.xml --- a/lib/galaxy/datatypes/converters/maf_to_interval_converter.xml Fri Sep 04 10:31:23 2009 -0400 +++ b/lib/galaxy/datatypes/converters/maf_to_interval_converter.xml Fri Sep 04 10:40:16 2009 -0400 @@ -1,4 +1,4 @@ -<tool id="CONVERTER_maf_to_interval_0" name="Convert MAF to Genomic Intervals"> +<tool id="CONVERTER_maf_to_interval_0" name="Convert MAF to Genomic Intervals" version="1.0.1"> <!-- <description>__NOT_USED_CURRENTLY_FOR_CONVERTERS__</description> --> <command interpreter="python">maf_to_interval_converter.py $output1 $input1 ${input1.metadata.dbkey}</command> <inputs> diff -r 99dcba7af5b6 -r e929a2d803e4 lib/galaxy/datatypes/sequence.py --- a/lib/galaxy/datatypes/sequence.py Fri Sep 04 10:31:23 2009 -0400 +++ b/lib/galaxy/datatypes/sequence.py Fri Sep 04 10:40:16 2009 -0400 @@ -22,7 +22,7 @@ pass class Alignment( Sequence ): - """Class describing an alignmnet""" + """Class describing an alignment""" """Add metadata elements""" MetadataElement( name="species", desc="Species", default=[], param=metadata.SelectParameter, multiple=True, readonly=True, no_value=None ) @@ -316,6 +316,78 @@ import bx.align.maf except: pass +#trying to import maf_utilities here throws an ImportError due to a circular import between jobs and tools: +#from galaxy.tools.util.maf_utilities import build_maf_index_species_chromosomes +#Traceback (most recent call last): +# File "./scripts/paster.py", line 27, in <module> +# command.run() +# File "build/bdist.solaris-2.11-i86pc/egg/paste/script/command.py", line 78, in run +# File "build/bdist.solaris-2.11-i86pc/egg/paste/script/command.py", line 117, in invoke +# File "build/bdist.solaris-2.11-i86pc/egg/paste/script/command.py", line 212, in run +# File "build/bdist.solaris-2.11-i86pc/egg/paste/script/serve.py", line 227, in command +# File "build/bdist.solaris-2.11-i86pc/egg/paste/script/serve.py", line 250, in loadapp +# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 193, in loadapp +# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 213, in loadobj +# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 237, in loadcontext +# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 267, in _loadconfig +# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 397, in get_context +# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 439, in _context_from_explicit +# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 18, in import_string +# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/pkg_resources.py", line 1912, in load +# entry = __import__(self.module_name, globals(),globals(), ['__name__']) +# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/web/buildapp.py", line 18, in <module> +# from galaxy import config, jobs, util, tools +# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/jobs/__init__.py", line 3, in <module> +# from galaxy import util, model +# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/model/__init__.py", line 13, in <module> +# import galaxy.datatypes.registry +# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/datatypes/registry.py", line 6, in <module> +# import data, tabular, interval, images, sequence, qualityscore, genetics, xml, coverage, tracks, chrominfo +# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/datatypes/sequence.py", line 344, in <module> +# from galaxy.tools.util.maf_utilities import build_maf_index_species_chromosomes +# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/tools/__init__.py", line 15, in <module> +# from galaxy import util, jobs, model +#ImportError: cannot import name jobs +#so we'll copy and paste for now...terribly icky +#*** ANYCHANGE TO THIS METHOD HERE OR IN maf_utilities MUST BE PROPAGATED *** +def COPIED_build_maf_index_species_chromosomes( filename, index_species = None ): + species = [] + species_chromosomes = {} + indexes = bx.interval_index_file.Indexes() + try: + maf_reader = bx.align.maf.Reader( open( filename ) ) + while True: + pos = maf_reader.file.tell() + block = maf_reader.next() + if block is None: break + for c in block.components: + spec = c.src + chrom = None + if "." in spec: + spec, chrom = spec.split( ".", 1 ) + if spec not in species: + species.append( spec ) + species_chromosomes[spec] = [] + if chrom and chrom not in species_chromosomes[spec]: + species_chromosomes[spec].append( chrom ) + if index_species is None or spec in index_species: + forward_strand_start = c.forward_strand_start + forward_strand_end = c.forward_strand_end + try: + forward_strand_start = int( forward_strand_start ) + forward_strand_end = int( forward_strand_end ) + except ValueError: + continue #start and end are not integers, can't add component to index, goto next component + #this likely only occurs when parse_e_rows is True? + #could a species exist as only e rows? should the + if forward_strand_end > forward_strand_start: + #require positive length; i.e. certain lines have start = end = 0 and cannot be indexed + indexes.add( c.src, forward_strand_start, forward_strand_end, pos, max=c.src_size ) + except Exception, e: + #most likely a bad MAF + log.debug( 'Building MAF index on %s failed: %s' % ( filename, e ) ) + return ( None, [], {} ) + return ( indexes, species, species_chromosomes ) class Maf( Alignment ): """Class describing a Maf alignment""" @@ -333,38 +405,8 @@ Parses and sets species, chromosomes, index from MAF file. """ #these metadata values are not accessable by users, always overwrite + indexes, species, species_chromosomes = COPIED_build_maf_index_species_chromosomes( dataset.file_name ) - try: - maf_reader = bx.align.maf.Reader( open( dataset.file_name ) ) - except: - return #not a maf file - species = [] - species_chromosomes = {} - indexes = bx.interval_index_file.Indexes() - while True: - pos = maf_reader.file.tell() - block = maf_reader.next() - if block is None: break - for c in block.components: - spec = c.src - chrom = None - if "." in spec: - spec, chrom = spec.split( ".", 1 ) - if spec not in species: - species.append(spec) - species_chromosomes[spec] = [] - if chrom and chrom not in species_chromosomes[spec]: - species_chromosomes[spec].append( chrom ) - forward_strand_start = c.forward_strand_start - forward_strand_end = c.forward_strand_end - try: - forward_strand_start = int( forward_strand_start ) - forward_strand_end = int( forward_strand_end ) - except ValueError: - continue #start and end are not integers, can't add component to index, goto next component - if forward_strand_end > forward_strand_start: - #require positive length; i.e. certain lines have start = end = 0 and cannot be indexed - indexes.add( c.src, forward_strand_start, forward_strand_end, pos, max=c.src_size ) dataset.metadata.species = species #only overwrite the contents if our newly determined chromosomes don't match stored chrom_file = dataset.metadata.species_chromosomes diff -r 99dcba7af5b6 -r e929a2d803e4 lib/galaxy/tools/util/maf_utilities.py --- a/lib/galaxy/tools/util/maf_utilities.py Fri Sep 04 10:31:23 2009 -0400 +++ b/lib/galaxy/tools/util/maf_utilities.py Fri Sep 04 10:40:16 2009 -0400 @@ -7,10 +7,41 @@ import bx.align.maf import bx.intervals import bx.interval_index_file -import sys, os, string, tempfile +import sys, os, string, tempfile +import logging +from copy import deepcopy assert sys.version_info[:2] >= ( 2, 4 ) - + +log = logging.getLogger(__name__) + + +GAP_CHARS = [ '-' ] +SRC_SPLIT_CHAR = '.' + +def src_split( src ): + spec, chrom = bx.align.maf.src_split( src ) + if None in [ spec, chrom ]: + spec = chrom = src + return spec, chrom + +def src_merge( spec, chrom, contig = None ): + if None in [ spec, chrom ]: + spec = chrom = spec or chrom + return bx.align.maf.src_merge( spec, chrom, contig ) + +def get_species_in_block( block ): + species = [] + for c in block.components: + spec, chrom = src_split( c.src ) + if spec not in species: + species.append( spec ) + return species + +def tool_fail( msg = "Unknown Error" ): + print >> sys.stderr, "Fatal Error: %s" % msg + sys.exit() + #an object corresponding to a reference layered alignment class RegionAlignment( object ): @@ -153,69 +184,187 @@ except: return build_maf_index( maf_file, species = species ) +#*** ANYCHANGE TO THIS METHOD HERE OR IN galaxy.datatypes.sequences MUST BE PROPAGATED *** +def build_maf_index_species_chromosomes( filename, index_species = None ): + species = [] + species_chromosomes = {} + indexes = bx.interval_index_file.Indexes() + try: + maf_reader = bx.align.maf.Reader( open( filename ) ) + while True: + pos = maf_reader.file.tell() + block = maf_reader.next() + if block is None: break + for c in block.components: + spec = c.src + chrom = None + if "." in spec: + spec, chrom = spec.split( ".", 1 ) + if spec not in species: + species.append( spec ) + species_chromosomes[spec] = [] + if chrom and chrom not in species_chromosomes[spec]: + species_chromosomes[spec].append( chrom ) + if index_species is None or spec in index_species: + forward_strand_start = c.forward_strand_start + forward_strand_end = c.forward_strand_end + try: + forward_strand_start = int( forward_strand_start ) + forward_strand_end = int( forward_strand_end ) + except ValueError: + continue #start and end are not integers, can't add component to index, goto next component + #this likely only occurs when parse_e_rows is True? + #could a species exist as only e rows? should the + if forward_strand_end > forward_strand_start: + #require positive length; i.e. certain lines have start = end = 0 and cannot be indexed + indexes.add( c.src, forward_strand_start, forward_strand_end, pos, max=c.src_size ) + except Exception, e: + #most likely a bad MAF + log.debug( 'Building MAF index on %s failed: %s' % ( filename, e ) ) + return ( None, [], {} ) + return ( indexes, species, species_chromosomes ) #builds and returns ( index, index_filename ) for specified maf_file def build_maf_index( maf_file, species = None ): - indexes = bx.interval_index_file.Indexes() - try: - maf_reader = bx.align.maf.Reader( open( maf_file ) ) - # Need to be a bit tricky in our iteration here to get the 'tells' right - while True: - pos = maf_reader.file.tell() - block = maf_reader.next() - if block is None: break - for c in block.components: - if species is not None and c.src.split( "." )[0] not in species: - continue - indexes.add( c.src, c.forward_strand_start, c.forward_strand_end, pos ) + indexes, found_species, species_chromosomes = build_maf_index_species_chromosomes( maf_file, species ) + if indexes is not None: fd, index_filename = tempfile.mkstemp() out = os.fdopen( fd, 'w' ) indexes.write( out ) out.close() - return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), index_filename ) - except: - return ( None, None ) - -def chop_block_by_region( block, src, region, species = None, mincols = 0, force_strand = None ): - ref = block.get_component_by_src( src ) - #We want our block coordinates to be from positive strand - if ref.strand == "-": - block = block.reverse_complement() - ref = block.get_component_by_src( src ) + return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), index_filename ) + return ( None, None ) + +def component_overlaps_region( c, region ): + if c is None: return False + start, end = c.get_forward_strand_start(), c.get_forward_strand_end() + if region.start >= end or region.end <= start: + return False + return True + +def chop_block_by_region( block, src, region, species = None, mincols = 0 ): + # This chopping method was designed to maintain consistency with how start/end padding gaps have been working in Galaxy thus far: + # behavior as seen when forcing blocks to be '+' relative to src sequence (ref) and using block.slice_by_component( ref, slice_start, slice_end ) + # whether-or-not this is the 'correct' behavior is questionable, but this will at least maintain consistency + # comments welcome + slice_start = block.text_size #max for the min() + slice_end = 0 #min for the max() + old_score = block.score #save old score for later use + # We no longer assume only one occurance of src per block, so we need to check them all + for c in iter_components_by_src( block, src ): + if component_overlaps_region( c, region ): + if c.text is not None: + rev_strand = False + if c.strand == "-": + #We want our coord_to_col coordinates to be returned from positive stranded component + rev_strand = True + c = c.reverse_complement() + start = max( region.start, c.start ) + end = min( region.end, c.end ) + start = c.coord_to_col( start ) + end = c.coord_to_col( end ) + if rev_strand: + #need to orient slice coordinates to the original block direction + slice_len = end - start + end = len( c.text ) - start + start = end - slice_len + slice_start = min( start, slice_start ) + slice_end = max( end, slice_end ) + + if slice_start < slice_end: + block = block.slice( slice_start, slice_end ) + if block.text_size > mincols: + # restore old score, may not be accurate, but it is better than 0 for everything? + block.score = old_score + if species is not None: + block = block.limit_to_species( species ) + block.remove_all_gap_columns() + return block + return None - #save old score here for later use - old_score = block.score - slice_start = max( region.start, ref.start ) - slice_end = min( region.end, ref.end ) - - #slice block by reference species at determined limits - block = block.slice_by_component( ref, slice_start, slice_end ) - - if block.text_size > mincols: - if ( force_strand is None and region.strand != ref.strand ) or ( force_strand is not None and force_strand != ref.strand ): - block = block.reverse_complement() - # restore old score, may not be accurate, but it is better than 0 for everything - block.score = old_score - if species is not None: - block = block.limit_to_species( species ) - block.remove_all_gap_columns() - return block - return None +def orient_block_by_region( block, src, region, force_strand = None ): + #loop through components matching src, + #make sure each of these components overlap region + #cache strand for each of overlaping regions + #if force_strand / region.strand not in strand cache, reverse complement + ### we could have 2 sequences with same src, overlapping region, on different strands, this would cause no reverse_complementing + strands = [ c.strand for c in iter_components_by_src( block, src ) if component_overlaps_region( c, region ) ] + if strands and ( force_strand is None and region.strand not in strands ) or ( force_strand is not None and force_strand not in strands ): + block = block.reverse_complement() + return block + +def get_oriented_chopped_blocks_for_region( index, src, region, species = None, mincols = 0, force_strand = None ): + for block, idx, offset in get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols, force_strand ): + yield block +def get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0, force_strand = None ): + for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ): + yield orient_block_by_region( block, src, region, force_strand ), idx, offset + +#split a block with multiple occurances of src into one block per src +def iter_blocks_split_by_src( block, src ): + for src_c in iter_components_by_src( block, src ): + new_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) ) + new_block.text_size = block.text_size + for c in block.components: + if c == src_c or c.src != src: + new_block.add_component( deepcopy( c ) ) #components have reference to alignment, dont want to loose reference to original alignment block in original components + yield new_block + +#split a block into multiple blocks with all combinations of a species appearing only once per block +def iter_blocks_split_by_species( block, species = None ): + def __split_components_by_species( components_by_species, new_block ): + if components_by_species: + #more species with components to add to this block + components_by_species = deepcopy( components_by_species ) + spec_comps = components_by_species.pop( 0 ) + for c in spec_comps: + newer_block = deepcopy( new_block ) + newer_block.add_component( deepcopy( c ) ) + for value in __split_components_by_species( components_by_species, newer_block ): + yield value + else: + #no more components to add, yield this block + yield new_block + + #divide components by species + spec_dict = {} + if not species: + species = [] + for c in block.components: + spec, chrom = src_split( c.src ) + if spec not in spec_dict: + spec_dict[ spec ] = [] + species.append( spec ) + spec_dict[ spec ].append( c ) + else: + for spec in species: + spec_dict[ spec ] = [] + for c in iter_components_by_src_start( block, spec ): + spec_dict[ spec ].append( c ) + + empty_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) ) #should we copy attributes? + empty_block.text_size = block.text_size + #call recursive function to split into each combo of spec/blocks + for value in __split_components_by_species( spec_dict.values(), empty_block ): + sort_block_components_by_block( value, block ) #restore original component order + yield value + + #generator yielding only chopped and valid blocks for a specified region -def get_chopped_blocks_for_region( index, src, region, species = None, mincols = 0, force_strand = None ): - for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols, force_strand ): +def get_chopped_blocks_for_region( index, src, region, species = None, mincols = 0 ): + for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ): yield block -def get_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0, force_strand = None ): +def get_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0 ): for block, idx, offset in index.get_as_iterator_with_index_and_offset( src, region.start, region.end ): - block = chop_block_by_region( block, src, region, species, mincols, force_strand ) + block = chop_block_by_region( block, src, region, species, mincols ) if block is not None: yield block, idx, offset #returns a filled region alignment for specified regions -def get_region_alignment( index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0 ): +def get_region_alignment( index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ): if species is not None: alignment = RegionAlignment( end - start, species ) else: alignment = RegionAlignment( end - start, primary_species ) - return fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand, species, mincols ) + return fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand, species, mincols, overwrite_with_gaps ) #reduces a block to only positions exisiting in the src provided def reduce_block_by_primary_genome( block, species, chromosome, region_start ): @@ -237,13 +386,11 @@ return ( start_offset, species_texts ) #fills a region alignment -def fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0 ): +def fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ): region = bx.intervals.Interval( start, end ) region.chrom = chrom region.strand = strand primary_src = "%s.%s" % ( primary_species, chrom ) - - #Order blocks overlaping this position by score, lowest first blocks = [] @@ -255,28 +402,40 @@ break else: blocks.append( ( score, idx, offset ) ) - + + gap_chars_tuple = tuple( GAP_CHARS ) + gap_chars_str = ''.join( GAP_CHARS ) #Loop through ordered blocks and layer by increasing score - for block_dict in blocks: - block = chop_block_by_region( block_dict[1].get_at_offset( block_dict[2] ), primary_src, region, species, mincols, strand ) - if block is None: continue - start_offset, species_texts = reduce_block_by_primary_genome( block, primary_species, chrom, start ) - for spec, text in species_texts.items(): - try: - alignment.set_range( start_offset, spec, text ) - except: - #species/sequence for species does not exist - pass - + for block_dict in blocks: for block in iter_blocks_split_by_species( block_dict[1].get_at_offset( block_dict[2] ) ): #need to handle each occurance of sequence in block seperately + if component_overlaps_region( block.get_component_by_src( primary_src ), region ): + block = chop_block_by_region( block, primary_src, region, species, mincols ) #chop block + block = orient_block_by_region( block, primary_src, region ) #orient block + start_offset, species_texts = reduce_block_by_primary_genome( block, primary_species, chrom, start ) + for spec, text in species_texts.items(): + #we should trim gaps from both sides, since these are not positions in this species genome (sequence) + text = text.rstrip( gap_chars_str ) + gap_offset = 0 + while text.startswith( gap_chars_tuple ): + gap_offset += 1 + text = text[1:] + if not text: + break + if text: + if overwrite_with_gaps: + alignment.set_range( start_offset + gap_offset, spec, text ) + else: + for i, char in enumerate( text ): + if char not in GAP_CHARS: + alignment.set_position( start_offset + gap_offset + i, spec, char ) return alignment #returns a filled spliced region alignment for specified region with start and end lists -def get_spliced_region_alignment( index, primary_species, chrom, starts, ends, strand = '+', species = None, mincols = 0 ): +def get_spliced_region_alignment( index, primary_species, chrom, starts, ends, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ): #create spliced alignment object if species is not None: alignment = SplicedAlignment( starts, ends, species ) else: alignment = SplicedAlignment( starts, ends, [primary_species] ) for exon in alignment.exons: - fill_region_alignment( exon, index, primary_species, chrom, exon.start, exon.end, strand, species, mincols) + fill_region_alignment( exon, index, primary_species, chrom, exon.start, exon.end, strand, species, mincols, overwrite_with_gaps ) return alignment #loop through string array, only return non-commented lines @@ -319,29 +478,36 @@ starts.append( start ) ends.append( end ) return ( starts, ends, fields ) - -def get_species_in_maf( maf_filename ): - try: - species={} - - file_in = open( maf_filename, 'r' ) - maf_reader = maf.Reader( file_in ) - - for i, m in enumerate( maf_reader ): - l = m.components - for c in l: - spec, chrom = maf.src_split( c.src ) - if not spec or not chrom: - spec = chrom = c.src - species[spec] = spec - - file_in.close() - - species = species.keys() - species.sort() - return species - except: - return [] + +def iter_components_by_src( block, src ): + for c in block.components: + if c.src == src: + yield c + +def get_components_by_src( block, src ): + return [ value for value in iter_components_by_src( block, src ) ] + +def iter_components_by_src_start( block, src ): + for c in block.components: + if c.src.startswith( src ): + yield c + +def get_components_by_src_start( block, src ): + return [ value for value in iter_components_by_src_start( block, src ) ] + +def sort_block_components_by_block( block1, block2 ): + #orders the components in block1 by the index of the component in block2 + #block1 must be a subset of block2 + #occurs in-place + return block1.components.sort( cmp = lambda x, y: block2.components.index( x ) - block2.components.index( y ) ) + +def get_species_in_maf( maf_filename ): + species = [] + for block in maf.Reader( open( maf_filename ) ): + for spec in get_species_in_block( block ): + if spec not in species: + species.append( spec ) + return species def parse_species_option( species ): if species: diff -r 99dcba7af5b6 -r e929a2d803e4 test-data/cf_maf2fasta_new.dat --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/cf_maf2fasta_new.dat Fri Sep 04 10:40:16 2009 -0400 @@ -0,0 +1,134 @@ +>hg17.chr7(+):127471195-127471526|sequence_index=0|block_index=0|species=hg17|hg17_0_0 +gtttgccatcttttgctgctctagggaatccagcagctgtcaccatgtaaacaagcccaggctagaccaGTTACCCTCATC---ATCTTAGCTGATAGCCAGCCAGCCACCACAGGCAtgagtcaggccatattgctggacccacagaattatgagctaaataaatagtcttgggttaagccactaagttttaggcatagtgtgttatgtaTCTCACAAACATATAAGACTGTGTGTTTGTTGACTGGAGGAAGAGATGCTATAAAGACCACCTTTTAAAACTTCCCAAATACTGCCACTGATGTCCTGATGGAGG-------------------------------------------------------TATGAA---------AACATCCACTAA +>panTro1.chr6(+):129885076-129885407|sequence_index=0|block_index=0|species=panTro1|panTro1_0_0 +gtttgccatcttttgctgctcttgggaatccagcagctgtcaccatgtaaacaagcccaggctagaccaGTTACCCTCATC---ATCTTAGCTGATAGCCAGCCAGCCACCACAGGCAtgagtcaggccatattgctggacccacagaattatgagctaaataaatagtcttgggttaagccactaagttttaggcatagtgtgttatgtaTCTCACAAACATATAAGACTGTGTGTTTGTTGACTGGAGGAAGAGATGCTATAAAGACCACCTTTTGAAACTTCCCAAATACTGCCACTGATGTCCTGATGGAGG-------------------------------------------------------TATGAA---------AACATCCACTAA +>rheMac2.chr3(+):165787989-165788319|sequence_index=0|block_index=0|species=rheMac2|rheMac2_0_0 +gcttgccatcttttgatgctcttgggaatccagcagctgtcaccat-taaacaagcccaggctagaccaGTTACCCTCATC---ATCTTAGCTGATAGCCAGCCAGCCACCATAGGCAtgagtcaggccatagtgctggacccacagaattatgagctaaataagtagtgttgggttaagtcactaagttttaggcatagtgtgttatgtagcTCACAAACATATAAGACTGTGTGTTTTTTGACTGGAGGAAGAGATGCCATAAAGACCACCTTTTGAAACTTCTCAAATACTGCCATTGATGTGCTGATGGAGG-------------------------------------------------------TATGAA---------AACATCCACTAA +>rn3.chr4(+):56178191-56178473|sequence_index=0|block_index=0|species=rn3|rn3_0_0 +CTTCACTCTCATTTGCTGTT----------------CTGTCACTATGGAGACAAACACAGGCTAGCCCAGTTACTATCTTGATCACAGCAGCTGT----CAGCTAGCTGCCACTCACAGGAATAAGGCCATACCATT-GATCCACTGAACCTTGATCTAGGAATTTGGC----------------------TGGGGCCAGTTTGCGGTGTCACTCATGA--CTCTAAGATTGTGTGTTTG----CTCCAGGAAGAGACGGCAAGAGGATTACCTTTAAAAGGTTCGG-AGTCTAGCTGTAGACAGCCCAATGGG---------------------------------------------------------TATAAC---------AATACTCACTAA +>mm7.chr6(+):28984529-28984886|sequence_index=0|block_index=0|species=mm7|mm7_0_0 +CTCCACTCTCGTTTGCTGTT----------------CTGTCACCATGGAAACAAACG-AGGGTGGTCCAGTTACTATCTTG---ACTGCAGCTGG----CAGTCAGTTGCCACT--CAGGAATAAGGCTATGCCATT-GATCCACTGAACCGTGATCTGGAAACCTGGCTGTTGTTT-------CAAGCCTTGGGGCCAGTTTGCGGTGTTACTCATGA--CTCTAAGATCGTGTGCTTG----CTGCAGGAAGAGACAGCAAGGGGGTTACATTTAAAAAGCCCCC-AGTTTAGCTATAGGCAGGCCAACAGGTGTAAAAATACTCACTAGTAATGGGCTGAACTCATGGAGGTAGCATTAGTGAGACACTGTAACTGTTTTTTTAAAAATCACTAA + +>hg17.chr7(+):127471526-127471584|sequence_index=0|block_index=1|species=hg17|hg17_1_0 +AATTTGTGGTTTATTCATTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG +>mm7.chr6(+):28984886-28984940|sequence_index=0|block_index=1|species=mm7|mm7_1_0 +----AACGTTTCATTGATTGCTCATCATTTAAAAAAAGAAATTCCTCAGTGGAAGAGG +>rheMac2.chr3(+):165788319-165788377|sequence_index=0|block_index=1|species=rheMac2|rheMac2_1_0 +AATTTGTGGTTTATTTATTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG +>panTro1.chr6(+):129885407-129885465|sequence_index=0|block_index=1|species=panTro1|panTro1_1_0 +AATTTGTGGTTTATTCGTTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG + +>hg17.chr7(+):127471584-127471688|sequence_index=0|block_index=2|species=hg17|hg17_2_0 +GAGATATTT-GGggaaatttt-gtatagactagctt--tcacgatgttagggaattattattgtgtgataatggtcttgcagttac-acagaaattcttcctta-ttttt +>panTro1.chr6(+):129885465-129885569|sequence_index=0|block_index=2|species=panTro1|panTro1_2_0 +GAGACATTT-GGggaaatttt-gtatagactagctt--tcacgatgttagggagttattattgtgtgataatggtcttgcagttac-acagaaattcttcctta-ttttt +>rheMac2.chr3(+):165788377-165788482|sequence_index=0|block_index=2|species=rheMac2|rheMac2_2_0 +GAGATATTT-GGggaaatttg-gtatagactagctt--tcatgatgtaagggagttatttttgtgtgataatggccctacagttac-acagaaattcttccttatttttt +>canFam2.chr14(-):11090703-11090811|sequence_index=0|block_index=2|species=canFam2|canFam2_2_0 +gagatattt-gggggaatttgaatgtagtgttgctcttttgtgatgctaagaaattataattgtctgatgatagtctcgtggttatgggggaaatgcttcctta-ttttt +>bosTau2.chr4(-):50243931-50244034|sequence_index=0|block_index=2|species=bosTau2|bosTau2_2_0 +-agacattg-ggtaaaattcaaatgcagactagctc----atgatgttaaagaattactcttgtgtggtaatggtcttgtgatagagatagaaatgcttcctta-ttttt +>rn3.chr4(+):56182200-56182295|sequence_index=0|block_index=2|species=rn3|rn3_2_0 +----TATTTGGGGGAAATATG-ATGTGCA----CTT--CCATGATCTTAAAGAATTGCTACTGTTTGATAGTGATCTTATGGTTAA-ATAAAAAAAAT--CTTA-GTTGT +>dasNov1.scaffold_256527(+):298-392|sequence_index=0|block_index=2|species=dasNov1|dasNov1_2_0 +GAGACATTT-GGAGAAATTTG-----------Aatt--tcatgatgttaaggaattacttttgtatgatgatggtcttgtggctat-gtagaatttcttccgtg-tttta + +>hg17.chr7(+):127471688-127471871|sequence_index=0|block_index=3|species=hg17|hg17_3_0 +tgggaagcaccaaagta-------gggataaaatgtcatgatgtgtgcaatacactttaaaatgtttttgccaaaa----------taattaa-------------------------tgaagc--aaatatg---gaaaataataattattaaatctaggt-----gatgggtatattgtagttcactatagtattgcacacttttctgtatgtttaaatttttcattta--------------------------aaaa- +>panTro1.chr6(+):129885569-129885752|sequence_index=0|block_index=3|species=panTro1|panTro1_3_0 +tgggaaacaccaaagta-------gggataaaatgtcatgatgtgtgcaatacgctttaaaatatttttgccaaaa----------taattaa-------------------------tgaagc--aaatatg---gaaaataataattattaaatctaggt-----gatgggtatattgtagttcactatagtattgcacacttttctgtatgtttaaaattttcattta--------------------------aaaa- +>rheMac2.chr3(+):165788482-165788684|sequence_index=0|block_index=3|species=rheMac2|rheMac2_3_0 +tgggaagcacaaaagta-------gggataaaatgtcatgatgtgtacaatatgctttaaaatatttttgccaaaa----------taattaa-------------------------tgaagc--aaatatg---gaaaataataactgttaaatctaggt-----gttgggtatattgcagttcattatgttattgcacacttttctgtgtgtttaaaattttcatttaaaaatatgttttaaaaatg-------aaaa- +>rn3.chr4(+):56182295-56182489|sequence_index=0|block_index=3|species=rn3|rn3_3_0 +TAGAAAATACTCAAATATTTAGGGGCGTGACAATGTCACAGTGTCTGCAATTTGCTTTAAAGATTTTT-----AAA----------TATTTAAAAAAGTTTTAATAATTTTGAAAAACTGAAGCTACACTATG---GGAAGTGGTAATTGTTACATATGGGT-----AATAAGTAT-----AATTCGTTATATTAT-------TTTTC------TTAGAATTTTTCATTTG--------------------------AAAA- +>bosTau2.chr4(-):50243792-50243930|sequence_index=0|block_index=3|species=bosTau2|bosTau2_3_0 +agataaacacttaagtattta---aggatgaaacgccctgatgtttgtaatttgctttagaatattttagccaaaa----------gaattaa-------------------------tgatgc--aaatatg--caaaaagagta--cgttaaacctaa-----------------------------------------------------atttgCGATTttcattta--------------------------aaaa- +>canFam2.chr14(-):11090345-11090505|sequence_index=0|block_index=3|species=canFam2|canFam2_3_0 +agacacaaactgaagtattta---aggatgaaatgtcatgatgtttgcaattggctttaaaatattttagccaaaa-----------agtaaa-------------------------tgaagc--AAATATG--GGAAGACAATAATCATTAAATCTAGGT-----GATGCATAC---------------------------TTTTCCATATGTTTGAAATTTTCATTTA--------------------------AAAA- +>dasNov1.scaffold_256527(+):393-625|sequence_index=0|block_index=3|species=dasNov1|dasNov1_3_0 +agacgcatgctgaagcatgta---aggataaaatgtcgtggtgtttgtaatttattctaaaacattttagccaaaaacaaataaataaataaa-------------------------tgaagc--aaatatgggggaaatgtttaattgttaaatctagatttaacacggtatataccgtgcttcattatactagtctctacttttccatgtgtttgaaattttCATTAAAATGTTTGTTTGTTGTCTGTTTTAATGAAAT + +>hg17.chr7(+):127471871-127471910|sequence_index=0|block_index=4|species=hg17|hg17_4_0 +actttgagctagacaccaggctatgagcta-ggagcatag +>rheMac2.chr3(+):165788684-165788723|sequence_index=0|block_index=4|species=rheMac2|rheMac2_4_0 +actttgagctagataccaggttatgagcta-ggagcatag +>panTro1.chr6(+):129885752-129885791|sequence_index=0|block_index=4|species=panTro1|panTro1_4_0 +actttgagctagacaccaggctatgagcta-ggagcatag +>bosTau2.chr4(-):50243734-50243773|sequence_index=0|block_index=4|species=bosTau2|bosTau2_4_0 +tcttcgtgcaacgcacggggctatcaatgt-gggatacag +>canFam2.chr14(-):11090081-11090120|sequence_index=0|block_index=4|species=canFam2|canFam2_4_0 +ACATCAtgctagatcctggactatgagctg-ggtatatag +>dasNov1.scaffold_256527(+):625-665|sequence_index=0|block_index=4|species=dasNov1|dasNov1_4_0 +CCTTTGTGCTAGCCACTGGGATGAAAGCTAGGGAACACAG + +>hg17.chr7(+):127471910-127472074|sequence_index=0|block_index=5|species=hg17|hg17_5_0 +caatgaccaa----------------------------------------------------------------------------------------------atagactcctaccaa-ctc-aaagaatgcacattctCTG-GGAAACATGTTTCCATTAGGAAGCCTCGAATGCAATGTGACTGTGGTCTCCAGGACCTG-TGTGATCCTGGCTTTTCCTGTTCCCTCCG---CATCATCACTGCAGGTGTGTTTTCCCAAG +>panTro1.chr6(+):129885791-129885955|sequence_index=0|block_index=5|species=panTro1|panTro1_5_0 +caatgaccaa----------------------------------------------------------------------------------------------atagactcctaccaa-ctc-aaagaatgcacattctCTG-GGAAACATGTTTCCATTAGGAAGCCTCGAATGCAATGTGACTGTGGTCTCCAGGACATG-TGTGATCCTGGCTTTTCCTGTTCCCTCTG---CATCATCACTGCAGGTGTATTTTCCCAAG +>rheMac2.chr3(+):165788723-165788885|sequence_index=0|block_index=5|species=rheMac2|rheMac2_5_0 +caatgaccaa----------------------------------------------------------------------------------------------atagacccctaccga-ctc-aaagaatgtacattctTTG-GGAAACATGTTTCCATCAGAAAATCTCAAATGCAATGTGACTGGGGTCTCCAGGACCTG-TGTGAGCCTGGCTTTTCCTGTTCCCTCCA---CATCATCACTGCAGGTGTATTTTCCC--G +>mm7.chr6(+):28990714-28990875|sequence_index=0|block_index=5|species=mm7|mm7_5_0 +caaaaaccaa------------------------------------------------------------------------------------------------aaaaACCTATAGC-CTC-ACAGGGTGGGTTGTCTTTG-AGGAACATGCATCCGCTAGAAAGTCCCAAGTACACTATGACAGTTG--CCCAGGCCCCGCCTTAAACCTGGTTTTCCTGGTTTCTTTCA---CATCATTACCACGAATATATTTCCTCAAG +>rn3.chr4(+):56183448-56183705|sequence_index=0|block_index=5|species=rn3|rn3_5_0 +--ATGACCAATATACACTGTTTACATGTATAGCATTGTGAATGGAGACATAAAAAGATAATCTAGCTTTGTGCTAGGTAGGTGCTGAGCTCTTAACAGTGCTGGGCAGAAACCTATAAC-CTC-ACAGGGTGGGTTGTCTTTG-AGGAGCGTGCTAACCCTAGGAAGTCTCAAATACAATGTGATGGTTGCCCCCAGGCACCACCTTGAACCTGGTCTTCCTGGTTTCTTTCA---CACCATTACCACAAATACATTTTCTCAGG +>bosTau2.chr4(-):50243566-50243734|sequence_index=0|block_index=5|species=bosTau2|bosTau2_5_0 +atgtgaacaa---------------------------------------------------------------------------------------------aacggacccgtgtgggactcggcggagcacacagattttgcgggagCACGTTCCCGTTAGGAAGTCTCTGATGCAATACGACCGGTGCCTTCAGGACCTG-TG--AGGCTGACTTTCCTTA-CCCCTCCACACCATCATCAAGGCAGGTGTGATTTTCCAGG +>canFam2.chr14(-):11089913-11090081|sequence_index=0|block_index=5|species=canFam2|canFam2_5_0 +cagtgaacaa---------------------------------------------------------------------------------------------aacagagccctgcagt-cttgatggagcacacaacctttg-gggaaCATGTTTCCATAAGAAAGTCTCCAATGTGATCTGA-TGGTGCCGCCAGGACCTA-TGTCAGCCTACCGTTCCATGTCCCCTCCACACCATCATCACTGCAGGTGTGTTTTCCCACA +>dasNov1.scaffold_256527(+):665-786|sequence_index=0|block_index=5|species=dasNov1|dasNov1_5_0 +CAGTGAGCAA-----------------------------------------------------------------------------------------------CAGCCTGGCTCCGT-CC--GGGGGCCGCTCAGCAGCTC-GGGAGCGTGGAGACG---GGAAGTCTGTCACGCGATGCG-----------CTGGGCCCG------------CTGTTCCCGCCCCCCTCC---CCCC----------------TTTCCCAAG + +>hg17.chr7(+):127472074-127472258|sequence_index=0|block_index=6|species=hg17|hg17_6_0 +TTTTAAA------CATTTACCTTCCCAGTGGCCTTGCGTCTAGAGGAATCCCTGTATAGTGGT-ACATGAATATAACACATAACAAA-AATCATCTCTATGGTGTGTGTTGTTCCTGGGGTTCAattcagcaaatttt-ccc-tgggcacccatgtgttcttggcactggaaaagtaccgggactgaaacagtt +>panTro1.chr6(+):129885955-129886139|sequence_index=0|block_index=6|species=panTro1|panTro1_6_0 +TTTTAAA------CATTTACCTTCCCAGTGGCCTTGCGTCTAGAGGAATCCCTGTATAGTGGT-ACATGAATATAACACATAACAAA-AATCATCTCTATGGTGTGTGTTGTTCCTGGGGTTCAattcagcaaatttt-tcc-tgggcacccatgtgttcttggcactggaaaagtaccgggactgaaacagtt +>rheMac2.chr3(+):165788885-165789069|sequence_index=0|block_index=6|species=rheMac2|rheMac2_6_0 +TTTTAAA------CATTTACTCTCCCAGTAGCCTTGCATCTCGAGGAATCCCTGTATAGTGGT-ACATGAATATAACACATAACAAA-AATCATCTGTACGGTGTGTGTTGTTCCTGGGGTTCAattcagcaaatttt-tcc-tgggcacccctgtgttcttggcactggaaaagtaccaggacttaaatagta +>mm7.chr6(+):28990875-28991025|sequence_index=0|block_index=6|species=mm7|mm7_6_0 +TTTAAAGAAAGTACCCCCTCCTTTCCAGT-GCCTCAAATCTAGAAGAATATTCATAGTGAAGT-GC------------------------ACAGCCGGGTGGTGCATGGTA-ATCTGGAAGTCACCTCTGCAAATCTT-TCC----------------TGTTGGTGCTGTGAAGGCACCAGGACTTCAAGAGTA +>rn3.chr4(+):56183705-56183879|sequence_index=0|block_index=6|species=rn3|rn3_6_0 +TTTAAAAGAAGT-CCCACTCCTTTCCAGT-GCCCTAGATCTAGAAGCACATTCATAATGATGT-ACAC-----TAACCC----------GACAGCTGTGTGGTATATGGTA-TCCCGGAAGTCACCTCAGCAAACCTT-TCCCGGGGAACCTACATGGTGTTGGTGCTGTGAAGGTACCAGGTTGTCAAGGGTA +>canFam2.chr14(-):11089743-11089913|sequence_index=0|block_index=6|species=canFam2|canFam2_6_0 +TTTTAAA------TATCTGC-TTCCCGGTGGCCTTGAGTCTAGAGGAGTCCCCCCACTATGGTGGCACTAATACTGAAGGTCAGAAATAATCAGTTCTGTGGTGCATGTTGCCCCTGAGGTTCTGTTCGGGAAACTTC-TTC-TGAGCAC----ATGCACCTGGCACTGCAAACGTACCAGGA----------- +>dasNov1.scaffold_256527(+):786-964|sequence_index=0|block_index=6|species=dasNov1|dasNov1_6_0 +TTTTAAA------AATTTACCTTCCCAGTGGCGGTGAATCCGGAGGAATACGGAAACTGGGGC-GCACTACCATGACACGTGTCAAA-AATCAGTTCCGTGGTCCGTGGAGGGCCTGGGGTTC------GAAAATCTTGTCC-CGAGCACCCCCGTGCGCCTGGCACCGCGACAGTGACAGGACTGAAGCGTG- + +>hg17.chr7(+):127472258-127472280|sequence_index=0|block_index=7|species=hg17|hg17_7_0 +gatggccca-atccctgtcctct- +>panTro1.chr6(+):129886139-129886161|sequence_index=0|block_index=7|species=panTro1|panTro1_7_0 +gatggccca-atccctgtcctct- +>rheMac2.chr3(+):165789069-165789091|sequence_index=0|block_index=7|species=rheMac2|rheMac2_7_0 +gatggccca-atccctgtcctct- +>mm7.chr6(+):28991025-28991048|sequence_index=0|block_index=7|species=mm7|mm7_7_0 +AATGGCAGAGGGCTCTGTTCTCT- +>rn3.chr4(+):56183879-56183902|sequence_index=0|block_index=7|species=rn3|rn3_7_0 +AATGGCAGAGGCCCCTGTTCTCT- +>canFam2.chr14(-):11089526-11089548|sequence_index=0|block_index=7|species=canFam2|canFam2_7_0 +GGAGACTTG-ATGCCTGCCTTCC- +>dasNov1.scaffold_256527(+):964-987|sequence_index=0|block_index=7|species=dasNov1|dasNov1_7_0 +GACGGCCAG-ACCTCTGCCCTCGG + +>hg17.chr7(+):127472280-127472681|sequence_index=0|block_index=8|species=hg17|hg17_8_0 +taaaacctaagggaggagaTGGAAAG-GGGCACCCAACCCAGACTGAGAGACAGGAATTAGCTGCAAGGGGAACTAGGAAAAGCTTCTTTA---AGGATG--GAGAGGCCCTA-GTGGAATGGGGAGATTCTTCCGGGAGAAGCGATGGATGCACAGTTGGGCATCCCCACAGACGGACTGGAAAGAAAAAAGGCCTGGAGGAATCA------ATGTGC-AATGTATGTGTGTTCCCTGGTTcaagggctgg-gaactttctcta--aagggccaggtagaaaacattttaggctttctaagccaagg---caaaattgaggat-attacatgggtacttatacaacaagaataaacaatt---tacacaa-ttttttgttgacagaattcaaaa---ctttat----agacac---agaaatgcaaatttcctgt +>panTro1.chr6(+):129886161-129886562|sequence_index=0|block_index=8|species=panTro1|panTro1_8_0 +taaaacctaagggaggagaTGGAAAG-GGGCACCCAACCCAGACTGAGAGACAGGAATTAGCTGCAAGGGGAACTAGGAAAAGCTTCTTTA---AGGATG--GAGAGACCCTA-GTGGAATGGGGAGATTCTTCCGGGAGAAGCGATGGATGCGCAGTTGGGCATCCCCACAGACGGACTGGAAAGAAAAAAGGCCTGGAGGAATCA------ATGTGC-AATGTATGTGTGTTCCCTGGTTcaagggctgg-gaactttctcta--aagggccaggtagaaaacattttaggctttctaagccaagg---caaaattgaggat-attacatgggtacttatacaacaagaataaacaatt---tacacaa-ttttttgttgacagaattcaaaa---ctttat----agacac---agaaatgtaaatttcctgt +>rheMac2.chr3(+):165789091-165789492|sequence_index=0|block_index=8|species=rheMac2|rheMac2_8_0 +taaaacctaatggaggagatggaATG-GGTCACCCAACCCGGACTGAGAGACAGGAATTAGCTGCAAGGGTAACCAGGACAAGCTTCTCTA---ATGATG--GAGAGACCCTA-GTGGAATGGGGAGATTCTTCTGGGAGAAGCGATGGATTCGTAGTTGGGCATCCCCACAGAGGGACTGGAAAGAAAAAAGACCTGGAGGAACCA------ATGTGC-AATGTATGTGTGTTTCCTGGTTcaagggctggcaaactttctcta--aagggccagatagaaaacattttaggctttgtaagccaagg---caaaatcgaggag-attacatgggtacttatacaacaagaataaacaatt---tccacaa--tttttattcacagaattcaaaa---ctttat----agacac---agaaatgtaaatttcctgt +>rn3.chr4(+):56183902-56184219|sequence_index=0|block_index=8|species=rn3|rn3_8_0 +------------------------------------GTCCATAGTCAAAG------------------------------AAGCCTCTCAG---ATGGAG--AGCAGGGCCTATGCAAAAGAGGGGGCTTCTGTAGGCAGAAGGGATGGACTAGCCTCCGGACATAGCCATAGAGAGGCTGGCAGGACTGAGACCCAGGAGAAGCCAGCGCAGGTGTGCGGGCGTGTGTATATTTCATAGTTTGCAGGTTGG----------------------------CAAACAATTCCTGCTTTGCAGGCCAAGA---GGAAACTGAAGGTGACCCCGTGAGTGCTTAC---ACAAGAGAAAACAAG-------ACAA-TTTTTGGTTGACCAAATTCAGAA---CTTTATTTGAGGATGC---TAAAGTTTAAATTTCTTTT +>canFam2.chr14(-):11089143-11089523|sequence_index=0|block_index=8|species=canFam2|canFam2_8_0 +TACAGCCTGTGGGCAGAGGTGGGAAGAGGTCACGCAAGCCAGTTGGAATGAGGGGAGTTGGCTGGAAAGGTGACCAGGACAAGCTACTTCAACCAGGAAG--AAGAGACCCCG-GTG----------------CTTGGAGAAGGCCTGATTGAGCAGTCCTGCATGCCCGCCCAC-GACTGGCAGGAATAAAGACCCAGAAGAGCTA------ACGTGC-AATGTA------TTTTCTAGTTCCAgggttggcaaactttctctct-aagggtgggatgataaacattttaggcttttcagaccaaga---ggcgacatcagag-ggtatgtaggt---------acaagagggaaaagttgcccccggaa-ttttttg--gataaaattcaaaa---ctttacttagggatgc---caaaatgtaaacttcatat +>dasNov1.scaffold_256527(+):987-1401|sequence_index=0|block_index=8|species=dasNov1|dasNov1_8_0 +CTAAATCTCGCGGAGAAGGTGGAACA-GGTTACCCAAACCCGACCGAG-GAGGCGAGTTG---GAAACGGCGACTGGGACAAGCTCCCTCA---GAGACGGAGAGAGACCCCA-GTGGAAGGGGGGAGAGGCTCTTAGGGAAACGATGGGGGGACCCGCCCGCACCCGCACAGAGGCGCTGGCAGGCACAGCGGCCCCGAGGAGCCC------AGGAGC-AGGGC-TGTGT-TCCCCTGCATcaggggttggcaaactttttctgcaaagggccagatagtaaatattttaggctttgcaaaccaagaagtagaaagggaggcc-attatgtacgtatttatatagcaagagagaacattt---cccacaatttttttattgacagaatttaaaacttctttattgatgaacaccaaagaaacttgaatttcatat + +>hg17.chr7(+):127472681-127472715|sequence_index=0|block_index=9|species=hg17|hg17_9_0 +aattttcccat---gagaactattcttcttttgtttt +>rheMac2.chr3(+):165789492-165789526|sequence_index=0|block_index=9|species=rheMac2|rheMac2_9_0 +aattttcacat---aagaactattcttcttttgtttt +>panTro1.chr6(+):129886562-129886596|sequence_index=0|block_index=9|species=panTro1|panTro1_9_0 +aattttcccgt---gagaactattcttcttttgtttt +>canFam2.chr14(-):11089108-11089143|sequence_index=0|block_index=9|species=canFam2|canFam2_9_0 +aatggtcatgt--ccataactattcttcttttatttt +>dasNov1.scaffold_256527(+):1401-1433|sequence_index=0|block_index=9|species=dasNov1|dasNov1_9_0 +aattttcacatatcacgaagtatttttttttt----- + diff -r 99dcba7af5b6 -r e929a2d803e4 test-data/maf_split_by_species_collapsed_out.maf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/maf_split_by_species_collapsed_out.maf Fri Sep 04 10:40:16 2009 -0400 @@ -0,0 +1,61 @@ +##maf version=1 +a score=2047408.0 +s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG +s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG +s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + +a score=2047408.0 +s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG +s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG +s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + +a score=2047408.0 +s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG +s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG +s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + +a score=2047408.0 +s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG +s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG +s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + +a score=2047408.0 +s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG +s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG +s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + +a score=2047408.0 +s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT-GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG +s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT-GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG +s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC--GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + +a score=2047408.0 +s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG +s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG +s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + +a score=2047408.0 +s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC-GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG +s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG +s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC-GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + +a score=2047408.0 +s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG +s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG +s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + +a score=2047408.0 +s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG +s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG +s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + +a score=2047408.0 +s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTAG +s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCAG +s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGCAG + +a score=2047408.0 +s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCAG +s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC---AG +s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC---AG + diff -r 99dcba7af5b6 -r e929a2d803e4 test-data/maf_split_by_species_in.maf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/maf_split_by_species_in.maf Fri Sep 04 10:40:16 2009 -0400 @@ -0,0 +1,11 @@ +##maf version=1 +a score=2047408.0 +s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG +s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG +s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG +s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG +s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG +s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG +s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG +s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + diff -r 99dcba7af5b6 -r e929a2d803e4 test-data/maf_split_by_species_not_collapsed_out.maf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/maf_split_by_species_not_collapsed_out.maf Fri Sep 04 10:40:16 2009 -0400 @@ -0,0 +1,61 @@ +##maf version=1 +a score=2047408.0 +s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG +s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG +s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + +a score=2047408.0 +s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG +s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG +s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + +a score=2047408.0 +s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG +s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG +s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + +a score=2047408.0 +s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG +s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG +s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + +a score=2047408.0 +s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG +s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG +s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + +a score=2047408.0 +s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG +s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG +s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + +a score=2047408.0 +s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG +s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG +s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + +a score=2047408.0 +s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG +s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG +s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + +a score=2047408.0 +s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG +s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG +s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + +a score=2047408.0 +s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG +s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG +s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + +a score=2047408.0 +s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG +s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG +s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + +a score=2047408.0 +s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG +s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG +s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + diff -r 99dcba7af5b6 -r e929a2d803e4 tool_conf.xml.sample --- a/tool_conf.xml.sample Fri Sep 04 10:31:23 2009 -0400 +++ b/tool_conf.xml.sample Fri Sep 04 10:40:16 2009 -0400 @@ -90,6 +90,7 @@ <section name="Fetch Alignments" id="fetchAlign"> <tool file="maf/interval2maf_pairwise.xml" /> <tool file="maf/interval2maf.xml" /> + <tool file="maf/maf_split_by_species.xml"/> <tool file="maf/interval_maf_to_merged_fasta.xml" /> <tool file="maf/genebed_maf_to_fasta.xml"/> <tool file="maf/maf_stats.xml"/> diff -r 99dcba7af5b6 -r e929a2d803e4 tools/annotation_profiler/annotation_profiler_for_interval.py --- a/tools/annotation_profiler/annotation_profiler_for_interval.py Fri Sep 04 10:31:23 2009 -0400 +++ b/tools/annotation_profiler/annotation_profiler_for_interval.py Fri Sep 04 10:40:16 2009 -0400 @@ -22,10 +22,12 @@ fmt_size = struct.calcsize( fmt ) def __init__( self, filename ): self.file_size = os.stat( filename ).st_size - self.file = open( filename, 'rb' ) + self.file = open( filename, 'rb' ) + self.filename = filename self.length = int( self.file_size / self.fmt_size / 2 ) self._cached_ranges = [ None for i in xrange( self.length ) ] def __getitem__( self, i ): + old_i = i if self._cached_ranges[i] is not None: return self._cached_ranges[i] if i < 0: i = self.length + i @@ -35,7 +37,13 @@ start = struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0] end = struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0] except Exception, e: - raise IndexError, e + print 'filename', self.filename + print 'len', len( self ) + print 'fmtsize', self.fmt_size + print 'index', i + print 'old i', old_i + print 'offset', offset + raise IndexError( str( e ) ) self._cached_ranges[i] = ( start, end ) return start, end def __len__( self ): @@ -141,20 +149,24 @@ self.chromosome_coverage[chrom] = bx.bitset.BitSet( chrom_length ) self.chromosome_coverage[chrom].set_range( region_start, region_length ) - for table_name, coverage, regions in self.coverage_reader.iter_table_coverage_regions_by_region( chrom, region_start, region_end ): - if table_name not in self.table_coverage: - self.table_coverage[table_name] = 0 - self.table_chromosome_size[table_name] = {} - self.table_regions_overlaped_count[table_name] = 0 - self.interval_table_overlap_count[table_name] = 0 - self.table_chromosome_count[table_name] = {} - if chrom not in self.table_chromosome_size[table_name]: - self.table_chromosome_size[table_name][chrom] = self.coverage_reader._coverage[table_name][chrom]._total_coverage - self.table_chromosome_count[table_name][chrom] = len( self.coverage_reader._coverage[table_name][chrom]._coverage ) - self.table_coverage[table_name] += coverage - if coverage: - self.interval_table_overlap_count[table_name] += 1 - self.table_regions_overlaped_count[table_name] += regions + try: + for table_name, coverage, regions in self.coverage_reader.iter_table_coverage_regions_by_region( chrom, region_start, region_end ): + if table_name not in self.table_coverage: + self.table_coverage[table_name] = 0 + self.table_chromosome_size[table_name] = {} + self.table_regions_overlaped_count[table_name] = 0 + self.interval_table_overlap_count[table_name] = 0 + self.table_chromosome_count[table_name] = {} + if chrom not in self.table_chromosome_size[table_name]: + self.table_chromosome_size[table_name][chrom] = self.coverage_reader._coverage[table_name][chrom]._total_coverage + self.table_chromosome_count[table_name][chrom] = len( self.coverage_reader._coverage[table_name][chrom]._coverage ) + self.table_coverage[table_name] += coverage + if coverage: + self.interval_table_overlap_count[table_name] += 1 + self.table_regions_overlaped_count[table_name] += regions + except Exception, e: + print "chrom:%s, start:%s, end%s:." % ( chrom, start, end ) + raise e def iter_table_coverage( self ): def get_nr_coverage(): #returns non-redundant coverage, where user's input intervals have been collapse to resolve overlaps diff -r 99dcba7af5b6 -r e929a2d803e4 tools/maf/genebed_maf_to_fasta.xml --- a/tools/maf/genebed_maf_to_fasta.xml Fri Sep 04 10:31:23 2009 -0400 +++ b/tools/maf/genebed_maf_to_fasta.xml Fri Sep 04 10:40:16 2009 -0400 @@ -1,8 +1,8 @@ -<tool id="GeneBed_Maf_Fasta2" name="Stitch Gene blocks"> +<tool id="GeneBed_Maf_Fasta2" name="Stitch Gene blocks" version="1.0.1"> <description>given a set of coding exon intervals</description> <command interpreter="python">#if $maf_source_type.maf_source == "user":#interval_maf_to_merged_fasta.py --dbkey=$dbkey --species=$maf_source_type.species --mafSource=$maf_source_type.maf_file --mafIndex=$maf_source_type.maf_file.metadata.maf_index --interval_file=$input1 --output_file=$out_file1 --mafSourceType=$maf_source_type.maf_source --geneBED --mafIndexFileDir=${GALAXY_DATA_INDEX_DIR} #else:#interval_maf_to_merged_fasta.py --dbkey=$dbkey --species=$maf_source_type.species --mafSource=$maf_source_type.maf_identifier --interval_file=$input1 --output_file=$out_file1 --mafSourceType=$maf_source_type.maf_source --geneBED --mafIndexFileDir=${GALAXY_DATA_INDEX_DIR} -#end if +#end if# --overwrite_with_gaps=$overwrite_with_gaps </command> <inputs> <param name="input1" type="data" format="bed" label="Gene BED File"> @@ -29,42 +29,48 @@ <when value="cached"> <param name="maf_identifier" type="select" label="MAF Type" > <options from_file="maf_index.loc"> - <column name="name" index="0"/> - <column name="value" index="1"/> - <column name="dbkey" index="2"/> - <column name="species" index="3"/> - <filter type="data_meta" ref="input1" key="dbkey" column="2" multiple="True" separator=","/> - <validator type="no_options" message="No alignments are available for the build associated with the selected interval file"/> + <column name="name" index="0"/> + <column name="value" index="1"/> + <column name="dbkey" index="2"/> + <column name="species" index="3"/> + <filter type="data_meta" ref="input1" key="dbkey" column="2" multiple="True" separator=","/> + <validator type="no_options" message="No alignments are available for the build associated with the selected interval file"/> </options> </param> <param name="species" type="select" display="checkboxes" multiple="true" label="Choose species" help="Select species to be included in the final alignment"> <options from_file="maf_index.loc"> - <column name="uid" index="1"/> - <column name="value" index="3"/> - <column name="name" index="3"/> - <filter type="param_value" ref="maf_identifier" name="uid" column="1"/> - <filter type="multiple_splitter" column="3" separator=","/> + <column name="uid" index="1"/> + <column name="value" index="3"/> + <column name="name" index="3"/> + <filter type="param_value" ref="maf_identifier" name="uid" column="1"/> + <filter type="multiple_splitter" column="3" separator=","/> </options> </param> </when> </conditional> - </inputs> + <param name="overwrite_with_gaps" type="select" label="Split into Gapless MAF blocks" help="When set to Yes, blocks are divided around gaps appearing in any species. This will prevent gaps occuring in the interior of the sequence for an aligning species from overwriting a nucleotide found for the same position in a lower-scoring block."> + <option value="True" selected="true">No</option> + <option value="False">Yes</option> + </param> + </inputs> <outputs> <data format="fasta" name="out_file1" /> </outputs> <tests> <test> <param name="input1" value="8.bed"/> - <param name="maf_source" value="cached"/> + <param name="maf_source" value="cached"/>in aligning species <param name="maf_identifier" value="8_WAY_MULTIZ_hg17"/> - <param name="species" value="canFam1,hg17,mm5,panTro1,rn3"/> + <param name="species" value="canFam1,hg17,mm5,panTro1,rn3"/> + <param name="overwrite_with_gaps" value="True"/> <output name="out_file1" file="gene_bed_maf_to_fasta_out.fasta" /> </test> <test> <param name="input1" value="8.bed"/> <param name="maf_source" value="user"/> <param name="maf_file" value="4.maf"/> - <param name="species" value="hg17,panTro1"/> + <param name="species" value="hg17,panTro1"/> + <param name="overwrite_with_gaps" value="True"/> <output name="out_file1" file="gene_bed_maf_to_fasta_user_out.fasta" /> </test> </tests> diff -r 99dcba7af5b6 -r e929a2d803e4 tools/maf/interval2maf.py --- a/tools/maf/interval2maf.py Fri Sep 04 10:31:23 2009 -0400 +++ b/tools/maf/interval2maf.py Fri Sep 04 10:40:16 2009 -0400 @@ -20,6 +20,8 @@ -i, --interval_file=i: Input interval file -o, --output_file=o: Output MAF file -p, --species=p: Species to include in output + -P, --split_blocks_by_species=P: Split blocks by species + -r, --remove_all_gap_columns=r: Remove all Gap columns -l, --indexLocation=l: Override default maf_index.loc file -z, --mafIndexFile=z: Directory of local maf index file ( maf_index.loc or maf_pairwise.loc ) """ @@ -45,25 +47,21 @@ if options.dbkey: dbkey = options.dbkey else: dbkey = None if dbkey in [None, "?"]: - print >>sys.stderr, "You must specify a proper build in order to extract alignments. You can specify your genome build by clicking on the pencil icon associated with your interval file." - sys.exit() + maf_utilities.tool_fail( "You must specify a proper build in order to extract alignments. You can specify your genome build by clicking on the pencil icon associated with your interval file." ) species = maf_utilities.parse_species_option( options.species ) if options.chromCol: chromCol = int( options.chromCol ) - 1 else: - print >>sys.stderr, "Chromosome column not set, click the pencil icon in the history item to set the metadata attributes." - sys.exit() + maf_utilities.tool_fail( "Chromosome column not set, click the pencil icon in the history item to set the metadata attributes." ) if options.startCol: startCol = int( options.startCol ) - 1 else: - print >>sys.stderr, "Start column not set, click the pencil icon in the history item to set the metadata attributes." - sys.exit() + maf_utilities.tool_fail( "Start column not set, click the pencil icon in the history item to set the metadata attributes." ) if options.endCol: endCol = int( options.endCol ) - 1 else: - print >>sys.stderr, "End column not set, click the pencil icon in the history item to set the metadata attributes." - sys.exit() + maf_utilities.tool_fail( "End column not set, click the pencil icon in the history item to set the metadata attributes." ) if options.strandCol: strandCol = int( options.strandCol ) - 1 else: @@ -71,13 +69,17 @@ if options.interval_file: interval_file = options.interval_file else: - print >>sys.stderr, "Input interval file has not been specified." - sys.exit() + maf_utilities.tool_fail( "Input interval file has not been specified." ) if options.output_file: output_file = options.output_file else: - print >>sys.stderr, "Output file has not been specified." - sys.exit() + maf_utilities.tool_fail( "Output file has not been specified." ) + + split_blocks_by_species = remove_all_gap_columns = False + if options.split_blocks_by_species and options.split_blocks_by_species == 'split_blocks_by_species': + split_blocks_by_species = True + if options.remove_all_gap_columns and options.remove_all_gap_columns == 'remove_all_gap_columns': + remove_all_gap_columns = True #Finish parsing command line #Open indexed access to MAFs @@ -87,16 +89,13 @@ else: index = maf_utilities.maf_index_by_uid( options.mafType, options.mafIndexFile ) if index is None: - print >> sys.stderr, "The MAF source specified (%s) appears to be invalid." % ( options.mafType ) - sys.exit() + maf_utilities.tool_fail( "The MAF source specified (%s) appears to be invalid." % ( options.mafType ) ) elif options.mafFile: index, index_filename = maf_utilities.open_or_build_maf_index( options.mafFile, options.mafIndex, species = [dbkey] ) if index is None: - print >> sys.stderr, "Your MAF file appears to be malformed." - sys.exit() + maf_utilities.tool_fail( "Your MAF file appears to be malformed." ) else: - print >>sys.stderr, "Desired source MAF type has not been specified." - sys.exit() + maf_utilities.tool_fail( "Desired source MAF type has not been specified." ) #Create MAF writter out = bx.align.maf.Writer( open(output_file, "w") ) @@ -105,10 +104,20 @@ num_blocks = 0 num_regions = None for num_regions, region in enumerate( bx.intervals.io.NiceReaderWrapper( open( interval_file, 'r' ), chrom_col = chromCol, start_col = startCol, end_col = endCol, strand_col = strandCol, fix_strand = True, return_header = False, return_comments = False ) ): - src = "%s.%s" % ( dbkey, region.chrom ) - for block in maf_utilities.get_chopped_blocks_for_region( index, src, region, species, mincols ): - out.write( block ) - num_blocks += 1 + src = maf_utilities.src_merge( dbkey, region.chrom ) + for block in index.get_as_iterator( src, region.start, region.end ): + if split_blocks_by_species: + blocks = [ new_block for new_block in maf_utilities.iter_blocks_split_by_species( block ) if maf_utilities.component_overlaps_region( new_block.get_component_by_src_start( dbkey ), region ) ] + else: + blocks = [ block ] + for block in blocks: + block = maf_utilities.chop_block_by_region( block, src, region ) + if block is not None: + block = maf_utilities.orient_block_by_region( block, src, region ) + if remove_all_gap_columns: + block.remove_all_gap_columns() + out.write( block ) + num_blocks += 1 #Close output MAF out.close() diff -r 99dcba7af5b6 -r e929a2d803e4 tools/maf/interval2maf.xml --- a/tools/maf/interval2maf.xml Fri Sep 04 10:31:23 2009 -0400 +++ b/tools/maf/interval2maf.xml Fri Sep 04 10:40:16 2009 -0400 @@ -3,6 +3,9 @@ <command interpreter="python"> #if $maf_source_type.maf_source == "user":#interval2maf.py --dbkey=${input1.dbkey} --chromCol=${input1.metadata.chromCol} --startCol=${input1.metadata.startCol} --endCol=${input1.metadata.endCol} --strandCol=${input1.metadata.strandCol} --mafFile=$maf_source_type.mafFile --mafIndex=$maf_source_type.mafFile.metadata.maf_index --interval_file=$input1 --output_file=$out_file1 --mafIndexFile=${GALAXY_DATA_INDEX_DIR}/maf_index.loc --species=$maf_source_type.species #else:#interval2maf.py --dbkey=${input1.dbkey} --chromCol=${input1.metadata.chromCol} --startCol=${input1.metadata.startCol} --endCol=${input1.metadata.endCol} --strandCol=${input1.metadata.strandCol} --mafType=$maf_source_type.mafType --interval_file=$input1 --output_file=$out_file1 --mafIndexFile=${GALAXY_DATA_INDEX_DIR}/maf_index.loc --species=$maf_source_type.species + #end if + --split_blocks_by_species=$split_blocks_by_species_selector.split_blocks_by_species + #if $split_blocks_by_species_selector.split_blocks_by_species == "split_blocks_by_species":# --remove_all_gap_columns=$split_blocks_by_species_selector.remove_all_gap_columns #end if </command> <inputs> @@ -48,7 +51,22 @@ </options> </param> </when> - </conditional> + </conditional> + <conditional name="split_blocks_by_species_selector"> + <param name="split_blocks_by_species" type="select" label="Split blocks by species" help="See the Split MAF blocks by Species tool for more information."> + <option value="split_blocks_by_species">Split by species</option> + <option value="dont_split_blocks_by_species" selected="true">Do not split</option> + </param> + <when value="dont_split_blocks_by_species"> + <!-- do nothing here --> + </when> + <when value="split_blocks_by_species"> + <param name="remove_all_gap_columns" type="select" label="Collapse empty alignment columns"> + <option value="remove_all_gap_columns" selected="true">Collapse empty columns</option> + <option value="do_not_remove_all_gap_columns">Do not collapse</option> + </param> + </when> + </conditional> </inputs> <outputs> <data format="maf" name="out_file1"/> @@ -59,6 +77,7 @@ <param name="maf_source" value="cached"/> <param name="mafType" value="ENCODE_TBA_hg17"/> <param name="species" value="hg17,panTro1,baboon,marmoset,galago,rn3,mm6,rabbit,cow,canFam1,rfbat,shrew,armadillo,tenrec,monDom1,tetNig1,fr1,rheMac1,galGal2,xenTro1,danRer2,elephant,platypus,hedgehog,colobus_monkey,dusky_titi,owl_monkey,mouse_lemur"/> + <param name="split_blocks_by_species" value="dont_split_blocks_by_species"/> <output name="out_file1" file="fsa_interval2maf.dat" /> </test> <test> @@ -66,6 +85,7 @@ <param name="maf_source" value="user"/> <param name="mafFile" value="fsa_interval2maf.dat"/> <param name="species" value="hg17,panTro1,baboon,marmoset,galago,rn3,mm6,rabbit,cow,canFam1,rfbat,shrew,armadillo,tenrec,monDom1,tetNig1,fr1,rheMac1,galGal2,xenTro1,danRer2,elephant,platypus,hedgehog,colobus_monkey,dusky_titi,owl_monkey,mouse_lemur"/> + <param name="split_blocks_by_species" value="dont_split_blocks_by_species"/> <output name="out_file1" file="fsa_interval2maf.dat" /> </test> </tests> diff -r 99dcba7af5b6 -r e929a2d803e4 tools/maf/interval2maf_pairwise.xml --- a/tools/maf/interval2maf_pairwise.xml Fri Sep 04 10:31:23 2009 -0400 +++ b/tools/maf/interval2maf_pairwise.xml Fri Sep 04 10:40:16 2009 -0400 @@ -1,4 +1,4 @@ -<tool id="Interval2Maf_pairwise1" name="Extract Pairwise MAF blocks"> +<tool id="Interval2Maf_pairwise1" name="Extract Pairwise MAF blocks" version="1.0.1"> <description>given a set of genomic intervals</description> <command interpreter="python">interval2maf.py --dbkey=${input1.dbkey} --chromCol=${input1.metadata.chromCol} --startCol=${input1.metadata.startCol} --endCol=${input1.metadata.endCol} --strandCol=${input1.metadata.strandCol} --mafType=$mafType --interval_file=$input1 --output_file=$out_file1 --indexLocation=${GALAXY_DATA_INDEX_DIR}/maf_pairwise.loc</command> <inputs> diff -r 99dcba7af5b6 -r e929a2d803e4 tools/maf/interval_maf_to_merged_fasta.py --- a/tools/maf/interval_maf_to_merged_fasta.py Fri Sep 04 10:31:23 2009 -0400 +++ b/tools/maf/interval_maf_to_merged_fasta.py Fri Sep 04 10:40:16 2009 -0400 @@ -19,6 +19,7 @@ -i, --interval_file=i: Input interval file -o, --output_file=o: Output MAF file -p, --species=p: Species to include in output + -O, --overwrite_with_gaps=O: Overwrite bases found in a lower-scoring block with gaps interior to the sequence for a species. -z, --mafIndexFileDir=z: Directory of local maf_index.loc file usage: %prog dbkey_of_BED comma_separated_list_of_additional_dbkeys_to_extract comma_separated_list_of_indexed_maf_files input_gene_bed_file output_fasta_file cached|user GALAXY_DATA_INDEX_DIR @@ -93,6 +94,11 @@ strand_col = int( options.strandCol ) - 1 mafIndexFile = "%s/maf_index.loc" % options.mafIndexFileDir + + overwrite_with_gaps = True + if options.overwrite_with_gaps and options.overwrite_with_gaps.lower() == 'false': + overwrite_with_gaps = False + #Finish parsing command line #get index for mafs based on type @@ -127,7 +133,7 @@ try: starts, ends, fields = maf_utilities.get_starts_ends_fields_from_gene_bed( line ) #create spliced alignment object - alignment = maf_utilities.get_spliced_region_alignment( index, primary_species, fields[0], starts, ends, strand = '+', species = species, mincols = mincols ) + alignment = maf_utilities.get_spliced_region_alignment( index, primary_species, fields[0], starts, ends, strand = '+', species = species, mincols = mincols, overwrite_with_gaps = overwrite_with_gaps ) primary_name = secondary_name = fields[3] alignment_strand = fields[5] except Exception, e: @@ -136,7 +142,7 @@ else: #Process as standard intervals try: #create spliced alignment object - alignment = maf_utilities.get_region_alignment( index, primary_species, line.chrom, line.start, line.end, strand = '+', species = species, mincols = mincols ) + alignment = maf_utilities.get_region_alignment( index, primary_species, line.chrom, line.start, line.end, strand = '+', species = species, mincols = mincols, overwrite_with_gaps = overwrite_with_gaps ) primary_name = "%s(%s):%s-%s" % ( line.chrom, line.strand, line.start, line.end ) secondary_name = "" alignment_strand = line.strand diff -r 99dcba7af5b6 -r e929a2d803e4 tools/maf/interval_maf_to_merged_fasta.xml --- a/tools/maf/interval_maf_to_merged_fasta.xml Fri Sep 04 10:31:23 2009 -0400 +++ b/tools/maf/interval_maf_to_merged_fasta.xml Fri Sep 04 10:40:16 2009 -0400 @@ -1,8 +1,8 @@ -<tool id="Interval_Maf_Merged_Fasta2" name="Stitch MAF blocks"> +<tool id="Interval_Maf_Merged_Fasta2" name="Stitch MAF blocks" version="1.0.1"> <description>given a set of genomic intervals</description> <command interpreter="python">#if $maf_source_type.maf_source == "user":#interval_maf_to_merged_fasta.py --dbkey=$dbkey --species=$maf_source_type.species --mafSource=$maf_source_type.maf_file --mafIndex=$maf_source_type.maf_file.metadata.maf_index --interval_file=$input1 --output_file=$out_file1 --chromCol=${input1.metadata.chromCol} --startCol=${input1.metadata.startCol} --endCol=${input1.metadata.endCol} --strandCol=${input1.metadata.strandCol} --mafSourceType=$maf_source_type.maf_source --mafIndexFileDir=${GALAXY_DATA_INDEX_DIR} #else:#interval_maf_to_merged_fasta.py --dbkey=$dbkey --species=$maf_source_type.species --mafSource=$maf_source_type.maf_identifier --interval_file=$input1 --output_file=$out_file1 --chromCol=${input1.metadata.chromCol} --startCol=${input1.metadata.startCol} --endCol=${input1.metadata.endCol} --strandCol=${input1.metadata.strandCol} --mafSourceType=$maf_source_type.maf_source --mafIndexFileDir=${GALAXY_DATA_INDEX_DIR} -#end if +#end if# --overwrite_with_gaps=$overwrite_with_gaps </command> <inputs> <page> @@ -30,25 +30,29 @@ <when value="cached"> <param name="maf_identifier" type="select" label="MAF Type" > <options from_file="maf_index.loc"> - <column name="name" index="0"/> - <column name="value" index="1"/> - <column name="dbkey" index="2"/> - <column name="species" index="3"/> - <filter type="data_meta" ref="input1" key="dbkey" column="2" multiple="True" separator=","/> - <validator type="no_options" message="No alignments are available for the build associated with the selected interval file"/> + <column name="name" index="0"/> + <column name="value" index="1"/> + <column name="dbkey" index="2"/> + <column name="species" index="3"/> + <filter type="data_meta" ref="input1" key="dbkey" column="2" multiple="True" separator=","/> + <validator type="no_options" message="No alignments are available for the build associated with the selected interval file"/> </options> </param> <param name="species" type="select" display="checkboxes" multiple="true" label="Choose species" help="Select species to be included in the final alignment"> <options from_file="maf_index.loc"> - <column name="uid" index="1"/> - <column name="value" index="3"/> - <column name="name" index="3"/> - <filter type="param_value" ref="maf_identifier" name="uid" column="1"/> - <filter type="multiple_splitter" column="3" separator=","/> + <column name="uid" index="1"/> + <column name="value" index="3"/> + <column name="name" index="3"/> + <filter type="param_value" ref="maf_identifier" name="uid" column="1"/> + <filter type="multiple_splitter" column="3" separator=","/> </options> </param> </when> </conditional> + <param name="overwrite_with_gaps" type="select" label="Split into Gapless MAF blocks" help="When set to Yes, blocks are divided around gaps appearing in any species. This will prevent gaps occuring in the interior of the sequence for an aligning species from overwriting a nucleotide found for the same position in a lower-scoring block."> + <option value="True" selected="true">No</option> + <option value="False">Yes</option> + </param> </page> </inputs> <outputs> @@ -59,21 +63,24 @@ <param name="input1" value="13.bed" dbkey="hg18" ftype="bed"/> <param name="maf_source" value="cached"/> <param name="maf_identifier" value="17_WAY_MULTIZ_hg18"/> - <param name="species" value="hg18,mm8"/> + <param name="species" value="hg18,mm8"/> + <param name="overwrite_with_gaps" value="True"/> <output name="out_file1" file="interval_maf_to_merged_fasta_out3.fasta" /> </test> <test> <param name="input1" value="1.bed" dbkey="hg17" ftype="bed"/> <param name="maf_source" value="cached"/> <param name="maf_identifier" value="8_WAY_MULTIZ_hg17"/> - <param name="species" value="canFam1,hg17,mm5,panTro1,rn3"/> + <param name="species" value="canFam1,hg17,mm5,panTro1,rn3"/> + <param name="overwrite_with_gaps" value="True"/> <output name="out_file1" file="interval_maf_to_merged_fasta_out.dat" /> </test> <test> <param name="input1" value="1.bed" dbkey="hg17" ftype="bed"/> <param name="maf_source" value="user"/> <param name="maf_file" value="5.maf"/> - <param name="species" value="canFam1,hg17,mm5,panTro1,rn3"/> + <param name="species" value="canFam1,hg17,mm5,panTro1,rn3"/> + <param name="overwrite_with_gaps" value="True"/> <output name="out_file1" file="interval_maf_to_merged_fasta_user_out.dat" /> </test> </tests> diff -r 99dcba7af5b6 -r e929a2d803e4 tools/maf/maf_filter.py --- a/tools/maf/maf_filter.py Fri Sep 04 10:31:23 2009 -0400 +++ b/tools/maf/maf_filter.py Fri Sep 04 10:40:16 2009 -0400 @@ -46,7 +46,7 @@ i = 0 blocks_kept = 0 for i, maf_block in enumerate( maf_reader ): - if min_size <= maf_block.components[0].size <= max_size: + if min_size <= maf_block.text_size <= max_size: local = {'maf_block':maf_block, 'ret_val':False} execfile( script_file, {}, local ) if local['ret_val']: diff -r 99dcba7af5b6 -r e929a2d803e4 tools/maf/maf_limit_size.py --- a/tools/maf/maf_limit_size.py Fri Sep 04 10:31:23 2009 -0400 +++ b/tools/maf/maf_limit_size.py Fri Sep 04 10:40:16 2009 -0400 @@ -28,7 +28,7 @@ blocks_kept = 0 i = 0 for i, m in enumerate( maf_reader ): - if min_size <= m.components[0].size <= max_size: + if min_size <= m.text_size <= max_size: maf_writer.write( m ) blocks_kept += 1 print 'Kept %s of %s blocks (%.2f%%).' % ( blocks_kept, i + 1, float( blocks_kept ) / float( i + 1 ) * 100.0 ) diff -r 99dcba7af5b6 -r e929a2d803e4 tools/maf/maf_limit_size.xml --- a/tools/maf/maf_limit_size.xml Fri Sep 04 10:31:23 2009 -0400 +++ b/tools/maf/maf_limit_size.xml Fri Sep 04 10:40:16 2009 -0400 @@ -1,4 +1,4 @@ -<tool id="maf_limit_size1" name="Filter MAF blocks"> +<tool id="maf_limit_size1" name="Filter MAF blocks" version="1.0.1"> <description>by Size</description> <command interpreter="python">maf_limit_size.py $input1 $out_file1 $min_size $max_size</command> <inputs> diff -r 99dcba7af5b6 -r e929a2d803e4 tools/maf/maf_limit_to_species.py --- a/tools/maf/maf_limit_to_species.py Fri Sep 04 10:31:23 2009 -0400 +++ b/tools/maf/maf_limit_to_species.py Fri Sep 04 10:40:16 2009 -0400 @@ -11,13 +11,18 @@ from galaxy import eggs import pkg_resources; pkg_resources.require( "bx-python" ) import bx.align.maf +from galaxy.tools.util import maf_utilities import sys assert sys.version_info[:2] >= ( 2, 4 ) def main(): - species = sys.argv[1].split( ',' ) + species = maf_utilities.parse_species_option( sys.argv[1] ) + if species: + spec_len = len( species ) + else: + spec_len = 0 try: maf_reader = bx.align.maf.Reader( open( sys.argv[2],'r' ) ) maf_writer = bx.align.maf.Writer( open( sys.argv[3],'w' ) ) @@ -30,10 +35,11 @@ maf_blocks_kept = 0 for m in maf_reader: - if species != ['None']: + if species: m = m.limit_to_species( species ) m.remove_all_gap_columns() - if ( species == ['None'] or allow_partial or len( m.components ) == len( species ) ) and len( m.components ) > min_species_per_block: + spec_in_block_len = len( maf_utilities.get_species_in_block( m ) ) + if ( not species or allow_partial or spec_in_block_len == spec_len ) and spec_in_block_len > min_species_per_block: maf_writer.write( m ) maf_blocks_kept += 1 diff -r 99dcba7af5b6 -r e929a2d803e4 tools/maf/maf_split_by_species.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/maf/maf_split_by_species.py Fri Sep 04 10:40:16 2009 -0400 @@ -0,0 +1,44 @@ +#!/usr/bin/env python + +""" +Read a maf and split blocks by unique species combinations +""" +import sys +from galaxy import eggs +import pkg_resources; pkg_resources.require( "bx-python" ) +from bx.align import maf +from galaxy.tools.util import maf_utilities +from galaxy.util import string_as_bool + +assert sys.version_info[:2] >= ( 2, 4 ) + +def __main__(): + try: + maf_reader = maf.Reader( open( sys.argv[1] ) ) + except Exception, e: + maf_utilities.tool_fail( "Error opening MAF: %s" % e ) + try: + out = maf.Writer( open( sys.argv[2], "w") ) + except Exception, e: + maf_utilities.tool_fail( "Error opening file for output: %s" % e ) + try: + collapse_columns = string_as_bool( sys.argv[3] ) + except Exception, e: + maf_utilities.tool_fail( "Error determining collapse columns value: %s" % e ) + + start_count = 0 + end_count = 0 + for start_count, start_block in enumerate( maf_reader ): + for block in maf_utilities.iter_blocks_split_by_species( start_block ): + if collapse_columns: + block.remove_all_gap_columns() + out.write( block ) + end_count += 1 + out.close() + + if end_count: + print "%i alignment blocks created from %i original blocks." % ( end_count, start_count + 1 ) + else: + print "No alignment blocks were created." + +if __name__ == "__main__": __main__() diff -r 99dcba7af5b6 -r e929a2d803e4 tools/maf/maf_split_by_species.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/maf/maf_split_by_species.xml Fri Sep 04 10:40:16 2009 -0400 @@ -0,0 +1,216 @@ +<tool id="MAF_split_blocks_by_species1" name="Split MAF blocks" version="1.0.0"> + <description>by Species</description> + <command interpreter="python">maf_split_by_species.py $input1 $out_file1 $collapse_columns</command> + <inputs> + <param format="maf" name="input1" type="data" label="MAF file to split"/> + <param name="collapse_columns" type="select" label="Collapse empty alignment columns" help="Removes columns that are gaps in all sequences"> + <option value="True" selected="true">Yes</option> + <option value="False">No</option> + </param> + </inputs> + <outputs> + <data format="maf" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="input1" value="maf_split_by_species_in.maf"/> + <param name="collapse_columns" value="True"/> + <output name="out_file1" file="maf_split_by_species_collapsed_out.maf"/> + </test> + <test> + <param name="input1" value="maf_split_by_species_in.maf"/> + <param name="collapse_columns" value="False"/> + <output name="out_file1" file="maf_split_by_species_not_collapsed_out.maf"/> + </test> + </tests> + <help> + +**What it does** + +This tool examines each MAF block for multiple occurrences of a species in a single block. When this occurs, a block is split into multiple blocks where every combination of one sequence per species per block is represented. + +The interface for this tool has two inputs: + + * **MAF file to split**. Choose multiple alignments from history to be split by species. + * **Collapse empty alignment columns**. Should alignment columns containing only gaps in the new blocks be removed. + +----- + +**Example 1**: **Collapse empty alignment columns is Yes**: + +For the following alignment:: + + ##maf version=1 + a score=2047408.0 + s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG + s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG + s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG + s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG + s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG + s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG + s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG + s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + +the tool will create **a single** history item containing 12 alignment blocks (notice that no columns contain only gaps):: + + ##maf version=1 + a score=2047408.0 + s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG + s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG + s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + + a score=2047408.0 + s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG + s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG + s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + + a score=2047408.0 + s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG + s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG + s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + + a score=2047408.0 + s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG + s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG + s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + + a score=2047408.0 + s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG + s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG + s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + + a score=2047408.0 + s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT-GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG + s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT-GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG + s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC--GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + + a score=2047408.0 + s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG + s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG + s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + + a score=2047408.0 + s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC-GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG + s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG + s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC-GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + + a score=2047408.0 + s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG + s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG + s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + + a score=2047408.0 + s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG + s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG + s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + + a score=2047408.0 + s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTAG + s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCAG + s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGCAG + + a score=2047408.0 + s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCAG + s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC---AG + s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC---AG + +----- + +**Example 1**: **Collapse empty alignment columns is Yes**: + +For the following alignment:: + + ##maf version=1 + a score=2047408.0 + s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG + s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG + s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG + s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG + s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG + s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG + s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG + s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + +the tool will create **a single** history item containing 12 alignment blocks (notice that some columns contain only gaps):: + + ##maf version=1 + a score=2047408.0 + s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG + s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG + s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + + a score=2047408.0 + s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG + s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG + s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + + a score=2047408.0 + s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG + s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG + s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + + a score=2047408.0 + s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG + s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG + s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + + a score=2047408.0 + s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG + s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG + s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + + a score=2047408.0 + s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG + s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG + s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + + a score=2047408.0 + s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG + s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG + s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + + a score=2047408.0 + s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG + s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG + s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + + a score=2047408.0 + s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG + s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG + s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + + a score=2047408.0 + s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG + s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG + s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + + a score=2047408.0 + s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG + s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG + s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + + a score=2047408.0 + s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG + s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG + s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG + +------- + +.. class:: infomark + +**About formats** + +**MAF format** multiple alignment format file. This format stores multiple alignments at the DNA level between entire genomes. + + - The .maf format is line-oriented. Each multiple alignment ends with a blank line. + - Each sequence in an alignment is on a single line. + - Lines starting with # are considered to be comments. + - Each multiple alignment is in a separate paragraph that begins with an "a" line and contains an "s" line for each sequence in the multiple alignment. + - Some MAF files may contain two optional line types: + + - An "i" line containing information about what is in the aligned species DNA before and after the immediately preceding "s" line; + - An "e" line containing information about the size of the gap between the alignments that span the current block. + + </help> +</tool> + diff -r 99dcba7af5b6 -r e929a2d803e4 tools/maf/maf_stats.py --- a/tools/maf/maf_stats.py Fri Sep 04 10:31:23 2009 -0400 +++ b/tools/maf/maf_stats.py Fri Sep 04 10:40:16 2009 -0400 @@ -64,16 +64,19 @@ total_length += region_length coverage = { dbkey: BitSet( region_length ) } - for block in maf_utilities.get_chopped_blocks_for_region( index, src, region, force_strand='+' ): - #make sure all species are known - for c in block.components: - spec = c.src.split( '.' )[0] + + for block in index.get_as_iterator( src, region.start, region.end ): + for spec in maf_utilities.get_species_in_block( block ): if spec not in coverage: coverage[spec] = BitSet( region_length ) - start_offset, alignment = maf_utilities.reduce_block_by_primary_genome( block, dbkey, region.chrom, region.start ) - for i in range( len( alignment[dbkey] ) ): - for spec, text in alignment.items(): - if text[i] != '-': - coverage[spec].set( start_offset + i ) + for block in maf_utilities.iter_blocks_split_by_species( block ): + if maf_utilities.component_overlaps_region( block.get_component_by_src( src ), region ): + #need to chop and orient the block + block = maf_utilities.orient_block_by_region( maf_utilities.chop_block_by_region( block, src, region ), src, region, force_strand = '+' ) + start_offset, alignment = maf_utilities.reduce_block_by_primary_genome( block, dbkey, region.chrom, region.start ) + for i in range( len( alignment[dbkey] ) ): + for spec, text in alignment.items(): + if text[i] != '-': + coverage[spec].set( start_offset + i ) if summary: #record summary for key in coverage.keys(): diff -r 99dcba7af5b6 -r e929a2d803e4 tools/maf/maf_stats.xml --- a/tools/maf/maf_stats.xml Fri Sep 04 10:31:23 2009 -0400 +++ b/tools/maf/maf_stats.xml Fri Sep 04 10:40:16 2009 -0400 @@ -1,4 +1,4 @@ -<tool id="maf_stats1" name="MAF Coverage Stats"> +<tool id="maf_stats1" name="MAF Coverage Stats" version="1.0.1"> <description>Alignment coverage information</description> <command interpreter="python"> maf_stats.py diff -r 99dcba7af5b6 -r e929a2d803e4 tools/maf/maf_to_fasta.xml --- a/tools/maf/maf_to_fasta.xml Fri Sep 04 10:31:23 2009 -0400 +++ b/tools/maf/maf_to_fasta.xml Fri Sep 04 10:40:16 2009 -0400 @@ -1,4 +1,4 @@ -<tool id="MAF_To_Fasta1" name="Maf to FASTA"> +<tool id="MAF_To_Fasta1" name="MAF to FASTA" version="1.0.1"> <description>Converts a MAF formated file to FASTA format</description> <command interpreter="python"> #if $fasta_target_type.fasta_type == "multiple":#maf_to_fasta_multiple_sets.py $input1 $out_file1 $fasta_target_type.species $fasta_target_type.complete_blocks @@ -47,7 +47,7 @@ <param name="species" value="hg17,panTro1,rheMac2,rn3,mm7,canFam2,bosTau2,dasNov1"/> <param name="complete_blocks" value="partial_allowed"/> <param name="fasta_type" value="multiple"/> - <output name="out_file1" file="cf_maf2fasta.dat" ftype="fasta"/> + <output name="out_file1" file="cf_maf2fasta_new.dat" ftype="fasta"/> </test> </tests> <help> diff -r 99dcba7af5b6 -r e929a2d803e4 tools/maf/maf_to_fasta_concat.py --- a/tools/maf/maf_to_fasta_concat.py Fri Sep 04 10:31:23 2009 -0400 +++ b/tools/maf/maf_to_fasta_concat.py Fri Sep 04 10:40:16 2009 -0400 @@ -1,7 +1,7 @@ #!/usr/bin/env python """ -Read a maf and print the text as a fasta file, concatenating blocks +Read a maf and output a single block fasta file, concatenating blocks usage %prog species1,species2 maf_file out_file """ @@ -15,28 +15,43 @@ assert sys.version_info[:2] >= ( 2, 4 ) def __main__(): - print "Restricted to species:", sys.argv[1] + try: + species = maf_utilities.parse_species_option( sys.argv[1] ) + except Exception, e: + maf_utilities.tool_fail( "Error determining species value: %s" % e ) + try: + input_filename = sys.argv[2] + except Exception, e: + maf_utilities.tool_fail( "Error reading MAF filename: %s" % e ) + try: + file_out = open( sys.argv[3], 'w' ) + except Exception, e: + maf_utilities.tool_fail( "Error opening file for output: %s" % e ) - texts = {} + if species: + print "Restricted to species: %s" % ', '.join( species ) + else: + print "Not restricted to species." - input_filename = sys.argv[2] - output_filename = sys.argv[3] - species = sys.argv[1].split( ',' ) + if not species: + try: + species = maf_utilities.get_species_in_maf( input_filename ) + except Exception, e: + maf_utilities.tool_fail( "Error determining species in input MAF: %s" % e ) - if "None" in species: - species = maf_utilities.get_species_in_maf( input_filename ) - - file_out = open( output_filename, 'w' ) for spec in species: file_out.write( ">" + spec + "\n" ) try: - for block in maf.Reader( open( input_filename, 'r' ) ): - component = block.get_component_by_src_start( spec ) - if component: file_out.write( component.text ) - else: file_out.write( "-" * block.text_size ) - except: - print >>sys.stderr, "Your MAF file appears to be malformed." - sys.exit() + for start_block in maf.Reader( open( input_filename, 'r' ) ): + for block in maf_utilities.iter_blocks_split_by_species( start_block ): + block.remove_all_gap_columns() #remove extra gaps + component = block.get_component_by_src_start( spec ) #blocks only have one occurrence of a particular species, so this is safe + if component: + file_out.write( component.text ) + else: + file_out.write( "-" * block.text_size ) + except Exception, e: + maf_utilities.tool_fail( "Your MAF file appears to be malformed: %s" % e ) file_out.write( "\n" ) file_out.close() diff -r 99dcba7af5b6 -r e929a2d803e4 tools/maf/maf_to_fasta_multiple_sets.py --- a/tools/maf/maf_to_fasta_multiple_sets.py Fri Sep 04 10:31:23 2009 -0400 +++ b/tools/maf/maf_to_fasta_multiple_sets.py Fri Sep 04 10:40:16 2009 -0400 @@ -1,7 +1,7 @@ #!/usr/bin/env python """ -Read a maf and print the text as a fasta file. +Read a maf and output a multiple block fasta file. """ #Dan Blankenberg import sys @@ -13,35 +13,46 @@ assert sys.version_info[:2] >= ( 2, 4 ) def __main__(): - print "Restricted to species:", sys.argv[3] + try: + maf_reader = maf.Reader( open( sys.argv[1] ) ) + except Exception, e: + maf_utilities.tool_fail( "Error opening input MAF: %s" % e ) + try: + file_out = open( sys.argv[2], 'w' ) + except Exception, e: + maf_utilities.tool_fail( "Error opening file for output: %s" % e ) + try: + species = maf_utilities.parse_species_option( sys.argv[3] ) + if species: + num_species = len( species ) + else: + num_species = 0 + except Exception, e: + maf_utilities.tool_fail( "Error determining species value: %s" % e ) + try: + partial = sys.argv[4] + except Exception, e: + maf_utilities.tool_fail( "Error determining keep partial value: %s" % e ) - input_filename = sys.argv[1] - output_filename = sys.argv[2] - species = sys.argv[3].split( ',' ) - partial = sys.argv[4] - num_species = len( species ) + if species: + print "Restricted to species: %s" % ', '.join( species ) + else: + print "Not restricted to species." - file_in = open( input_filename, 'r' ) - try: - maf_reader = maf.Reader( file_in ) - - file_out = open( output_filename, 'w' ) - - for block_num, block in enumerate( maf_reader ): - if "None" not in species: - block = block.limit_to_species( species ) - if len( block.components ) < num_species and partial == "partial_disallowed": continue - for component in block.components: - spec, chrom = maf.src_split( component.src ) - if not spec or not chrom: - spec = chrom = component.src - file_out.write( "%s\n" % maf_utilities.get_fasta_header( component, suffix = "%s_%i" % ( spec, block_num ) ) ) - file_out.write( "%s\n" % component.text ) - file_out.write( "\n" ) - file_in.close() - except Exception, e: - print >>sys.stderr, "Your MAF file appears to be malformed:", e - sys.exit() + for block_num, block in enumerate( maf_reader ): + if species: + block = block.limit_to_species( species ) + if len( maf_utilities.get_species_in_block( block ) ) < num_species and partial == "partial_disallowed": continue + spec_counts = {} + for component in block.components: + spec, chrom = maf_utilities.src_split( component.src ) + if spec not in spec_counts: + spec_counts[ spec ] = 0 + else: + spec_counts[ spec ] += 1 + file_out.write( "%s\n" % maf_utilities.get_fasta_header( component, { 'block_index' : block_num, 'species' : spec, 'sequence_index' : spec_counts[ spec ] }, suffix = "%s_%i_%i" % ( spec, block_num, spec_counts[ spec ] ) ) ) + file_out.write( "%s\n" % component.text ) + file_out.write( "\n" ) file_out.close() if __name__ == "__main__": __main__()