commit/galaxy-central: 2 new changesets
2 new commits in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/ce558696ab74/ Changeset: ce558696ab74 User: dannon Date: 2013-08-30 04:28:04 Summary: Strip COPIED_FROM method in datatypes/sequence.py from maf_utilities; avoid circular import. Affected #: 1 file diff -r 638e011bd72d029f08d4f51ea32f6f47ec80b87f -r ce558696ab749b5500e027d3b601a5b715fdac3c lib/galaxy/datatypes/sequence.py --- a/lib/galaxy/datatypes/sequence.py +++ b/lib/galaxy/datatypes/sequence.py @@ -12,13 +12,19 @@ from galaxy.datatypes.metadata import MetadataElement from galaxy.datatypes import metadata import galaxy.model -from galaxy import util +from galaxy import eggs, util from sniff import * -import pkg_resources -pkg_resources.require("simplejson") +eggs.require("simplejson") import simplejson +try: + eggs.require( "bx-python" ) + import bx.align.maf +except: + pass + + log = logging.getLogger(__name__) class SequenceSplitLocations( data.Text ): @@ -579,90 +585,6 @@ """Class representing a Color Space FASTQ sequence ( e.g a SOLiD variant )""" file_ext = "fastqcssanger" -try: - from galaxy import eggs - import pkg_resources; pkg_resources.require( "bx-python" ) - import bx.align.maf -except: - pass - -#trying to import maf_utilities here throws an ImportError due to a circular import between jobs and tools: -#from galaxy.tools.util.maf_utilities import build_maf_index_species_chromosomes -#Traceback (most recent call last): -# File "./scripts/paster.py", line 27, in <module> -# command.run() -# File "build/bdist.solaris-2.11-i86pc/egg/paste/script/command.py", line 78, in run -# File "build/bdist.solaris-2.11-i86pc/egg/paste/script/command.py", line 117, in invoke -# File "build/bdist.solaris-2.11-i86pc/egg/paste/script/command.py", line 212, in run -# File "build/bdist.solaris-2.11-i86pc/egg/paste/script/serve.py", line 227, in command -# File "build/bdist.solaris-2.11-i86pc/egg/paste/script/serve.py", line 250, in loadapp -# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 193, in loadapp -# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 213, in loadobj -# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 237, in loadcontext -# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 267, in _loadconfig -# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 397, in get_context -# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 439, in _context_from_explicit -# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 18, in import_string -# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/pkg_resources.py", line 1912, in load -# entry = __import__(self.module_name, globals(),globals(), ['__name__']) -# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/web/buildapp.py", line 18, in <module> -# from galaxy import config, jobs, util, tools -# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/jobs/__init__.py", line 3, in <module> -# from galaxy import util, model -# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/model/__init__.py", line 13, in <module> -# import galaxy.datatypes.registry -# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/datatypes/registry.py", line 6, in <module> -# import data, tabular, interval, images, sequence, qualityscore, genetics, xml, coverage, tracks, chrominfo -# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/datatypes/sequence.py", line 344, in <module> -# from galaxy.tools.util.maf_utilities import build_maf_index_species_chromosomes -# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/tools/__init__.py", line 15, in <module> -# from galaxy import util, jobs, model -#ImportError: cannot import name jobs -#so we'll copy and paste for now...terribly icky -#*** ANYCHANGE TO THIS METHOD HERE OR IN maf_utilities MUST BE PROPAGATED *** -def COPIED_build_maf_index_species_chromosomes( filename, index_species = None ): - species = [] - species_chromosomes = {} - indexes = bx.interval_index_file.Indexes() - blocks = 0 - try: - maf_reader = bx.align.maf.Reader( open( filename ) ) - while True: - pos = maf_reader.file.tell() - block = maf_reader.next() - if block is None: - break - blocks += 1 - for c in block.components: - spec = c.src - chrom = None - if "." in spec: - spec, chrom = spec.split( ".", 1 ) - if spec not in species: - species.append( spec ) - species_chromosomes[spec] = [] - if chrom and chrom not in species_chromosomes[spec]: - species_chromosomes[spec].append( chrom ) - if index_species is None or spec in index_species: - forward_strand_start = c.forward_strand_start - forward_strand_end = c.forward_strand_end - try: - forward_strand_start = int( forward_strand_start ) - forward_strand_end = int( forward_strand_end ) - except ValueError: - continue #start and end are not integers, can't add component to index, goto next component - #this likely only occurs when parse_e_rows is True? - #could a species exist as only e rows? should the - if forward_strand_end > forward_strand_start: - #require positive length; i.e. certain lines have start = end = 0 and cannot be indexed - indexes.add( c.src, forward_strand_start, forward_strand_end, pos, max=c.src_size ) - except Exception, e: - #most likely a bad MAF - log.debug( 'Building MAF index on %s failed: %s' % ( filename, e ) ) - return ( None, [], {}, 0 ) - return ( indexes, species, species_chromosomes, blocks ) - - class Maf( Alignment ): """Class describing a Maf alignment""" file_ext = "maf" @@ -679,7 +601,9 @@ Parses and sets species, chromosomes, index from MAF file. """ #these metadata values are not accessable by users, always overwrite - indexes, species, species_chromosomes, blocks = COPIED_build_maf_index_species_chromosomes( dataset.file_name ) + #Imported here to avoid circular dependency + from galaxy.tools.util.maf_utilities import build_maf_index_species_chromosomes + indexes, species, species_chromosomes, blocks = build_maf_index_species_chromosomes( dataset.file_name ) if indexes is None: return #this is not a MAF file dataset.metadata.species = species https://bitbucket.org/galaxy/galaxy-central/commits/d265a8a71309/ Changeset: d265a8a71309 User: dannon Date: 2013-08-30 04:28:23 Summary: Whitespace cleanup in datatypes/sequence.py Affected #: 1 file diff -r ce558696ab749b5500e027d3b601a5b715fdac3c -r d265a8a713092d3ba3fbca4d8db2cb93006d502f lib/galaxy/datatypes/sequence.py --- a/lib/galaxy/datatypes/sequence.py +++ b/lib/galaxy/datatypes/sequence.py @@ -30,7 +30,7 @@ class SequenceSplitLocations( data.Text ): """ Class storing information about a sequence file composed of multiple gzip files concatenated as - one OR an uncompressed file. In the GZIP case, each sub-file's location is stored in start and end. + one OR an uncompressed file. In the GZIP case, each sub-file's location is stored in start and end. The format of the file is JSON:: @@ -174,7 +174,7 @@ directories.append(dir) return dir - # we know how many splits and how many sequences in each. What remains is to write out instructions for the + # we know how many splits and how many sequences in each. What remains is to write out instructions for the # splitting of all the input files. To decouple the format of those instructions from this code, the exact format of # those instructions is delegated to scripts start_sequence=0 @@ -197,7 +197,7 @@ start_sequence += sequences_per_file[part_no] return directories write_split_files = classmethod(write_split_files) - + def split( cls, input_datasets, subdir_generator_function, split_params): """Split a generic sequence file (not sensible or possible, see subclasses).""" if split_params is None: @@ -217,7 +217,7 @@ return None raise NotImplementedError("Can't split generic alignment files") - + class Fasta( Sequence ): """Class representing a FASTA sequence""" file_ext = "fasta" @@ -225,13 +225,13 @@ def sniff( self, filename ): """ Determines whether the file is in fasta format - - A sequence in FASTA format consists of a single-line description, followed by lines of sequence data. - The first character of the description line is a greater-than (">") symbol in the first column. + + A sequence in FASTA format consists of a single-line description, followed by lines of sequence data. + The first character of the description line is a greater-than (">") symbol in the first column. All lines should be shorter than 80 characters - + For complete details see http://www.ncbi.nlm.nih.gov/blast/fasta.shtml - + Rules for sniffing as True: We don't care about line length (other than empty lines). @@ -247,7 +247,7 @@ This should be done through sniff order, where csfasta (currently has a null sniff function) is detected for first (stricter definition) followed sometime after by fasta We will only check that the first purported sequence is correctly formatted. - + >>> fname = get_test_fname( 'sequence.maf' ) >>> Fasta().sniff( fname ) False @@ -255,7 +255,7 @@ >>> Fasta().sniff( fname ) True """ - + try: fh = open( filename ) while True: @@ -410,7 +410,7 @@ def sniff( self, filename ): """ - Color-space sequence: + Color-space sequence: >2_15_85_F3 T213021013012303002332212012112221222112212222 @@ -444,7 +444,7 @@ except: pass return False - + def set_meta( self, dataset, **kwd ): if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize: dataset.metadata.data_lines = None @@ -474,7 +474,7 @@ if line and line.startswith( '#' ) and not sequences: # We don't count comment lines for sequence data types continue - if line and line.startswith( '@' ): + if line and line.startswith( '@' ): if seq_counter >= 4: # count previous block # blocks should be 4 lines long @@ -515,7 +515,7 @@ # Check the sequence line, make sure it contains only G/C/A/T/N if not bases_regexp.match( headers[1][0] ): return False - return True + return True return False except: return False @@ -556,7 +556,7 @@ output_name = data['output_name'] start_sequence = long(args['start_sequence']) sequence_count = long(args['num_sequences']) - + if 'toc_file' in args: toc_file = simplejson.load(open(args['toc_file'], 'r')) commands = Sequence.get_split_commands_with_toc(input_name, output_name, toc_file, start_sequence, sequence_count) @@ -588,7 +588,7 @@ class Maf( Alignment ): """Class describing a Maf alignment""" file_ext = "maf" - + #Readonly and optional, users can't unset it, but if it is not set, we are generally ok; if required use a metadata validator in the tool definition MetadataElement( name="blocks", default=0, desc="Number of blocks", readonly=True, optional=True, visible=False, no_value=0 ) MetadataElement( name="species_chromosomes", desc="Species Chromosomes", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True ) @@ -608,7 +608,7 @@ return #this is not a MAF file dataset.metadata.species = species dataset.metadata.blocks = blocks - + #write species chromosomes to a file chrom_file = dataset.metadata.species_chromosomes if not chrom_file: @@ -618,7 +618,7 @@ chrom_out.write( "%s\t%s\n" % ( spec, "\t".join( chroms ) ) ) chrom_out.close() dataset.metadata.species_chromosomes = chrom_file - + index_file = dataset.metadata.maf_index if not index_file: index_file = dataset.metadata.spec['maf_index'].param.new_file( dataset = dataset ) @@ -665,18 +665,18 @@ def sniff( self, filename ): """ Determines wether the file is in maf format - - The .maf format is line-oriented. Each multiple alignment ends with a blank line. - Each sequence in an alignment is on a single line, which can get quite long, but - there is no length limit. Words in a line are delimited by any white space. - Lines starting with # are considered to be comments. Lines starting with ## can + + The .maf format is line-oriented. Each multiple alignment ends with a blank line. + Each sequence in an alignment is on a single line, which can get quite long, but + there is no length limit. Words in a line are delimited by any white space. + Lines starting with # are considered to be comments. Lines starting with ## can be ignored by most programs, but contain meta-data of one form or another. - - The first line of a .maf file begins with ##maf. This word is followed by white-space-separated + + The first line of a .maf file begins with ##maf. This word is followed by white-space-separated variable=value pairs. There should be no white space surrounding the "=". - + For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format5 - + >>> fname = get_test_fname( 'sequence.maf' ) >>> Maf().sniff( fname ) True @@ -696,11 +696,11 @@ class MafCustomTrack( data.Text ): file_ext = "mafcustomtrack" - + MetadataElement( name="vp_chromosome", default='chr1', desc="Viewport Chromosome", readonly=True, optional=True, visible=False, no_value='' ) MetadataElement( name="vp_start", default='1', desc="Viewport Start", readonly=True, optional=True, visible=False, no_value='' ) MetadataElement( name="vp_end", default='100', desc="Viewport End", readonly=True, optional=True, visible=False, no_value='' ) - + def set_meta( self, dataset, overwrite = True, **kwd ): """ Parses and sets viewport metadata from MAF file. @@ -723,7 +723,7 @@ forward_strand_end = max( forward_strand_end, ref_comp.forward_strand_end ) if i > max_block_check: break - + if forward_strand_end > forward_strand_start: dataset.metadata.vp_chromosome = chrom dataset.metadata.vp_start = forward_strand_start @@ -734,7 +734,7 @@ class Axt( data.Text ): """Class describing an axt alignment""" - + # gvk- 11/19/09 - This is really an alignment, but we no longer have tools that use this data type, and it is # here simply for backward compatibility ( although it is still in the datatypes registry ). Subclassing # from data.Text eliminates managing metadata elements inherited from the Alignemnt class. @@ -744,21 +744,21 @@ def sniff( self, filename ): """ Determines whether the file is in axt format - - axt alignment files are produced from Blastz, an alignment tool available from Webb Miller's lab + + axt alignment files are produced from Blastz, an alignment tool available from Webb Miller's lab at Penn State University. - + Each alignment block in an axt file contains three lines: a summary line and 2 sequence lines. Blocks are separated from one another by blank lines. - + The summary line contains chromosomal position and size information about the alignment. It consists of 9 required fields. - + The sequence lines contain the sequence of the primary assembly (line 2) and aligning assembly (line 3) with inserts. Repeats are indicated by lower-case letters. - + For complete details see http://genome.ucsc.edu/goldenPath/help/axt.html - + >>> fname = get_test_fname( 'alignment.axt' ) >>> Axt().sniff( fname ) True @@ -797,12 +797,12 @@ def sniff( self, filename ): """ Determines whether the file is in lav format - + LAV is an alignment format developed by Webb Miller's group. It is the primary output format for BLASTZ. The first line of a .lav file begins with #:lav. - + For complete details see http://www.bioperl.org/wiki/LAV_alignment_format - + >>> fname = get_test_fname( 'alignment.lav' ) >>> Lav().sniff( fname ) True Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
commits-noreply@bitbucket.org