1 new commit in galaxy-central:
https://bitbucket.org/galaxy/galaxy-central/commits/facc879fe054/
Changeset: facc879fe054
User: dannon
Date: 2013-08-30 05:00:04
Summary: Add missing os import to cli_shell/rsh
Affected #: 1 file
diff -r 5e3567cdc08984012b6b26162a4ebe75e77e942b -r facc879fe0543f25e6b4d65e3e5d5efe716ff455 lib/galaxy/jobs/runners/cli_shell/rsh.py
--- a/lib/galaxy/jobs/runners/cli_shell/rsh.py
+++ b/lib/galaxy/jobs/runners/cli_shell/rsh.py
@@ -2,10 +2,11 @@
Interface for remote shell commands (rsh, rcp) and derivatives that use the same syntax (ssh, scp)
"""
+import logging
+import os
+import subprocess
+import tempfile
import time
-import logging
-import tempfile
-import subprocess
from galaxy.util.bunch import Bunch
from galaxy.jobs.runners.cli_shell import BaseShellExec
Repository URL: https://bitbucket.org/galaxy/galaxy-central/
--
This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.
1 new commit in galaxy-central:
https://bitbucket.org/galaxy/galaxy-central/commits/5e3567cdc089/
Changeset: 5e3567cdc089
User: dannon
Date: 2013-08-30 04:58:30
Summary: data_transfer workflow execute was using replacement_dict (which didn't exist in scope) instead of specifying it empty
Affected #: 1 file
diff -r 1cd27d43ab14111326ab9601c7d818e6b15a3e74 -r 5e3567cdc08984012b6b26162a4ebe75e77e942b lib/galaxy/jobs/deferred/data_transfer.py
--- a/lib/galaxy/jobs/deferred/data_transfer.py
+++ b/lib/galaxy/jobs/deferred/data_transfer.py
@@ -367,7 +367,7 @@
outputs[ step.id ] = out_data
for pja in step.post_job_actions:
if pja.action_type in ActionBox.immediate_actions:
- ActionBox.execute(self.app, self.sa_session, pja, job, replacement_dict)
+ ActionBox.execute(self.app, self.sa_session, pja, job, replacement_dict=None)
else:
job.add_post_job_action(pja)
else:
Repository URL: https://bitbucket.org/galaxy/galaxy-central/
--
This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.
1 new commit in galaxy-central:
https://bitbucket.org/galaxy/galaxy-central/commits/1cd27d43ab14/
Changeset: 1cd27d43ab14
User: dannon
Date: 2013-08-30 04:55:56
Summary: Broken exception in transfer manager using error vs response['error']
Affected #: 1 file
diff -r a1abf7b75d3e28ee0dc800a2ba13498e45d70ca2 -r 1cd27d43ab14111326ab9601c7d818e6b15a3e74 lib/galaxy/jobs/transfer_manager.py
--- a/lib/galaxy/jobs/transfer_manager.py
+++ b/lib/galaxy/jobs/transfer_manager.py
@@ -90,7 +90,7 @@
raise Exception( dict( code=128, message='Did not receive valid response from transfer daemon for state' ) )
if 'error' in response:
# Response was valid but Request resulted in an error
- raise Exception( error )
+ raise Exception( response['error'])
else:
# Request was valid
response['result']['transfer_job_id'] = tj.id
Repository URL: https://bitbucket.org/galaxy/galaxy-central/
--
This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.
1 new commit in galaxy-central:
https://bitbucket.org/galaxy/galaxy-central/commits/ce2528c40e4d/
Changeset: ce2528c40e4d
User: dannon
Date: 2013-08-30 04:32:20
Summary: Remove unused 'chunks' variable calculation from get_sequences_per_file
Affected #: 1 file
diff -r 31b4f32c38443343612b54cc879739bf7bc019f4 -r ce2528c40e4d89642bc98c2aff7bdc6f524e4f03 lib/galaxy/datatypes/sequence.py
--- a/lib/galaxy/datatypes/sequence.py
+++ b/lib/galaxy/datatypes/sequence.py
@@ -116,8 +116,6 @@
elif split_params['split_mode'] == 'to_size':
# loop through the sections and calculate the number of sequences
chunk_size = long(split_params['split_size'])
-
- chunks = total_sequences / chunk_size
rem = total_sequences % chunk_size
sequences_per_file = [chunk_size for i in range(total_sequences / chunk_size)]
# TODO: Should we invest the time in a better way to handle small remainders?
Repository URL: https://bitbucket.org/galaxy/galaxy-central/
--
This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.
1 new commit in galaxy-central:
https://bitbucket.org/galaxy/galaxy-central/commits/31b4f32c3844/
Changeset: 31b4f32c3844
User: dannon
Date: 2013-08-30 04:31:42
Summary: Import cleanup in sequence.py, remove import *.
Affected #: 1 file
diff -r d265a8a713092d3ba3fbca4d8db2cb93006d502f -r 31b4f32c38443343612b54cc879739bf7bc019f4 lib/galaxy/datatypes/sequence.py
--- a/lib/galaxy/datatypes/sequence.py
+++ b/lib/galaxy/datatypes/sequence.py
@@ -2,18 +2,19 @@
Sequence classes
"""
+import data
import gzip
-import data
import logging
+import os
import re
import string
-import os
from cgi import escape
+
+from galaxy import eggs, util
+from galaxy.datatypes import metadata
+from galaxy.datatypes.checkers import is_gzip
+from galaxy.datatypes.sniff import get_test_fname, get_headers
from galaxy.datatypes.metadata import MetadataElement
-from galaxy.datatypes import metadata
-import galaxy.model
-from galaxy import eggs, util
-from sniff import *
eggs.require("simplejson")
import simplejson
@@ -30,7 +31,7 @@
class SequenceSplitLocations( data.Text ):
"""
Class storing information about a sequence file composed of multiple gzip files concatenated as
- one OR an uncompressed file. In the GZIP case, each sub-file's location is stored in start and end.
+ one OR an uncompressed file. In the GZIP case, each sub-file's location is stored in start and end.
The format of the file is JSON::
@@ -174,7 +175,7 @@
directories.append(dir)
return dir
- # we know how many splits and how many sequences in each. What remains is to write out instructions for the
+ # we know how many splits and how many sequences in each. What remains is to write out instructions for the
# splitting of all the input files. To decouple the format of those instructions from this code, the exact format of
# those instructions is delegated to scripts
start_sequence=0
@@ -197,7 +198,7 @@
start_sequence += sequences_per_file[part_no]
return directories
write_split_files = classmethod(write_split_files)
-
+
def split( cls, input_datasets, subdir_generator_function, split_params):
"""Split a generic sequence file (not sensible or possible, see subclasses)."""
if split_params is None:
@@ -217,7 +218,7 @@
return None
raise NotImplementedError("Can't split generic alignment files")
-
+
class Fasta( Sequence ):
"""Class representing a FASTA sequence"""
file_ext = "fasta"
@@ -225,13 +226,13 @@
def sniff( self, filename ):
"""
Determines whether the file is in fasta format
-
- A sequence in FASTA format consists of a single-line description, followed by lines of sequence data.
- The first character of the description line is a greater-than (">") symbol in the first column.
+
+ A sequence in FASTA format consists of a single-line description, followed by lines of sequence data.
+ The first character of the description line is a greater-than (">") symbol in the first column.
All lines should be shorter than 80 characters
-
+
For complete details see http://www.ncbi.nlm.nih.gov/blast/fasta.shtml
-
+
Rules for sniffing as True:
We don't care about line length (other than empty lines).
@@ -247,7 +248,7 @@
This should be done through sniff order, where csfasta (currently has a null sniff function) is detected for first (stricter definition) followed sometime after by fasta
We will only check that the first purported sequence is correctly formatted.
-
+
>>> fname = get_test_fname( 'sequence.maf' )
>>> Fasta().sniff( fname )
False
@@ -255,7 +256,7 @@
>>> Fasta().sniff( fname )
True
"""
-
+
try:
fh = open( filename )
while True:
@@ -410,7 +411,7 @@
def sniff( self, filename ):
"""
- Color-space sequence:
+ Color-space sequence:
>2_15_85_F3
T213021013012303002332212012112221222112212222
@@ -444,7 +445,7 @@
except:
pass
return False
-
+
def set_meta( self, dataset, **kwd ):
if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
dataset.metadata.data_lines = None
@@ -474,7 +475,7 @@
if line and line.startswith( '#' ) and not sequences:
# We don't count comment lines for sequence data types
continue
- if line and line.startswith( '@' ):
+ if line and line.startswith( '@' ):
if seq_counter >= 4:
# count previous block
# blocks should be 4 lines long
@@ -515,7 +516,7 @@
# Check the sequence line, make sure it contains only G/C/A/T/N
if not bases_regexp.match( headers[1][0] ):
return False
- return True
+ return True
return False
except:
return False
@@ -556,7 +557,7 @@
output_name = data['output_name']
start_sequence = long(args['start_sequence'])
sequence_count = long(args['num_sequences'])
-
+
if 'toc_file' in args:
toc_file = simplejson.load(open(args['toc_file'], 'r'))
commands = Sequence.get_split_commands_with_toc(input_name, output_name, toc_file, start_sequence, sequence_count)
@@ -588,7 +589,7 @@
class Maf( Alignment ):
"""Class describing a Maf alignment"""
file_ext = "maf"
-
+
#Readonly and optional, users can't unset it, but if it is not set, we are generally ok; if required use a metadata validator in the tool definition
MetadataElement( name="blocks", default=0, desc="Number of blocks", readonly=True, optional=True, visible=False, no_value=0 )
MetadataElement( name="species_chromosomes", desc="Species Chromosomes", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True )
@@ -608,7 +609,7 @@
return #this is not a MAF file
dataset.metadata.species = species
dataset.metadata.blocks = blocks
-
+
#write species chromosomes to a file
chrom_file = dataset.metadata.species_chromosomes
if not chrom_file:
@@ -618,7 +619,7 @@
chrom_out.write( "%s\t%s\n" % ( spec, "\t".join( chroms ) ) )
chrom_out.close()
dataset.metadata.species_chromosomes = chrom_file
-
+
index_file = dataset.metadata.maf_index
if not index_file:
index_file = dataset.metadata.spec['maf_index'].param.new_file( dataset = dataset )
@@ -665,18 +666,18 @@
def sniff( self, filename ):
"""
Determines wether the file is in maf format
-
- The .maf format is line-oriented. Each multiple alignment ends with a blank line.
- Each sequence in an alignment is on a single line, which can get quite long, but
- there is no length limit. Words in a line are delimited by any white space.
- Lines starting with # are considered to be comments. Lines starting with ## can
+
+ The .maf format is line-oriented. Each multiple alignment ends with a blank line.
+ Each sequence in an alignment is on a single line, which can get quite long, but
+ there is no length limit. Words in a line are delimited by any white space.
+ Lines starting with # are considered to be comments. Lines starting with ## can
be ignored by most programs, but contain meta-data of one form or another.
-
- The first line of a .maf file begins with ##maf. This word is followed by white-space-separated
+
+ The first line of a .maf file begins with ##maf. This word is followed by white-space-separated
variable=value pairs. There should be no white space surrounding the "=".
-
+
For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format5
-
+
>>> fname = get_test_fname( 'sequence.maf' )
>>> Maf().sniff( fname )
True
@@ -696,11 +697,11 @@
class MafCustomTrack( data.Text ):
file_ext = "mafcustomtrack"
-
+
MetadataElement( name="vp_chromosome", default='chr1', desc="Viewport Chromosome", readonly=True, optional=True, visible=False, no_value='' )
MetadataElement( name="vp_start", default='1', desc="Viewport Start", readonly=True, optional=True, visible=False, no_value='' )
MetadataElement( name="vp_end", default='100', desc="Viewport End", readonly=True, optional=True, visible=False, no_value='' )
-
+
def set_meta( self, dataset, overwrite = True, **kwd ):
"""
Parses and sets viewport metadata from MAF file.
@@ -723,7 +724,7 @@
forward_strand_end = max( forward_strand_end, ref_comp.forward_strand_end )
if i > max_block_check:
break
-
+
if forward_strand_end > forward_strand_start:
dataset.metadata.vp_chromosome = chrom
dataset.metadata.vp_start = forward_strand_start
@@ -734,7 +735,7 @@
class Axt( data.Text ):
"""Class describing an axt alignment"""
-
+
# gvk- 11/19/09 - This is really an alignment, but we no longer have tools that use this data type, and it is
# here simply for backward compatibility ( although it is still in the datatypes registry ). Subclassing
# from data.Text eliminates managing metadata elements inherited from the Alignemnt class.
@@ -744,21 +745,21 @@
def sniff( self, filename ):
"""
Determines whether the file is in axt format
-
- axt alignment files are produced from Blastz, an alignment tool available from Webb Miller's lab
+
+ axt alignment files are produced from Blastz, an alignment tool available from Webb Miller's lab
at Penn State University.
-
+
Each alignment block in an axt file contains three lines: a summary line and 2 sequence lines.
Blocks are separated from one another by blank lines.
-
+
The summary line contains chromosomal position and size information about the alignment. It
consists of 9 required fields.
-
+
The sequence lines contain the sequence of the primary assembly (line 2) and aligning assembly
(line 3) with inserts. Repeats are indicated by lower-case letters.
-
+
For complete details see http://genome.ucsc.edu/goldenPath/help/axt.html
-
+
>>> fname = get_test_fname( 'alignment.axt' )
>>> Axt().sniff( fname )
True
@@ -797,12 +798,12 @@
def sniff( self, filename ):
"""
Determines whether the file is in lav format
-
+
LAV is an alignment format developed by Webb Miller's group. It is the primary output format for BLASTZ.
The first line of a .lav file begins with #:lav.
-
+
For complete details see http://www.bioperl.org/wiki/LAV_alignment_format
-
+
>>> fname = get_test_fname( 'alignment.lav' )
>>> Lav().sniff( fname )
True
Repository URL: https://bitbucket.org/galaxy/galaxy-central/
--
This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.
2 new commits in galaxy-central:
https://bitbucket.org/galaxy/galaxy-central/commits/ce558696ab74/
Changeset: ce558696ab74
User: dannon
Date: 2013-08-30 04:28:04
Summary: Strip COPIED_FROM method in datatypes/sequence.py from maf_utilities; avoid circular import.
Affected #: 1 file
diff -r 638e011bd72d029f08d4f51ea32f6f47ec80b87f -r ce558696ab749b5500e027d3b601a5b715fdac3c lib/galaxy/datatypes/sequence.py
--- a/lib/galaxy/datatypes/sequence.py
+++ b/lib/galaxy/datatypes/sequence.py
@@ -12,13 +12,19 @@
from galaxy.datatypes.metadata import MetadataElement
from galaxy.datatypes import metadata
import galaxy.model
-from galaxy import util
+from galaxy import eggs, util
from sniff import *
-import pkg_resources
-pkg_resources.require("simplejson")
+eggs.require("simplejson")
import simplejson
+try:
+ eggs.require( "bx-python" )
+ import bx.align.maf
+except:
+ pass
+
+
log = logging.getLogger(__name__)
class SequenceSplitLocations( data.Text ):
@@ -579,90 +585,6 @@
"""Class representing a Color Space FASTQ sequence ( e.g a SOLiD variant )"""
file_ext = "fastqcssanger"
-try:
- from galaxy import eggs
- import pkg_resources; pkg_resources.require( "bx-python" )
- import bx.align.maf
-except:
- pass
-
-#trying to import maf_utilities here throws an ImportError due to a circular import between jobs and tools:
-#from galaxy.tools.util.maf_utilities import build_maf_index_species_chromosomes
-#Traceback (most recent call last):
-# File "./scripts/paster.py", line 27, in <module>
-# command.run()
-# File "build/bdist.solaris-2.11-i86pc/egg/paste/script/command.py", line 78, in run
-# File "build/bdist.solaris-2.11-i86pc/egg/paste/script/command.py", line 117, in invoke
-# File "build/bdist.solaris-2.11-i86pc/egg/paste/script/command.py", line 212, in run
-# File "build/bdist.solaris-2.11-i86pc/egg/paste/script/serve.py", line 227, in command
-# File "build/bdist.solaris-2.11-i86pc/egg/paste/script/serve.py", line 250, in loadapp
-# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 193, in loadapp
-# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 213, in loadobj
-# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 237, in loadcontext
-# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 267, in _loadconfig
-# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 397, in get_context
-# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 439, in _context_from_explicit
-# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 18, in import_string
-# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/pkg_resources.py", line 1912, in load
-# entry = __import__(self.module_name, globals(),globals(), ['__name__'])
-# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/web/buildapp.py", line 18, in <module>
-# from galaxy import config, jobs, util, tools
-# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/jobs/__init__.py", line 3, in <module>
-# from galaxy import util, model
-# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/model/__init__.py", line 13, in <module>
-# import galaxy.datatypes.registry
-# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/datatypes/registry.py", line 6, in <module>
-# import data, tabular, interval, images, sequence, qualityscore, genetics, xml, coverage, tracks, chrominfo
-# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/datatypes/sequence.py", line 344, in <module>
-# from galaxy.tools.util.maf_utilities import build_maf_index_species_chromosomes
-# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/tools/__init__.py", line 15, in <module>
-# from galaxy import util, jobs, model
-#ImportError: cannot import name jobs
-#so we'll copy and paste for now...terribly icky
-#*** ANYCHANGE TO THIS METHOD HERE OR IN maf_utilities MUST BE PROPAGATED ***
-def COPIED_build_maf_index_species_chromosomes( filename, index_species = None ):
- species = []
- species_chromosomes = {}
- indexes = bx.interval_index_file.Indexes()
- blocks = 0
- try:
- maf_reader = bx.align.maf.Reader( open( filename ) )
- while True:
- pos = maf_reader.file.tell()
- block = maf_reader.next()
- if block is None:
- break
- blocks += 1
- for c in block.components:
- spec = c.src
- chrom = None
- if "." in spec:
- spec, chrom = spec.split( ".", 1 )
- if spec not in species:
- species.append( spec )
- species_chromosomes[spec] = []
- if chrom and chrom not in species_chromosomes[spec]:
- species_chromosomes[spec].append( chrom )
- if index_species is None or spec in index_species:
- forward_strand_start = c.forward_strand_start
- forward_strand_end = c.forward_strand_end
- try:
- forward_strand_start = int( forward_strand_start )
- forward_strand_end = int( forward_strand_end )
- except ValueError:
- continue #start and end are not integers, can't add component to index, goto next component
- #this likely only occurs when parse_e_rows is True?
- #could a species exist as only e rows? should the
- if forward_strand_end > forward_strand_start:
- #require positive length; i.e. certain lines have start = end = 0 and cannot be indexed
- indexes.add( c.src, forward_strand_start, forward_strand_end, pos, max=c.src_size )
- except Exception, e:
- #most likely a bad MAF
- log.debug( 'Building MAF index on %s failed: %s' % ( filename, e ) )
- return ( None, [], {}, 0 )
- return ( indexes, species, species_chromosomes, blocks )
-
-
class Maf( Alignment ):
"""Class describing a Maf alignment"""
file_ext = "maf"
@@ -679,7 +601,9 @@
Parses and sets species, chromosomes, index from MAF file.
"""
#these metadata values are not accessable by users, always overwrite
- indexes, species, species_chromosomes, blocks = COPIED_build_maf_index_species_chromosomes( dataset.file_name )
+ #Imported here to avoid circular dependency
+ from galaxy.tools.util.maf_utilities import build_maf_index_species_chromosomes
+ indexes, species, species_chromosomes, blocks = build_maf_index_species_chromosomes( dataset.file_name )
if indexes is None:
return #this is not a MAF file
dataset.metadata.species = species
https://bitbucket.org/galaxy/galaxy-central/commits/d265a8a71309/
Changeset: d265a8a71309
User: dannon
Date: 2013-08-30 04:28:23
Summary: Whitespace cleanup in datatypes/sequence.py
Affected #: 1 file
diff -r ce558696ab749b5500e027d3b601a5b715fdac3c -r d265a8a713092d3ba3fbca4d8db2cb93006d502f lib/galaxy/datatypes/sequence.py
--- a/lib/galaxy/datatypes/sequence.py
+++ b/lib/galaxy/datatypes/sequence.py
@@ -30,7 +30,7 @@
class SequenceSplitLocations( data.Text ):
"""
Class storing information about a sequence file composed of multiple gzip files concatenated as
- one OR an uncompressed file. In the GZIP case, each sub-file's location is stored in start and end.
+ one OR an uncompressed file. In the GZIP case, each sub-file's location is stored in start and end.
The format of the file is JSON::
@@ -174,7 +174,7 @@
directories.append(dir)
return dir
- # we know how many splits and how many sequences in each. What remains is to write out instructions for the
+ # we know how many splits and how many sequences in each. What remains is to write out instructions for the
# splitting of all the input files. To decouple the format of those instructions from this code, the exact format of
# those instructions is delegated to scripts
start_sequence=0
@@ -197,7 +197,7 @@
start_sequence += sequences_per_file[part_no]
return directories
write_split_files = classmethod(write_split_files)
-
+
def split( cls, input_datasets, subdir_generator_function, split_params):
"""Split a generic sequence file (not sensible or possible, see subclasses)."""
if split_params is None:
@@ -217,7 +217,7 @@
return None
raise NotImplementedError("Can't split generic alignment files")
-
+
class Fasta( Sequence ):
"""Class representing a FASTA sequence"""
file_ext = "fasta"
@@ -225,13 +225,13 @@
def sniff( self, filename ):
"""
Determines whether the file is in fasta format
-
- A sequence in FASTA format consists of a single-line description, followed by lines of sequence data.
- The first character of the description line is a greater-than (">") symbol in the first column.
+
+ A sequence in FASTA format consists of a single-line description, followed by lines of sequence data.
+ The first character of the description line is a greater-than (">") symbol in the first column.
All lines should be shorter than 80 characters
-
+
For complete details see http://www.ncbi.nlm.nih.gov/blast/fasta.shtml
-
+
Rules for sniffing as True:
We don't care about line length (other than empty lines).
@@ -247,7 +247,7 @@
This should be done through sniff order, where csfasta (currently has a null sniff function) is detected for first (stricter definition) followed sometime after by fasta
We will only check that the first purported sequence is correctly formatted.
-
+
>>> fname = get_test_fname( 'sequence.maf' )
>>> Fasta().sniff( fname )
False
@@ -255,7 +255,7 @@
>>> Fasta().sniff( fname )
True
"""
-
+
try:
fh = open( filename )
while True:
@@ -410,7 +410,7 @@
def sniff( self, filename ):
"""
- Color-space sequence:
+ Color-space sequence:
>2_15_85_F3
T213021013012303002332212012112221222112212222
@@ -444,7 +444,7 @@
except:
pass
return False
-
+
def set_meta( self, dataset, **kwd ):
if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
dataset.metadata.data_lines = None
@@ -474,7 +474,7 @@
if line and line.startswith( '#' ) and not sequences:
# We don't count comment lines for sequence data types
continue
- if line and line.startswith( '@' ):
+ if line and line.startswith( '@' ):
if seq_counter >= 4:
# count previous block
# blocks should be 4 lines long
@@ -515,7 +515,7 @@
# Check the sequence line, make sure it contains only G/C/A/T/N
if not bases_regexp.match( headers[1][0] ):
return False
- return True
+ return True
return False
except:
return False
@@ -556,7 +556,7 @@
output_name = data['output_name']
start_sequence = long(args['start_sequence'])
sequence_count = long(args['num_sequences'])
-
+
if 'toc_file' in args:
toc_file = simplejson.load(open(args['toc_file'], 'r'))
commands = Sequence.get_split_commands_with_toc(input_name, output_name, toc_file, start_sequence, sequence_count)
@@ -588,7 +588,7 @@
class Maf( Alignment ):
"""Class describing a Maf alignment"""
file_ext = "maf"
-
+
#Readonly and optional, users can't unset it, but if it is not set, we are generally ok; if required use a metadata validator in the tool definition
MetadataElement( name="blocks", default=0, desc="Number of blocks", readonly=True, optional=True, visible=False, no_value=0 )
MetadataElement( name="species_chromosomes", desc="Species Chromosomes", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True )
@@ -608,7 +608,7 @@
return #this is not a MAF file
dataset.metadata.species = species
dataset.metadata.blocks = blocks
-
+
#write species chromosomes to a file
chrom_file = dataset.metadata.species_chromosomes
if not chrom_file:
@@ -618,7 +618,7 @@
chrom_out.write( "%s\t%s\n" % ( spec, "\t".join( chroms ) ) )
chrom_out.close()
dataset.metadata.species_chromosomes = chrom_file
-
+
index_file = dataset.metadata.maf_index
if not index_file:
index_file = dataset.metadata.spec['maf_index'].param.new_file( dataset = dataset )
@@ -665,18 +665,18 @@
def sniff( self, filename ):
"""
Determines wether the file is in maf format
-
- The .maf format is line-oriented. Each multiple alignment ends with a blank line.
- Each sequence in an alignment is on a single line, which can get quite long, but
- there is no length limit. Words in a line are delimited by any white space.
- Lines starting with # are considered to be comments. Lines starting with ## can
+
+ The .maf format is line-oriented. Each multiple alignment ends with a blank line.
+ Each sequence in an alignment is on a single line, which can get quite long, but
+ there is no length limit. Words in a line are delimited by any white space.
+ Lines starting with # are considered to be comments. Lines starting with ## can
be ignored by most programs, but contain meta-data of one form or another.
-
- The first line of a .maf file begins with ##maf. This word is followed by white-space-separated
+
+ The first line of a .maf file begins with ##maf. This word is followed by white-space-separated
variable=value pairs. There should be no white space surrounding the "=".
-
+
For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format5
-
+
>>> fname = get_test_fname( 'sequence.maf' )
>>> Maf().sniff( fname )
True
@@ -696,11 +696,11 @@
class MafCustomTrack( data.Text ):
file_ext = "mafcustomtrack"
-
+
MetadataElement( name="vp_chromosome", default='chr1', desc="Viewport Chromosome", readonly=True, optional=True, visible=False, no_value='' )
MetadataElement( name="vp_start", default='1', desc="Viewport Start", readonly=True, optional=True, visible=False, no_value='' )
MetadataElement( name="vp_end", default='100', desc="Viewport End", readonly=True, optional=True, visible=False, no_value='' )
-
+
def set_meta( self, dataset, overwrite = True, **kwd ):
"""
Parses and sets viewport metadata from MAF file.
@@ -723,7 +723,7 @@
forward_strand_end = max( forward_strand_end, ref_comp.forward_strand_end )
if i > max_block_check:
break
-
+
if forward_strand_end > forward_strand_start:
dataset.metadata.vp_chromosome = chrom
dataset.metadata.vp_start = forward_strand_start
@@ -734,7 +734,7 @@
class Axt( data.Text ):
"""Class describing an axt alignment"""
-
+
# gvk- 11/19/09 - This is really an alignment, but we no longer have tools that use this data type, and it is
# here simply for backward compatibility ( although it is still in the datatypes registry ). Subclassing
# from data.Text eliminates managing metadata elements inherited from the Alignemnt class.
@@ -744,21 +744,21 @@
def sniff( self, filename ):
"""
Determines whether the file is in axt format
-
- axt alignment files are produced from Blastz, an alignment tool available from Webb Miller's lab
+
+ axt alignment files are produced from Blastz, an alignment tool available from Webb Miller's lab
at Penn State University.
-
+
Each alignment block in an axt file contains three lines: a summary line and 2 sequence lines.
Blocks are separated from one another by blank lines.
-
+
The summary line contains chromosomal position and size information about the alignment. It
consists of 9 required fields.
-
+
The sequence lines contain the sequence of the primary assembly (line 2) and aligning assembly
(line 3) with inserts. Repeats are indicated by lower-case letters.
-
+
For complete details see http://genome.ucsc.edu/goldenPath/help/axt.html
-
+
>>> fname = get_test_fname( 'alignment.axt' )
>>> Axt().sniff( fname )
True
@@ -797,12 +797,12 @@
def sniff( self, filename ):
"""
Determines whether the file is in lav format
-
+
LAV is an alignment format developed by Webb Miller's group. It is the primary output format for BLASTZ.
The first line of a .lav file begins with #:lav.
-
+
For complete details see http://www.bioperl.org/wiki/LAV_alignment_format
-
+
>>> fname = get_test_fname( 'alignment.lav' )
>>> Lav().sniff( fname )
True
Repository URL: https://bitbucket.org/galaxy/galaxy-central/
--
This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.
1 new commit in galaxy-central:
https://bitbucket.org/galaxy/galaxy-central/commits/638e011bd72d/
Changeset: 638e011bd72d
User: dannon
Date: 2013-08-30 04:08:18
Summary: Fix maf_utilities formatting, removing windows terminators.
Affected #: 1 file
diff -r bec1baa9d2db009abd16778093fb15b96bde0950 -r 638e011bd72d029f08d4f51ea32f6f47ec80b87f lib/galaxy/tools/util/maf_utilities.py
--- a/lib/galaxy/tools/util/maf_utilities.py
+++ b/lib/galaxy/tools/util/maf_utilities.py
@@ -1,17 +1,17 @@
-#!/usr/bin/env python
-"""
-Provides wrappers and utilities for working with MAF files and alignments.
-"""
-#Dan Blankenberg
-import pkg_resources; pkg_resources.require( "bx-python" )
-import bx.align.maf
-import bx.intervals
-import bx.interval_index_file
+#!/usr/bin/env python
+"""
+Provides wrappers and utilities for working with MAF files and alignments.
+"""
+#Dan Blankenberg
+import pkg_resources; pkg_resources.require( "bx-python" )
+import bx.align.maf
+import bx.intervals
+import bx.interval_index_file
import sys, os, string, tempfile
import logging
-from copy import deepcopy
-
-assert sys.version_info[:2] >= ( 2, 4 )
+from copy import deepcopy
+
+assert sys.version_info[:2] >= ( 2, 4 )
log = logging.getLogger(__name__)
@@ -26,7 +26,7 @@
chrom = fields.pop( 0 )
else:
chrom = spec
- return spec, chrom
+ return spec, chrom
def src_merge( spec, chrom, contig = None ):
if None in [ spec, chrom ]:
@@ -45,140 +45,140 @@
print >> sys.stderr, "Fatal Error: %s" % msg
sys.exit()
-#an object corresponding to a reference layered alignment
-class RegionAlignment( object ):
-
- DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" )
- MAX_SEQUENCE_SIZE = sys.maxint #Maximum length of sequence allowed
-
- def __init__( self, size, species = [] ):
- assert size <= self.MAX_SEQUENCE_SIZE, "Maximum length allowed for an individual sequence has been exceeded (%i > %i)." % ( size, self.MAX_SEQUENCE_SIZE )
- self.size = size
- self.sequences = {}
- if not isinstance( species, list ):
- species = [species]
- for spec in species:
- self.add_species( spec )
-
- #add a species to the alignment
- def add_species( self, species ):
- #make temporary sequence files
- self.sequences[species] = tempfile.TemporaryFile()
- self.sequences[species].write( "-" * self.size )
-
- #returns the names for species found in alignment, skipping names as requested
- def get_species_names( self, skip = [] ):
- if not isinstance( skip, list ): skip = [skip]
- names = self.sequences.keys()
- for name in skip:
- try: names.remove( name )
- except: pass
- return names
-
- #returns the sequence for a species
- def get_sequence( self, species ):
- self.sequences[species].seek( 0 )
- return self.sequences[species].read()
-
- #returns the reverse complement of the sequence for a species
- def get_sequence_reverse_complement( self, species ):
- complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )]
- complement.reverse()
- return "".join( complement )
-
- #sets a position for a species
- def set_position( self, index, species, base ):
- if len( base ) != 1: raise Exception( "A genomic position can only have a length of 1." )
- return self.set_range( index, species, base )
- #sets a range for a species
- def set_range( self, index, species, bases ):
- if index >= self.size or index < 0: raise Exception( "Your index (%i) is out of range (0 - %i)." % ( index, self.size - 1 ) )
- if len( bases ) == 0: raise Exception( "A set of genomic positions can only have a positive length." )
- if species not in self.sequences.keys(): self.add_species( species )
- self.sequences[species].seek( index )
- self.sequences[species].write( bases )
-
- #Flush temp file of specified species, or all species
- def flush( self, species = None ):
- if species is None:
- species = self.sequences.keys()
- elif not isinstance( species, list ):
- species = [species]
- for spec in species:
- self.sequences[spec].flush()
-
-class GenomicRegionAlignment( RegionAlignment ):
-
- def __init__( self, start, end, species = [] ):
- RegionAlignment.__init__( self, end - start, species )
- self.start = start
- self.end = end
-
-class SplicedAlignment( object ):
-
- DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" )
-
- def __init__( self, exon_starts, exon_ends, species = [] ):
- if not isinstance( exon_starts, list ):
- exon_starts = [exon_starts]
- if not isinstance( exon_ends, list ):
- exon_ends = [exon_ends]
- assert len( exon_starts ) == len( exon_ends ), "The number of starts does not match the number of sizes."
- self.exons = []
- for i in range( len( exon_starts ) ):
- self.exons.append( GenomicRegionAlignment( exon_starts[i], exon_ends[i], species ) )
-
- #returns the names for species found in alignment, skipping names as requested
- def get_species_names( self, skip = [] ):
- if not isinstance( skip, list ): skip = [skip]
- names = []
- for exon in self.exons:
- for name in exon.get_species_names( skip = skip ):
- if name not in names:
- names.append( name )
- return names
-
- #returns the sequence for a species
- def get_sequence( self, species ):
- sequence = tempfile.TemporaryFile()
- for exon in self.exons:
- if species in exon.get_species_names():
- sequence.write( exon.get_sequence( species ) )
- else:
- sequence.write( "-" * exon.size )
- sequence.seek( 0 )
- return sequence.read()
-
- #returns the reverse complement of the sequence for a species
- def get_sequence_reverse_complement( self, species ):
- complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )]
- complement.reverse()
- return "".join( complement )
-
- #Start and end of coding region
- @property
- def start( self ):
- return self.exons[0].start
- @property
- def end( self ):
- return self.exons[-1].end
-
-#Open a MAF index using a UID
-def maf_index_by_uid( maf_uid, index_location_file ):
- for line in open( index_location_file ):
- try:
- #read each line, if not enough fields, go to next line
- if line[0:1] == "#" : continue
- fields = line.split('\t')
- if maf_uid == fields[1]:
- try:
- maf_files = fields[4].replace( "\n", "" ).replace( "\r", "" ).split( "," )
- return bx.align.maf.MultiIndexed( maf_files, keep_open = True, parse_e_rows = False )
- except Exception, e:
- raise Exception( 'MAF UID (%s) found, but configuration appears to be malformed: %s' % ( maf_uid, e ) )
- except:
- pass
- return None
+#an object corresponding to a reference layered alignment
+class RegionAlignment( object ):
+
+ DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" )
+ MAX_SEQUENCE_SIZE = sys.maxint #Maximum length of sequence allowed
+
+ def __init__( self, size, species = [] ):
+ assert size <= self.MAX_SEQUENCE_SIZE, "Maximum length allowed for an individual sequence has been exceeded (%i > %i)." % ( size, self.MAX_SEQUENCE_SIZE )
+ self.size = size
+ self.sequences = {}
+ if not isinstance( species, list ):
+ species = [species]
+ for spec in species:
+ self.add_species( spec )
+
+ #add a species to the alignment
+ def add_species( self, species ):
+ #make temporary sequence files
+ self.sequences[species] = tempfile.TemporaryFile()
+ self.sequences[species].write( "-" * self.size )
+
+ #returns the names for species found in alignment, skipping names as requested
+ def get_species_names( self, skip = [] ):
+ if not isinstance( skip, list ): skip = [skip]
+ names = self.sequences.keys()
+ for name in skip:
+ try: names.remove( name )
+ except: pass
+ return names
+
+ #returns the sequence for a species
+ def get_sequence( self, species ):
+ self.sequences[species].seek( 0 )
+ return self.sequences[species].read()
+
+ #returns the reverse complement of the sequence for a species
+ def get_sequence_reverse_complement( self, species ):
+ complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )]
+ complement.reverse()
+ return "".join( complement )
+
+ #sets a position for a species
+ def set_position( self, index, species, base ):
+ if len( base ) != 1: raise Exception( "A genomic position can only have a length of 1." )
+ return self.set_range( index, species, base )
+ #sets a range for a species
+ def set_range( self, index, species, bases ):
+ if index >= self.size or index < 0: raise Exception( "Your index (%i) is out of range (0 - %i)." % ( index, self.size - 1 ) )
+ if len( bases ) == 0: raise Exception( "A set of genomic positions can only have a positive length." )
+ if species not in self.sequences.keys(): self.add_species( species )
+ self.sequences[species].seek( index )
+ self.sequences[species].write( bases )
+
+ #Flush temp file of specified species, or all species
+ def flush( self, species = None ):
+ if species is None:
+ species = self.sequences.keys()
+ elif not isinstance( species, list ):
+ species = [species]
+ for spec in species:
+ self.sequences[spec].flush()
+
+class GenomicRegionAlignment( RegionAlignment ):
+
+ def __init__( self, start, end, species = [] ):
+ RegionAlignment.__init__( self, end - start, species )
+ self.start = start
+ self.end = end
+
+class SplicedAlignment( object ):
+
+ DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" )
+
+ def __init__( self, exon_starts, exon_ends, species = [] ):
+ if not isinstance( exon_starts, list ):
+ exon_starts = [exon_starts]
+ if not isinstance( exon_ends, list ):
+ exon_ends = [exon_ends]
+ assert len( exon_starts ) == len( exon_ends ), "The number of starts does not match the number of sizes."
+ self.exons = []
+ for i in range( len( exon_starts ) ):
+ self.exons.append( GenomicRegionAlignment( exon_starts[i], exon_ends[i], species ) )
+
+ #returns the names for species found in alignment, skipping names as requested
+ def get_species_names( self, skip = [] ):
+ if not isinstance( skip, list ): skip = [skip]
+ names = []
+ for exon in self.exons:
+ for name in exon.get_species_names( skip = skip ):
+ if name not in names:
+ names.append( name )
+ return names
+
+ #returns the sequence for a species
+ def get_sequence( self, species ):
+ sequence = tempfile.TemporaryFile()
+ for exon in self.exons:
+ if species in exon.get_species_names():
+ sequence.write( exon.get_sequence( species ) )
+ else:
+ sequence.write( "-" * exon.size )
+ sequence.seek( 0 )
+ return sequence.read()
+
+ #returns the reverse complement of the sequence for a species
+ def get_sequence_reverse_complement( self, species ):
+ complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )]
+ complement.reverse()
+ return "".join( complement )
+
+ #Start and end of coding region
+ @property
+ def start( self ):
+ return self.exons[0].start
+ @property
+ def end( self ):
+ return self.exons[-1].end
+
+#Open a MAF index using a UID
+def maf_index_by_uid( maf_uid, index_location_file ):
+ for line in open( index_location_file ):
+ try:
+ #read each line, if not enough fields, go to next line
+ if line[0:1] == "#" : continue
+ fields = line.split('\t')
+ if maf_uid == fields[1]:
+ try:
+ maf_files = fields[4].replace( "\n", "" ).replace( "\r", "" ).split( "," )
+ return bx.align.maf.MultiIndexed( maf_files, keep_open = True, parse_e_rows = False )
+ except Exception, e:
+ raise Exception( 'MAF UID (%s) found, but configuration appears to be malformed: %s' % ( maf_uid, e ) )
+ except:
+ pass
+ return None
#return ( index, temp_index_filename ) for user maf, if available, or build one and return it, return None when no tempfile is created
def open_or_build_maf_index( maf_file, index_filename, species = None ):
@@ -186,27 +186,27 @@
return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), None )
except:
return build_maf_index( maf_file, species = species )
-
+
#*** ANYCHANGE TO THIS METHOD HERE OR IN galaxy.datatypes.sequences MUST BE PROPAGATED ***
def build_maf_index_species_chromosomes( filename, index_species = None ):
species = []
species_chromosomes = {}
- indexes = bx.interval_index_file.Indexes()
+ indexes = bx.interval_index_file.Indexes()
blocks = 0
try:
maf_reader = bx.align.maf.Reader( open( filename ) )
while True:
pos = maf_reader.file.tell()
block = maf_reader.next()
- if block is None:
- break
+ if block is None:
+ break
blocks += 1
for c in block.components:
spec = c.src
chrom = None
if "." in spec:
spec, chrom = spec.split( ".", 1 )
- if spec not in species:
+ if spec not in species:
species.append( spec )
species_chromosomes[spec] = []
if chrom and chrom not in species_chromosomes[spec]:
@@ -229,17 +229,17 @@
log.debug( 'Building MAF index on %s failed: %s' % ( filename, e ) )
return ( None, [], {}, 0 )
return ( indexes, species, species_chromosomes, blocks )
-
-#builds and returns ( index, index_filename ) for specified maf_file
-def build_maf_index( maf_file, species = None ):
+
+#builds and returns ( index, index_filename ) for specified maf_file
+def build_maf_index( maf_file, species = None ):
indexes, found_species, species_chromosomes, blocks = build_maf_index_species_chromosomes( maf_file, species )
if indexes is not None:
- fd, index_filename = tempfile.mkstemp()
- out = os.fdopen( fd, 'w' )
- indexes.write( out )
- out.close()
+ fd, index_filename = tempfile.mkstemp()
+ out = os.fdopen( fd, 'w' )
+ indexes.write( out )
+ out.close()
return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), index_filename )
- return ( None, None )
+ return ( None, None )
def component_overlaps_region( c, region ):
if c is None: return False
@@ -276,7 +276,7 @@
start = end - slice_len
slice_start = min( start, slice_start )
slice_end = max( end, slice_end )
-
+
if slice_start < slice_end:
block = block.slice( slice_start, slice_end )
if block.text_size > mincols:
@@ -287,7 +287,7 @@
block.remove_all_gap_columns()
return block
return None
-
+
def orient_block_by_region( block, src, region, force_strand = None ):
#loop through components matching src,
#make sure each of these components overlap region
@@ -295,12 +295,12 @@
#if force_strand / region.strand not in strand cache, reverse complement
### we could have 2 sequences with same src, overlapping region, on different strands, this would cause no reverse_complementing
strands = [ c.strand for c in iter_components_by_src( block, src ) if component_overlaps_region( c, region ) ]
- if strands and ( force_strand is None and region.strand not in strands ) or ( force_strand is not None and force_strand not in strands ):
+ if strands and ( force_strand is None and region.strand not in strands ) or ( force_strand is not None and force_strand not in strands ):
block = block.reverse_complement()
return block
def get_oriented_chopped_blocks_for_region( index, src, region, species = None, mincols = 0, force_strand = None ):
- for block, idx, offset in get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols, force_strand ):
+ for block, idx, offset in get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols, force_strand ):
yield block
def get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0, force_strand = None ):
for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ):
@@ -331,8 +331,8 @@
else:
#no more components to add, yield this block
yield new_block
-
- #divide components by species
+
+ #divide components by species
spec_dict = {}
if not species:
species = []
@@ -347,7 +347,7 @@
spec_dict[ spec ] = []
for c in iter_components_by_src_start( block, spec ):
spec_dict[ spec ].append( c )
-
+
empty_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) ) #should we copy attributes?
empty_block.text_size = block.text_size
#call recursive function to split into each combo of spec/blocks
@@ -356,68 +356,69 @@
yield value
-#generator yielding only chopped and valid blocks for a specified region
-def get_chopped_blocks_for_region( index, src, region, species = None, mincols = 0 ):
- for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ):
- yield block
-def get_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0 ):
- for block, idx, offset in index.get_as_iterator_with_index_and_offset( src, region.start, region.end ):
- block = chop_block_by_region( block, src, region, species, mincols )
- if block is not None:
- yield block, idx, offset
-
-#returns a filled region alignment for specified regions
-def get_region_alignment( index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ):
- if species is not None: alignment = RegionAlignment( end - start, species )
- else: alignment = RegionAlignment( end - start, primary_species )
- return fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand, species, mincols, overwrite_with_gaps )
-
-#reduces a block to only positions exisiting in the src provided
-def reduce_block_by_primary_genome( block, species, chromosome, region_start ):
- #returns ( startIndex, {species:texts}
- #where texts' contents are reduced to only positions existing in the primary genome
- src = "%s.%s" % ( species, chromosome )
- ref = block.get_component_by_src( src )
- start_offset = ref.start - region_start
- species_texts = {}
- for c in block.components:
- species_texts[ c.src.split( '.' )[0] ] = list( c.text )
- #remove locations which are gaps in the primary species, starting from the downstream end
- for i in range( len( species_texts[ species ] ) - 1, -1, -1 ):
- if species_texts[ species ][i] == '-':
- for text in species_texts.values():
- text.pop( i )
- for spec, text in species_texts.items():
- species_texts[spec] = ''.join( text )
- return ( start_offset, species_texts )
-
-#fills a region alignment
-def fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ):
- region = bx.intervals.Interval( start, end )
- region.chrom = chrom
- region.strand = strand
- primary_src = "%s.%s" % ( primary_species, chrom )
-
- #Order blocks overlaping this position by score, lowest first
- blocks = []
- for block, idx, offset in index.get_as_iterator_with_index_and_offset( primary_src, start, end ):
- score = float( block.score )
- for i in range( 0, len( blocks ) ):
- if score < blocks[i][0]:
- blocks.insert( i, ( score, idx, offset ) )
- break
- else:
- blocks.append( ( score, idx, offset ) )
-
+#generator yielding only chopped and valid blocks for a specified region
+def get_chopped_blocks_for_region( index, src, region, species = None, mincols = 0 ):
+ for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ):
+ yield block
+def get_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0 ):
+ for block, idx, offset in index.get_as_iterator_with_index_and_offset( src, region.start, region.end ):
+ block = chop_block_by_region( block, src, region, species, mincols )
+ if block is not None:
+ yield block, idx, offset
+
+#returns a filled region alignment for specified regions
+def get_region_alignment( index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ):
+ if species is not None: alignment = RegionAlignment( end - start, species )
+ else: alignment = RegionAlignment( end - start, primary_species )
+ return fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand, species, mincols, overwrite_with_gaps )
+
+#reduces a block to only positions exisiting in the src provided
+def reduce_block_by_primary_genome( block, species, chromosome, region_start ):
+ #returns ( startIndex, {species:texts}
+ #where texts' contents are reduced to only positions existing in the primary genome
+ src = "%s.%s" % ( species, chromosome )
+ ref = block.get_component_by_src( src )
+ start_offset = ref.start - region_start
+ species_texts = {}
+ for c in block.components:
+ species_texts[ c.src.split( '.' )[0] ] = list( c.text )
+ #remove locations which are gaps in the primary species, starting from the downstream end
+ for i in range( len( species_texts[ species ] ) - 1, -1, -1 ):
+ if species_texts[ species ][i] == '-':
+ for text in species_texts.values():
+ text.pop( i )
+ for spec, text in species_texts.items():
+ species_texts[spec] = ''.join( text )
+ return ( start_offset, species_texts )
+
+#fills a region alignment
+def fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ):
+ region = bx.intervals.Interval( start, end )
+ region.chrom = chrom
+ region.strand = strand
+ primary_src = "%s.%s" % ( primary_species, chrom )
+
+ #Order blocks overlaping this position by score, lowest first
+ blocks = []
+ for block, idx, offset in index.get_as_iterator_with_index_and_offset( primary_src, start, end ):
+ score = float( block.score )
+ for i in range( 0, len( blocks ) ):
+ if score < blocks[i][0]:
+ blocks.insert( i, ( score, idx, offset ) )
+ break
+ else:
+ blocks.append( ( score, idx, offset ) )
+
#gap_chars_tuple = tuple( GAP_CHARS )
- gap_chars_str = ''.join( GAP_CHARS )
- #Loop through ordered blocks and layer by increasing score
- for block_dict in blocks:
for block in iter_blocks_split_by_species( block_dict[1].get_at_offset( block_dict[2] ) ): #need to handle each occurance of sequence in block seperately
+ gap_chars_str = ''.join( GAP_CHARS )
+ #Loop through ordered blocks and layer by increasing score
+ for block_dict in blocks:
+ for block in iter_blocks_split_by_species( block_dict[1].get_at_offset( block_dict[2] ) ): #need to handle each occurance of sequence in block seperately
if component_overlaps_region( block.get_component_by_src( primary_src ), region ):
block = chop_block_by_region( block, primary_src, region, species, mincols ) #chop block
block = orient_block_by_region( block, primary_src, region ) #orient block
- start_offset, species_texts = reduce_block_by_primary_genome( block, primary_species, chrom, start )
- for spec, text in species_texts.items():
+ start_offset, species_texts = reduce_block_by_primary_genome( block, primary_species, chrom, start )
+ for spec, text in species_texts.items():
#we should trim gaps from both sides, since these are not positions in this species genome (sequence)
text = text.rstrip( gap_chars_str )
gap_offset = 0
@@ -433,169 +434,169 @@
else:
for i, char in enumerate( text ):
if char not in GAP_CHARS:
- alignment.set_position( start_offset + gap_offset + i, spec, char )
- return alignment
-
-#returns a filled spliced region alignment for specified region with start and end lists
-def get_spliced_region_alignment( index, primary_species, chrom, starts, ends, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ):
- #create spliced alignment object
- if species is not None: alignment = SplicedAlignment( starts, ends, species )
- else: alignment = SplicedAlignment( starts, ends, [primary_species] )
- for exon in alignment.exons:
- fill_region_alignment( exon, index, primary_species, chrom, exon.start, exon.end, strand, species, mincols, overwrite_with_gaps )
- return alignment
-
-#loop through string array, only return non-commented lines
-def line_enumerator( lines, comment_start = '#' ):
- i = 0
- for line in lines:
- if not line.startswith( comment_start ):
- i += 1
- yield ( i, line )
-
-#read a GeneBed file, return list of starts, ends, raw fields
-def get_starts_ends_fields_from_gene_bed( line ):
- #Starts and ends for exons
- starts = []
- ends = []
-
- fields = line.split()
- #Requires atleast 12 BED columns
- if len(fields) < 12:
- raise Exception( "Not a proper 12 column BED line (%s)." % line )
- chrom = fields[0]
- tx_start = int( fields[1] )
- tx_end = int( fields[2] )
- name = fields[3]
- strand = fields[5]
- if strand != '-': strand='+' #Default strand is +
- cds_start = int( fields[6] )
- cds_end = int( fields[7] )
-
- #Calculate and store starts and ends of coding exons
- region_start, region_end = cds_start, cds_end
- exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) )
- exon_starts = map( ( lambda x: x + tx_start ), exon_starts )
- exon_ends = map( int, fields[10].rstrip( ',' ).split( ',' ) )
- exon_ends = map( ( lambda x, y: x + y ), exon_starts, exon_ends );
- for start, end in zip( exon_starts, exon_ends ):
- start = max( start, region_start )
- end = min( end, region_end )
- if start < end:
- starts.append( start )
- ends.append( end )
- return ( starts, ends, fields )
-
-def iter_components_by_src( block, src ):
- for c in block.components:
- if c.src == src:
- yield c
-
-def get_components_by_src( block, src ):
- return [ value for value in iter_components_by_src( block, src ) ]
-
-def iter_components_by_src_start( block, src ):
- for c in block.components:
- if c.src.startswith( src ):
- yield c
-
-def get_components_by_src_start( block, src ):
- return [ value for value in iter_components_by_src_start( block, src ) ]
+ alignment.set_position( start_offset + gap_offset + i, spec, char )
+ return alignment
+
+#returns a filled spliced region alignment for specified region with start and end lists
+def get_spliced_region_alignment( index, primary_species, chrom, starts, ends, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ):
+ #create spliced alignment object
+ if species is not None: alignment = SplicedAlignment( starts, ends, species )
+ else: alignment = SplicedAlignment( starts, ends, [primary_species] )
+ for exon in alignment.exons:
+ fill_region_alignment( exon, index, primary_species, chrom, exon.start, exon.end, strand, species, mincols, overwrite_with_gaps )
+ return alignment
+
+#loop through string array, only return non-commented lines
+def line_enumerator( lines, comment_start = '#' ):
+ i = 0
+ for line in lines:
+ if not line.startswith( comment_start ):
+ i += 1
+ yield ( i, line )
+
+#read a GeneBed file, return list of starts, ends, raw fields
+def get_starts_ends_fields_from_gene_bed( line ):
+ #Starts and ends for exons
+ starts = []
+ ends = []
+
+ fields = line.split()
+ #Requires atleast 12 BED columns
+ if len(fields) < 12:
+ raise Exception( "Not a proper 12 column BED line (%s)." % line )
+ chrom = fields[0]
+ tx_start = int( fields[1] )
+ tx_end = int( fields[2] )
+ name = fields[3]
+ strand = fields[5]
+ if strand != '-': strand='+' #Default strand is +
+ cds_start = int( fields[6] )
+ cds_end = int( fields[7] )
+
+ #Calculate and store starts and ends of coding exons
+ region_start, region_end = cds_start, cds_end
+ exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) )
+ exon_starts = map( ( lambda x: x + tx_start ), exon_starts )
+ exon_ends = map( int, fields[10].rstrip( ',' ).split( ',' ) )
+ exon_ends = map( ( lambda x, y: x + y ), exon_starts, exon_ends );
+ for start, end in zip( exon_starts, exon_ends ):
+ start = max( start, region_start )
+ end = min( end, region_end )
+ if start < end:
+ starts.append( start )
+ ends.append( end )
+ return ( starts, ends, fields )
+
+def iter_components_by_src( block, src ):
+ for c in block.components:
+ if c.src == src:
+ yield c
+
+def get_components_by_src( block, src ):
+ return [ value for value in iter_components_by_src( block, src ) ]
+
+def iter_components_by_src_start( block, src ):
+ for c in block.components:
+ if c.src.startswith( src ):
+ yield c
+
+def get_components_by_src_start( block, src ):
+ return [ value for value in iter_components_by_src_start( block, src ) ]
def sort_block_components_by_block( block1, block2 ):
#orders the components in block1 by the index of the component in block2
#block1 must be a subset of block2
#occurs in-place
return block1.components.sort( cmp = lambda x, y: block2.components.index( x ) - block2.components.index( y ) )
-
+
def get_species_in_maf( maf_filename ):
species = []
for block in bx.align.maf.Reader( open( maf_filename ) ):
for spec in get_species_in_block( block ):
if spec not in species:
species.append( spec )
- return species
-
-def parse_species_option( species ):
- if species:
- species = species.split( ',' )
- if 'None' not in species:
- return species
- return None #provided species was '', None, or had 'None' in it
-
-def remove_temp_index_file( index_filename ):
- try: os.unlink( index_filename )
- except: pass
-
-#Below are methods to deal with FASTA files
-
-def get_fasta_header( component, attributes = {}, suffix = None ):
- header = ">%s(%s):%i-%i|" % ( component.src, component.strand, component.get_forward_strand_start(), component.get_forward_strand_end() )
- for key, value in attributes.iteritems():
- header = "%s%s=%s|" % ( header, key, value )
- if suffix:
- header = "%s%s" % ( header, suffix )
- else:
- header = "%s%s" % ( header, src_split( component.src )[ 0 ] )
- return header
-
-def get_attributes_from_fasta_header( header ):
- if not header: return {}
- attributes = {}
- header = header.lstrip( '>' )
- header = header.strip()
- fields = header.split( '|' )
- try:
- region = fields[0]
- region = region.split( '(', 1 )
- temp = region[0].split( '.', 1 )
- attributes['species'] = temp[0]
- if len( temp ) == 2:
- attributes['chrom'] = temp[1]
- else:
- attributes['chrom'] = temp[0]
- region = region[1].split( ')', 1 )
- attributes['strand'] = region[0]
- region = region[1].lstrip( ':' ).split( '-' )
- attributes['start'] = int( region[0] )
- attributes['end'] = int( region[1] )
- except:
- #fields 0 is not a region coordinate
- pass
- if len( fields ) > 2:
- for i in xrange( 1, len( fields ) - 1 ):
- prop = fields[i].split( '=', 1 )
- if len( prop ) == 2:
- attributes[ prop[0] ] = prop[1]
- if len( fields ) > 1:
- attributes['__suffix__'] = fields[-1]
- return attributes
-
-def iter_fasta_alignment( filename ):
- class fastaComponent:
- def __init__( self, species, text = "" ):
- self.species = species
- self.text = text
- def extend( self, text ):
- self.text = self.text + text.replace( '\n', '' ).replace( '\r', '' ).strip()
- #yields a list of fastaComponents for a FASTA file
- f = open( filename, 'rb' )
- components = []
- #cur_component = None
- while True:
- line = f.readline()
- if not line:
- if components:
- yield components
- return
- line = line.strip()
- if not line:
- if components:
- yield components
- components = []
- elif line.startswith( '>' ):
- attributes = get_attributes_from_fasta_header( line )
- components.append( fastaComponent( attributes['species'] ) )
- elif components:
- components[-1].extend( line )
-
+ return species
+
+def parse_species_option( species ):
+ if species:
+ species = species.split( ',' )
+ if 'None' not in species:
+ return species
+ return None #provided species was '', None, or had 'None' in it
+
+def remove_temp_index_file( index_filename ):
+ try: os.unlink( index_filename )
+ except: pass
+
+#Below are methods to deal with FASTA files
+
+def get_fasta_header( component, attributes = {}, suffix = None ):
+ header = ">%s(%s):%i-%i|" % ( component.src, component.strand, component.get_forward_strand_start(), component.get_forward_strand_end() )
+ for key, value in attributes.iteritems():
+ header = "%s%s=%s|" % ( header, key, value )
+ if suffix:
+ header = "%s%s" % ( header, suffix )
+ else:
+ header = "%s%s" % ( header, src_split( component.src )[ 0 ] )
+ return header
+
+def get_attributes_from_fasta_header( header ):
+ if not header: return {}
+ attributes = {}
+ header = header.lstrip( '>' )
+ header = header.strip()
+ fields = header.split( '|' )
+ try:
+ region = fields[0]
+ region = region.split( '(', 1 )
+ temp = region[0].split( '.', 1 )
+ attributes['species'] = temp[0]
+ if len( temp ) == 2:
+ attributes['chrom'] = temp[1]
+ else:
+ attributes['chrom'] = temp[0]
+ region = region[1].split( ')', 1 )
+ attributes['strand'] = region[0]
+ region = region[1].lstrip( ':' ).split( '-' )
+ attributes['start'] = int( region[0] )
+ attributes['end'] = int( region[1] )
+ except:
+ #fields 0 is not a region coordinate
+ pass
+ if len( fields ) > 2:
+ for i in xrange( 1, len( fields ) - 1 ):
+ prop = fields[i].split( '=', 1 )
+ if len( prop ) == 2:
+ attributes[ prop[0] ] = prop[1]
+ if len( fields ) > 1:
+ attributes['__suffix__'] = fields[-1]
+ return attributes
+
+def iter_fasta_alignment( filename ):
+ class fastaComponent:
+ def __init__( self, species, text = "" ):
+ self.species = species
+ self.text = text
+ def extend( self, text ):
+ self.text = self.text + text.replace( '\n', '' ).replace( '\r', '' ).strip()
+ #yields a list of fastaComponents for a FASTA file
+ f = open( filename, 'rb' )
+ components = []
+ #cur_component = None
+ while True:
+ line = f.readline()
+ if not line:
+ if components:
+ yield components
+ return
+ line = line.strip()
+ if not line:
+ if components:
+ yield components
+ components = []
+ elif line.startswith( '>' ):
+ attributes = get_attributes_from_fasta_header( line )
+ components.append( fastaComponent( attributes['species'] ) )
+ elif components:
+ components[-1].extend( line )
+
Repository URL: https://bitbucket.org/galaxy/galaxy-central/
--
This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.
1 new commit in galaxy-central:
https://bitbucket.org/galaxy/galaxy-central/commits/bec1baa9d2db/
Changeset: bec1baa9d2db
User: dannon
Date: 2013-08-29 18:21:50
Summary: Irods get_data was using wrong varname for reading(count)
Affected #: 1 file
diff -r 444c5b5b451b50eacba3ba1d08d83e16c1027fa5 -r bec1baa9d2db009abd16778093fb15b96bde0950 lib/galaxy/objectstore/rods.py
--- a/lib/galaxy/objectstore/rods.py
+++ b/lib/galaxy/objectstore/rods.py
@@ -210,7 +210,7 @@
if count == -1:
return h.read()
else:
- return f.read( count )
+ return h.read( count )
# TODO: make sure implicit close is okay, DiskObjectStore actually
# reads data into a var, closes, and returns the var
Repository URL: https://bitbucket.org/galaxy/galaxy-central/
--
This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.
1 new commit in galaxy-central:
https://bitbucket.org/galaxy/galaxy-central/commits/444c5b5b451b/
Changeset: 444c5b5b451b
User: dan
Date: 2013-08-29 17:52:41
Summary: Log, then raise the IOError exception that occur in DiskObjectStore.update_from_file() as per discussion 08/05/2013-08/12/2013.
Affected #: 1 file
diff -r 533fb8fcc330c52646529659a246e1c50dc18e6a -r 444c5b5b451b50eacba3ba1d08d83e16c1027fa5 lib/galaxy/objectstore/__init__.py
--- a/lib/galaxy/objectstore/__init__.py
+++ b/lib/galaxy/objectstore/__init__.py
@@ -344,6 +344,7 @@
except IOError, ex:
log.critical('Error copying %s to %s: %s' % (file_name,
self._get_filename(obj, **kwargs), ex))
+ raise ex
def get_object_url(self, obj, **kwargs):
return None
Repository URL: https://bitbucket.org/galaxy/galaxy-central/
--
This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.