commit/galaxy-central: inithello: Added option to download pre-built indexes from galaxy servers.
1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/changeset/41f3a789037e/ changeset: 41f3a789037e user: inithello date: 2012-05-18 15:50:35 summary: Added option to download pre-built indexes from galaxy servers. affected #: 6 files diff -r 56c13a76faf9ad520cd1fcfe0c0cbe132a26d153 -r 41f3a789037ed5a8785a8531567871d904cf25f2 lib/galaxy/config.py --- a/lib/galaxy/config.py +++ b/lib/galaxy/config.py @@ -43,6 +43,7 @@ self.openid_consumer_cache_path = resolve_path( kwargs.get( "openid_consumer_cache_path", "database/openid_consumer_cache" ), self.root ) self.cookie_path = kwargs.get( "cookie_path", "/" ) self.genome_data_path = kwargs.get( "genome_data_path", "tool-data/genome" ) + self.rsync_url = kwargs.get( "rsync_url", "rsync://scofield.bx.psu.edu/indexes" ) # Galaxy OpenID settings self.enable_openid = string_as_bool( kwargs.get( 'enable_openid', False ) ) self.openid_config = kwargs.get( 'openid_config_file', 'openid_conf.xml' ) diff -r 56c13a76faf9ad520cd1fcfe0c0cbe132a26d153 -r 41f3a789037ed5a8785a8531567871d904cf25f2 lib/galaxy/tools/genome_index/__init__.py --- a/lib/galaxy/tools/genome_index/__init__.py +++ b/lib/galaxy/tools/genome_index/__init__.py @@ -16,7 +16,7 @@ <tool id="__GENOME_INDEX__" name="Index Genome" version="0.1" tool_type="genome_index"><type class="GenomeIndexTool" module="galaxy.tools"/><action module="galaxy.tools.actions.index_genome" class="GenomeIndexToolAction"/> - <command>$__GENOME_INDEX_COMMAND__ $output_file $output_file.files_path</command> + <command>$__GENOME_INDEX_COMMAND__ $output_file $output_file.files_path $__app__.config.rsync_url</command><inputs><param name="__GENOME_INDEX_COMMAND__" type="hidden"/></inputs> @@ -85,7 +85,7 @@ if indexer == '2bit': indexdata = os.path.join( workingdir, '%s.2bit' % dbkey ) destination = os.path.join( basepath, 'seq', '%s.2bit' % dbkey ) - location.append( dict( line='\t'.join( [ 'seq', dbkey, os.path.join( destination, '%s.2bit' % dbkey ) ] ), file= os.path.join( locbase, 'alignseq.loc' ) ) ) + location.append( dict( line='\t'.join( [ 'seq', dbkey, destination ] ), file= os.path.join( locbase, 'alignseq.loc' ) ) ) elif indexer == 'bowtie': self._ex_tar( workingdir, 'cs.tar' ) destination = os.path.join( basepath, 'bowtie_index' ) @@ -153,17 +153,52 @@ log.debug( 'Moving %s to %s' % ( indexdata, destination ) ) shutil.move( indexdata, destination ) if indexer not in [ '2bit' ]: - genome = '%s.fa' + genome = '%s.fa' % dbkey target = os.path.join( destination, genome ) - farel = os.path.relpath( os.path.join( basepath, 'seq', genome ), destination ) - os.symlink( farel, target ) + fasta = os.path.abspath( os.path.join( basepath, 'seq', genome ) ) + self._check_link( fasta, target ) if os.path.exists( os.path.join( destination, 'cs' ) ): target = os.path.join( destination, 'cs', genome ) - farel = os.path.relpath( os.path.join( basepath, 'seq', genome ), os.path.join( destination, 'cs' ) ) - os.symlink( os.path.join( farel, target ) ) + fasta = os.path.abspath( os.path.join( basepath, 'seq', genome ) ) + self._check_link( fasta, target ) for line in location: self._add_line( line[ 'file' ], line[ 'line' ] ) + def _check_link( self, targetfile, symlink ): + target = os.path.relpath( targetfile, os.path.dirname( symlink ) ) + filename = os.path.basename( targetfile ) + if not os.path.exists( targetfile ): # this should never happen. + raise Exception, "%s not found. Unable to proceed without a FASTA file. Aborting." % targetfile + if os.path.exists( symlink ) and os.path.islink( symlink ): + if os.path.realpath( symlink ) == os.path.abspath( targetfile ): # symlink exists, points to the correct FASTA file. + return + else: # no it doesn't. Make a new one, and this time do it right. + os.remove( symlink ) + os.symlink( target, symlink ) + return + elif not os.path.exists( symlink ): # no symlink to the FASTA file. Create one. + os.symlink( target, symlink ) + return + elif os.path.exists( symlink ) and not os.path.islink( symlink ): + if self._hash_file( targetfile ) == self._hash_file( symlink ): # files are identical. No need to panic. + return + else: + if os.path.getsize( symlink ) == 0: # somehow an empty file got copied instead of the symlink. Delete with extreme prejudice. + os.remove( symlink ) + os.symlink( target, symlink ) + return + else: + raise Exception, "Regular file %s exists, is not empty, contents do not match %s." % ( symlink, targetfile ) + + def _hash_file( self, filename ): + import hashlib + md5 = hashlib.md5() + with open( filename, 'rb' ) as f: + for chunk in iter( lambda: f.read( 8192 ), b'' ): + md5.update( chunk ) + return md5.digest() + + def _ex_tar( self, directory, filename ): fh = tarfile.open( os.path.join( directory, filename ) ) fh.extractall( path=directory ) @@ -181,4 +216,5 @@ if newline not in origlines: origlines.append( newline ) with open( filepath, 'w+' ) as destfile: + origlines.append( '' ) destfile.write( '\n'.join( origlines ) ) diff -r 56c13a76faf9ad520cd1fcfe0c0cbe132a26d153 -r 41f3a789037ed5a8785a8531567871d904cf25f2 lib/galaxy/tools/genome_index/index_genome.py --- a/lib/galaxy/tools/genome_index/index_genome.py +++ b/lib/galaxy/tools/genome_index/index_genome.py @@ -8,13 +8,15 @@ import optparse, sys, os, tempfile, time, subprocess, shlex, json, tarfile, shutil class ManagedIndexer(): - def __init__( self, output_file, infile, workingdir ): + def __init__( self, output_file, infile, workingdir, rsync_url ): self.workingdir = os.path.abspath( workingdir ) self.outfile = open( os.path.abspath( output_file ), 'w' ) self.basedir = os.path.split( self.workingdir )[0] self.fasta = os.path.abspath( infile ) self.locations = dict( nt=[], cs=[] ) self.log = [] + self.rsync_opts = '-aclSzq' + self.rsync_url = rsync_url self.indexers = { 'bwa': '_bwa', 'bowtie': '_bowtie', @@ -32,6 +34,7 @@ def run_indexer( self, indexer ): self.fapath = self.fasta self.fafile = os.path.basename( self.fapath ) + self.genome = os.path.splitext( self.fafile )[0] with WithChDir( self.basedir ): if indexer not in self.indexers: raise KeyError, 'The requested indexing function does not exist' @@ -42,11 +45,26 @@ if result is None: self._log( 'Error running indexer %s.' % indexer ) self._flush_files() - raise Exception + return True else: self._log( 'Indexer %s completed successfully.' % indexer ) self._flush_files() + def _check_link( self ): + self._log( 'Checking symlink to %s' % self.fafile ) + if not os.path.exists( self.fafile ): + self._log( 'Symlink not found, creating' ) + os.symlink( os.path.relpath( self.fapath ), self.fafile ) + + def _do_rsync( self, idxpath ): + self._log( 'Trying rsync at %s/%s%s' % ( self.rsync_url, self.genome, idxpath ) ) + result = subprocess.call( shlex.split( 'rsync %s %s/%s%s .' % ( self.rsync_opts, self.rsync_url, self.genome, idxpath ) ) ) + if result != 0: + self._log( 'Rsync failed or index not found. Generating.' ) + else: + self._log( 'Rsync succeeded.' ) + return result + def _flush_files( self ): json.dump( self.locations, self.outfile ) self.outfile.close() @@ -57,9 +75,12 @@ self.logfile.write( "[%s] %s\n" % (timestamp, stuff) ) def _bwa( self ): - with WithChDir( self.workingdir ): - if not os.path.exists( self.fafile ): - os.symlink( os.path.relpath( self.fapath ), self.fafile ) + result = self._do_rsync( '/bwa_index/' ) + if result == 0: + self.locations[ 'nt' ].append( self.fafile ) + return self._bwa_cs() + else: + self._check_link() command = shlex.split( 'bwa index -a bwtsw %s' % self.fafile ) result = subprocess.call( command, stderr=self.logfile, stdout=self.logfile ) if result != 0: @@ -68,65 +89,82 @@ if result == 0: self.locations[ 'nt' ].append( self.fafile ) os.remove( self.fafile ) - os.makedirs( 'cs' ) - with WithChDir( 'cs' ): - if not os.path.exists( self.fafile ): - os.symlink( os.path.relpath( self.fapath ), self.fafile ) - command = shlex.split( 'bwa index -a bwtsw -c %s' % self.fafile ) - result = subprocess.call( command, stderr=self.logfile, stdout=self.logfile ) - if result != 0: - newcommand = shlex.split( 'bwa index -c %s' % self.fafile ) - result = call( newcommand, stderr=self.logfile, stdout=self.logfile ) - if result == 0: - self.locations[ 'cs' ].append( self.fafile ) - os.remove( self.fafile ) - else: - return False - else: - self.locations[ 'cs' ].append( self.fafile ) - os.remove( self.fafile ) - temptar = tarfile.open( 'cs.tar', 'w' ) - temptar.add( 'cs' ) - temptar.close() - shutil.rmtree( 'cs' ) - return True + return self._bwa_cs() else: return False + def _bwa_cs( self ): + if not os.path.exists( os.path.join( self.workingdir, 'cs' ) ): + os.makedirs( 'cs' ) + with WithChDir( 'cs' ): + self._check_link() + command = shlex.split( 'bwa index -a bwtsw -c %s' % self.fafile ) + result = subprocess.call( command, stderr=self.logfile, stdout=self.logfile ) + if result != 0: + newcommand = shlex.split( 'bwa index -c %s' % self.fafile ) + result = call( newcommand, stderr=self.logfile, stdout=self.logfile ) + if result == 0: + self.locations[ 'cs' ].append( self.fafile ) + os.remove( self.fafile ) + else: + return False + else: + self.locations[ 'cs' ].append( self.fafile ) + os.remove( self.fafile ) + else: + self.locations[ 'cs' ].append( self.fafile ) + temptar = tarfile.open( 'cs.tar', 'w' ) + temptar.add( 'cs' ) + temptar.close() + shutil.rmtree( 'cs' ) + return True + + def _bowtie( self ): - ref_base = os.path.splitext(self.fafile)[0] - if not os.path.exists( self.fafile ): - os.symlink( os.path.relpath( self.fapath ), self.fafile ) - command = shlex.split( 'bowtie-build -f %s %s' % ( self.fafile, ref_base ) ) - result = subprocess.call( command, stderr=self.logfile, stdout=self.logfile ) + result = self._do_rsync( '/bowtie_index/' ) if result == 0: - self.locations[ 'nt' ].append( ref_base ) - os.remove( self.fafile ) - indexdir = os.path.join( os.getcwd(), 'cs' ) + self.locations[ 'nt' ].append( self.genome ) + return self._bowtie_cs() + else: + self._check_link() + command = shlex.split( 'bowtie-build -f %s %s' % ( self.fafile, self.genome ) ) + result = subprocess.call( command, stderr=self.logfile, stdout=self.logfile ) + if result == 0: + self.locations[ 'nt' ].append( self.genome ) + os.remove( self.fafile ) + return self._bowtie_cs() + else: + return False + + def _bowtie_cs( self ): + indexdir = os.path.join( os.getcwd(), 'cs' ) + if not ( os.path.exists( indexdir ) ): os.makedirs( indexdir ) with WithChDir( indexdir ): - ref_base = os.path.splitext(self.fafile)[0] - if not os.path.exists( self.fafile ): - os.symlink( os.path.relpath( self.fapath ), self.fafile ) - command = shlex.split( 'bowtie-build -C -f %s %s' % ( self.fafile, ref_base ) ) + self._check_link() + command = shlex.split( 'bowtie-build -C -f %s %s' % ( self.fafile, self.genome ) ) result = subprocess.call( command, stderr=self.logfile, stdout=self.logfile ) if result == 0: - self.locations[ 'cs' ].append( ref_base ) + self.locations[ 'cs' ].append( self.genome ) else: return False os.remove( os.path.join( indexdir, self.fafile ) ) - temptar = tarfile.open( 'cs.tar', 'w' ) - temptar.add( 'cs' ) - temptar.close() - shutil.rmtree( 'cs' ) - return True else: - return False + self.locations[ 'cs' ].append( self.genome ) + temptar = tarfile.open( 'cs.tar', 'w' ) + temptar.add( 'cs' ) + temptar.close() + shutil.rmtree( 'cs' ) + return True + def _bowtie2( self ): + result = self._do_rsync( '/bowtie2_index/' ) + if result == 0: + self.locations[ 'nt' ].append( self.fafile ) + return True ref_base = os.path.splitext(self.fafile)[0] - if not os.path.exists( self.fafile ): - os.symlink( os.path.relpath( self.fapath ), self.fafile ) + self._check_link() command = shlex.split( 'bowtie2-build %s %s' % ( self.fafile, ref_base ) ) result = subprocess.call( command, stderr=self.logfile, stdout=self.logfile ) if result == 0: @@ -139,151 +177,92 @@ def _twobit( self ): """Index reference files using 2bit for random access. """ - ref_base = os.path.splitext(self.fafile)[0] - out_file = "%s.2bit" % ref_base - if not os.path.exists( self.fafile ): - os.symlink( os.path.relpath( self.fapath ), self.fafile ) - command = shlex.split( 'faToTwoBit %s %s' % ( self.fafile, out_file ) ) - result = subprocess.call( command, stderr=self.logfile, stdout=self.logfile ) + result = self._do_rsync( '/seq/%s.2bit' % self.genome ) if result == 0: - self.locations['nt'].append( out_file ) - os.remove( self.fafile ) + self.locations['nt'].append( "%s.2bit" % self.genome ) return True else: - return False + out_file = "%s.2bit" % self.genome + self._check_link() + command = shlex.split( 'faToTwoBit %s %s' % ( self.fafile, out_file ) ) + result = subprocess.call( command, stderr=self.logfile, stdout=self.logfile ) + if result == 0: + self.locations['nt'].append( out_file ) + os.remove( self.fafile ) + return True + else: + return False def _perm( self ): - local_ref = self.fafile - if not os.path.exists( local_ref ): - os.symlink( os.path.relpath( self.fapath ), self.fafile ) - genome = os.path.splitext( local_ref )[0] + result = self._do_rsync( '/perm_index/' ) + self._check_link() + genome = self.genome read_length = 50 for seed in [ 'F3', 'F4' ]: - key = '%s_%s_%s' % (genome, seed, read_length) - desc = '%s: seed=%s, read length=%s' % (genome, seed, read_length) - index = "%s_base_%s_%s.index" % (genome, seed, read_length) - command = shlex.split("PerM %s %s --readFormat fastq --seed %s -m -s %s" % (local_ref, read_length, seed, index)) - result = subprocess.call( command ) - if result == 0: - self.locations[ 'nt' ].append( [ key, desc, index ] ) - else: - return False - os.remove( local_ref ) - os.makedirs( 'cs' ) + key = '%s_%s_%s' % (self.genome, seed, read_length) + desc = '%s: seed=%s, read length=%s' % (self.genome, seed, read_length) + index = "%s_base_%s_%s.index" % (self.genome, seed, read_length) + if not os.path.exists( index ): + command = shlex.split("PerM %s %s --readFormat fastq --seed %s -m -s %s" % (self.fafile, read_length, seed, index)) + result = subprocess.call( command ) + if result != 0: + return False + self.locations[ 'nt' ].append( [ key, desc, index ] ) + os.remove( self.fafile ) + return self._perm_cs() + + def _perm_cs( self ): + if not os.path.exists( 'cs' ): + os.makedirs( 'cs' ) with WithChDir( 'cs' ): - if not os.path.exists( local_ref ): - os.symlink( os.path.relpath( self.fapath ), self.fafile ) + self._check_link() for seed in [ 'F3', 'F4' ]: key = '%s_%s_%s' % (genome, seed, read_length) desc = '%s: seed=%s, read length=%s' % (genome, seed, read_length) index = "%s_color_%s_%s.index" % (genome, seed, read_length) - command = shlex.split("PerM %s %s --readFormat csfastq --seed %s -m -s %s" % (local_ref, read_length, seed, index)) - result = subprocess.call( command, stderr=self.logfile, stdout=self.logfile ) - if result == 0: - self.locations[ 'cs' ].append( [ key, desc, index ] ) - else: + if not os.path.exists( index ): + command = shlex.split("PerM %s %s --readFormat csfastq --seed %s -m -s %s" % (local_ref, read_length, seed, index)) + result = subprocess.call( command, stderr=self.logfile, stdout=self.logfile ) + if result != 0: return False + self.locations[ 'cs' ].append( [ key, desc, index ] ) os.remove( local_ref ) temptar = tarfile.open( 'cs.tar', 'w' ) temptar.add( 'cs' ) temptar.close() shutil.rmtree( 'cs' ) return True - - def _bfast( self ): - """Indexes bfast in color and nucleotide space for longer reads. - - This preps for 40+bp sized reads, which is bfast's strength. - """ - dir_name_nt = 'nt' - dir_name_cs = 'cs' - window_size = 14 - bfast_nt_masks = [ - "1111111111111111111111", - "1111101110111010100101011011111", - "1011110101101001011000011010001111111", - "10111001101001100100111101010001011111", - "11111011011101111011111111", - "111111100101001000101111101110111", - "11110101110010100010101101010111111", - "111101101011011001100000101101001011101", - "1111011010001000110101100101100110100111", - "1111010010110110101110010110111011", - ] - bfast_color_masks = [ - "1111111111111111111111", - "111110100111110011111111111", - "10111111011001100011111000111111", - "1111111100101111000001100011111011", - "111111110001111110011111111", - "11111011010011000011000110011111111", - "1111111111110011101111111", - "111011000011111111001111011111", - "1110110001011010011100101111101111", - "111111001000110001011100110001100011111", - ] + + def _picard( self ): + result = self._do_rsync( '/srma_index/' ) + if result == 0 and os.path.exists( '%s.dict' % self.genome): + self.locations[ 'nt' ].append( self.fafile ) + return True local_ref = self.fafile - os.makedirs( dir_name_nt ) - os.makedirs( dir_name_cs ) - if not os.path.exists( self.fafile ): - os.symlink( os.path.relpath( self.fapath ), self.fafile ) - with WithChDir( dir_name_nt ): - if not os.path.exists( self.fafile ): - os.symlink( os.path.relpath( self.fapath ), self.fafile ) - # nucleotide space - command = shlex.split( "bfast fasta2brg -f %s -A 0" % local_ref ) - result = subprocess.call( command, stderr=self.logfile ) - for i, mask in enumerate( bfast_nt_masks ): - command = shlex.split("bfast index -d 1 -n 4 -f %s -A 0 -m %s -w %s -i %s" % - ( local_ref, mask, window_size, i + 1 ) ) - result = subprocess.call( command, stderr=self.logfile, stdout=self.logfile ) - os.remove( self.fafile ) + srma = 'tool-data/shared/jars/srma.jar' + genome = os.path.splitext( self.fafile )[0] + self._check_link() + if not os.path.exists( '%s.fai' % self.fafile ) and not os.path.exists( '%s.fai' % self.genome ): + command = shlex.split( 'samtools faidx %s' % self.fafile ) + subprocess.call( command, stderr=self.logfile ) + command = shlex.split( "java -cp %s net.sf.picard.sam.CreateSequenceDictionary R=%s O=%s/%s.dict URI=%s" \ + % ( srma, local_ref, os.curdir, genome, local_ref ) ) + if not os.path.exists( '%s.dict' % self.genome ): + result = subprocess.call( command, stderr=self.logfile, stdout=self.logfile ) if result != 0: return False - else: - os.remove( self.fafile ) - with WithChDir( dir_name_cs ): - if not os.path.exists( self.fafile ): - os.symlink( os.path.relpath( self.fapath ), self.fafile ) - # colorspace - command = shlex.split( "bfast fasta2brg -f %s -A 1" % local_ref ) - result = subprocess.call( command, stderr=self.logfile, stdout=self.logfile ) - for i, mask in enumerate( bfast_color_masks ): - command = shlex.split( "bfast index -d 1 -n 4 -f %s -A 1 -m %s -w %s -i %s" % - ( local_ref, mask, window_size, i + 1 ) ) - result = subprocess.call( command, stderr=self.logfile, stdout=self.logfile ) - if result != 0: - return False - else: - os.remove( self.fafile ) - self.locations = None - return True - - def _picard( self ): - local_ref = self.fafile - srma = '/Users/dave/srma.jar' - genome = os.path.splitext( self.fafile )[0] - if not os.path.exists( self.fafile ): - os.symlink( os.path.relpath( self.fapath ), self.fafile ) - command = shlex.split( 'samtools faidx %s' % self.fafile ) - subprocess.call( command, stderr=self.logfile ) - os.rename( '%s.fai' % self.fafile, '%s.fai' % genome ) - command = shlex.split( "java -cp %s net.sf.picard.sam.CreateSequenceDictionary R=%s O=%s/%s.dict URI=%s" \ - % ( srma, local_ref, os.curdir, genome, local_ref ) ) - result = subprocess.call( command, stderr=self.logfile, stdout=self.logfile ) - if result != 0: - return False - else: - self.locations[ 'nt' ].append( self.fafile ) - #os.remove( '%s.fai' % genome ) - os.remove( self.fafile ) - return True + self.locations[ 'nt' ].append( self.fafile ) + os.remove( self.fafile ) + return True def _sam( self ): local_ref = self.fafile local_file = os.path.splitext( self.fafile )[ 0 ] - if not os.path.exists( local_ref ): - os.symlink( os.path.relpath( self.fapath ), self.fafile ) + result = self._do_rsync( '/sam_index/' ) + if result == 0 and ( os.path.exists( '%s.fai' % self.fafile ) or os.path.exists( '%s.fai' % self.genome ) ): + self.locations[ 'nt' ].append( local_ref ) + return True + self._check_link() command = shlex.split("samtools faidx %s" % local_ref) result = subprocess.call( command, stderr=self.logfile ) if result != 0: @@ -307,9 +286,9 @@ # Parse command line. parser = optparse.OptionParser() (options, args) = parser.parse_args() - indexer, infile, outfile, working_dir = args + indexer, infile, outfile, working_dir, rsync_url = args # Create archive. - idxobj = ManagedIndexer( outfile, infile, working_dir ) + idxobj = ManagedIndexer( outfile, infile, working_dir, rsync_url ) idxobj.run_indexer( indexer ) \ No newline at end of file diff -r 56c13a76faf9ad520cd1fcfe0c0cbe132a26d153 -r 41f3a789037ed5a8785a8531567871d904cf25f2 lib/galaxy/web/controllers/data_admin.py --- a/lib/galaxy/web/controllers/data_admin.py +++ b/lib/galaxy/web/controllers/data_admin.py @@ -23,7 +23,8 @@ downloaded='state-color-running', new='state-color-new', ok='panel-done-message', - error='panel-error-message' + error='panel-error-message', + queued='state-color-waiting' ) @web.expose diff -r 56c13a76faf9ad520cd1fcfe0c0cbe132a26d153 -r 41f3a789037ed5a8785a8531567871d904cf25f2 universe_wsgi.ini.sample --- a/universe_wsgi.ini.sample +++ b/universe_wsgi.ini.sample @@ -120,6 +120,9 @@ # Path where genome builds are stored. This defaults to tool-data/genome #genome_data_path = tool-data/genome +# URL for rsync server to download pre-built indexes. +#rsync_url = rsync://scofield.bx.psu.edu/indexes + # Dataset files are stored in this directory. #file_path = database/files Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
Bitbucket