[hg] galaxy 2399: Directly stream .tar.gz and .tar.bz2 library d...
details: http://www.bx.psu.edu/hg/galaxy/rev/3abf689d6f6f changeset: 2399:3abf689d6f6f user: Nate Coraor <nate@bx.psu.edu> date: Tue May 05 12:03:40 2009 -0400 description: Directly stream .tar.gz and .tar.bz2 library downloads. .zip is still created on-disk first since Python's zipfile can't write to file-like objects like tarfile can. For response body generators (e.g. this feature) to work properly, you must set use_interactive and debug to False in universe_wsgi.ini. 3 file(s) affected in this change: lib/galaxy/util/streamball.py lib/galaxy/web/controllers/library.py templates/library/browse_library.mako diffs (277 lines): diff -r 6c76023580e3 -r 3abf689d6f6f lib/galaxy/util/streamball.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/galaxy/util/streamball.py Tue May 05 12:03:40 2009 -0400 @@ -0,0 +1,50 @@ +""" +A simple wrapper for writing tarballs as a stream. The work is performed in a +thread and data is written to a Queue instead of a file. +""" + +import logging, tarfile + +from Queue import Queue, Empty, Full +from threading import Thread + +log = logging.getLogger( __name__ ) + +class QueueArchive( object ): + queue_size = 32 + def __init__( self ): + self.queue = Queue( QueueArchive.queue_size ) + self.get = self.queue.get + self.empty = self.queue.empty + def write( self, data ): + self.queue.put( data, block=True, timeout=300 ) + def tell( self ): + return 0 + +class StreamBall( object ): + def __init__( self, mode, members={} ): + self.mode = mode + self.members = members + self.tarfileobj = QueueArchive() + def add( self, file, relpath ): + self.members[file] = relpath + def stream( self ): + t = Thread( target=self.thread_write ) + t.start() + while t.isAlive(): + try: + yield self.tarfileobj.get( block=False ) + except Empty: + pass + t.join() + # exhaust the queue + while not self.tarfileobj.empty(): + yield self.tarfileobj.get() + def thread_write( self ): + tf = tarfile.open( mode=self.mode, fileobj=self.tarfileobj ) + try: + for file, rel in self.members.items(): + tf.add( file, arcname=rel ) + tf.close() + except Full: + log.warning( 'Queue full for longer than 300 seconds, timing out' ) diff -r 6c76023580e3 -r 3abf689d6f6f lib/galaxy/web/controllers/library.py --- a/lib/galaxy/web/controllers/library.py Tue May 05 08:29:23 2009 -0400 +++ b/lib/galaxy/web/controllers/library.py Tue May 05 12:03:40 2009 -0400 @@ -2,6 +2,7 @@ from galaxy.model.orm import * from galaxy.datatypes import sniff from galaxy import util +from galaxy.util.streamball import StreamBall import logging, tempfile, zipfile, tarfile, os, sys if sys.version_info[:2] < ( 2, 6 ): @@ -10,6 +11,40 @@ zipfile.LargeZipFile = zipfile.error log = logging.getLogger( __name__ ) + +# Test for available compression types +tmpd = tempfile.mkdtemp() +comptypes = [] +for comptype in ( 'gz', 'bz2' ): + tmpf = os.path.join( tmpd, 'compression_test.tar.' + comptype ) + try: + archive = tarfile.open( tmpf, 'w:' + comptype ) + archive.close() + comptypes.append( comptype ) + except tarfile.CompressionError: + log.exception( "Compression error when testing %s compression. This option will be disabled for library downloads." % comptype ) + try: + os.unlink( tmpf ) + except OSError: + pass +ziptype = '32' +tmpf = os.path.join( tmpd, 'compression_test.zip' ) +try: + archive = zipfile.ZipFile( tmpf, 'w', zipfile.ZIP_DEFLATED, True ) + archive.close() + comptypes.append( 'zip' ) + ziptype = '64' +except RuntimeError: + log.exception( "Compression error when testing zip compression. This option will be disabled for library downloads." ) +except (TypeError, zipfile.LargeZipFile): + # ZIP64 is only in Python2.5+. Remove TypeError when 2.4 support is dropped + log.warning( 'Max zip file size is 2GB, ZIP64 not supported' ) + comptypes.append( 'zip' ) +try: + os.unlink( tmpf ) +except OSError: + pass +os.rmdir( tmpd ) class Library( BaseController ): @web.expose @@ -66,6 +101,7 @@ library=trans.app.model.Library.get( id ), created_ldda_ids=created_ldda_ids, default_action=params.get( 'default_action', None ), + comptypes=comptypes, msg=msg, messagetype=messagetype ) @web.expose @@ -195,49 +231,21 @@ msg=util.sanitize_text( msg ), messagetype=messagetype ) ) else: - # Can't use mkstemp - the file must not exist first try: - tmpd = tempfile.mkdtemp() - tmpf = os.path.join( tmpd, 'library_download.' + params.do_action ) if params.do_action == 'zip': - try: + # Can't use mkstemp - the file must not exist first + tmpd = tempfile.mkdtemp() + tmpf = os.path.join( tmpd, 'library_download.' + params.do_action ) + if ziptype == '64': archive = zipfile.ZipFile( tmpf, 'w', zipfile.ZIP_DEFLATED, True ) - except RuntimeError: - log.exception( "Compression error when opening zipfile for library download" ) - msg = "ZIP compression is not available in this Python, please notify an administrator" - return trans.response.send_redirect( web.url_for( controller='library', - action='browse_library', - id=library_id, - msg=util.sanitize_text( msg ), - messagetype='error' ) ) - except (TypeError, zipfile.LargeZipFile): - # ZIP64 is only in Python2.5+. Remove TypeError when 2.4 support is dropped - log.warning( 'Max zip file size is 2GB, ZIP64 not supported' ) + else: archive = zipfile.ZipFile( tmpf, 'w', zipfile.ZIP_DEFLATED ) archive.add = lambda x, y: archive.write( x, y.encode('CP437') ) elif params.do_action == 'tgz': - try: - archive = tarfile.open( tmpf, 'w:gz' ) - except tarfile.CompressionError: - log.exception( "Compression error when opening tarfile for library download" ) - msg = "gzip compression is not available in this Python, please notify an administrator" - return trans.response.send_redirect( web.url_for( controller='library', - action='browse_library', - id=library_id, - msg=util.sanitize_text( msg ), - messagetype='error' ) ) + archive = util.streamball.StreamBall( 'w|gz' ) elif params.do_action == 'tbz': - try: - archive = tarfile.open( tmpf, 'w:bz2' ) - except tarfile.CompressionError: - log.exception( "Compression error when opening tarfile for library download" ) - msg = "bzip2 compression is not available in this Python, please notify an administrator" - return trans.response.send_redirect( web.url_for( controller='library', - action='browse_library', - id=library_id, - msg=util.sanitize_text( msg ), - messagetype='error' ) ) - except (OSError, zipfile.BadZipFile, tarfile.ReadError): + archive = util.streamball.StreamBall( 'w|bz2' ) + except (OSError, zipfile.BadZipFile): log.exception( "Unable to create archive for download" ) msg = "Unable to create archive for download, please report this error" return trans.response.send_redirect( web.url_for( controller='library', @@ -255,9 +263,11 @@ path = "" parent_folder = ldda.library_dataset.folder while parent_folder is not None: - path = os.path.join( parent_folder.name, path ) + # Exclude the now-hidden "root folder" if parent_folder.parent is None: path = os.path.join( parent_folder.library_root[0].name, path ) + break + path = os.path.join( parent_folder.name, path ) parent_folder = parent_folder.parent path += ldda.name while path in seen: @@ -273,22 +283,28 @@ id=library_id, msg=util.sanitize_text( msg ), messagetype='error' ) ) - archive.close() - tmpfh = open( tmpf ) - # clean up now - try: - os.unlink( tmpf ) - os.rmdir( tmpd ) - except OSError: - log.exception( "Unable to remove temporary library download archive and directory" ) - msg = "Unable to create archive for download, please report this error" - return trans.response.send_redirect( web.url_for( controller='library', - action='browse_library', - id=library_id, - msg=util.sanitize_text( msg ), - messagetype='error' ) ) - trans.response.headers[ "Content-Disposition" ] = "attachment; filename=GalaxyLibraryFiles.%s" % params.do_action - return tmpfh + if params.do_action == 'zip': + archive.close() + tmpfh = open( tmpf ) + # clean up now + try: + os.unlink( tmpf ) + os.rmdir( tmpd ) + except OSError: + log.exception( "Unable to remove temporary library download archive and directory" ) + msg = "Unable to create archive for download, please report this error" + return trans.response.send_redirect( web.url_for( controller='library', + action='browse_library', + id=library_id, + msg=util.sanitize_text( msg ), + messagetype='error' ) ) + trans.response.set_content_type( "application/x-zip-compressed" ) + trans.response.headers[ "Content-Disposition" ] = "attachment; filename=GalaxyLibraryFiles.%s" % params.do_action + return tmpfh + else: + trans.response.set_content_type( "application/x-tar" ) + trans.response.headers[ "Content-Disposition" ] = "attachment; filename=GalaxyLibraryFiles.%s" % params.do_action + return archive.stream() @web.expose def download_dataset_from_folder(self, trans, id, library_id=None, **kwd): """Catches the dataset id and displays file contents as directed""" diff -r 6c76023580e3 -r 3abf689d6f6f templates/library/browse_library.mako --- a/templates/library/browse_library.mako Tue May 05 08:29:23 2009 -0400 +++ b/templates/library/browse_library.mako Tue May 05 12:03:40 2009 -0400 @@ -268,13 +268,38 @@ # This condition should not contain an else clause because the user is not authorized # to manage dataset permissions unless the default action is 'manage_permissions' %endif - %if default_action == 'download': - <option value="zip" selected>Download selected datasets as a .zip file</option> - %else: + %if 'bz2' in comptypes: + <option value="tbz" + %if default_action == 'download': + selected + %endif> + >Download selected datasets as a .tar.bz2 file</option> + %endif + %if 'gz' in comptypes: + <option value="tgz">Download selected datasets as a .tar.gz file</option> + %endif + %if 'zip' in comptypes: <option value="zip">Download selected datasets as a .zip file</option> %endif - <option value="tgz">Download selected datasets as a .tar.gz file</option> - <option value="tbz">Download selected datasets as a .tar.bz2 file</option> </select> <input type="submit" class="primary-button" name="action_on_datasets_button" id="action_on_datasets_button" value="Go"/> </form> + +%if len( comptypes ) > 1: + <div> + <p class="infomark"> + TIP: Multiple compression options are available for downloading library datasets: + </p> + <ul style="padding-left: 1em; list-style-type: disc;"> + %if 'bz2' in comptypes: + <li>bzip2: Compression takes the most time but is better for slower network connections (that transfer slower than the rate of compression) since the resulting file size is smallest.</li> + %endif + %if 'gz' in comptypes: + <li>gzip: Compression is faster and yields a larger file, making it more suitable for fast network connections.</li> + %endif + %if 'zip' in comptypes: + <li>ZIP: Not recommended but is provided as an option for those on Windows without WinZip (since WinZip can read .bz2 and .gz files).</li> + %endif + </ul> + </div> +%endif
participants (1)
-
Nate Coraor