commit/galaxy-central: jmchilton: Merged in BjoernGruening/galaxy-central-1/compressed_upload (pull request #630)
1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/9ef517d8e767/ Changeset: 9ef517d8e767 User: jmchilton Date: 2015-01-23 19:13:25+00:00 Summary: Merged in BjoernGruening/galaxy-central-1/compressed_upload (pull request #630) Add CompressedArchive as datatype and do not uncomress it during upload. Affected #: 3 files diff -r a6855d0b02d70cd35ae312f48bd704590805ca15 -r 9ef517d8e767037e1633f6063c6edb3023f86499 config/datatypes_conf.xml.sample --- a/config/datatypes_conf.xml.sample +++ b/config/datatypes_conf.xml.sample @@ -141,6 +141,8 @@ <datatype extension="pbm" type="galaxy.datatypes.images:Pbm" mimetype="image/pbm"/><datatype extension="pgm" type="galaxy.datatypes.images:Pgm" mimetype="image/pgm"/><datatype extension="rna_eps" type="galaxy.datatypes.sequence:RNADotPlotMatrix" mimetype="image/eps" display_in_upload="True"/> + <datatype extension="searchgui_archive" type="galaxy.datatypes.binary:CompressedArchive" subclass="True" display_in_upload="True"/> + <datatype extension="peptideshaker_archive" type="galaxy.datatypes.binary:CompressedArchive" subclass="True" display_in_upload="True"/><datatype extension="eps" type="galaxy.datatypes.images:Eps" mimetype="image/eps"/><datatype extension="rast" type="galaxy.datatypes.images:Rast" mimetype="image/rast"/><datatype extension="laj" type="galaxy.datatypes.images:Laj"/> diff -r a6855d0b02d70cd35ae312f48bd704590805ca15 -r 9ef517d8e767037e1633f6063c6edb3023f86499 lib/galaxy/datatypes/binary.py --- a/lib/galaxy/datatypes/binary.py +++ b/lib/galaxy/datatypes/binary.py @@ -106,6 +106,30 @@ Binary.register_unsniffable_binary_ext("ab1") +class CompressedArchive( Binary ): + """ + Class describing an compressed binary file + This class can be sublass'ed to implement archive filetypes that will not be unpacked by upload.py. + """ + file_ext = "compressed_archive" + compressed = True + + def set_peek( self, dataset, is_multi_byte=False ): + if not dataset.dataset.purged: + dataset.peek = "Compressed binary file" + dataset.blurb = data.nice_size( dataset.get_size() ) + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disk' + + def display_peek( self, dataset ): + try: + return dataset.peek + except: + return "Compressed binary file (%s)" % ( data.nice_size( dataset.get_size() ) ) + +Binary.register_unsniffable_binary_ext("compressed_archive") + class GenericAsn1Binary( Binary ): """Class for generic ASN.1 binary format""" diff -r a6855d0b02d70cd35ae312f48bd704590805ca15 -r 9ef517d8e767037e1633f6063c6edb3023f86499 tools/data_source/upload.py --- a/tools/data_source/upload.py +++ b/tools/data_source/upload.py @@ -120,171 +120,176 @@ data_type = type_info[0] ext = type_info[1] if not data_type: - # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress - is_gzipped, is_valid = check_gzip( dataset.path ) - if is_gzipped and not is_valid: - file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) - return - elif is_gzipped and is_valid: - if link_data_only == 'copy_files': - # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format - CHUNK_SIZE = 2**20 # 1Mb - fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) - gzipped_file = gzip.GzipFile( dataset.path, 'rb' ) - while 1: - try: - chunk = gzipped_file.read( CHUNK_SIZE ) - except IOError: - os.close( fd ) - os.remove( uncompressed ) - file_err( 'Problem decompressing gzipped data', dataset, json_file ) - return - if not chunk: - break - os.write( fd, chunk ) - os.close( fd ) - gzipped_file.close() - # Replace the gzipped file with the decompressed file if it's safe to do so - if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: - dataset.path = uncompressed - else: - shutil.move( uncompressed, dataset.path ) - os.chmod(dataset.path, 0644) - dataset.name = dataset.name.rstrip( '.gz' ) - data_type = 'gzip' - if not data_type and bz2 is not None: - # See if we have a bz2 file, much like gzip - is_bzipped, is_valid = check_bz2( dataset.path ) - if is_bzipped and not is_valid: + root_datatype = registry.get_datatype_by_extension( dataset.file_type ) + if getattr( root_datatype, 'compressed', False ): + data_type = 'compressed archive' + ext = dataset.file_type + else: + # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress + is_gzipped, is_valid = check_gzip( dataset.path ) + if is_gzipped and not is_valid: file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) return - elif is_bzipped and is_valid: + elif is_gzipped and is_valid: if link_data_only == 'copy_files': - # We need to uncompress the temp_name file + # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format CHUNK_SIZE = 2**20 # 1Mb - fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) - bzipped_file = bz2.BZ2File( dataset.path, 'rb' ) + fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) + gzipped_file = gzip.GzipFile( dataset.path, 'rb' ) while 1: try: - chunk = bzipped_file.read( CHUNK_SIZE ) + chunk = gzipped_file.read( CHUNK_SIZE ) except IOError: os.close( fd ) os.remove( uncompressed ) - file_err( 'Problem decompressing bz2 compressed data', dataset, json_file ) + file_err( 'Problem decompressing gzipped data', dataset, json_file ) return if not chunk: break os.write( fd, chunk ) os.close( fd ) - bzipped_file.close() - # Replace the bzipped file with the decompressed file if it's safe to do so + gzipped_file.close() + # Replace the gzipped file with the decompressed file if it's safe to do so if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: dataset.path = uncompressed else: shutil.move( uncompressed, dataset.path ) os.chmod(dataset.path, 0644) - dataset.name = dataset.name.rstrip( '.bz2' ) - data_type = 'bz2' - if not data_type: - # See if we have a zip archive - is_zipped = check_zip( dataset.path ) - if is_zipped: - if link_data_only == 'copy_files': - CHUNK_SIZE = 2**20 # 1Mb - uncompressed = None - uncompressed_name = None - unzipped = False - z = zipfile.ZipFile( dataset.path ) - for name in z.namelist(): - if name.endswith('/'): - continue - if unzipped: - stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' - break - fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) - if sys.version_info[:2] >= ( 2, 6 ): - zipped_file = z.open( name ) - while 1: + dataset.name = dataset.name.rstrip( '.gz' ) + data_type = 'gzip' + if not data_type and bz2 is not None: + # See if we have a bz2 file, much like gzip + is_bzipped, is_valid = check_bz2( dataset.path ) + if is_bzipped and not is_valid: + file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) + return + elif is_bzipped and is_valid: + if link_data_only == 'copy_files': + # We need to uncompress the temp_name file + CHUNK_SIZE = 2**20 # 1Mb + fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) + bzipped_file = bz2.BZ2File( dataset.path, 'rb' ) + while 1: + try: + chunk = bzipped_file.read( CHUNK_SIZE ) + except IOError: + os.close( fd ) + os.remove( uncompressed ) + file_err( 'Problem decompressing bz2 compressed data', dataset, json_file ) + return + if not chunk: + break + os.write( fd, chunk ) + os.close( fd ) + bzipped_file.close() + # Replace the bzipped file with the decompressed file if it's safe to do so + if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: + dataset.path = uncompressed + else: + shutil.move( uncompressed, dataset.path ) + os.chmod(dataset.path, 0644) + dataset.name = dataset.name.rstrip( '.bz2' ) + data_type = 'bz2' + if not data_type: + # See if we have a zip archive + is_zipped = check_zip( dataset.path ) + if is_zipped: + if link_data_only == 'copy_files': + CHUNK_SIZE = 2**20 # 1Mb + uncompressed = None + uncompressed_name = None + unzipped = False + z = zipfile.ZipFile( dataset.path ) + for name in z.namelist(): + if name.endswith('/'): + continue + if unzipped: + stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' + break + fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) + if sys.version_info[:2] >= ( 2, 6 ): + zipped_file = z.open( name ) + while 1: + try: + chunk = zipped_file.read( CHUNK_SIZE ) + except IOError: + os.close( fd ) + os.remove( uncompressed ) + file_err( 'Problem decompressing zipped data', dataset, json_file ) + return + if not chunk: + break + os.write( fd, chunk ) + os.close( fd ) + zipped_file.close() + uncompressed_name = name + unzipped = True + else: + # python < 2.5 doesn't have a way to read members in chunks(!) try: - chunk = zipped_file.read( CHUNK_SIZE ) + outfile = open( uncompressed, 'wb' ) + outfile.write( z.read( name ) ) + outfile.close() + uncompressed_name = name + unzipped = True except IOError: os.close( fd ) os.remove( uncompressed ) file_err( 'Problem decompressing zipped data', dataset, json_file ) return - if not chunk: - break - os.write( fd, chunk ) - os.close( fd ) - zipped_file.close() - uncompressed_name = name - unzipped = True + z.close() + # Replace the zipped file with the decompressed file if it's safe to do so + if uncompressed is not None: + if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: + dataset.path = uncompressed + else: + shutil.move( uncompressed, dataset.path ) + os.chmod(dataset.path, 0644) + dataset.name = uncompressed_name + data_type = 'zip' + if not data_type: + # TODO refactor this logic. check_binary isn't guaranteed to be + # correct since it only looks at whether the first 100 chars are + # printable or not. If someone specifies a known unsniffable + # binary datatype and check_binary fails, the file gets mangled. + if check_binary( dataset.path ) or Binary.is_ext_unsniffable(dataset.file_type): + # We have a binary dataset, but it is not Bam, Sff or Pdf + data_type = 'binary' + #binary_ok = False + parts = dataset.name.split( "." ) + if len( parts ) > 1: + ext = parts[-1].strip().lower() + if not Binary.is_ext_unsniffable(ext): + file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file ) + return + elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext: + err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext ) + file_err( err_msg, dataset, json_file ) + return + if not data_type: + # We must have a text file + if check_html( dataset.path ): + file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file ) + return + if data_type != 'binary': + if link_data_only == 'copy_files': + if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]: + in_place = False + # Convert universal line endings to Posix line endings, but allow the user to turn it off, + # so that is becomes possible to upload gzip, bz2 or zip files with binary data without + # corrupting the content of those files. + if dataset.to_posix_lines: + tmpdir = output_adjacent_tmpdir( output_path ) + tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id + if dataset.space_to_tab: + line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) else: - # python < 2.5 doesn't have a way to read members in chunks(!) - try: - outfile = open( uncompressed, 'wb' ) - outfile.write( z.read( name ) ) - outfile.close() - uncompressed_name = name - unzipped = True - except IOError: - os.close( fd ) - os.remove( uncompressed ) - file_err( 'Problem decompressing zipped data', dataset, json_file ) - return - z.close() - # Replace the zipped file with the decompressed file if it's safe to do so - if uncompressed is not None: - if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: - dataset.path = uncompressed - else: - shutil.move( uncompressed, dataset.path ) - os.chmod(dataset.path, 0644) - dataset.name = uncompressed_name - data_type = 'zip' - if not data_type: - # TODO refactor this logic. check_binary isn't guaranteed to be - # correct since it only looks at whether the first 100 chars are - # printable or not. If someone specifies a known unsniffable - # binary datatype and check_binary fails, the file gets mangled. - if check_binary( dataset.path ) or Binary.is_ext_unsniffable(dataset.file_type): - # We have a binary dataset, but it is not Bam, Sff or Pdf - data_type = 'binary' - #binary_ok = False - parts = dataset.name.split( "." ) - if len( parts ) > 1: - ext = parts[-1].strip().lower() - if not Binary.is_ext_unsniffable(ext): - file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file ) - return - elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext: - err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext ) - file_err( err_msg, dataset, json_file ) - return - if not data_type: - # We must have a text file - if check_html( dataset.path ): - file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file ) - return - if data_type != 'binary': - if link_data_only == 'copy_files': - if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]: - in_place = False - # Convert universal line endings to Posix line endings, but allow the user to turn it off, - # so that is becomes possible to upload gzip, bz2 or zip files with binary data without - # corrupting the content of those files. - if dataset.to_posix_lines: - tmpdir = output_adjacent_tmpdir( output_path ) - tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id - if dataset.space_to_tab: - line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) - else: - line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) - if dataset.file_type == 'auto': - ext = sniff.guess_ext( dataset.path, registry.sniff_order ) - else: - ext = dataset.file_type - data_type = ext + line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) + if dataset.file_type == 'auto': + ext = sniff.guess_ext( dataset.path, registry.sniff_order ) + else: + ext = dataset.file_type + data_type = ext # Save job info for the framework if ext == 'auto' and dataset.ext: ext = dataset.ext Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
commits-noreply@bitbucket.org