details: http://www.bx.psu.edu/hg/galaxy/rev/15756ebb2b11 changeset: 2889:15756ebb2b11 user: Greg Von Kuster <greg@bx.psu.edu> date: Thu Oct 15 20:10:32 2009 -0400 description: Fixes and new functional tests for uploading multi-byte character files and fixes for displaying multi-byte character dataset peek. 8 file(s) affected in this change: lib/galaxy/datatypes/data.py lib/galaxy/jobs/__init__.py lib/galaxy/model/__init__.py lib/galaxy/tools/actions/upload_common.py lib/galaxy/tools/parameters/grouping.py test-data/asian_chars_1.txt test/functional/test_get_data.py tools/data_source/upload.py diffs (269 lines): diff -r e945dcdfd578 -r 15756ebb2b11 lib/galaxy/datatypes/data.py --- a/lib/galaxy/datatypes/data.py Thu Oct 15 19:01:07 2009 -0400 +++ b/lib/galaxy/datatypes/data.py Thu Oct 15 20:10:32 2009 -0400 @@ -136,7 +136,7 @@ line = line.strip() if not line: continue - out.append( '<tr><td>%s</td></tr>' % escape( unicode( line ) ) ) + out.append( '<tr><td>%s</td></tr>' % escape( unicode( line, 'utf-8' ) ) ) out.append( '</table>' ) out = "".join( out ) except Exception, exc: @@ -437,8 +437,8 @@ temp.close() if file_type in [ 'gzipped', 'binary' ]: text = "%s file" % file_type - else: - text = unicode( '\n'.join( lines ), 'utf-8' ) + else: + text = unicode( '\n'.join( lines ), 'utf-8' ) return text def get_line_count(file_name): diff -r e945dcdfd578 -r 15756ebb2b11 lib/galaxy/jobs/__init__.py --- a/lib/galaxy/jobs/__init__.py Thu Oct 15 19:01:07 2009 -0400 +++ b/lib/galaxy/jobs/__init__.py Thu Oct 15 20:10:32 2009 -0400 @@ -535,12 +535,12 @@ dataset.metadata.from_JSON_dict( self.external_output_metadata.get_output_filenames_by_dataset( dataset ).filename_out ) try: assert context.get( 'line_count', None ) is not None - if self.tool.is_multi_byte: + if ( not dataset.datatype.composite_type and dataset.dataset.is_multi_byte() ) or self.tool.is_multi_byte: dataset.set_multi_byte_peek( line_count=context['line_count'] ) else: dataset.set_peek( line_count=context['line_count'] ) except: - if self.tool.is_multi_byte: + if ( not dataset.datatype.composite_type and dataset.dataset.is_multi_byte() ) or self.tool.is_multi_byte: dataset.set_multi_byte_peek() else: dataset.set_peek() diff -r e945dcdfd578 -r 15756ebb2b11 lib/galaxy/model/__init__.py --- a/lib/galaxy/model/__init__.py Thu Oct 15 19:01:07 2009 -0400 +++ b/lib/galaxy/model/__init__.py Thu Oct 15 20:10:32 2009 -0400 @@ -5,7 +5,7 @@ the relationship cardinalities are obvious (e.g. prefer Dataset to Data) """ -import os.path, os, errno, sys +import os.path, os, errno, sys, codecs import galaxy.datatypes from galaxy.util.bunch import Bunch from galaxy import util @@ -418,7 +418,13 @@ return self.get_size() > 0 def mark_deleted( self, include_children=True ): self.deleted = True - + def is_multi_byte( self ): + if not self.has_data(): + return False + try: + return util.is_multi_byte( codecs.open( self.file_name, 'r', 'utf-8' ).read( 100 ) ) + except UnicodeDecodeError, e: + return False # FIXME: sqlalchemy will replace this def _delete(self): """Remove the file that corresponds to this data""" @@ -433,7 +439,7 @@ permitted_actions = Dataset.permitted_actions def __init__( self, id=None, hid=None, name=None, info=None, blurb=None, peek=None, extension=None, dbkey=None, metadata=None, history=None, dataset=None, deleted=False, designation=None, - parent_id=None, validation_errors=None, visible=True, create_dataset = False ): + parent_id=None, validation_errors=None, visible=True, create_dataset=False ): self.name = name or "Unnamed dataset" self.id = id self.info = info @@ -519,6 +525,9 @@ def get_mime( self ): """Returns the mime type of the data""" return datatypes_registry.get_mimetype_by_extension( self.extension.lower() ) + def is_multi_byte( self ): + """Data consists of multi-byte characters""" + return self.dataset.is_multi_byte() def set_peek( self ): return self.datatype.set_peek( self ) def set_multi_byte_peek( self ): @@ -556,7 +565,7 @@ def get_converter_types(self): return self.datatype.get_converter_types( self, datatypes_registry) def find_conversion_destination( self, accepted_formats, **kwd ): - """Returns ( target_ext, exisiting converted dataset )""" + """Returns ( target_ext, existing converted dataset )""" return self.datatype.find_conversion_destination( self, accepted_formats, datatypes_registry, **kwd ) def add_validation_error( self, validation_error ): self.validation_errors.append( validation_error ) diff -r e945dcdfd578 -r 15756ebb2b11 lib/galaxy/tools/actions/upload_common.py --- a/lib/galaxy/tools/actions/upload_common.py Thu Oct 15 19:01:07 2009 -0400 +++ b/lib/galaxy/tools/actions/upload_common.py Thu Oct 15 20:10:32 2009 -0400 @@ -27,7 +27,7 @@ elif type( f ) == dict and 'filename' and 'local_filename' not in f: raise Exception( 'Uploaded file was encoded in a way not understood by Galaxy.' ) if upload_dataset['url_paste'].strip() != '': - upload_dataset['url_paste'] = datatypes.sniff.stream_to_file( StringIO.StringIO( upload_dataset['url_paste'] ), prefix="strio_url_paste_" )[0] + upload_dataset['url_paste'], is_multi_byte = datatypes.sniff.stream_to_file( StringIO.StringIO( upload_dataset['url_paste'] ), prefix="strio_url_paste_" ) else: upload_dataset['url_paste'] = None new_files.append( upload_dataset ) @@ -135,7 +135,6 @@ folder.refresh() matches = filter( lambda x: x.name == name, active_folders( trans, folder ) ) if matches: - log.debug( 'DEBUGDEBUG: In %s, found a folder name match: %s:%s' % ( folder.name, matches[0].id, matches[0].name ) ) folder = matches[0] else: new_folder = trans.app.model.LibraryFolder( name=name, description='Automatically created by upload tool' ) @@ -143,7 +142,6 @@ folder.add_folder( new_folder ) new_folder.flush() trans.app.security_agent.copy_library_permissions( folder, new_folder ) - log.debug( 'DEBUGDEBUG: In %s, created a new folder: %s:%s' % ( folder.name, new_folder.id, new_folder.name ) ) folder = new_folder if library_bunch.replace_dataset: ld = library_bunch.replace_dataset diff -r e945dcdfd578 -r 15756ebb2b11 lib/galaxy/tools/parameters/grouping.py --- a/lib/galaxy/tools/parameters/grouping.py Thu Oct 15 19:01:07 2009 -0400 +++ b/lib/galaxy/tools/parameters/grouping.py Thu Oct 15 20:10:32 2009 -0400 @@ -240,7 +240,7 @@ url_paste = context['url_paste'] name = context.get( 'NAME', None ) info = context.get( 'INFO', None ) - space_to_tab = False + space_to_tab = False if context.get( 'space_to_tab', None ) not in ["None", None]: space_to_tab = True warnings = [] @@ -248,7 +248,6 @@ if file_bunch.path: file_bunch.space_to_tab = space_to_tab rval.append( file_bunch ) - #rval.append( ( type, temp_name, precreated_name, space_to_tab, dataset_name, dataset_info ) ) for file_bunch in get_url_paste_urls_or_filename( context, override_name = name, override_info = info ): if file_bunch.path: file_bunch.space_to_tab = space_to_tab @@ -266,11 +265,6 @@ if d_type.composite_type is not None: #handle uploading of composite datatypes #Only one Dataset can be created - - ''' - dataset = UploadedDataset() - dataset.datatype = d_type - ''' dataset = Bunch() dataset.type = 'composite' dataset.file_type = file_type @@ -279,14 +273,12 @@ dataset.warnings = [] dataset.metadata = {} dataset.composite_files = {} - #load metadata files_metadata = context.get( self.metadata_ref, {} ) for meta_name, meta_spec in d_type.metadata_spec.iteritems(): if meta_spec.set_in_upload: if meta_name in files_metadata: dataset.metadata[ meta_name ] = files_metadata[ meta_name ] - dataset_name = None dataset_info = None if dataset.datatype.composite_type == 'auto_primary_file': diff -r e945dcdfd578 -r 15756ebb2b11 test-data/asian_chars_1.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/asian_chars_1.txt Thu Oct 15 20:10:32 2009 -0400 @@ -0,0 +1,1 @@ +蛋白質核酸酵素:制癌性物質の化学修飾による効果の向上. \ No newline at end of file diff -r e945dcdfd578 -r 15756ebb2b11 test/functional/test_get_data.py --- a/test/functional/test_get_data.py Thu Oct 15 19:01:07 2009 -0400 +++ b/test/functional/test_get_data.py Thu Oct 15 20:10:32 2009 -0400 @@ -93,3 +93,17 @@ self.verify_composite_datatype_file_content( 'rgenetics.bed', str( hda1.id ) ) self.verify_composite_datatype_file_content( 'rgenetics.fam', str( hda1.id ) ) self.delete_history( id=self.security.encode_id( history4.id ) ) + def test_020_upload_multibyte_character_file( self ): + """Test uploading multi-byte character file""" + # Logged in as admin_user + self.check_history_for_string( 'Your history is empty' ) + history5 = galaxy.model.History.filter( and_( galaxy.model.History.table.c.deleted==False, + galaxy.model.History.table.c.user_id==admin_user.id ) ) \ + .order_by( desc( galaxy.model.History.table.c.create_time ) ).first() + self.upload_file( 'asian_chars_1.txt' ) + hda1 = galaxy.model.HistoryDatasetAssociation.query() \ + .order_by( desc( galaxy.model.HistoryDatasetAssociation.table.c.create_time ) ).first() + assert hda1 is not None, "Problem retrieving hda1 from database" + self.verify_dataset_correctness( 'asian_chars_1.txt', hid=str( hda1.hid ) ) + self.check_history_for_string( 'uploaded multi-byte char file' ) + self.delete_history( id=self.security.encode_id( history5.id ) ) diff -r e945dcdfd578 -r 15756ebb2b11 tools/data_source/upload.py --- a/tools/data_source/upload.py Thu Oct 15 19:01:07 2009 -0400 +++ b/tools/data_source/upload.py Thu Oct 15 20:10:32 2009 -0400 @@ -4,7 +4,7 @@ # WARNING: Changes in this tool (particularly as related to parsing) may need # to be reflected in galaxy.web.controllers.tool_runner and galaxy.tools -import urllib, sys, os, gzip, tempfile, shutil, re, gzip, zipfile +import urllib, sys, os, gzip, tempfile, shutil, re, gzip, zipfile, codecs from galaxy import eggs # need to import model before sniff to resolve a circular import dependency import galaxy.model @@ -129,13 +129,11 @@ if dataset.type == 'url': try: - temp_name, is_multi_byte = sniff.stream_to_file( urllib.urlopen( dataset.path ), prefix='url_paste' ) + temp_name, dataset.is_multi_byte = sniff.stream_to_file( urllib.urlopen( dataset.path ), prefix='url_paste' ) except Exception, e: file_err( 'Unable to fetch %s\n%s' % ( dataset.path, str( e ) ), dataset, json_file ) return dataset.path = temp_name - dataset.is_multi_byte = is_multi_byte - # See if we have an empty file if not os.path.exists( dataset.path ): file_err( 'Uploaded temporary file (%s) does not exist.' % dataset.path, dataset, json_file ) @@ -143,11 +141,15 @@ if not os.path.getsize( dataset.path ) > 0: file_err( 'The uploaded file is empty', dataset, json_file ) return - if 'is_multi_byte' not in dir( dataset ): - dataset.is_multi_byte = util.is_multi_byte( open( dataset.path, 'r' ).read( 1024 ) ) + if not dataset.type == 'url': + # Already set is_multi_byte above if type == 'url' + try: + dataset.is_multi_byte = util.is_multi_byte( codecs.open( dataset.path, 'r', 'utf-8' ).read( 100 ) ) + except UnicodeDecodeError, e: + dataset.is_multi_byte = False if dataset.is_multi_byte: + data_type = 'multi-byte char' ext = sniff.guess_ext( dataset.path, is_multi_byte=True ) - data_type = ext else: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress is_gzipped, is_valid = check_gzip( dataset.path ) @@ -283,24 +285,19 @@ sys.exit( 1 ) output_paths = parse_outputs( sys.argv[2:] ) - json_file = open( 'galaxy.json', 'w' ) - for line in open( sys.argv[1], 'r' ): dataset = from_json_string( line ) dataset = util.bunch.Bunch( **safe_dict( dataset ) ) - try: output_path = output_paths[int( dataset.dataset_id )] except: print >>sys.stderr, 'Output path for dataset %s not found on command line' % dataset.dataset_id sys.exit( 1 ) - if dataset.type == 'composite': add_composite_file( dataset, json_file, output_path ) else: add_file( dataset, json_file, output_path ) - # clean up paramfile try: os.remove( sys.argv[1] )