5 new commits in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/changeset/1aee30671c45/ changeset: 1aee30671c45 user: dan date: 2012-08-29 01:23:39 summary: Handle non-ascii unicode in data source tools. Add util.is_binary() method that returns true when provided string contains a null byte. affected #: 3 files diff -r 738b703f725c1ade02018489e1c8e197236e30cd -r 1aee30671c45ecf045e33fe472525b814b97838d lib/galaxy/datatypes/sniff.py --- a/lib/galaxy/datatypes/sniff.py +++ b/lib/galaxy/datatypes/sniff.py @@ -6,6 +6,7 @@ from galaxy import util from galaxy.datatypes.checkers import * from galaxy.datatypes.binary import unsniffable_binary_formats +from encodings import search_function as encodings_search_function log = logging.getLogger(__name__) @@ -15,7 +16,7 @@ full_path = os.path.join(path, 'test', fname) return full_path -def stream_to_open_named_file( stream, fd, filename ): +def stream_to_open_named_file( stream, fd, filename, source_encoding=None, source_error='strict', target_encoding=None, target_error='strict' ): """Writes a stream to the provided file descriptor, returns the file's name and bool( is_multi_byte ). Closes file descriptor""" #signature and behavor is somewhat odd, due to backwards compatibility, but this can/should be done better CHUNK_SIZE = 1048576 @@ -23,6 +24,10 @@ is_compressed = False is_binary = False is_multi_byte = False + if not target_encoding or not encodings_search_function( target_encoding ): + target_encoding = util.DEFAULT_ENCODING #utf-8 + if not source_encoding: + source_encoding = util.DEFAULT_ENCODING #sys.getdefaultencoding() would mimic old behavior (defaults to ascii) while 1: chunk = stream.read( CHUNK_SIZE ) if not chunk: @@ -42,13 +47,12 @@ chars = chunk[:100] is_multi_byte = util.is_multi_byte( chars ) if not is_multi_byte: - for char in chars: - if ord( char ) > 128: - is_binary = True - break + is_binary = util.is_binary( chunk ) data_checked = True if not is_compressed and not is_binary: - os.write( fd, chunk.encode( "utf-8" ) ) + if not isinstance( chunk, unicode ): + chunk = chunk.decode( source_encoding, source_error ) + os.write( fd, chunk.encode( target_encoding, target_error ) ) else: # Compressed files must be encoded after they are uncompressed in the upload utility, # while binary files should not be encoded at all. @@ -56,10 +60,10 @@ os.close( fd ) return filename, is_multi_byte -def stream_to_file( stream, suffix='', prefix='', dir=None, text=False ): +def stream_to_file( stream, suffix='', prefix='', dir=None, text=False, **kwd ): """Writes a stream to a temporary file, returns the temporary file's name""" fd, temp_name = tempfile.mkstemp( suffix=suffix, prefix=prefix, dir=dir, text=text ) - return stream_to_open_named_file( stream, fd, temp_name ) + return stream_to_open_named_file( stream, fd, temp_name, **kwd ) def check_newlines( fname, bytes_to_read=52428800 ): """ @@ -305,14 +309,9 @@ else: for hdr in headers: for char in hdr: - if len( char ) > 1: - for c in char: - if ord( c ) > 128: - is_binary = True - break - elif ord( char ) > 128: - is_binary = True - break + #old behavior had 'char' possibly having length > 1, + #need to determine when/if this occurs + is_binary = util.is_binary( char ) if is_binary: break if is_binary: diff -r 738b703f725c1ade02018489e1c8e197236e30cd -r 1aee30671c45ecf045e33fe472525b814b97838d lib/galaxy/util/__init__.py --- a/lib/galaxy/util/__init__.py +++ b/lib/galaxy/util/__init__.py @@ -34,6 +34,9 @@ gzip_magic = '\037\213' bz2_magic = 'BZh' +DEFAULT_ENCODING = 'utf-8' +NULL_CHAR = '\000' +BINARY_CHARS = [ NULL_CHAR ] from inflection import Inflector, English inflector = Inflector(English) @@ -57,6 +60,32 @@ return True return False +def is_binary( value, binary_chars=None ): + """ + File is binary if it contains a null-byte by default (e.g. behavior of grep, etc.). + This may fail for utf-16 files, but so would ASCII encoding. + >>> is_binary( string.printable ) + False + >>> is_binary( '\\xce\\x94' ) + False + >>> is_binary( '\\000' ) + True + """ + if binary_chars is None: + binary_chars = BINARY_CHARS + for binary_char in binary_chars: + if binary_char in value: + return True + return False + +def get_charset_from_http_headers( headers, default=None ): + rval = headers.get('content-type', None ) + if rval and 'charset=' in rval: + rval = rval.split('charset=')[-1].split(';')[0].strip() + if rval: + return rval + return default + def synchronized(func): """This wrapper will serialize access to 'func' to a single thread. Use it as a decorator.""" def caller(*params, **kparams): @@ -333,6 +362,17 @@ else: return amount[0:sfs] + '0'*(len(amount) - sfs) +def unicodify( value, encoding=DEFAULT_ENCODING, error='replace', default=None ): + """ + Returns a unicode string or None + """ + if isinstance( value, unicode ): + return value + try: + return unicode( value, encoding, error ) + except: + return default + def object_to_string( obj ): return binascii.hexlify( pickle.dumps( obj, 2 ) ) @@ -502,7 +542,7 @@ def recursively_stringify_dictionary_keys( d ): if isinstance(d, dict): - return dict([(k.encode('utf-8'), recursively_stringify_dictionary_keys(v)) for k,v in d.iteritems()]) + return dict([(k.encode( DEFAULT_ENCODING ), recursively_stringify_dictionary_keys(v)) for k,v in d.iteritems()]) elif isinstance(d, list): return [recursively_stringify_dictionary_keys(x) for x in d] else: @@ -622,7 +662,7 @@ Sends an email. """ to = listify( to ) - msg = MIMEText( body ) + msg = MIMEText( body.encode( 'ascii', 'replace' ) ) msg[ 'To' ] = ', '.join( to ) msg[ 'From' ] = frm msg[ 'Subject' ] = subject diff -r 738b703f725c1ade02018489e1c8e197236e30cd -r 1aee30671c45ecf045e33fe472525b814b97838d tools/data_source/data_source.py --- a/tools/data_source/data_source.py +++ b/tools/data_source/data_source.py @@ -4,6 +4,7 @@ import socket, urllib, sys, os from galaxy import eggs #eggs needs to be imported so that galaxy.util can find docutils egg... from galaxy.util.json import from_json_string, to_json_string +from galaxy.util import get_charset_from_http_headers import galaxy.model # need to import model before sniff to resolve a circular import dependency from galaxy.datatypes import sniff from galaxy.datatypes.registry import Registry @@ -92,7 +93,7 @@ stop_err( 'The size of the data (%d bytes) you have requested exceeds the maximum allowed (%d bytes) on this server.' % ( file_size, max_file_size ) ) #do sniff stream for multi_byte try: - cur_filename, is_multi_byte = sniff.stream_to_open_named_file( page, os.open( cur_filename, os.O_WRONLY | os.O_CREAT ), cur_filename ) + cur_filename, is_multi_byte = sniff.stream_to_open_named_file( page, os.open( cur_filename, os.O_WRONLY | os.O_CREAT ), cur_filename, source_encoding=get_charset_from_http_headers( page.headers ) ) except Exception, e: stop_err( 'Unable to fetch %s:\n%s' % ( cur_URL, e ) ) https://bitbucket.org/galaxy/galaxy-central/changeset/577498958c37/ changeset: 577498958c37 user: dan date: 2012-08-29 01:23:39 summary: Handle non-ascii unicode in upload tool. Significant pre-existing refactoring still needed. affected #: 1 file diff -r 1aee30671c45ecf045e33fe472525b814b97838d -r 577498958c37cb3c0702b06a048bb51b2d82257a tools/data_source/upload.py --- a/tools/data_source/upload.py +++ b/tools/data_source/upload.py @@ -90,7 +90,8 @@ if dataset.type == 'url': try: - temp_name, dataset.is_multi_byte = sniff.stream_to_file( urllib.urlopen( dataset.path ), prefix='url_paste' ) + page = urllib.urlopen( dataset.path ) #page will be .close()ed by sniff methods + temp_name, dataset.is_multi_byte = sniff.stream_to_file( page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers( page.headers ) ) except Exception, e: file_err( 'Unable to fetch %s\n%s' % ( dataset.path, str( e ) ), dataset, json_file ) return https://bitbucket.org/galaxy/galaxy-central/changeset/fca4e12478d6/ changeset: fca4e12478d6 user: dan date: 2012-08-29 01:23:39 summary: Handle non-ascii unicode in dataset edit attributes. affected #: 1 file diff -r 577498958c37cb3c0702b06a048bb51b2d82257a -r fca4e12478d62162cb486422c79687587209c639 templates/dataset/edit_attributes.mako --- a/templates/dataset/edit_attributes.mako +++ b/templates/dataset/edit_attributes.mako @@ -58,7 +58,7 @@ Info: </label><div style="float: left; width: 250px; margin-right: 10px;"> - <textarea name="info" cols="40" rows="2">${data.info | h}</textarea> + <textarea name="info" cols="40" rows="2">${ util.unicodify( data.info ) | h}</textarea></div><div style="clear: both"></div></div> https://bitbucket.org/galaxy/galaxy-central/changeset/efefe08d6dd0/ changeset: efefe08d6dd0 user: dan date: 2012-08-29 01:23:39 summary: Handle non-ascii unicode in dataset error page. affected #: 1 file diff -r fca4e12478d62162cb486422c79687587209c639 -r efefe08d6dd0c70055a80953007e312e1d85d271 templates/dataset/errors.mako --- a/templates/dataset/errors.mako +++ b/templates/dataset/errors.mako @@ -24,21 +24,21 @@ <% job = hda.creating_job_associations[0].job %> %if job.traceback: The Galaxy framework encountered the following error while attempting to run the tool: - <pre>${job.traceback | h}</pre> + <pre>${ util.unicodify( job.traceback ) | h}</pre> %endif %if job.stderr or job.info: Tool execution generated the following error message: %if job.stderr: - <pre>${job.stderr | h}</pre> + <pre>${ util.unicodify( job.stderr ) | h}</pre> %elif job.info: - <pre>${job.info | h}</pre> + <pre>${ util.unicodify( job.info ) | h}</pre> %endif %else: Tool execution did not generate any error messages. %endif %if job.stdout: The tool produced the following additional output: - <pre>${job.stdout | h}</pre> + <pre>${ util.unicodify( job.stdout ) | h}</pre> %endif %else: The tool did not create any additional job / error info. https://bitbucket.org/galaxy/galaxy-central/changeset/80dd03582ea9/ changeset: 80dd03582ea9 user: dan date: 2012-08-29 01:23:40 summary: Handle non-ascii unicode in dataset error report emails. affected #: 1 file diff -r efefe08d6dd0c70055a80953007e312e1d85d271 -r 80dd03582ea9041d0c897b3a2c58e12a5bb494bb lib/galaxy/web/controllers/dataset.py --- a/lib/galaxy/web/controllers/dataset.py +++ b/lib/galaxy/web/controllers/dataset.py @@ -203,12 +203,12 @@ job_id=job.id, job_tool_id=job.tool_id, job_command_line=job.command_line, - job_stderr=job.stderr, - job_stdout=job.stdout, - job_info=job.info, - job_traceback=job.traceback, + job_stderr=util.unicodify( job.stderr ), + job_stdout=util.unicodify( job.stdout ), + job_info=util.unicodify( job.info ), + job_traceback=util.unicodify( job.traceback ), email=email, - message=message ) + message=util.unicodify( message ) ) frm = to_address # Check email a bit email = email.strip() Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.