[hg] galaxy 1742: i18n fixes: uploading binary files now works a...

25 Mar 2009

details:   http://www.bx.psu.edu/hg/galaxy/rev/02843f56b812
changeset: 1742:02843f56b812
user:      Greg Von Kuster <greg@bx.psu.edu>
date:      Wed Mar 25 11:51:30 2009 -0400
description:
i18n fixes: uploading binary files now works again, multi-byte character files are no longer set as binary, file peek now supports multi-byte characters.

7 file(s) affected in this change:

eggs.ini
lib/galaxy/datatypes/data.py
lib/galaxy/datatypes/sniff.py
lib/galaxy/model/__init__.py
lib/galaxy/tools/actions/upload.py
lib/galaxy/util/__init__.py
lib/galaxy/web/framework/__init__.py

diffs (686 lines):

diff -r affd3085eee3 -r 02843f56b812 eggs.ini

--- a/eggs.ini	Fri Mar 20 02:49:30 2009 -0400
+++ b/eggs.ini	Wed Mar 25 11:51:30 2009 -0400
@@ -48,6 +48,7 @@
 WebOb = 0.8.5
 wsgiref = 0.1.2
 Babel = 0.9.4
+wchartype = 0.1
 
 ; extra version information
 [tags]
@@ -94,3 +95,4 @@
 WebOb = http://pypi.python.org/packages/source/W/WebOb/WebOb-0.8.5.tar.gz
 wsgiref = http://pypi.python.org/packages/source/w/wsgiref/wsgiref-0.1.2.zip
 Babel = http://ftp.edgewall.com/pub/babel/Babel-0.9.4.zip
+wchartype = http://ginstrom.com/code/wchartype-0.1.zip
diff -r affd3085eee3 -r 02843f56b812 lib/galaxy/datatypes/data.py
--- a/lib/galaxy/datatypes/data.py	Fri Mar 20 02:49:30 2009 -0400
+++ b/lib/galaxy/datatypes/data.py	Wed Mar 25 11:51:30 2009 -0400
@@ -109,7 +109,7 @@
         else:
             dataset.peek = 'file does not exist'
             dataset.blurb = 'file purged from disk'
-    def display_peek(self, dataset):
+    def display_peek(self, dataset ):
         """Create HTML table, used for displaying peek"""
         out = ['<table cellspacing="0" cellpadding="3">']
         try:
@@ -121,7 +121,7 @@
                 line = line.strip()
                 if not line:
                     continue
-                out.append( '<tr><td>%s</td></tr>' % escape( line ) )
+                out.append( '<tr><td>%s</td></tr>' % escape( unicode( line, 'utf-8' ) ) )
             out.append( '</table>' )
             out = "".join( out )
         except Exception, exc:
@@ -190,7 +190,6 @@
         except:
             log.exception('Function %s is referred to in datatype %s for displaying as type %s, but is not accessible' % (self.supported_display_apps[type]['file_function'], self.__class__.__name__, type) )
         return "This display type (%s) is not implemented for this datatype (%s)." % ( type, dataset.ext)
-        
     def get_display_links(self, dataset, type, app, base_url, **kwd):
         """Returns a list of tuples of (name, link) for a particular display type """
         try:
@@ -199,21 +198,17 @@
         except:
             log.exception('Function %s is referred to in datatype %s for generating links for type %s, but is not accessible' % (self.supported_display_apps[type]['links_function'], self.__class__.__name__, type) )
         return []
-
     def get_converter_types(self, original_dataset, datatypes_registry):
         """Returns available converters by type for this dataset"""
         return datatypes_registry.get_converters_by_datatype(original_dataset.ext)
-    
     def find_conversion_destination( self, dataset, accepted_formats, datatypes_registry, **kwd ):
         """Returns ( target_ext, exisiting converted dataset )"""
         return datatypes_registry.find_conversion_destination_for_dataset_by_extensions( dataset, accepted_formats, **kwd )
-    
     def convert_dataset(self, trans, original_dataset, target_type, return_output = False, visible = True ):
         """This function adds a job to the queue to convert a dataset to another type. Returns a message about success/failure."""
         converter = trans.app.datatypes_registry.get_converter_by_target_type( original_dataset.ext, target_type )
         if converter is None:
             raise "A converter does not exist for %s to %s." % ( original_dataset.ext, target_type )
-        
         #Generate parameter dictionary
         params = {}
         #determine input parameter name and add to params
@@ -223,31 +218,24 @@
                 input_name = key
                 break
         params[input_name] = original_dataset
-        
         #Run converter, job is dispatched through Queue
         converted_dataset = converter.execute( trans, incoming = params, set_output_hid = visible )
-        
         if len(params) > 0:
             trans.log_event( "Converter params: %s" % (str(params)), tool_id=converter.id )
-        
         if not visible:
             for name, value in converted_dataset.iteritems():
                 value.visible = False
-        
         if return_output:
             return converted_dataset
         return "The file conversion of %s on data %s has been added to the Queue." % (converter.name, original_dataset.hid)
-
     def before_edit( self, dataset ):
         """This function is called on the dataset before metadata is edited."""
         pass
-
     def after_edit( self, dataset ):
         """This function is called on the dataset after metadata is edited."""
         dataset.clear_associated_files( metadata_safe = True )
 
 class Text( Data ):
-
     def write_from_stream(self, dataset, stream):
         """Writes data from a stream"""
         # write it twice for now 
@@ -265,30 +253,36 @@
             line = line.strip() + '\n'
             fp.write(line)
         fp.close()
-
     def set_raw_data(self, dataset, data):
         """Saves the data on the disc"""
         fd, temp_name = tempfile.mkstemp()
         os.write(fd, data)
         os.close(fd)
-
         # rewrite the file with unix newlines
         fp = open(dataset.file_name, 'wt')
         for line in file(temp_name, "U"):
             line = line.strip() + '\n'
             fp.write(line)
         fp.close()
-
         os.remove( temp_name )
-    
     def get_mime(self):
         """Returns the mime type of the datatype"""
         return 'text/plain'
-   
     def set_peek( self, dataset, line_count=None ):
         if not dataset.dataset.purged:
             # The file must exist on disk for the get_file_peek() method
             dataset.peek = get_file_peek( dataset.file_name )
+            if line_count is None:
+                dataset.blurb = "%s lines" % util.commaify( str( get_line_count( dataset.file_name ) ) )
+            else:
+                dataset.blurb = "%s lines" % util.commaify( str( line_count ) )
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disk'
+    def set_multi_byte_peek( self, dataset, line_count=None ):
+        if not dataset.dataset.purged:
+            # The file must exist on disk for the get_file_peek() method
+            dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=True )
             if line_count is None:
                 dataset.blurb = "%s lines" % util.commaify( str( get_line_count( dataset.file_name ) ) )
             else:
@@ -340,7 +334,7 @@
             return out
     return '??? bytes'
 
-def get_file_peek( file_name, WIDTH=256, LINE_COUNT=5 ):
+def get_file_peek( file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5 ):
     """
     Returns the first LINE_COUNT lines wrapped to WIDTH
     
@@ -350,12 +344,12 @@
     """
     lines = []
     count = 0
-    file_type = ''
+    file_type = None
     data_checked = False
     for line in file( file_name ):
-        line = line[ :WIDTH ]
-        if not data_checked and line:
-            data_checked = True
+        line = line[:WIDTH]
+        if line and not is_multi_byte and not data_checked:
+            # See if we have a compressed or binary file
             if line[0:2] == util.gzip_magic:
                 file_type = 'gzipped'
                 break
@@ -364,14 +358,17 @@
                     if ord( char ) > 128:
                         file_type = 'binary'
                         break
+            data_checked = True
+        if file_type in [ 'gzipped', 'binary' ]:
+            break
         lines.append( line )
         if count == LINE_COUNT: 
             break
         count += 1
-    if file_type: 
-        text = "%s file" %file_type 
+    if file_type in [ 'gzipped', 'binary' ]: 
+        text = "%s file" % file_type 
     else: 
-        text  = '\n'.join( lines )
+        text  =  unicode( '\n'.join( lines ), 'utf-8' )
     return text
 
 def get_line_count(file_name):
diff -r affd3085eee3 -r 02843f56b812 lib/galaxy/datatypes/sniff.py
--- a/lib/galaxy/datatypes/sniff.py	Fri Mar 20 02:49:30 2009 -0400
+++ b/lib/galaxy/datatypes/sniff.py	Wed Mar 25 11:51:30 2009 -0400
@@ -1,8 +1,9 @@
 """
 File format detector
 """
-import logging, sys, os, csv, tempfile, shutil, re
+import logging, sys, os, csv, tempfile, shutil, re, zipfile
 import registry
+from galaxy import util
 
 log = logging.getLogger(__name__)
         
@@ -13,18 +14,43 @@
     return full_path
 
 def stream_to_file( stream, suffix='', prefix='', dir=None, text=False ):
-    """
-    Writes a stream to a temporary file, returns the temporary file's name
-    """
+    """Writes a stream to a temporary file, returns the temporary file's name"""
     fd, temp_name = tempfile.mkstemp( suffix=suffix, prefix=prefix, dir=dir, text=text )
+    CHUNK_SIZE = 1048576
+    data_checked = False
+    is_compressed = False
+    is_binary = False
+    is_multi_byte = False
     while 1:
-        chunk = stream.read(1048576)
+        chunk = stream.read( CHUNK_SIZE )
         if not chunk:
             break
-        # TODO: does this work on binary files?
-        os.write( fd, chunk.encode( "utf-8" ) )
-    os.close(fd)
-    return temp_name
+        if not data_checked:
+            # See if we're uploading a compressed file
+            if zipfile.is_zipfile( temp_name ):
+                is_compressed = True
+            else:
+                magic_check = chunk[:2]
+                if magic_check == util.gzip_magic:
+                    is_compressed = True
+            if not is_compressed:
+                # See if we have a multi-byte character file
+                chars = chunk[:100]
+                is_multi_byte = util.is_multi_byte( chars )
+                if not is_multi_byte:
+                    for char in chars:
+                        if ord( char ) > 128:
+                            is_binary = True
+                            break
+            data_checked = True
+        if not is_compressed and not is_binary:
+            os.write( fd, chunk.encode( "utf-8" ) )
+        else:
+            # Compressed files must be encoded after they are uncompressed in the upload utility,
+            # while binary files should not be encoded at all.
+            os.write( fd, chunk )
+    os.close( fd )
+    return temp_name, is_multi_byte
 
 def convert_newlines( fname ):
     """
@@ -94,7 +120,7 @@
     # Return number of lines in file.
     return i + 1
 
-def get_headers(fname, sep, count=60):
+def get_headers( fname, sep, count=60, is_multi_byte=False ):
     """
     Returns a list with the first 'count' lines split by 'sep'
     
@@ -105,12 +131,16 @@
     headers = []
     for idx, line in enumerate(file(fname)):
         line = line.rstrip('\n\r')
+        if is_multi_byte:
+            # TODO: fix this - sep is never found in line
+            line = unicode( line, 'utf-8' )
+            sep = sep.encode( 'utf-8' )
         headers.append( line.split(sep) )
         if idx == count:
             break
     return headers
     
-def is_column_based(fname, sep='\t', skip=0):
+def is_column_based( fname, sep='\t', skip=0, is_multi_byte=False ):
     """
     Checks whether the file is column based with respect to a separator 
     (defaults to tab separator).
@@ -138,9 +168,8 @@
     >>> is_column_based(fname)
     True
     """
-    headers = get_headers(fname, sep)
+    headers = get_headers( fname, sep, is_multi_byte=is_multi_byte )
     count = 0
-    
     if not headers:
         return False
     for hdr in headers[skip:]:
@@ -156,7 +185,7 @@
                 return False
     return True
 
-def guess_ext( fname, sniff_order=None ):
+def guess_ext( fname, sniff_order=None, is_multi_byte=False ):
     """
     Returns an extension that can be used in the datatype factory to
     generate a data for the 'fname' file
@@ -220,20 +249,28 @@
                 return datatype.file_ext
         except:
             pass
-
     headers = get_headers( fname, None )
-    is_binary = True
-    for hdr in headers:
-        for char in hdr:
-            try:
-                if not ord(char) > 128:
-                    is_binary = False
-            except:
-                is_binary = False
+    is_binary = False
+    if is_multi_byte:
+        is_binary = False
+    else:
+        for hdr in headers:
+            for char in hdr:
+                if len( char ) > 1:
+                    for c in char:
+                        if ord( c ) > 128:
+                            is_binary = True
+                            break
+                elif ord( char ) > 128:
+                    is_binary = True
+                    break
+                if is_binary:
+                    break
+            if is_binary:
                 break
     if is_binary:
         return 'data'        #default binary data type file extension
-    if is_column_based( fname, '\t', 1):
+    if is_column_based( fname, '\t', 1, is_multi_byte=is_multi_byte ):
         return 'tabular'    #default tabular data type file extension
     return 'txt'            #default text data type file extension
 
diff -r affd3085eee3 -r 02843f56b812 lib/galaxy/model/__init__.py
--- a/lib/galaxy/model/__init__.py	Fri Mar 20 02:49:30 2009 -0400
+++ b/lib/galaxy/model/__init__.py	Wed Mar 25 11:51:30 2009 -0400
@@ -245,6 +245,8 @@
         return datatypes_registry.get_mimetype_by_extension( self.extension.lower() )
     def set_peek( self ):
         return self.datatype.set_peek( self )
+    def set_multi_byte_peek( self ):
+        return self.datatype.set_multi_byte_peek( self )
     def init_meta( self, copy_from=None ):
         return self.datatype.init_meta( self, copy_from=copy_from )
     def set_meta( self, **kwd ):
diff -r affd3085eee3 -r 02843f56b812 lib/galaxy/tools/actions/upload.py
--- a/lib/galaxy/tools/actions/upload.py	Fri Mar 20 02:49:30 2009 -0400
+++ b/lib/galaxy/tools/actions/upload.py	Wed Mar 25 11:51:30 2009 -0400
@@ -23,6 +23,7 @@
         file_type = incoming['file_type']
         dbkey = incoming['dbkey']
         url_paste = incoming['url_paste']
+        is_multi_byte = False
         space_to_tab = False 
         if 'space_to_tab' in incoming:
             if incoming['space_to_tab'] not in ["None", None]:
@@ -49,7 +50,7 @@
             file_name = file_name.split( '\\' )[-1]
             file_name = file_name.split( '/' )[-1]
             try:
-                data_list.append( self.add_file( trans, data_file.local_filename, file_name, file_type, dbkey, space_to_tab=space_to_tab ) )
+                data_list.append( self.add_file( trans, data_file.local_filename, file_name, file_type, is_multi_byte, dbkey, space_to_tab=space_to_tab ) )
             except Exception, e:
                 log.exception( 'exception in add_file using datafile.local_filename %s: %s' % ( data_file.local_filename, str( e ) ) )
                 self.remove_tempfile( data_file.local_filename )
@@ -59,13 +60,13 @@
             file_name = file_name.split( '\\' )[-1]
             file_name = file_name.split( '/' )[-1]
             try:
-                temp_name = sniff.stream_to_file( data_file.file, prefix='upload' )
+                temp_name, is_multi_byte = sniff.stream_to_file( data_file.file, prefix='upload' )
             except Exception, e:
                 log.exception( 'exception in sniff.stream_to_file using file %s: %s' % ( data_file.filename, str( e ) ) )
                 self.remove_tempfile( temp_name )
                 return self.upload_empty( trans, job, "Error:", str( e ) )
             try:
-                data_list.append( self.add_file( trans, temp_name, file_name, file_type, dbkey, space_to_tab=space_to_tab ) )
+                data_list.append( self.add_file( trans, temp_name, file_name, file_type, is_multi_byte, dbkey, space_to_tab=space_to_tab ) )
             except Exception, e:
                 log.exception( 'exception in add_file using file temp_name %s: %s' % ( str( temp_name ), str( e ) ) )
                 self.remove_tempfile( temp_name )
@@ -77,13 +78,13 @@
                     line = line.rstrip( '\r\n' )
                     if line:
                         try:
-                            temp_name = sniff.stream_to_file( urllib.urlopen( line ), prefix='url_paste' )
+                            temp_name, is_multi_byte = sniff.stream_to_file( urllib.urlopen( line ), prefix='url_paste' )
                         except Exception, e:
                             log.exception( 'exception in sniff.stream_to_file using url_paste %s: %s' % ( url_paste, str( e ) ) )
                             self.remove_tempfile( temp_name )
                             return self.upload_empty( trans, job, "Error:", str( e ) )
                         try:
-                            data_list.append( self.add_file( trans, temp_name, line, file_type, dbkey, info="uploaded url", space_to_tab=space_to_tab ) )
+                            data_list.append( self.add_file( trans, temp_name, line, file_type, is_multi_byte, dbkey, info="uploaded url", space_to_tab=space_to_tab ) )
                         except Exception, e:
                             log.exception( 'exception in add_file using url_paste temp_name %s: %s' % ( str( temp_name ), str( e ) ) )
                             self.remove_tempfile( temp_name )
@@ -97,13 +98,13 @@
                         break
                 if is_valid:
                     try:
-                        temp_name = sniff.stream_to_file( StringIO.StringIO( url_paste ), prefix='strio_url_paste' )
+                        temp_name, is_multi_byte = sniff.stream_to_file( StringIO.StringIO( url_paste ), prefix='strio_url_paste' )
                     except Exception, e:
                         log.exception( 'exception in sniff.stream_to_file using StringIO.StringIO( url_paste ) %s: %s' % ( url_paste, str( e ) ) )
                         self.remove_tempfile( temp_name )
                         return self.upload_empty( trans, job, "Error:", str( e ) )
                     try:
-                        data_list.append( self.add_file( trans, temp_name, 'Pasted Entry', file_type, dbkey, info="pasted entry", space_to_tab=space_to_tab ) )
+                        data_list.append( self.add_file( trans, temp_name, 'Pasted Entry', file_type, is_multi_byte, dbkey, info="pasted entry", space_to_tab=space_to_tab ) )
                     except Exception, e:
                         log.exception( 'exception in add_file using StringIO.StringIO( url_paste ) temp_name %s: %s' % ( str( temp_name ), str( e ) ) )
                         self.remove_tempfile( temp_name )
@@ -144,85 +145,87 @@
         trans.log_event( 'job id %d ended with errors, err_msg: %s' % ( job.id, err_msg ), tool_id=job.tool_id )
         return dict( output=data )
 
-    def add_file( self, trans, temp_name, file_name, file_type, dbkey, info=None, space_to_tab=False ):
+    def add_file( self, trans, temp_name, file_name, file_type, is_multi_byte, dbkey, info=None, space_to_tab=False ):
         data_type = None
+        ext = ''
         # See if we have an empty file
         if not os.path.getsize( temp_name ) > 0:
             raise BadFileException( "you attempted to upload an empty file." )
-        # See if we have a gzipped file, which, if it passes our restrictions,
-        # we'll decompress on the fly.
-        is_gzipped, is_valid = self.check_gzip( temp_name )
-        if is_gzipped and not is_valid:
-            raise BadFileException( "you attempted to upload an inappropriate file." )
-        elif is_gzipped and is_valid:
-            #We need to decompress the temp_name file
-            CHUNK_SIZE = 2**20 # 1Mb   
-            fd, uncompressed = tempfile.mkstemp()   
-            gzipped_file = gzip.GzipFile( temp_name )
-            while 1:
-                try:
-                    chunk = gzipped_file.read( CHUNK_SIZE )
-                except IOError:
+        if is_multi_byte:
+            ext = sniff.guess_ext( temp_name, is_multi_byte=True )
+        else:
+            if not data_type:
+                # See if we have a gzipped file, which, if it passes our restrictions,
+                # we'll decompress on the fly.
+                is_gzipped, is_valid = self.check_gzip( temp_name )
+                if is_gzipped and not is_valid:
+                    raise BadFileException( "you attempted to upload an inappropriate file." )
+                elif is_gzipped and is_valid:
+                    #We need to decompress the temp_name file
+                    CHUNK_SIZE = 2**20 # 1Mb   
+                    fd, uncompressed = tempfile.mkstemp()   
+                    gzipped_file = gzip.GzipFile( temp_name )
+                    while 1:
+                        try:
+                            chunk = gzipped_file.read( CHUNK_SIZE )
+                        except IOError:
+                            os.close( fd )
+                            os.remove( uncompressed )
+                            raise BadFileException( 'problem decompressing gzipped data.' )
+                        if not chunk:
+                            break
+                        os.write( fd, chunk.encode( "utf-8" ) )
                     os.close( fd )
-                    os.remove( uncompressed )
-                    raise BadFileException( 'problem decompressing gzipped data.' )
-                if not chunk:
-                    break
-                os.write( fd, chunk )
-            os.close( fd )
-            gzipped_file.close()
-            # Replace the gzipped file with the decompressed file
-            shutil.move( uncompressed, temp_name )
-            file_name = file_name.rstrip( '.gz' )
-            data_type = 'gzip'
-        ext = ''
-        if not data_type:
-            # See if we have a zip archive
-            is_zipped, is_valid, test_ext = self.check_zip( temp_name )
-            if is_zipped and not is_valid:
-                raise BadFileException( "you attempted to upload an inappropriate file." )
-            elif is_zipped and is_valid:
-                # Currently, we force specific tools to handle this case.  We also require the user
-                # to manually set the incoming file_type
-                if ( test_ext == 'ab1' or test_ext == 'scf' ) and file_type != 'binseq.zip':
-                    raise BadFileException( "Invalid 'File Format' for archive consisting of binary files - use 'Binseq.zip'." )
-                elif test_ext == 'txt' and file_type != 'txtseq.zip':
-                    raise BadFileException( "Invalid 'File Format' for archive consisting of text files - use 'Txtseq.zip'." )
-                if not ( file_type == 'binseq.zip' or file_type == 'txtseq.zip' ):
-                    raise BadFileException( "you must manually set the 'File Format' to either 'Binseq.zip' or 'Txtseq.zip' when uploading zip files." )
-                data_type = 'zip'
-                ext = file_type
-        if not data_type:
-            if self.check_binary( temp_name ):
-                parts = file_name.split( "." )
-                if len( parts ) > 1:
-                    ext = parts[1].strip().lower()
-                    if not( ext == 'ab1' or ext == 'scf' ):
-                        raise BadFileException( "you attempted to upload an inappropriate file." )
-                    if ext == 'ab1' and file_type != 'ab1':
-                        raise BadFileException( "you must manually set the 'File Format' to 'Ab1' when uploading ab1 files." )
-                    elif ext == 'scf' and file_type != 'scf':
-                        raise BadFileException( "you must manually set the 'File Format' to 'Scf' when uploading scf files." )
-                data_type = 'binary'
-        if not data_type:
-            # We must have a text file
-            if self.check_html( temp_name ):
-                raise BadFileException( "you attempted to upload an inappropriate file." )
-        if data_type != 'binary' and data_type != 'zip':
-            if space_to_tab:
-                self.line_count = sniff.convert_newlines_sep2tabs( temp_name )
-            else:
-                self.line_count = sniff.convert_newlines( temp_name )
-            if file_type == 'auto':
-                ext = sniff.guess_ext( temp_name, sniff_order=trans.app.datatypes_registry.sniff_order )    
-            else:
-                ext = file_type
-            data_type = ext
-
+                    gzipped_file.close()
+                    # Replace the gzipped file with the decompressed file
+                    shutil.move( uncompressed, temp_name )
+                    file_name = file_name.rstrip( '.gz' )
+                    data_type = 'gzip'
+            if not data_type:
+                # See if we have a zip archive
+                is_zipped, is_valid, test_ext = self.check_zip( temp_name )
+                if is_zipped and not is_valid:
+                    raise BadFileException( "you attempted to upload an inappropriate file." )
+                elif is_zipped and is_valid:
+                    # Currently, we force specific tools to handle this case.  We also require the user
+                    # to manually set the incoming file_type
+                    if ( test_ext == 'ab1' or test_ext == 'scf' ) and file_type != 'binseq.zip':
+                        raise BadFileException( "Invalid 'File Format' for archive consisting of binary files - use 'Binseq.zip'." )
+                    elif test_ext == 'txt' and file_type != 'txtseq.zip':
+                        raise BadFileException( "Invalid 'File Format' for archive consisting of text files - use 'Txtseq.zip'." )
+                    if not ( file_type == 'binseq.zip' or file_type == 'txtseq.zip' ):
+                        raise BadFileException( "you must manually set the 'File Format' to either 'Binseq.zip' or 'Txtseq.zip' when uploading zip files." )
+                    data_type = 'zip'
+                    ext = file_type
+            if not data_type:
+                if self.check_binary( temp_name ):
+                    parts = file_name.split( "." )
+                    if len( parts ) > 1:
+                        ext = parts[1].strip().lower()
+                        if not( ext == 'ab1' or ext == 'scf' ):
+                            raise BadFileException( "you attempted to upload an inappropriate file." )
+                        if ext == 'ab1' and file_type != 'ab1':
+                            raise BadFileException( "you must manually set the 'File Format' to 'Ab1' when uploading ab1 files." )
+                        elif ext == 'scf' and file_type != 'scf':
+                            raise BadFileException( "you must manually set the 'File Format' to 'Scf' when uploading scf files." )
+                    data_type = 'binary'
+            if not data_type:
+                # We must have a text file
+                if self.check_html( temp_name ):
+                    raise BadFileException( "you attempted to upload an inappropriate file." )
+            if data_type != 'binary' and data_type != 'zip':
+                if space_to_tab:
+                    self.line_count = sniff.convert_newlines_sep2tabs( temp_name )
+                else:
+                    self.line_count = sniff.convert_newlines( temp_name )
+                if file_type == 'auto':
+                    ext = sniff.guess_ext( temp_name, sniff_order=trans.app.datatypes_registry.sniff_order )    
+                else:
+                    ext = file_type
+                data_type = ext
         if info is None:
             info = 'uploaded %s file' %data_type
-
-        data = trans.app.model.HistoryDatasetAssociation( history = trans.history, extension = ext, create_dataset = True )
+        data = trans.app.model.HistoryDatasetAssociation( history=trans.history, extension=ext, create_dataset=True )
         data.name = file_name
         data.dbkey = dbkey
         data.info = info
@@ -233,19 +236,25 @@
         data.init_meta()
         if self.line_count is not None:
             try:
-                data.set_peek( line_count=self.line_count )
+                if is_multi_byte:
+                    data.set_multi_byte_peek( line_count=self.line_count )
+                else:
+                    data.set_peek( line_count=self.line_count )
             except:
+                if is_multi_byte:
+                    data.set_multi_byte_peek()
+                else:
+                    data.set_peek()
+        else:
+            if is_multi_byte:
+                data.set_multi_byte_peek()
+            else:
                 data.set_peek()
-        else:
-            data.set_peek()
-
-        # validate incomming data
-        """
-        Commented by greg on 3/14/07
-        for error in data.datatype.validate( data ):
-            data.add_validation_error( 
-                model.ValidationError( message=str( error ), err_type=error.__class__.__name__, attributes=util.object_to_string( error.__dict__ ) ) )
-        """
+        # validate incoming data
+        # Commented by greg on 3/14/07
+        # for error in data.datatype.validate( data ):
+        #     data.add_validation_error( 
+        #         model.ValidationError( message=str( error ), err_type=error.__class__.__name__, attributes=util.object_to_string( error.__dict__ ) ) )
         if data.missing_meta():
             data.datatype.set_meta( data )
         dbkey_to_store = dbkey
@@ -319,6 +328,8 @@
             lineno += 1
             line = line.strip()
             if line:
+                if util.is_multi_byte( line ):
+                    return False
                 for char in line:
                     if ord( char ) > 128:
                         if chunk is None:
diff -r affd3085eee3 -r 02843f56b812 lib/galaxy/util/__init__.py
--- a/lib/galaxy/util/__init__.py	Fri Mar 20 02:49:30 2009 -0400
+++ b/lib/galaxy/util/__init__.py	Wed Mar 25 11:51:30 2009 -0400
@@ -14,10 +14,33 @@
 pkg_resources.require( 'elementtree' )
 from elementtree import ElementTree
 
+pkg_resources.require( "wchartype" )
+import wchartype
+
 log   = logging.getLogger(__name__)
 _lock = threading.RLock()
 
 gzip_magic = '\037\213'
+
+def is_multi_byte( chars ):
+    for char in chars:
+        try:
+            char = unicode( char )
+        except Exception, e:
+            # Probably binary
+            log.exception( e )
+            return False
+        if wchartype.is_asian( char ) or \
+            wchartype.is_full_width( char ) or \
+            wchartype.is_kanji( char ) or \
+            wchartype.is_hiragana( char ) or \
+            wchartype.is_katakana( char ) or \
+            wchartype.is_half_katakana( char ) or \
+            wchartype.is_hangul( char ) or \
+            wchartype.is_full_digit( char ) or \
+            wchartype.is_full_letter( char ):
+            return True
+    return False
 
 def synchronized(func):
     """This wrapper will serialize access to 'func' to a single thread. Use it as a decorator."""
diff -r affd3085eee3 -r 02843f56b812 lib/galaxy/web/framework/__init__.py
--- a/lib/galaxy/web/framework/__init__.py	Fri Mar 20 02:49:30 2009 -0400
+++ b/lib/galaxy/web/framework/__init__.py	Wed Mar 25 11:51:30 2009 -0400
@@ -466,8 +466,7 @@
             return self.fill_template_mako( filename, **kwargs )
         else:
             template = Template( file=os.path.join(self.app.config.template_path, filename), 
-                                 searchList=[kwargs, self.template_context, dict(caller=self, t=self, h=webhelpers, util=util, request=self.request, response=self.response, app=self.app)],
-                                 output_encoding='utf-8' )
+                                 searchList=[kwargs, self.template_context, dict(caller=self, t=self, h=webhelpers, util=util, request=self.request, response=self.response, app=self.app)] )
             return str( template )
     def fill_template_mako( self, filename, **kwargs ):
         template = self.webapp.mako_template_lookup.get_template( filename )
@@ -481,8 +480,7 @@
         Fill in a template, putting any keyword arguments on the context.
         """
         template = Template( source=template_string,
-                             searchList=[context or kwargs, dict(caller=self)],
-                             output_encoding='utf-8' )
+                             searchList=[context or kwargs, dict(caller=self)] )
         return str(template)
         
 class FormBuilder( object ):

    

Greg Von Kuster

tags

participants (1)