[galaxy-dev] [hg] galaxy 2889: Fixes and new functional tests for uploading m...

20 Oct 2009

details:   http://www.bx.psu.edu/hg/galaxy/rev/15756ebb2b11
changeset: 2889:15756ebb2b11
user:      Greg Von Kuster <greg@bx.psu.edu>
date:      Thu Oct 15 20:10:32 2009 -0400
description:
Fixes and new functional tests for uploading multi-byte character files and fixes for displaying multi-byte character dataset peek.

8 file(s) affected in this change:

lib/galaxy/datatypes/data.py
lib/galaxy/jobs/__init__.py
lib/galaxy/model/__init__.py
lib/galaxy/tools/actions/upload_common.py
lib/galaxy/tools/parameters/grouping.py
test-data/asian_chars_1.txt
test/functional/test_get_data.py
tools/data_source/upload.py

diffs (269 lines):

diff -r e945dcdfd578 -r 15756ebb2b11 lib/galaxy/datatypes/data.py

--- a/lib/galaxy/datatypes/data.py	Thu Oct 15 19:01:07 2009 -0400
+++ b/lib/galaxy/datatypes/data.py	Thu Oct 15 20:10:32 2009 -0400
@@ -136,7 +136,7 @@
                 line = line.strip()
                 if not line:
                     continue
-                out.append( '<tr><td>%s</td></tr>' % escape( unicode( line ) ) )
+                out.append( '<tr><td>%s</td></tr>' % escape( unicode( line, 'utf-8' ) ) )
             out.append( '</table>' )
             out = "".join( out )
         except Exception, exc:
@@ -437,8 +437,8 @@
     temp.close()
     if file_type in [ 'gzipped', 'binary' ]: 
         text = "%s file" % file_type 
-    else: 
-        text  =  unicode( '\n'.join( lines ), 'utf-8' )
+    else:
+        text = unicode( '\n'.join( lines ), 'utf-8' )
     return text
 
 def get_line_count(file_name):
diff -r e945dcdfd578 -r 15756ebb2b11 lib/galaxy/jobs/__init__.py
--- a/lib/galaxy/jobs/__init__.py	Thu Oct 15 19:01:07 2009 -0400
+++ b/lib/galaxy/jobs/__init__.py	Thu Oct 15 20:10:32 2009 -0400
@@ -535,12 +535,12 @@
                         dataset.metadata.from_JSON_dict( self.external_output_metadata.get_output_filenames_by_dataset( dataset ).filename_out )
                     try:
                         assert context.get( 'line_count', None ) is not None
-                        if self.tool.is_multi_byte:
+                        if ( not dataset.datatype.composite_type and dataset.dataset.is_multi_byte() ) or self.tool.is_multi_byte:
                             dataset.set_multi_byte_peek( line_count=context['line_count'] )
                         else:
                             dataset.set_peek( line_count=context['line_count'] )
                     except:
-                        if self.tool.is_multi_byte:
+                        if ( not dataset.datatype.composite_type and dataset.dataset.is_multi_byte() ) or self.tool.is_multi_byte:
                             dataset.set_multi_byte_peek()
                         else:
                             dataset.set_peek()
diff -r e945dcdfd578 -r 15756ebb2b11 lib/galaxy/model/__init__.py
--- a/lib/galaxy/model/__init__.py	Thu Oct 15 19:01:07 2009 -0400
+++ b/lib/galaxy/model/__init__.py	Thu Oct 15 20:10:32 2009 -0400
@@ -5,7 +5,7 @@
 the relationship cardinalities are obvious (e.g. prefer Dataset to Data)
 """
 
-import os.path, os, errno, sys
+import os.path, os, errno, sys, codecs
 import galaxy.datatypes
 from galaxy.util.bunch import Bunch
 from galaxy import util
@@ -418,7 +418,13 @@
         return self.get_size() > 0
     def mark_deleted( self, include_children=True ):
         self.deleted = True
-        
+    def is_multi_byte( self ):
+        if not self.has_data():
+            return False
+        try:
+            return util.is_multi_byte( codecs.open( self.file_name, 'r', 'utf-8' ).read( 100 ) )
+        except UnicodeDecodeError, e:
+            return False
     # FIXME: sqlalchemy will replace this
     def _delete(self):
         """Remove the file that corresponds to this data"""
@@ -433,7 +439,7 @@
     permitted_actions = Dataset.permitted_actions
     def __init__( self, id=None, hid=None, name=None, info=None, blurb=None, peek=None, extension=None, 
                   dbkey=None, metadata=None, history=None, dataset=None, deleted=False, designation=None,
-                  parent_id=None, validation_errors=None, visible=True, create_dataset = False ):
+                  parent_id=None, validation_errors=None, visible=True, create_dataset=False ):
         self.name = name or "Unnamed dataset"
         self.id = id
         self.info = info
@@ -519,6 +525,9 @@
     def get_mime( self ):
         """Returns the mime type of the data"""
         return datatypes_registry.get_mimetype_by_extension( self.extension.lower() )
+    def is_multi_byte( self ):
+        """Data consists of multi-byte characters"""
+        return self.dataset.is_multi_byte()
     def set_peek( self ):
         return self.datatype.set_peek( self )
     def set_multi_byte_peek( self ):
@@ -556,7 +565,7 @@
     def get_converter_types(self):
         return self.datatype.get_converter_types( self, datatypes_registry)
     def find_conversion_destination( self, accepted_formats, **kwd ):
-        """Returns ( target_ext, exisiting converted dataset )"""
+        """Returns ( target_ext, existing converted dataset )"""
         return self.datatype.find_conversion_destination( self, accepted_formats, datatypes_registry, **kwd )
     def add_validation_error( self, validation_error ):
         self.validation_errors.append( validation_error )
diff -r e945dcdfd578 -r 15756ebb2b11 lib/galaxy/tools/actions/upload_common.py
--- a/lib/galaxy/tools/actions/upload_common.py	Thu Oct 15 19:01:07 2009 -0400
+++ b/lib/galaxy/tools/actions/upload_common.py	Thu Oct 15 20:10:32 2009 -0400
@@ -27,7 +27,7 @@
             elif type( f ) == dict and 'filename' and 'local_filename' not in f:
                 raise Exception( 'Uploaded file was encoded in a way not understood by Galaxy.' )
             if upload_dataset['url_paste'].strip() != '':
-                upload_dataset['url_paste'] = datatypes.sniff.stream_to_file( StringIO.StringIO( upload_dataset['url_paste'] ), prefix="strio_url_paste_" )[0]
+                upload_dataset['url_paste'], is_multi_byte = datatypes.sniff.stream_to_file( StringIO.StringIO( upload_dataset['url_paste'] ), prefix="strio_url_paste_" )
             else:
                 upload_dataset['url_paste'] = None
             new_files.append( upload_dataset )
@@ -135,7 +135,6 @@
             folder.refresh()
             matches = filter( lambda x: x.name == name, active_folders( trans, folder ) )
             if matches:
-                log.debug( 'DEBUGDEBUG: In %s, found a folder name match: %s:%s' % ( folder.name, matches[0].id, matches[0].name ) )
                 folder = matches[0]
             else:
                 new_folder = trans.app.model.LibraryFolder( name=name, description='Automatically created by upload tool' )
@@ -143,7 +142,6 @@
                 folder.add_folder( new_folder )
                 new_folder.flush()
                 trans.app.security_agent.copy_library_permissions( folder, new_folder )
-                log.debug( 'DEBUGDEBUG: In %s, created a new folder: %s:%s' % ( folder.name, new_folder.id, new_folder.name ) )
                 folder = new_folder
     if library_bunch.replace_dataset:
         ld = library_bunch.replace_dataset
diff -r e945dcdfd578 -r 15756ebb2b11 lib/galaxy/tools/parameters/grouping.py
--- a/lib/galaxy/tools/parameters/grouping.py	Thu Oct 15 19:01:07 2009 -0400
+++ b/lib/galaxy/tools/parameters/grouping.py	Thu Oct 15 20:10:32 2009 -0400
@@ -240,7 +240,7 @@
             url_paste = context['url_paste']
             name = context.get( 'NAME', None )
             info = context.get( 'INFO', None )
-            space_to_tab = False 
+            space_to_tab = False
             if context.get( 'space_to_tab', None ) not in ["None", None]:
                 space_to_tab = True
             warnings = []
@@ -248,7 +248,6 @@
             if file_bunch.path:
                 file_bunch.space_to_tab = space_to_tab
                 rval.append( file_bunch )
-                #rval.append( ( type, temp_name, precreated_name, space_to_tab, dataset_name, dataset_info ) )
             for file_bunch in get_url_paste_urls_or_filename( context, override_name = name, override_info = info ):
                 if file_bunch.path:
                     file_bunch.space_to_tab = space_to_tab
@@ -266,11 +265,6 @@
         if d_type.composite_type is not None:
             #handle uploading of composite datatypes
             #Only one Dataset can be created
-            
-            '''
-            dataset = UploadedDataset()
-            dataset.datatype = d_type
-            '''
             dataset = Bunch()
             dataset.type = 'composite'
             dataset.file_type = file_type
@@ -279,14 +273,12 @@
             dataset.warnings = []
             dataset.metadata = {}
             dataset.composite_files = {}
-            
             #load metadata
             files_metadata = context.get( self.metadata_ref, {} )
             for meta_name, meta_spec in d_type.metadata_spec.iteritems():
                 if meta_spec.set_in_upload:
                     if meta_name in files_metadata:
                         dataset.metadata[ meta_name ] = files_metadata[ meta_name ]
-            
             dataset_name = None
             dataset_info = None
             if dataset.datatype.composite_type == 'auto_primary_file':
diff -r e945dcdfd578 -r 15756ebb2b11 test-data/asian_chars_1.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/asian_chars_1.txt	Thu Oct 15 20:10:32 2009 -0400
@@ -0,0 +1,1 @@
+蛋白質核酸酵素:制癌性物質の化学修飾による効果の向上.
\ No newline at end of file
diff -r e945dcdfd578 -r 15756ebb2b11 test/functional/test_get_data.py
--- a/test/functional/test_get_data.py	Thu Oct 15 19:01:07 2009 -0400
+++ b/test/functional/test_get_data.py	Thu Oct 15 20:10:32 2009 -0400
@@ -93,3 +93,17 @@
         self.verify_composite_datatype_file_content( 'rgenetics.bed', str( hda1.id ) )
         self.verify_composite_datatype_file_content( 'rgenetics.fam', str( hda1.id ) )
         self.delete_history( id=self.security.encode_id( history4.id ) )
+    def test_020_upload_multibyte_character_file( self ):
+        """Test uploading multi-byte character file"""
+        # Logged in as admin_user
+        self.check_history_for_string( 'Your history is empty' )
+        history5 = galaxy.model.History.filter( and_( galaxy.model.History.table.c.deleted==False,
+                                                      galaxy.model.History.table.c.user_id==admin_user.id ) ) \
+            .order_by( desc( galaxy.model.History.table.c.create_time ) ).first()
+        self.upload_file( 'asian_chars_1.txt' )
+        hda1 = galaxy.model.HistoryDatasetAssociation.query() \
+            .order_by( desc( galaxy.model.HistoryDatasetAssociation.table.c.create_time ) ).first()
+        assert hda1 is not None, "Problem retrieving hda1 from database"
+        self.verify_dataset_correctness( 'asian_chars_1.txt', hid=str( hda1.hid ) )
+        self.check_history_for_string( 'uploaded multi-byte char file' )
+        self.delete_history( id=self.security.encode_id( history5.id ) )
diff -r e945dcdfd578 -r 15756ebb2b11 tools/data_source/upload.py
--- a/tools/data_source/upload.py	Thu Oct 15 19:01:07 2009 -0400
+++ b/tools/data_source/upload.py	Thu Oct 15 20:10:32 2009 -0400
@@ -4,7 +4,7 @@
 # WARNING: Changes in this tool (particularly as related to parsing) may need
 # to be reflected in galaxy.web.controllers.tool_runner and galaxy.tools
 
-import urllib, sys, os, gzip, tempfile, shutil, re, gzip, zipfile
+import urllib, sys, os, gzip, tempfile, shutil, re, gzip, zipfile, codecs
 from galaxy import eggs
 # need to import model before sniff to resolve a circular import dependency
 import galaxy.model
@@ -129,13 +129,11 @@
 
     if dataset.type == 'url':
         try:
-            temp_name, is_multi_byte = sniff.stream_to_file( urllib.urlopen( dataset.path ), prefix='url_paste' )
+            temp_name, dataset.is_multi_byte = sniff.stream_to_file( urllib.urlopen( dataset.path ), prefix='url_paste' )
         except Exception, e:
             file_err( 'Unable to fetch %s\n%s' % ( dataset.path, str( e ) ), dataset, json_file )
             return
         dataset.path = temp_name
-        dataset.is_multi_byte = is_multi_byte
-
     # See if we have an empty file
     if not os.path.exists( dataset.path ):
         file_err( 'Uploaded temporary file (%s) does not exist.' % dataset.path, dataset, json_file )
@@ -143,11 +141,15 @@
     if not os.path.getsize( dataset.path ) > 0:
         file_err( 'The uploaded file is empty', dataset, json_file )
         return
-    if 'is_multi_byte' not in dir( dataset ):
-        dataset.is_multi_byte = util.is_multi_byte( open( dataset.path, 'r' ).read( 1024 ) )
+    if not dataset.type == 'url':
+        # Already set is_multi_byte above if type == 'url'
+        try:
+            dataset.is_multi_byte = util.is_multi_byte( codecs.open( dataset.path, 'r', 'utf-8' ).read( 100 ) )
+        except UnicodeDecodeError, e:
+            dataset.is_multi_byte = False
     if dataset.is_multi_byte:
+        data_type = 'multi-byte char'
         ext = sniff.guess_ext( dataset.path, is_multi_byte=True )
-        data_type = ext
     else:
         # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress
         is_gzipped, is_valid = check_gzip( dataset.path )
@@ -283,24 +285,19 @@
         sys.exit( 1 )
 
     output_paths = parse_outputs( sys.argv[2:] )
-
     json_file = open( 'galaxy.json', 'w' )
-
     for line in open( sys.argv[1], 'r' ):
         dataset = from_json_string( line )
         dataset = util.bunch.Bunch( **safe_dict( dataset ) )
-
         try:
             output_path = output_paths[int( dataset.dataset_id )]
         except:
             print >>sys.stderr, 'Output path for dataset %s not found on command line' % dataset.dataset_id
             sys.exit( 1 )
-
         if dataset.type == 'composite':
             add_composite_file( dataset, json_file, output_path )
         else:
             add_file( dataset, json_file, output_path )
-
     # clean up paramfile
     try:
         os.remove( sys.argv[1] )