[hg] galaxy 2589: Merge with f6e0863862efe02bb416596b08010b513f9...

24 Aug 2009

details:   http://www.bx.psu.edu/hg/galaxy/rev/a7f9325bb319
changeset: 2589:a7f9325bb319
user:      rc
date:      Thu Aug 20 11:43:28 2009 -0400
description:
Merge with f6e0863862efe02bb416596b08010b513f9ffdf7

2 file(s) affected in this change:

templates/base_panels.mako
test/base/twilltestcase.py

diffs (1505 lines):

diff -r 070cf5f6f928 -r a7f9325bb319 lib/galaxy/jobs/__init__.py

--- a/lib/galaxy/jobs/__init__.py	Thu Aug 20 11:39:32 2009 -0400
+++ b/lib/galaxy/jobs/__init__.py	Thu Aug 20 11:43:28 2009 -0400
@@ -6,6 +6,8 @@
 from galaxy.datatypes.tabular import *
 from galaxy.datatypes.interval import *
 from galaxy.datatypes import metadata
+from galaxy.util.json import from_json_string
+from galaxy.util.expressions import ExpressionContext
 
 import pkg_resources
 pkg_resources.require( "PasteDeploy" )
@@ -18,6 +20,12 @@
 
 # States for running a job. These are NOT the same as data states
 JOB_WAIT, JOB_ERROR, JOB_INPUT_ERROR, JOB_INPUT_DELETED, JOB_OK, JOB_READY, JOB_DELETED, JOB_ADMIN_DELETED = 'wait', 'error', 'input_error', 'input_deleted', 'ok', 'ready', 'deleted', 'admin_deleted'
+
+# This file, if created in the job's working directory, will be used for
+# setting advanced metadata properties on the job and its associated outputs.
+# This interface is currently experimental, is only used by the upload tool,
+# and should eventually become API'd
+TOOL_PROVIDED_JOB_METADATA_FILE = 'galaxy.json'
 
 class JobManager( object ):
     """
@@ -320,6 +328,7 @@
         self.working_directory = \
             os.path.join( self.app.config.job_working_directory, str( self.job_id ) )
         self.output_paths = None
+        self.tool_provided_job_metadata = None
         self.external_output_metadata = metadata.JobExternalOutputMetadataWrapper( job ) #wrapper holding the info required to restore and clean up from files used for setting metadata externally
         
     def get_param_dict( self ):
@@ -422,6 +431,8 @@
                 dataset.blurb = 'tool error'
                 dataset.info = message
                 dataset.set_size()
+                if dataset.ext == 'auto':
+                    dataset.extension = 'data'
                 dataset.flush()
             job.state = model.Job.states.ERROR
             job.command_line = self.command_line
@@ -486,16 +497,28 @@
                 except ( IOError, OSError ):
                     self.fail( "Job %s's output dataset(s) could not be read" % job.id )
                     return
+        job_context = ExpressionContext( dict( stdout = stdout, stderr = stderr ) )
         for dataset_assoc in job.output_datasets:
+            context = self.get_dataset_finish_context( job_context, dataset_assoc.dataset.dataset )
             #should this also be checking library associations? - can a library item be added from a history before the job has ended? - lets not allow this to occur
             for dataset in dataset_assoc.dataset.dataset.history_associations: #need to update all associated output hdas, i.e. history was shared with job running
+                if context.get( 'path', None ):
+                    # The tool can set an alternate output path for the dataset.
+                    try:
+                        shutil.move( context['path'], dataset.file_name )
+                    except ( IOError, OSError ):
+                        if not context['stderr']:
+                            context['stderr'] = 'This dataset could not be processed'
                 dataset.blurb = 'done'
                 dataset.peek  = 'no peek'
-                dataset.info  = stdout + stderr
+                dataset.info  = context['stdout'] + context['stderr']
                 dataset.set_size()
-                if stderr:
+                if context['stderr']:
                     dataset.blurb = "error"
                 elif dataset.has_data():
+                    # If the tool was expected to set the extension, attempt to retrieve it
+                    if dataset.ext == 'auto':
+                        dataset.extension = context.get( 'ext', 'data' )
                     #if a dataset was copied, it won't appear in our dictionary:
                     #either use the metadata from originating output dataset, or call set_meta on the copies
                     #it would be quicker to just copy the metadata from the originating output dataset, 
@@ -510,18 +533,39 @@
                         #the metadata that was stored to disk for use via the external process, 
                         #and the changes made by the user will be lost, without warning or notice
                         dataset.metadata.from_JSON_dict( self.external_output_metadata.get_output_filenames_by_dataset( dataset ).filename_out )
-                    if self.tool.is_multi_byte:
-                        dataset.set_multi_byte_peek()
-                    else:
-                        dataset.set_peek()
+                    try:
+                        assert context.get( 'line_count', None ) is not None
+                        if self.tool.is_multi_byte:
+                            dataset.set_multi_byte_peek( line_count=context['line_count'] )
+                        else:
+                            dataset.set_peek( line_count=context['line_count'] )
+                    except:
+                        if self.tool.is_multi_byte:
+                            dataset.set_multi_byte_peek()
+                        else:
+                            dataset.set_peek()
+                    try:
+                        # set the name if provided by the tool
+                        dataset.name = context['name']
+                    except:
+                        pass
                 else:
                     dataset.blurb = "empty"
+                    if dataset.ext == 'auto':
+                        dataset.extension = 'txt'
                 dataset.flush()
-            if stderr:
+            if context['stderr']:
                 dataset_assoc.dataset.dataset.state = model.Dataset.states.ERROR
             else:
                 dataset_assoc.dataset.dataset.state = model.Dataset.states.OK
-            dataset_assoc.dataset.dataset.flush()
+            # If any of the rest of the finish method below raises an
+            # exception, the fail method will run and set the datasets to
+            # ERROR.  The user will never see that the datasets are in error if
+            # they were flushed as OK here, since upon doing so, the history
+            # panel stops checking for updates.  So allow the
+            # mapping.context.current.flush() at the bottom of this method set
+            # the state instead.
+            #dataset_assoc.dataset.dataset.flush()
         
         # Save stdout and stderr    
         if len( stdout ) > 32768:
@@ -591,7 +635,8 @@
             return self.output_paths
 
         class DatasetPath( object ):
-            def __init__( self, real_path, false_path = None ):
+            def __init__( self, dataset_id, real_path, false_path = None ):
+                self.dataset_id = dataset_id
                 self.real_path = real_path
                 self.false_path = false_path
             def __str__( self ):
@@ -605,10 +650,55 @@
             self.output_paths = []
             for name, data in [ ( da.name, da.dataset.dataset ) for da in job.output_datasets ]:
                 false_path = os.path.abspath( os.path.join( self.working_directory, "galaxy_dataset_%d.dat" % data.id ) )
-                self.output_paths.append( DatasetPath( data.file_name, false_path ) )
+                self.output_paths.append( DatasetPath( data.id, data.file_name, false_path ) )
         else:
-            self.output_paths = [ DatasetPath( da.dataset.file_name ) for da in job.output_datasets ]
+            self.output_paths = [ DatasetPath( da.dataset.dataset.id, da.dataset.file_name ) for da in job.output_datasets ]
         return self.output_paths
+
+    def get_output_file_id( self, file ):
+        if self.output_paths is None:
+            self.get_output_fnames()
+        for dp in self.output_paths:
+            if self.app.config.outputs_to_working_directory and os.path.basename( dp.false_path ) == file:
+                return dp.dataset_id
+            elif os.path.basename( dp.real_path ) == file:
+                return dp.dataset_id
+        return None
+
+    def get_tool_provided_job_metadata( self ):
+        if self.tool_provided_job_metadata is not None:
+            return self.tool_provided_job_metadata
+
+        # Look for JSONified job metadata
+        self.tool_provided_job_metadata = []
+        meta_file = os.path.join( self.working_directory, TOOL_PROVIDED_JOB_METADATA_FILE )
+        if os.path.exists( meta_file ):
+            for line in open( meta_file, 'r' ):
+                try:
+                    line = from_json_string( line )
+                    assert 'type' in line
+                except:
+                    log.exception( '(%s) Got JSON data from tool, but data is improperly formatted or no "type" key in data' % self.job_id )
+                    log.debug( 'Offending data was: %s' % line )
+                    continue
+                # Set the dataset id if it's a dataset entry and isn't set.
+                # This isn't insecure.  We loop the job's output datasets in
+                # the finish method, so if a tool writes out metadata for a
+                # dataset id that it doesn't own, it'll just be ignored.
+                if line['type'] == 'dataset' and 'dataset_id' not in line:
+                    try:
+                        line['dataset_id'] = self.get_output_file_id( line['dataset'] )
+                    except KeyError:
+                        log.warning( '(%s) Tool provided job dataset-specific metadata without specifying a dataset' % self.job_id )
+                        continue
+                self.tool_provided_job_metadata.append( line )
+        return self.tool_provided_job_metadata
+
+    def get_dataset_finish_context( self, job_context, dataset ):
+        for meta in self.get_tool_provided_job_metadata():
+            if meta['type'] == 'dataset' and meta['dataset_id'] == dataset.id:
+                return ExpressionContext( meta, job_context )
+        return job_context
 
     def check_output_sizes( self ):
         sizes = []
diff -r 070cf5f6f928 -r a7f9325bb319 lib/galaxy/tools/__init__.py
--- a/lib/galaxy/tools/__init__.py	Thu Aug 20 11:39:32 2009 -0400
+++ b/lib/galaxy/tools/__init__.py	Thu Aug 20 11:43:28 2009 -0400
@@ -5,7 +5,7 @@
 
 pkg_resources.require( "simplejson" )
 
-import logging, os, string, sys, tempfile, glob, shutil
+import logging, os, string, sys, tempfile, glob, shutil, types
 import simplejson
 import binascii
 from UserDict import DictMixin
@@ -415,6 +415,7 @@
                 output.metadata_source = data_elem.get("metadata_source", "")
                 output.parent = data_elem.get("parent", None)
                 output.label = util.xml_text( data_elem, "label" )
+                output.count = int( data_elem.get("count", 1) )
                 output.filters = data_elem.findall( 'filter' )
                 self.outputs[ output.name ] = output
         # Any extra generated config files for the tool
@@ -816,7 +817,11 @@
             # If we've completed the last page we can execute the tool
             elif state.page == self.last_page:
                 out_data = self.execute( trans, incoming=params )
-                return 'tool_executed.mako', dict( out_data=out_data )
+                try:
+                    assert type( out_data ) is types.DictType
+                    return 'tool_executed.mako', dict( out_data=out_data )
+                except:
+                    return 'message.mako', dict( message_type='error', message=out_data, refresh_frames=[] )
             # Otherwise move on to the next page
             else:
                 state.page += 1
@@ -824,15 +829,26 @@
                 self.fill_in_new_state( trans, self.inputs_by_page[ state.page ], state.inputs )
                 return 'tool_form.mako', dict( errors=errors, tool_state=state )
         else:
-            if filter( lambda x: isinstance( x, FieldStorage ) and x.file, state.inputs.values() ):
+            try:
+                self.find_fieldstorage( state.inputs )
+            except InterruptedUpload:
                 # If inputs contain a file it won't persist.  Most likely this
                 # is an interrupted upload.  We should probably find a more
                 # standard method of determining an incomplete POST.
                 return self.handle_interrupted( trans, state.inputs )
-            else:
-                # Just a refresh, render the form with updated state and errors.
-                return 'tool_form.mako', dict( errors=errors, tool_state=state )
+            except:
+                pass
+            # Just a refresh, render the form with updated state and errors.
+            return 'tool_form.mako', dict( errors=errors, tool_state=state )
       
+    def find_fieldstorage( self, x ):
+        if isinstance( x, FieldStorage ):
+            raise InterruptedUpload( None )
+        elif type( x ) is types.DictType:
+            [ self.find_fieldstorage( y ) for y in x.values() ]
+        elif type( x ) is types.ListType:
+            [ self.find_fieldstorage( y ) for y in x ]
+
     def handle_interrupted( self, trans, inputs ):
         """
         Upon handling inputs, if it appears that we have received an incomplete
@@ -1704,3 +1720,6 @@
         return value
     else:
         return incoming.get( key, default )
+
+class InterruptedUpload( Exception ):
+    pass
diff -r 070cf5f6f928 -r a7f9325bb319 lib/galaxy/tools/actions/upload.py
--- a/lib/galaxy/tools/actions/upload.py	Thu Aug 20 11:39:32 2009 -0400
+++ b/lib/galaxy/tools/actions/upload.py	Thu Aug 20 11:43:28 2009 -0400
@@ -1,8 +1,10 @@
 import os, shutil, urllib, StringIO, re, gzip, tempfile, shutil, zipfile
+from cgi import FieldStorage
 from __init__ import ToolAction
 from galaxy import datatypes, jobs
 from galaxy.datatypes import sniff
 from galaxy import model, util
+from galaxy.util.json import to_json_string
 
 import sys, traceback
 
@@ -11,14 +13,28 @@
 
 class UploadToolAction( ToolAction ):
     # Action for uploading files
-    def __init__( self ):
-        self.empty = False
-        self.line_count = None
-    def remove_tempfile( self, filename ):
-        try:
-            os.unlink( filename )
-        except:
-            log.exception( 'failure removing temporary file: %s' % filename )
+    def persist_uploads( self, incoming ):
+        if 'files' in incoming:
+            new_files = []
+            temp_files = []
+            for upload_dataset in incoming['files']:
+                f = upload_dataset['file_data']
+                if isinstance( f, FieldStorage ): 
+                    # very small files can be StringIOs
+                    if 'name' in dir( f.file ) and f.file.name != '<fdopen>':
+                        local_filename = util.mkstemp_ln( f.file.name, 'upload_file_data_' )
+                        f.file.close()
+                    else:
+                        local_filename = datatypes.sniff.stream_to_file( f.file, prefix="strio_upload_file_" )[0]
+                    upload_dataset['file_data'] = dict( filename = f.filename,
+                                                        local_filename = local_filename )
+                if upload_dataset['url_paste'].strip() != '':
+                    upload_dataset['url_paste'] = datatypes.sniff.stream_to_file( StringIO.StringIO( upload_dataset['url_paste'] ), prefix="strio_url_paste_" )[0]
+                else:
+                    upload_dataset['url_paste'] = None
+                new_files.append( upload_dataset )
+            incoming['files'] = new_files
+        return incoming
     def execute( self, tool, trans, incoming={}, set_output_hid = True ):
         dataset_upload_inputs = []
         for input_name, input in tool.inputs.iteritems():
@@ -42,330 +58,100 @@
                log.error( 'Got a precreated dataset (%s) but it does not belong to current user (%s)' % ( data.id, trans.user.id ) )
             else:
                 self.precreated_datasets.append( data )
+
         data_list = []
+
+        incoming = self.persist_uploads( incoming )
+
+        json_file = tempfile.mkstemp()
+        json_file_path = json_file[1]
+        json_file = os.fdopen( json_file[0], 'w' )
         for dataset_upload_input in dataset_upload_inputs:
             uploaded_datasets = dataset_upload_input.get_uploaded_datasets( trans, incoming )
             for uploaded_dataset in uploaded_datasets:
-                precreated_dataset = self.get_precreated_dataset( uploaded_dataset.precreated_name )
-                dataset = self.add_file( trans, uploaded_dataset.primary_file, uploaded_dataset.name, uploaded_dataset.file_type, uploaded_dataset.is_multi_byte, uploaded_dataset.dbkey, space_to_tab = uploaded_dataset.space_to_tab, info = uploaded_dataset.info, precreated_dataset = precreated_dataset, metadata = uploaded_dataset.metadata, uploaded_dataset = uploaded_dataset )
-                #dataset state is now set, we should not do anything else to this dataset
-                data_list.append( dataset )
-                #clean up extra temp names
-                uploaded_dataset.clean_up_temp_files()
-        
+                data = self.get_precreated_dataset( uploaded_dataset.name )
+                if not data:
+                    data = trans.app.model.HistoryDatasetAssociation( history = trans.history, create_dataset = True )
+                    data.name = uploaded_dataset.name
+                    data.state = data.states.QUEUED
+                    data.extension = uploaded_dataset.file_type
+                    data.dbkey = uploaded_dataset.dbkey
+                    data.flush()
+                    trans.history.add_dataset( data, genome_build = uploaded_dataset.dbkey )
+                    permissions = trans.app.security_agent.history_get_default_permissions( trans.history )
+                    trans.app.security_agent.set_all_dataset_permissions( data.dataset, permissions )
+                else:
+                    data.extension = uploaded_dataset.file_type
+                    data.dbkey = uploaded_dataset.dbkey
+                    data.flush()
+                    trans.history.genome_build = uploaded_dataset.dbkey
+                if uploaded_dataset.type == 'composite':
+                    # we need to init metadata before the job is dispatched
+                    data.init_meta()
+                    for meta_name, meta_value in uploaded_dataset.metadata.iteritems():
+                        setattr( data.metadata, meta_name, meta_value )
+                    data.flush()
+                    json = dict( file_type = uploaded_dataset.file_type,
+                                 dataset_id = data.dataset.id,
+                                 dbkey = uploaded_dataset.dbkey,
+                                 type = uploaded_dataset.type,
+                                 metadata = uploaded_dataset.metadata,
+                                 primary_file = uploaded_dataset.primary_file,
+                                 extra_files_path = data.extra_files_path,
+                                 composite_file_paths = uploaded_dataset.composite_files,
+                                 composite_files = dict( [ ( k, v.__dict__ ) for k, v in data.datatype.get_composite_files( data ).items() ] ) )
+                else:
+                    try:
+                        is_binary = uploaded_dataset.datatype.is_binary
+                    except:
+                        is_binary = None
+                    json = dict( file_type = uploaded_dataset.file_type,
+                                 name = uploaded_dataset.name,
+                                 dataset_id = data.dataset.id,
+                                 dbkey = uploaded_dataset.dbkey,
+                                 type = uploaded_dataset.type,
+                                 is_binary = is_binary,
+                                 space_to_tab = uploaded_dataset.space_to_tab,
+                                 path = uploaded_dataset.path )
+                json_file.write( to_json_string( json ) + '\n' )
+                data_list.append( data )
+        json_file.close()
+
         #cleanup unclaimed precreated datasets:
         for data in self.precreated_datasets:
             log.info( 'Cleaned up unclaimed precreated dataset (%s).' % ( data.id ) )
             data.state = data.states.ERROR
             data.info = 'No file contents were available.'
         
-        if data_list:
-            trans.app.model.flush()
+        if not data_list:
+            try:
+                os.remove( json_file_path )
+            except:
+                pass
+            return 'No data was entered in the upload form, please go back and choose data to upload.'
         
         # Create the job object
         job = trans.app.model.Job()
         job.session_id = trans.get_galaxy_session().id
         job.history_id = trans.history.id
         job.tool_id = tool.id
-        try:
-            # For backward compatibility, some tools may not have versions yet.
-            job.tool_version = tool.version
-        except:
-            job.tool_version = "1.0.1"
+        job.tool_version = tool.version
         job.state = trans.app.model.Job.states.UPLOAD
         job.flush()
         log.info( 'tool %s created job id %d' % ( tool.id, job.id ) )
         trans.log_event( 'created job id %d' % job.id, tool_id=tool.id )
+
+        for name, value in tool.params_to_strings( incoming, trans.app ).iteritems():
+            job.add_parameter( name, value )
+        job.add_parameter( 'paramfile', to_json_string( json_file_path ) )
+        for i, dataset in enumerate( data_list ):
+            job.add_output_dataset( i, dataset )
+        trans.app.model.flush()
         
-        #if we could make a 'real' job here, then metadata could be set before job.finish() is called
-        hda = data_list[0] #only our first hda is being added as output for the job, why?
-        job.state = trans.app.model.Job.states.OK
-        file_size_str = datatypes.data.nice_size( hda.dataset.file_size )
-        job.info = "%s, size: %s" % ( hda.info, file_size_str )
-        job.add_output_dataset( hda.name, hda )
-        job.flush()
-        log.info( 'job id %d ended ok, file size: %s' % ( job.id, file_size_str ) )
-        trans.log_event( 'job id %d ended ok, file size: %s' % ( job.id, file_size_str ), tool_id=tool.id )
-        return dict( output=hda )
-        
-    def upload_empty(self, trans, job, err_code, err_msg, precreated_dataset = None):
-        if precreated_dataset is not None:
-            data = precreated_dataset
-        else:
-            data = trans.app.model.HistoryDatasetAssociation( create_dataset=True )
-        trans.app.security_agent.set_all_dataset_permissions( data.dataset, trans.app.security_agent.history_get_default_permissions( trans.history ) )
-        data.name = err_code
-        data.extension = "txt"
-        data.dbkey = "?"
-        data.info = err_msg
-        data.file_size = 0
-        data.state = data.states.EMPTY
-        data.flush()
-        if precreated_dataset is None:
-            trans.history.add_dataset( data )
-        trans.app.model.flush()
-        # Indicate job failure by setting state and info
-        job.state = trans.app.model.Job.states.ERROR
-        job.info = err_msg
-        job.add_output_dataset( data.name, data )
-        job.flush()
-        log.info( 'job id %d ended with errors, err_msg: %s' % ( job.id, err_msg ) )
-        trans.log_event( 'job id %d ended with errors, err_msg: %s' % ( job.id, err_msg ), tool_id=job.tool_id )
-        return dict( output=data )
-
-    def add_file( self, trans, temp_name, file_name, file_type, is_multi_byte, dbkey, info=None, space_to_tab=False, precreated_dataset=None, metadata = {}, uploaded_dataset = None ):
-        def dataset_no_data_error( data, message = 'there was an error uploading your file' ):
-            data.info = "No data: %s." % message
-            data.state = data.states.ERROR
-            if data.extension is None:
-                data.extension = 'data'
-            return data
-        data_type = None
-        
-        if precreated_dataset is not None:
-            data = precreated_dataset
-        else:
-            data = trans.app.model.HistoryDatasetAssociation( history = trans.history, create_dataset = True )
-        trans.app.security_agent.set_all_dataset_permissions( data.dataset, trans.app.security_agent.history_get_default_permissions( trans.history ) )
-        
-        # See if we have an empty file
-        if not os.path.getsize( temp_name ) > 0:
-            return dataset_no_data_error( data, message = 'you attempted to upload an empty file' )
-            #raise BadFileException( "you attempted to upload an empty file." )
-        if is_multi_byte:
-            ext = sniff.guess_ext( temp_name, is_multi_byte=True )
-        else:
-            if not data_type: #at this point data_type is always None (just initialized above), so this is always True...lots of cleanup needed here
-                # See if we have a gzipped file, which, if it passes our restrictions,
-                # we'll decompress on the fly.
-                is_gzipped, is_valid = self.check_gzip( temp_name )
-                if is_gzipped and not is_valid:
-                    return dataset_no_data_error( data, message = 'you attempted to upload an inappropriate file' )
-                    #raise BadFileException( "you attempted to upload an inappropriate file." )
-                elif is_gzipped and is_valid:
-                    # We need to uncompress the temp_name file
-                    CHUNK_SIZE = 2**20 # 1Mb   
-                    fd, uncompressed = tempfile.mkstemp()   
-                    gzipped_file = gzip.GzipFile( temp_name )
-                    while 1:
-                        try:
-                            chunk = gzipped_file.read( CHUNK_SIZE )
-                        except IOError:
-                            os.close( fd )
-                            os.remove( uncompressed )
-                            return dataset_no_data_error( data, message = 'problem decompressing gzipped data' )
-                            #raise BadFileException( 'problem decompressing gzipped data.' )
-                        if not chunk:
-                            break
-                        os.write( fd, chunk )
-                    os.close( fd )
-                    gzipped_file.close()
-                    # Replace the gzipped file with the decompressed file
-                    shutil.move( uncompressed, temp_name )
-                    file_name = file_name.rstrip( '.gz' )
-                    data_type = 'gzip'
-                ext = ''
-                if not data_type:
-                    # See if we have a zip archive
-                    is_zipped, is_valid, test_ext = self.check_zip( temp_name )
-                    if is_zipped and not is_valid:
-                        return dataset_no_data_error( data, message = 'you attempted to upload an inappropriate file' )
-                        #raise BadFileException( "you attempted to upload an inappropriate file." )
-                    elif is_zipped and is_valid:
-                        # Currently, we force specific tools to handle this case.  We also require the user
-                        # to manually set the incoming file_type
-                        if ( test_ext == 'ab1' or test_ext == 'scf' ) and file_type != 'binseq.zip':
-                            return dataset_no_data_error( data, message = "Invalid 'File Format' for archive consisting of binary files - use 'Binseq.zip'" )
-                            #raise BadFileException( "Invalid 'File Format' for archive consisting of binary files - use 'Binseq.zip'." )
-                        elif test_ext == 'txt' and file_type != 'txtseq.zip':
-                            return dataset_no_data_error( data, message = "Invalid 'File Format' for archive consisting of text files - use 'Txtseq.zip'" )
-                            #raise BadFileException( "Invalid 'File Format' for archive consisting of text files - use 'Txtseq.zip'." )
-                        if not ( file_type == 'binseq.zip' or file_type == 'txtseq.zip' ):
-                            return dataset_no_data_error( data, message = "you must manually set the 'File Format' to either 'Binseq.zip' or 'Txtseq.zip' when uploading zip files" )
-                            #raise BadFileException( "you must manually set the 'File Format' to either 'Binseq.zip' or 'Txtseq.zip' when uploading zip files." )
-                        data_type = 'zip'
-                        ext = file_type
-                if not data_type:
-                    if self.check_binary( temp_name ):
-                        if uploaded_dataset and uploaded_dataset.datatype and uploaded_dataset.datatype.is_binary:
-                            #we need a more generalized way of checking if a binary upload is of the right format for a datatype...magic number, etc
-                            data_type = 'binary'
-                            ext = uploaded_dataset.file_type
-                        else:
-                            parts = file_name.split( "." )
-                            if len( parts ) > 1:
-                                ext = parts[1].strip().lower()
-                                if not( ext == 'ab1' or ext == 'scf' ):
-                                    return dataset_no_data_error( data, message = "you attempted to upload an inappropriate file" )
-                                    #raise BadFileException( "you attempted to upload an inappropriate file." )
-                                if ext == 'ab1' and file_type != 'ab1':
-                                    return dataset_no_data_error( data, message = "you must manually set the 'File Format' to 'Ab1' when uploading ab1 files" )
-                                    #raise BadFileException( "you must manually set the 'File Format' to 'Ab1' when uploading ab1 files." )
-                                elif ext == 'scf' and file_type != 'scf':
-                                    return dataset_no_data_error( data, message = "you must manually set the 'File Format' to 'Scf' when uploading scf files" )
-                                    #raise BadFileException( "you must manually set the 'File Format' to 'Scf' when uploading scf files." )
-                            data_type = 'binary'
-                if not data_type:
-                    # We must have a text file
-                    if trans.app.datatypes_registry.get_datatype_by_extension( file_type ).composite_type != 'auto_primary_file' and self.check_html( temp_name ):
-                        return dataset_no_data_error( data, message = "you attempted to upload an inappropriate file" )
-                        #raise BadFileException( "you attempted to upload an inappropriate file." )
-                #if data_type != 'binary' and data_type != 'zip' and not trans.app.datatypes_registry.get_datatype_by_extension( ext ).is_binary:
-                if data_type != 'binary' and data_type != 'zip':
-                    if space_to_tab:
-                        self.line_count = sniff.convert_newlines_sep2tabs( temp_name )
-                    else:
-                        self.line_count = sniff.convert_newlines( temp_name )
-                    if file_type == 'auto':
-                        ext = sniff.guess_ext( temp_name, sniff_order=trans.app.datatypes_registry.sniff_order )    
-                    else:
-                        ext = file_type
-                    data_type = ext
-        if info is None:
-            info = 'uploaded %s file' %data_type
-        data.extension = ext
-        data.name = file_name
-        data.dbkey = dbkey
-        data.info = info
-        data.flush()
-        shutil.move( temp_name, data.file_name )
-        dataset_state = data.states.OK #don't set actual state here, only set to OK when finished setting attributes of the dataset
-        data.set_size()
-        data.init_meta()
-        #need to set metadata, has to be done after extention is set
-        for meta_name, meta_value in metadata.iteritems():
-            setattr( data.metadata, meta_name, meta_value )
-        if self.line_count is not None:
-            try:
-                if is_multi_byte:
-                    data.set_multi_byte_peek( line_count=self.line_count )
-                else:
-                    data.set_peek( line_count=self.line_count )
-            except:
-                if is_multi_byte:
-                    data.set_multi_byte_peek()
-                else:
-                    data.set_peek()
-        else:
-            if is_multi_byte:
-                data.set_multi_byte_peek()
-            else:
-                data.set_peek()
-
-        # validate incomming data
-        # Commented by greg on 3/14/07
-        # for error in data.datatype.validate( data ):
-        #     data.add_validation_error( 
-        #         model.ValidationError( message=str( error ), err_type=error.__class__.__name__, attributes=util.object_to_string( error.__dict__ ) ) )
-        if data.missing_meta():
-            data.datatype.set_meta( data )
-        dbkey_to_store = dbkey
-        if type( dbkey_to_store ) == type( [] ):
-            dbkey_to_store = dbkey[0]
-        if precreated_dataset is not None:
-            trans.history.genome_build = dbkey_to_store
-        else:
-            trans.history.add_dataset( data, genome_build=dbkey_to_store )
-        #set up composite files
-        if uploaded_dataset is not None:
-            composite_files = data.datatype.get_composite_files( data )
-            if composite_files:
-                os.mkdir( data.extra_files_path ) #make extra files path
-                for name, value in composite_files.iteritems():
-                    if uploaded_dataset.composite_files[ value.name ] is None and not value.optional:
-                        data.info = "A required composite data file was not provided (%s)" % name
-                        dataset_state = data.states.ERROR
-                        break
-                    elif uploaded_dataset.composite_files[ value.name] is not None:
-                        if not value.is_binary:
-                            if uploaded_dataset.composite_files[ value.name ].space_to_tab:
-                                sniff.convert_newlines_sep2tabs( uploaded_dataset.composite_files[ value.name ].filename )
-                            else:
-                                sniff.convert_newlines( uploaded_dataset.composite_files[ value.name ].filename )
-                        shutil.move( uploaded_dataset.composite_files[ value.name ].filename, os.path.join( data.extra_files_path, name ) )
-            if data.datatype.composite_type == 'auto_primary_file':
-               #now that metadata was set above, we should create the primary file as required
-               open( data.file_name, 'wb+' ).write( data.datatype.generate_primary_file( dataset = data ) )
-        data.state = dataset_state #Always set dataset state LAST
-        trans.app.model.flush()
-        trans.log_event( "Added dataset %d to history %d" %( data.id, trans.history.id ), tool_id="upload" )
-        return data
-
-    def check_gzip( self, temp_name ):
-        temp = open( temp_name, "U" )
-        magic_check = temp.read( 2 )
-        temp.close()
-        if magic_check != util.gzip_magic:
-            return ( False, False )
-        CHUNK_SIZE = 2**15 # 32Kb
-        gzipped_file = gzip.GzipFile( temp_name )
-        chunk = gzipped_file.read( CHUNK_SIZE )
-        gzipped_file.close()
-        if self.check_html( temp_name, chunk=chunk ) or self.check_binary( temp_name, chunk=chunk ):
-            return( True, False )
-        return ( True, True )
-
-    def check_zip( self, temp_name ):
-        if not zipfile.is_zipfile( temp_name ):
-            return ( False, False, None )
-        zip_file = zipfile.ZipFile( temp_name, "r" )
-        # Make sure the archive consists of valid files.  The current rules are:
-        # 1. Archives can only include .ab1, .scf or .txt files
-        # 2. All file extensions within an archive must be the same
-        name = zip_file.namelist()[0]
-        test_ext = name.split( "." )[1].strip().lower()
-        if not ( test_ext == 'scf' or test_ext == 'ab1' or test_ext == 'txt' ):
-            return ( True, False, test_ext )
-        for name in zip_file.namelist():
-            ext = name.split( "." )[1].strip().lower()
-            if ext != test_ext:
-                return ( True, False, test_ext )
-        return ( True, True, test_ext )
-
-    def check_html( self, temp_name, chunk=None ):
-        if chunk is None:
-            temp = open(temp_name, "U")
-        else:
-            temp = chunk
-        regexp1 = re.compile( "<A\s+[^>]*HREF[^>]+>", re.I )
-        regexp2 = re.compile( "<IFRAME[^>]*>", re.I )
-        regexp3 = re.compile( "<FRAMESET[^>]*>", re.I )
-        regexp4 = re.compile( "<META[^>]*>", re.I )
-        lineno = 0
-        for line in temp:
-            lineno += 1
-            matches = regexp1.search( line ) or regexp2.search( line ) or regexp3.search( line ) or regexp4.search( line )
-            if matches:
-                if chunk is None:
-                    temp.close()
-                return True
-            if lineno > 100:
-                break
-        if chunk is None:
-            temp.close()
-        return False
-    def check_binary( self, temp_name, chunk=None ):
-        if chunk is None:
-            temp = open( temp_name, "U" )
-        else:
-            temp = chunk
-        lineno = 0
-        for line in temp:
-            lineno += 1
-            line = line.strip()
-            if line:
-                if util.is_multi_byte( line ):
-                    return False
-                for char in line:
-                    if ord( char ) > 128:
-                        if chunk is None:
-                            temp.close()
-                        return True
-            if lineno > 10:
-                break
-        if chunk is None:
-            temp.close()
-        return False
+        # Queue the job for execution
+        trans.app.job_queue.put( job.id, tool )
+        trans.log_event( "Added job to the job queue, id: %s" % str(job.id), tool_id=job.tool_id )
+        return dict( [ ( i, v ) for i, v in enumerate( data_list ) ] )
 
     def get_precreated_dataset( self, name ):
         """
@@ -378,7 +164,3 @@
             return self.precreated_datasets.pop( names.index( name ) )
         else:
             return None
-
-class BadFileException( Exception ):
-    pass
-
diff -r 070cf5f6f928 -r a7f9325bb319 lib/galaxy/tools/parameters/basic.py
--- a/lib/galaxy/tools/parameters/basic.py	Thu Aug 20 11:39:32 2009 -0400
+++ b/lib/galaxy/tools/parameters/basic.py	Thu Aug 20 11:43:28 2009 -0400
@@ -304,21 +304,22 @@
     def get_html_field( self, trans=None, value=None, other_values={}  ):
         return form_builder.FileField( self.name, ajax = self.ajax, value = value )
     def from_html( self, value, trans=None, other_values={} ):
+        # TODO: Fix nginx upload module support
         # Middleware or proxies may encode files in special ways (TODO: this
         # should be pluggable)
-        if type( value ) == dict:
-            upload_location = self.tool.app.config.nginx_upload_location
-            assert upload_location, \
-                "Request appears to have been processed by nginx_upload_module \
-                but Galaxy is not configured to recgonize it"
-            # Check that the file is in the right location
-            local_filename = os.path.abspath( value['path'] )
-            assert local_filename.startswith( upload_location ), \
-                "Filename provided by nginx is not in correct directory"
-            value = Bunch(
-                filename = value["name"],
-                local_filename = local_filename
-            )
+        #if type( value ) == dict:
+        #    upload_location = self.tool.app.config.nginx_upload_location
+        #    assert upload_location, \
+        #        "Request appears to have been processed by nginx_upload_module \
+        #        but Galaxy is not configured to recgonize it"
+        #    # Check that the file is in the right location
+        #    local_filename = os.path.abspath( value['path'] )
+        #    assert local_filename.startswith( upload_location ), \
+        #        "Filename provided by nginx is not in correct directory"
+        #    value = Bunch(
+        #        filename = value["name"],
+        #        local_filename = local_filename
+        #    )
         return value
     def get_required_enctype( self ):
         """
@@ -330,10 +331,18 @@
             return None
         elif isinstance( value, unicode ) or isinstance( value, str ):
             return value
+        elif isinstance( value, dict ):
+            # or should we jsonify?
+            try:
+                return value['local_filename']
+            except:
+                return None
         raise Exception( "FileToolParameter cannot be persisted" )
     def to_python( self, value, app ):
         if value is None:
             return None
+        elif isinstance( value, unicode ) or isinstance( value, str ):
+            return value
         else:
             raise Exception( "FileToolParameter cannot be persisted" )
     def get_initial_value( self, trans, context ):
diff -r 070cf5f6f928 -r a7f9325bb319 lib/galaxy/tools/parameters/grouping.py
--- a/lib/galaxy/tools/parameters/grouping.py	Thu Aug 20 11:39:32 2009 -0400
+++ b/lib/galaxy/tools/parameters/grouping.py	Thu Aug 20 11:43:28 2009 -0400
@@ -12,6 +12,7 @@
 from galaxy.datatypes import sniff
 from galaxy.util.bunch import Bunch
 from galaxy.util.odict import odict
+from galaxy.util import json
 
 class Group( object ):
     def __init__( self ):
@@ -167,33 +168,30 @@
             rval.append( rval_dict )
         return rval
     def get_uploaded_datasets( self, trans, context, override_name = None, override_info = None ):
-        def get_data_file_filename( data_file, is_multi_byte = False, override_name = None, override_info = None ):
+        def get_data_file_filename( data_file, override_name = None, override_info = None ):
             dataset_name = override_name
             dataset_info = override_info
             def get_file_name( file_name ):
                 file_name = file_name.split( '\\' )[-1]
                 file_name = file_name.split( '/' )[-1]
                 return file_name
-            if 'local_filename' in dir( data_file ):
+            try:
                 # Use the existing file
-                return data_file.local_filename, get_file_name( data_file.filename ), is_multi_byte
-            elif 'filename' in dir( data_file ):
-                #create a new tempfile
-                try:
-                    temp_name, is_multi_byte = sniff.stream_to_file( data_file.file, prefix='upload' )
-                    precreated_name = get_file_name( data_file.filename )
-                    if not dataset_name:
-                        dataset_name = precreated_name
-                    if not dataset_info:
-                        dataset_info = 'uploaded file'
-                    return temp_name, get_file_name( data_file.filename ), is_multi_byte, dataset_name, dataset_info
-                except Exception, e:
-                    log.exception( 'exception in sniff.stream_to_file using file %s: %s' % ( data_file.filename, str( e ) ) )
-                    self.remove_temp_file( temp_name )
-            return None, None, is_multi_byte, None, None
-        def filenames_from_url_paste( url_paste, group_incoming, override_name = None, override_info = None ):
+                if not dataset_name and 'filename' in data_file:
+                    dataset_name = get_file_name( data_file['filename'] )
+                if not dataset_info:
+                    dataset_info = 'uploaded file'
+                return Bunch( type='file', path=data_file['local_filename'], name=get_file_name( data_file['filename'] ) )
+                #return 'file', data_file['local_filename'], get_file_name( data_file.filename ), dataset_name, dataset_info
+            except:
+                # The uploaded file should've been persisted by the upload tool action
+                return Bunch( type=None, path=None, name=None )
+                #return None, None, None, None, None
+        def get_url_paste_urls_or_filename( group_incoming, override_name = None, override_info = None ):
             filenames = []
-            if url_paste not in [ None, "" ]:
+            url_paste_file = group_incoming.get( 'url_paste', None )
+            if url_paste_file is not None:
+                url_paste = open( url_paste_file, 'r' ).read( 1024 )
                 if url_paste.lstrip().lower().startswith( 'http://' ) or url_paste.lstrip().lower().startswith( 'ftp://' ):
                     url_paste = url_paste.replace( '\r', '' ).split( '\n' )
                     for line in url_paste:
@@ -208,114 +206,54 @@
                             dataset_info = override_info
                             if not dataset_info:
                                 dataset_info = 'uploaded url'
-                            try:
-                                temp_name, is_multi_byte = sniff.stream_to_file( urllib.urlopen( line ), prefix='url_paste' )
-                            except Exception, e:
-                                temp_name = None
-                                precreated_name = str( e )
-                                log.exception( 'exception in sniff.stream_to_file using url_paste %s: %s' % ( url_paste, str( e ) ) )
-                                try:
-                                    self.remove_temp_file( temp_name )
-                                except:
-                                    pass
-                            yield ( temp_name, precreated_name, is_multi_byte, dataset_name, dataset_info )
-                            #yield ( None, str( e ), False, dataset_name, dataset_info )
+                            yield Bunch( type='url', path=line, name=precreated_name )
+                            #yield ( 'url', line, precreated_name, dataset_name, dataset_info )
                 else:
                     dataset_name = dataset_info = precreated_name = 'Pasted Entry' #we need to differentiate between various url pastes here
                     if override_name:
                         dataset_name = override_name
                     if override_info:
                         dataset_info = override_info
-                    is_valid = False
-                    for line in url_paste: #Trim off empty lines from begining
-                        line = line.rstrip( '\r\n' )
-                        if line:
-                            is_valid = True
-                            break
-                    if is_valid:
-                        try:
-                            temp_name, is_multi_byte = sniff.stream_to_file( StringIO.StringIO( url_paste ), prefix='strio_url_paste' )
-                        except Exception, e:
-                            log.exception( 'exception in sniff.stream_to_file using StringIO.StringIO( url_paste ) %s: %s' % ( url_paste, str( e ) ) )
-                            temp_name = None
-                            precreated_name = str( e )
-                            try:
-                                self.remove_temp_file( temp_name )
-                            except:
-                                pass
-                        yield ( temp_name, precreated_name, is_multi_byte, dataset_name, dataset_info )
-                        #yield ( None, str( e ), False, dataset_name, dataset_info )
-        
+                    yield Bunch( type='file', path=url_paste_file, name=precreated_name )
+                    #yield ( 'file', url_paste_file, precreated_name, dataset_name, dataset_info )
         def get_one_filename( context ):
             data_file = context['file_data']
             url_paste = context['url_paste']
             name = context.get( 'NAME', None )
             info = context.get( 'INFO', None )
             warnings = []
-            is_multi_byte = False
             space_to_tab = False 
             if context.get( 'space_to_tab', None ) not in ["None", None]:
                 space_to_tab = True
-            temp_name, precreated_name, is_multi_byte, dataset_name, dataset_info = get_data_file_filename( data_file, is_multi_byte = is_multi_byte, override_name = name, override_info = info )
-            if temp_name:
+            file_bunch = get_data_file_filename( data_file, override_name = name, override_info = info )
+            if file_bunch.path:
                 if url_paste.strip():
                     warnings.append( "All file contents specified in the paste box were ignored." )
             else: #we need to use url_paste
-                #file_names = filenames_from_url_paste( url_paste, context, override_name = name, override_info = info )
-                for temp_name, precreated_name, is_multi_byte, dataset_name, dataset_info in filenames_from_url_paste( url_paste, context, override_name = name, override_info = info ):#file_names:
-                    if temp_name:
+                for file_bunch in get_url_paste_urls_or_filename( context, override_name = name, override_info = info ):
+                    if file_bunch.path:
                         break
-                ###this check will cause an additional file to be retrieved and created...so lets not do that
-                #try: #check to see if additional paste contents were available
-                #    file_names.next()
-                #    warnings.append( "Additional file contents were specified in the paste box, but ignored." )
-                #except StopIteration:
-                #    pass
-            return temp_name, precreated_name, is_multi_byte, space_to_tab, dataset_name, dataset_info, warnings
-        
+            return file_bunch, warnings
         def get_filenames( context ):
             rval = []
             data_file = context['file_data']
             url_paste = context['url_paste']
             name = context.get( 'NAME', None )
             info = context.get( 'INFO', None )
-            warnings = []
-            is_multi_byte = False
             space_to_tab = False 
             if context.get( 'space_to_tab', None ) not in ["None", None]:
                 space_to_tab = True
-            temp_name, precreated_name, is_multi_byte, dataset_name, dataset_info = get_data_file_filename( data_file, is_multi_byte = is_multi_byte, override_name = name, override_info = info )
-            if temp_name:
-                rval.append( ( temp_name, precreated_name, is_multi_byte, space_to_tab, dataset_name, dataset_info ) )
-            for temp_name, precreated_name, is_multi_byte, dataset_name, dataset_info in filenames_from_url_paste( url_paste, context, override_name = name, override_info = info ):
-                if temp_name:
-                    rval.append( ( temp_name, precreated_name, is_multi_byte, space_to_tab, dataset_name, dataset_info ) )
+            warnings = []
+            file_bunch = get_data_file_filename( data_file, override_name = name, override_info = info )
+            if file_bunch.path:
+                file_bunch.space_to_tab = space_to_tab
+                rval.append( file_bunch )
+                #rval.append( ( type, temp_name, precreated_name, space_to_tab, dataset_name, dataset_info ) )
+            for file_bunch in get_url_paste_urls_or_filename( context, override_name = name, override_info = info ):
+                if file_bunch.path:
+                    file_bunch.space_to_tab = space_to_tab
+                    rval.append( file_bunch )
             return rval
-        class UploadedDataset( Bunch ):
-            def __init__( self, **kwd ):
-                Bunch.__init__( self, **kwd )
-                self.primary_file = None
-                self.composite_files = odict()
-                self.dbkey = None
-                self.warnings = []
-                self.metadata = {}
-                
-                self._temp_filenames = [] #store all created filenames here, delete on cleanup
-            def register_temp_file( self, filename ):
-                if isinstance( filename, list ):
-                    self._temp_filenames.extend( filename )
-                else:
-                    self._temp_filenames.append( filename )
-            def remove_temp_file( self, filename ):
-                try:
-                    os.unlink( filename )
-                except Exception, e:
-                    pass
-                    #log.warning( str( e ) )
-            def clean_up_temp_files( self ):
-                for filename in self._temp_filenames:
-                    self.remove_temp_file( filename )
-        
         file_type = self.get_file_type( context )
         d_type = self.get_datatype( trans, context )
         dbkey = context.get( 'dbkey', None )
@@ -325,51 +263,50 @@
         for group_incoming in context.get( self.name, [] ):
             i = int( group_incoming['__index__'] )
             groups_incoming[ i ] = group_incoming
-        
         if d_type.composite_type is not None:
             #handle uploading of composite datatypes
             #Only one Dataset can be created
             
+            '''
             dataset = UploadedDataset()
+            dataset.datatype = d_type
+            '''
+            dataset = Bunch()
+            dataset.type = 'composite'
             dataset.file_type = file_type
+            dataset.dbkey = dbkey
             dataset.datatype = d_type
-            dataset.dbkey = dbkey
+            dataset.warnings = []
+            dataset.metadata = {}
+            dataset.composite_files = {}
             
             #load metadata
             files_metadata = context.get( self.metadata_ref, {} )
-            for meta_name, meta_spec in d_type.metadata_spec.iteritems():
+            for meta_name, meta_spec in d_type.metadata_spec.iteritems():
                 if meta_spec.set_in_upload:
                     if meta_name in files_metadata:
                         dataset.metadata[ meta_name ] = files_metadata[ meta_name ]
-            
-            temp_name = None
-            precreated_name = None
-            is_multi_byte = False
-            space_to_tab = False
-            warnings = []
             
             dataset_name = None
             dataset_info = None
             if dataset.datatype.composite_type == 'auto_primary_file':
                 #replace sniff here with just creating an empty file
                 temp_name, is_multi_byte = sniff.stream_to_file( StringIO.StringIO( d_type.generate_primary_file() ), prefix='upload_auto_primary_file' )
-                precreated_name = dataset_name = 'Uploaded Composite Dataset (%s)' % ( file_type )
+                dataset.primary_file = temp_name
+                dataset.space_to_tab = False
+                dataset.precreated_name = dataset.name = 'Uploaded Composite Dataset (%s)' % ( file_type )
             else:
-                temp_name, precreated_name, is_multi_byte, space_to_tab, dataset_name, dataset_info, warnings = get_one_filename( groups_incoming[ 0 ] )
+                file_bunch, warnings = get_one_filename( groups_incoming[ 0 ] )
                 if dataset.datatype.composite_type:
                     precreated_name = 'Uploaded Composite Dataset (%s)' % ( file_type )
                 writable_files_offset = 1
-            if temp_name is None:#remove this before finish, this should create an empty dataset
+                dataset.primary_file = file_bunch.path
+                dataset.space_to_tab = file_bunch.space_to_tab
+                dataset.precreated_name = file_bunch.precreated_name
+                dataset.name = file_bunch.precreated_name
+                dataset.warnings.extend( file_bunch.warnings )
+            if dataset.primary_file is None:#remove this before finish, this should create an empty dataset
                 raise Exception( 'No primary dataset file was available for composite upload' )
-            dataset.primary_file = temp_name
-            dataset.is_multi_byte = is_multi_byte
-            dataset.space_to_tab = space_to_tab
-            dataset.precreated_name = precreated_name
-            dataset.name = dataset_name
-            dataset.info = dataset_info
-            dataset.warnings.extend( warnings )
-            dataset.register_temp_file( temp_name )
-            
             keys = [ value.name for value in writable_files.values() ]
             for i, group_incoming in enumerate( groups_incoming[ writable_files_offset : ] ):
                 key = keys[ i + writable_files_offset ]
@@ -377,37 +314,22 @@
                     dataset.warnings.append( "A required composite file (%s) was not specified." % ( key ) )
                     dataset.composite_files[ key ] = None
                 else:
-                    temp_name, precreated_name, is_multi_byte, space_to_tab, dataset_name, dataset_info, warnings = get_one_filename( group_incoming )
-                    if temp_name:
-                        dataset.composite_files[ key ] = Bunch( filename = temp_name, precreated_name = precreated_name, is_multi_byte = is_multi_byte, space_to_tab = space_to_tab, warnings = warnings, info = dataset_info, name = dataset_name )
-                        dataset.register_temp_file( temp_name )
+                    file_bunch, warnings = get_one_filename( group_incoming )
+                    if file_bunch.path:
+                        dataset.composite_files[ key ] = file_bunch.__dict__
                     else:
                         dataset.composite_files[ key ] = None
                         if not writable_files[ writable_files.keys()[ keys.index( key ) ] ].optional:
                             dataset.warnings.append( "A required composite file (%s) was not specified." % ( key ) )
             return [ dataset ]
         else:
+            datasets = get_filenames( context[ self.name ][0] )
             rval = []
-            for temp_name, precreated_name, is_multi_byte, space_to_tab, dataset_name, dataset_info, in get_filenames( context[ self.name ][0] ):
-                dataset = UploadedDataset()
+            for dataset in datasets:
                 dataset.file_type = file_type
-                dataset.datatype = d_type
                 dataset.dbkey = dbkey
-                dataset.primary_file = temp_name
-                dataset.is_multi_byte = is_multi_byte
-                dataset.space_to_tab = space_to_tab
-                dataset.name = dataset_name
-                dataset.info = dataset_info
-                dataset.precreated_name = precreated_name
-                dataset.register_temp_file( temp_name )
                 rval.append( dataset )
-        return rval
-    def remove_temp_file( self, filename ):
-        try:
-            os.unlink( filename )
-        except Exception, e:
-            log.warning( str( e ) )
-
+            return rval
 
 class Conditional( Group ):
     type = "conditional"
diff -r 070cf5f6f928 -r a7f9325bb319 lib/galaxy/util/__init__.py
--- a/lib/galaxy/util/__init__.py	Thu Aug 20 11:39:32 2009 -0400
+++ b/lib/galaxy/util/__init__.py	Thu Aug 20 11:43:28 2009 -0400
@@ -3,7 +3,7 @@
 
 """
 import logging
-import threading, random, string, re, binascii, pickle, time, datetime, math, re, os, sys
+import threading, random, string, re, binascii, pickle, time, datetime, math, re, os, sys, tempfile
 
 # Older py compatibility
 try:
@@ -454,6 +454,26 @@
         out_dict[ str( key ) ] = value
     return out_dict
 
+def mkstemp_ln( src, prefix='mkstemp_ln_' ):
+    """
+    From tempfile._mkstemp_inner, generate a hard link in the same dir with a
+    random name.  Created so we can persist the underlying file of a
+    NamedTemporaryFile upon its closure.
+    """
+    dir = os.path.dirname(src)
+    names = tempfile._get_candidate_names()
+    for seq in xrange(tempfile.TMP_MAX):
+        name = names.next()
+        file = os.path.join(dir, prefix + name)
+        try:
+            linked_path = os.link( src, file )
+            return (os.path.abspath(file))
+        except OSError, e:
+            if e.errno == errno.EEXIST:
+                continue # try again
+            raise
+    raise IOError, (errno.EEXIST, "No usable temporary file name found")
+
 galaxy_root_path = os.path.join(__path__[0], "..","..","..")
 dbnames = read_dbnames( os.path.join( galaxy_root_path, "tool-data", "shared", "ucsc", "builds.txt" ) ) #this list is used in edit attributes and the upload tool
 ucsc_build_sites = read_build_sites( os.path.join( galaxy_root_path, "tool-data", "shared", "ucsc", "ucsc_build_sites.txt" ) ) #this list is used in history.tmpl
diff -r 070cf5f6f928 -r a7f9325bb319 lib/galaxy/web/controllers/tool_runner.py
--- a/lib/galaxy/web/controllers/tool_runner.py	Thu Aug 20 11:39:32 2009 -0400
+++ b/lib/galaxy/web/controllers/tool_runner.py	Thu Aug 20 11:43:28 2009 -0400
@@ -136,6 +136,7 @@
         """
         Precreate datasets for asynchronous uploading.
         """
+        permissions = trans.app.security_agent.history_get_default_permissions( trans.history )
         def create_dataset( name, history ):
             data = trans.app.model.HistoryDatasetAssociation( create_dataset = True )
             data.name = name
@@ -143,6 +144,7 @@
             data.history = history
             data.flush()
             history.add_dataset( data )
+            trans.app.security_agent.set_all_dataset_permissions( data.dataset, permissions )
             return data
         tool = self.get_toolbox().tools_by_id.get( tool_id, None )
         if not tool:
diff -r 070cf5f6f928 -r a7f9325bb319 lib/galaxy/web/framework/base.py
--- a/lib/galaxy/web/framework/base.py	Thu Aug 20 11:39:32 2009 -0400
+++ b/lib/galaxy/web/framework/base.py	Thu Aug 20 11:43:28 2009 -0400
@@ -212,6 +212,17 @@
         else:
             return None
     
+# For request.params, override cgi.FieldStorage.make_file to create persistent
+# tempfiles.  Necessary for externalizing the upload tool.  It's a little hacky
+# but for performance reasons it's way better to use Paste's tempfile than to
+# create a new one and copy.
+import cgi
+class FieldStorage( cgi.FieldStorage ):
+    def make_file(self, binary=None):
+        import tempfile
+        return tempfile.NamedTemporaryFile()
+cgi.FieldStorage = FieldStorage
+
 class Request( webob.Request ):
     """
     Encapsulates an HTTP request. 
diff -r 070cf5f6f928 -r a7f9325bb319 templates/base_panels.mako
--- a/templates/base_panels.mako	Thu Aug 20 11:39:32 2009 -0400
+++ b/templates/base_panels.mako	Thu Aug 20 11:43:28 2009 -0400
@@ -72,9 +72,6 @@
     <script type="text/javascript">
         jQuery( function() {
             $("iframe#galaxy_main").load( function() {
-                ##$(this.contentDocument).find("input[galaxy-ajax-upload]").each( function() {
-                ##$("iframe")[0].contentDocument.body.innerHTML = "HELLO"
-                ##$(this.contentWindow.document).find("input[galaxy-ajax-upload]").each( function() {
                 $(this).contents().find("form").each( function() { 
                     if ( $(this).find("input[galaxy-ajax-upload]").length > 0 ){
                         $(this).submit( function() {
diff -r 070cf5f6f928 -r a7f9325bb319 test/base/twilltestcase.py
--- a/test/base/twilltestcase.py	Thu Aug 20 11:39:32 2009 -0400
+++ b/test/base/twilltestcase.py	Thu Aug 20 11:43:28 2009 -0400
@@ -93,6 +93,8 @@
                 valid_hid = int( hid )
             except:
                 raise AssertionError, "Invalid hid (%s) created when uploading file %s" % ( hid, filename )
+        # Wait for upload processing to finish (TODO: this should be done in each test case instead)
+        self.wait()
     def upload_url_paste( self, url_paste, ftype='auto', dbkey='unspecified (?)' ):
         """Pasted data in the upload utility"""
         self.visit_page( "tool_runner/index?tool_id=upload1" )
@@ -112,6 +114,8 @@
                 valid_hid = int( hid )
             except:
                 raise AssertionError, "Invalid hid (%s) created when pasting %s" % ( hid, url_paste )
+        # Wait for upload processing to finish (TODO: this should be done in each test case instead)
+        self.wait()
 
     # Functions associated with histories
     def check_history_for_errors( self ):
diff -r 070cf5f6f928 -r a7f9325bb319 tools/data_source/upload.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/upload.py	Thu Aug 20 11:43:28 2009 -0400
@@ -0,0 +1,280 @@
+#!/usr/bin/env python
+#Processes uploads from the user.
+
+# WARNING: Changes in this tool (particularly as related to parsing) may need
+# to be reflected in galaxy.web.controllers.tool_runner and galaxy.tools
+
+import urllib, sys, os, gzip, tempfile, shutil, re, gzip, zipfile
+from galaxy import eggs
+# need to import model before sniff to resolve a circular import dependency
+import galaxy.model
+from galaxy.datatypes import sniff
+from galaxy import util
+from galaxy.util.json import *
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg, ret=1 ):
+    sys.stderr.write( msg )
+    sys.exit( ret )
+
+def file_err( msg, dataset, json_file ):
+    json_file.write( to_json_string( dict( type = 'dataset',
+                                           ext = 'data',
+                                           dataset_id = dataset.dataset_id,
+                                           stderr = msg ) ) + "\n" )
+    try:
+        os.remove( dataset.path )
+    except:
+        pass
+
+def safe_dict(d):
+    """
+    Recursively clone json structure with UTF-8 dictionary keys
+    http://mellowmachines.com/blog/2009/06/exploding-dictionary-with-unicode-key...
+    """
+    if isinstance(d, dict):
+        return dict([(k.encode('utf-8'), safe_dict(v)) for k,v in d.iteritems()])
+    elif isinstance(d, list):
+        return [safe_dict(x) for x in d]
+    else:
+        return d
+
+def check_html( temp_name, chunk=None ):
+    if chunk is None:
+        temp = open(temp_name, "U")
+    else:
+        temp = chunk
+    regexp1 = re.compile( "<A\s+[^>]*HREF[^>]+>", re.I )
+    regexp2 = re.compile( "<IFRAME[^>]*>", re.I )
+    regexp3 = re.compile( "<FRAMESET[^>]*>", re.I )
+    regexp4 = re.compile( "<META[^>]*>", re.I )
+    lineno = 0
+    for line in temp:
+        lineno += 1
+        matches = regexp1.search( line ) or regexp2.search( line ) or regexp3.search( line ) or regexp4.search( line )
+        if matches:
+            if chunk is None:
+                temp.close()
+            return True
+        if lineno > 100:
+            break
+    if chunk is None:
+        temp.close()
+    return False
+
+def check_binary( temp_name, chunk=None ):
+    if chunk is None:
+        temp = open( temp_name, "U" )
+    else:
+        temp = chunk
+    lineno = 0
+    for line in temp:
+        lineno += 1
+        line = line.strip()
+        if line:
+            for char in line:
+                if ord( char ) > 128:
+                    if chunk is None:
+                        temp.close()
+                    return True
+        if lineno > 10:
+            break
+    if chunk is None:
+        temp.close()
+    return False
+
+def check_gzip( temp_name ):
+    temp = open( temp_name, "U" )
+    magic_check = temp.read( 2 )
+    temp.close()
+    if magic_check != util.gzip_magic:
+        return ( False, False )
+    CHUNK_SIZE = 2**15 # 32Kb
+    gzipped_file = gzip.GzipFile( temp_name )
+    chunk = gzipped_file.read( CHUNK_SIZE )
+    gzipped_file.close()
+    if check_html( temp_name, chunk=chunk ) or check_binary( temp_name, chunk=chunk ):
+        return( True, False )
+    return ( True, True )
+
+def check_zip( temp_name ):
+    if not zipfile.is_zipfile( temp_name ):
+        return ( False, False, None )
+    zip_file = zipfile.ZipFile( temp_name, "r" )
+    # Make sure the archive consists of valid files.  The current rules are:
+    # 1. Archives can only include .ab1, .scf or .txt files
+    # 2. All file extensions within an archive must be the same
+    name = zip_file.namelist()[0]
+    test_ext = name.split( "." )[1].strip().lower()
+    if not ( test_ext == 'scf' or test_ext == 'ab1' or test_ext == 'txt' ):
+        return ( True, False, test_ext )
+    for name in zip_file.namelist():
+        ext = name.split( "." )[1].strip().lower()
+        if ext != test_ext:
+            return ( True, False, test_ext )
+    return ( True, True, test_ext )
+
+def add_file( dataset, json_file ):
+    data_type = None
+    line_count = None
+
+    if dataset.type == 'url':
+        try:
+            temp_name, is_multi_byte = sniff.stream_to_file( urllib.urlopen( dataset.path ), prefix='url_paste' )
+        except Exception, e:
+            file_err( 'Unable to fetch %s\n%s' % ( dataset.path, str( e ) ), dataset, json_file )
+            return
+        dataset.path = temp_name
+        dataset.is_multi_byte = is_multi_byte
+
+    # See if we have an empty file
+    if not os.path.exists( dataset.path ):
+        file_err( 'Uploaded temporary file (%s) does not exist.  Please' % dataset.path, dataset, json_file )
+        return
+    if not os.path.getsize( dataset.path ) > 0:
+        file_err( 'The uploaded file is empty', dataset, json_file )
+        return
+    if 'is_multi_byte' not in dir( dataset ):
+        dataset.is_multi_byte = util.is_multi_byte( open( dataset.path, 'r' ).read( 1024 )[:100] )
+    if dataset.is_multi_byte:
+        ext = sniff.guess_ext( dataset.path, is_multi_byte=True )
+        data_type = ext
+    else:
+        # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress
+        is_gzipped, is_valid = check_gzip( dataset.path )
+        if is_gzipped and not is_valid:
+            file_err( 'The uploaded file contains inappropriate content', dataset, json_file )
+            return
+        elif is_gzipped and is_valid:
+            # We need to uncompress the temp_name file
+            CHUNK_SIZE = 2**20 # 1Mb   
+            fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( dataset.path ) )
+            gzipped_file = gzip.GzipFile( dataset.path )
+            while 1:
+                try:
+                    chunk = gzipped_file.read( CHUNK_SIZE )
+                except IOError:
+                    os.close( fd )
+                    os.remove( uncompressed )
+                    file_err( 'Problem decompressing gzipped data', dataset, json_file )
+                    return
+                if not chunk:
+                    break
+                os.write( fd, chunk )
+            os.close( fd )
+            gzipped_file.close()
+            # Replace the gzipped file with the decompressed file
+            shutil.move( uncompressed, dataset.path )
+            dataset.name = dataset.name.rstrip( '.gz' )
+            data_type = 'gzip'
+        if not data_type:
+            # See if we have a zip archive
+            is_zipped, is_valid, test_ext = check_zip( dataset.path )
+            if is_zipped and not is_valid:
+                file_err( 'The uploaded file contains inappropriate content', dataset, json_file )
+                return
+            elif is_zipped and is_valid:
+                # Currently, we force specific tools to handle this case.  We also require the user
+                # to manually set the incoming file_type
+                if ( test_ext == 'ab1' or test_ext == 'scf' ) and dataset.file_type != 'binseq.zip':
+                    file_err( "Invalid 'File Format' for archive consisting of binary files - use 'Binseq.zip'", dataset, json_file )
+                    return
+                elif test_ext == 'txt' and dataset.file_type != 'txtseq.zip':
+                    file_err( "Invalid 'File Format' for archive consisting of text files - use 'Txtseq.zip'", dataset, json_file )
+                    return
+                if not ( dataset.file_type == 'binseq.zip' or dataset.file_type == 'txtseq.zip' ):
+                    file_err( "You must manually set the 'File Format' to either 'Binseq.zip' or 'Txtseq.zip' when uploading zip files", dataset, json_file )
+                    return
+                data_type = 'zip'
+                ext = dataset.file_type
+        if not data_type:
+            if check_binary( dataset.path ):
+                if dataset.is_binary is not None:
+                    data_type = 'binary'
+                    ext = dataset.file_type
+                else:
+                    parts = dataset.name.split( "." )
+                    if len( parts ) > 1:
+                        ext = parts[1].strip().lower()
+                        if not( ext == 'ab1' or ext == 'scf' ):
+                            file_err( 'The uploaded file contains inappropriate content', dataset, json_file )
+                            return
+                        if ext == 'ab1' and dataset.file_type != 'ab1':
+                            file_err( "You must manually set the 'File Format' to 'Ab1' when uploading ab1 files.", dataset, json_file )
+                            return
+                        elif ext == 'scf' and dataset.file_type != 'scf':
+                            file_err( "You must manually set the 'File Format' to 'Scf' when uploading scf files.", dataset, json_file )
+                            return
+                    data_type = 'binary'
+        if not data_type:
+            # We must have a text file
+            if check_html( dataset.path ):
+                file_err( 'The uploaded file contains inappropriate content', dataset, json_file )
+                return
+        if data_type != 'binary' and data_type != 'zip':
+            if dataset.space_to_tab:
+                line_count = sniff.convert_newlines_sep2tabs( dataset.path )
+            else:
+                line_count = sniff.convert_newlines( dataset.path )
+            if dataset.file_type == 'auto':
+                ext = sniff.guess_ext( dataset.path )
+            else:
+                ext = dataset.file_type
+            data_type = ext
+    # Save job info for the framework
+    info = dict( type = 'dataset',
+                 dataset_id = dataset.dataset_id,
+                 path = dataset.path,
+                 ext = ext,
+                 stdout = 'uploaded %s file' % data_type,
+                 name = dataset.name,
+                 line_count = line_count )
+    json_file.write( to_json_string( info ) + "\n" )
+
+def add_composite_file( dataset, json_file ):
+        if dataset.composite_files:
+            os.mkdir( dataset.extra_files_path )
+            for name, value in dataset.composite_files.iteritems():
+                value = util.bunch.Bunch( **value )
+                if dataset.composite_file_paths[ value.name ] is None and not value.optional:
+                    file_err( 'A required composite data file was not provided (%s)' % name, dataset, json_file )
+                    break
+                elif dataset.composite_file_paths[value.name] is not None:
+                    if not value.is_binary:
+                        if uploaded_dataset.composite_files[ value.name ].space_to_tab:
+                            sniff.convert_newlines_sep2tabs( dataset.composite_file_paths[ value.name ][ 'path' ] )
+                        else:
+                            sniff.convert_newlines( dataset.composite_file_paths[ value.name ][ 'path' ] )
+                    shutil.move( dataset.composite_file_paths[ value.name ][ 'path' ], os.path.join( dataset.extra_files_path, name ) )
+        info = dict( type = 'dataset',
+                     dataset_id = dataset.dataset_id,
+                     path = dataset.primary_file,
+                     stdout = 'uploaded %s file' % dataset.file_type )
+        json_file.write( to_json_string( info ) + "\n" )
+
+def __main__():
+
+    if len( sys.argv ) != 2:
+        print >>sys.stderr, 'usage: upload.py <json paramfile>'
+        sys.exit( 1 )
+
+    json_file = open( 'galaxy.json', 'w' )
+
+    for line in open( sys.argv[1], 'r' ):
+        dataset = from_json_string( line )
+        dataset = util.bunch.Bunch( **safe_dict( dataset ) )
+
+        if dataset.type == 'composite':
+            add_composite_file( dataset, json_file )
+        else:
+            add_file( dataset, json_file )
+
+    # clean up paramfile
+    try:
+        os.remove( sys.argv[1] )
+    except:
+        pass
+
+if __name__ == '__main__':
+    __main__()
diff -r 070cf5f6f928 -r a7f9325bb319 tools/data_source/upload.xml
--- a/tools/data_source/upload.xml	Thu Aug 20 11:39:32 2009 -0400
+++ b/tools/data_source/upload.xml	Thu Aug 20 11:43:28 2009 -0400
@@ -1,10 +1,13 @@
 <?xml version="1.0"?>
 
-<tool name="Upload File" id="upload1" version="1.0.2">
+<tool name="Upload File" id="upload1" version="1.0.3">
   <description>
     from your computer  
   </description>
   <action module="galaxy.tools.actions.upload" class="UploadToolAction"/>
+  <command interpreter="python">
+    upload.py $paramfile
+  </command>
   <inputs>
     <param name="file_type" type="select" label="File Format" help="Which format? See help below">
       <options from_parameter="tool.app.datatypes_registry.upload_file_formats" transform_lines="[ "%s%s%s" % ( line, self.separator, line ) for line in obj ]">

    

Greg Von Kuster

tags

participants (1)