details: http://www.bx.psu.edu/hg/galaxy/rev/f6e0863862ef changeset: 2583:f6e0863862ef user: Nate Coraor <nate@bx.psu.edu> date: Thu Aug 20 10:49:54 2009 -0400 description: Real Job(tm) upload support 12 file(s) affected in this change: lib/galaxy/jobs/__init__.py lib/galaxy/tools/__init__.py lib/galaxy/tools/actions/upload.py lib/galaxy/tools/parameters/basic.py lib/galaxy/tools/parameters/grouping.py lib/galaxy/util/__init__.py lib/galaxy/web/controllers/tool_runner.py lib/galaxy/web/framework/base.py templates/base_panels.mako test/base/twilltestcase.py tools/data_source/upload.py tools/data_source/upload.xml diffs (1505 lines): diff -r 5fa8803716fd -r f6e0863862ef lib/galaxy/jobs/__init__.py --- a/lib/galaxy/jobs/__init__.py Wed Aug 19 18:07:55 2009 -0400 +++ b/lib/galaxy/jobs/__init__.py Thu Aug 20 10:49:54 2009 -0400 @@ -6,6 +6,8 @@ from galaxy.datatypes.tabular import * from galaxy.datatypes.interval import * from galaxy.datatypes import metadata +from galaxy.util.json import from_json_string +from galaxy.util.expressions import ExpressionContext import pkg_resources pkg_resources.require( "PasteDeploy" ) @@ -18,6 +20,12 @@ # States for running a job. These are NOT the same as data states JOB_WAIT, JOB_ERROR, JOB_INPUT_ERROR, JOB_INPUT_DELETED, JOB_OK, JOB_READY, JOB_DELETED, JOB_ADMIN_DELETED = 'wait', 'error', 'input_error', 'input_deleted', 'ok', 'ready', 'deleted', 'admin_deleted' + +# This file, if created in the job's working directory, will be used for +# setting advanced metadata properties on the job and its associated outputs. +# This interface is currently experimental, is only used by the upload tool, +# and should eventually become API'd +TOOL_PROVIDED_JOB_METADATA_FILE = 'galaxy.json' class JobManager( object ): """ @@ -320,6 +328,7 @@ self.working_directory = \ os.path.join( self.app.config.job_working_directory, str( self.job_id ) ) self.output_paths = None + self.tool_provided_job_metadata = None self.external_output_metadata = metadata.JobExternalOutputMetadataWrapper( job ) #wrapper holding the info required to restore and clean up from files used for setting metadata externally def get_param_dict( self ): @@ -422,6 +431,8 @@ dataset.blurb = 'tool error' dataset.info = message dataset.set_size() + if dataset.ext == 'auto': + dataset.extension = 'data' dataset.flush() job.state = model.Job.states.ERROR job.command_line = self.command_line @@ -486,16 +497,28 @@ except ( IOError, OSError ): self.fail( "Job %s's output dataset(s) could not be read" % job.id ) return + job_context = ExpressionContext( dict( stdout = stdout, stderr = stderr ) ) for dataset_assoc in job.output_datasets: + context = self.get_dataset_finish_context( job_context, dataset_assoc.dataset.dataset ) #should this also be checking library associations? - can a library item be added from a history before the job has ended? - lets not allow this to occur for dataset in dataset_assoc.dataset.dataset.history_associations: #need to update all associated output hdas, i.e. history was shared with job running + if context.get( 'path', None ): + # The tool can set an alternate output path for the dataset. + try: + shutil.move( context['path'], dataset.file_name ) + except ( IOError, OSError ): + if not context['stderr']: + context['stderr'] = 'This dataset could not be processed' dataset.blurb = 'done' dataset.peek = 'no peek' - dataset.info = stdout + stderr + dataset.info = context['stdout'] + context['stderr'] dataset.set_size() - if stderr: + if context['stderr']: dataset.blurb = "error" elif dataset.has_data(): + # If the tool was expected to set the extension, attempt to retrieve it + if dataset.ext == 'auto': + dataset.extension = context.get( 'ext', 'data' ) #if a dataset was copied, it won't appear in our dictionary: #either use the metadata from originating output dataset, or call set_meta on the copies #it would be quicker to just copy the metadata from the originating output dataset, @@ -510,18 +533,39 @@ #the metadata that was stored to disk for use via the external process, #and the changes made by the user will be lost, without warning or notice dataset.metadata.from_JSON_dict( self.external_output_metadata.get_output_filenames_by_dataset( dataset ).filename_out ) - if self.tool.is_multi_byte: - dataset.set_multi_byte_peek() - else: - dataset.set_peek() + try: + assert context.get( 'line_count', None ) is not None + if self.tool.is_multi_byte: + dataset.set_multi_byte_peek( line_count=context['line_count'] ) + else: + dataset.set_peek( line_count=context['line_count'] ) + except: + if self.tool.is_multi_byte: + dataset.set_multi_byte_peek() + else: + dataset.set_peek() + try: + # set the name if provided by the tool + dataset.name = context['name'] + except: + pass else: dataset.blurb = "empty" + if dataset.ext == 'auto': + dataset.extension = 'txt' dataset.flush() - if stderr: + if context['stderr']: dataset_assoc.dataset.dataset.state = model.Dataset.states.ERROR else: dataset_assoc.dataset.dataset.state = model.Dataset.states.OK - dataset_assoc.dataset.dataset.flush() + # If any of the rest of the finish method below raises an + # exception, the fail method will run and set the datasets to + # ERROR. The user will never see that the datasets are in error if + # they were flushed as OK here, since upon doing so, the history + # panel stops checking for updates. So allow the + # mapping.context.current.flush() at the bottom of this method set + # the state instead. + #dataset_assoc.dataset.dataset.flush() # Save stdout and stderr if len( stdout ) > 32768: @@ -591,7 +635,8 @@ return self.output_paths class DatasetPath( object ): - def __init__( self, real_path, false_path = None ): + def __init__( self, dataset_id, real_path, false_path = None ): + self.dataset_id = dataset_id self.real_path = real_path self.false_path = false_path def __str__( self ): @@ -605,10 +650,55 @@ self.output_paths = [] for name, data in [ ( da.name, da.dataset.dataset ) for da in job.output_datasets ]: false_path = os.path.abspath( os.path.join( self.working_directory, "galaxy_dataset_%d.dat" % data.id ) ) - self.output_paths.append( DatasetPath( data.file_name, false_path ) ) + self.output_paths.append( DatasetPath( data.id, data.file_name, false_path ) ) else: - self.output_paths = [ DatasetPath( da.dataset.file_name ) for da in job.output_datasets ] + self.output_paths = [ DatasetPath( da.dataset.dataset.id, da.dataset.file_name ) for da in job.output_datasets ] return self.output_paths + + def get_output_file_id( self, file ): + if self.output_paths is None: + self.get_output_fnames() + for dp in self.output_paths: + if self.app.config.outputs_to_working_directory and os.path.basename( dp.false_path ) == file: + return dp.dataset_id + elif os.path.basename( dp.real_path ) == file: + return dp.dataset_id + return None + + def get_tool_provided_job_metadata( self ): + if self.tool_provided_job_metadata is not None: + return self.tool_provided_job_metadata + + # Look for JSONified job metadata + self.tool_provided_job_metadata = [] + meta_file = os.path.join( self.working_directory, TOOL_PROVIDED_JOB_METADATA_FILE ) + if os.path.exists( meta_file ): + for line in open( meta_file, 'r' ): + try: + line = from_json_string( line ) + assert 'type' in line + except: + log.exception( '(%s) Got JSON data from tool, but data is improperly formatted or no "type" key in data' % self.job_id ) + log.debug( 'Offending data was: %s' % line ) + continue + # Set the dataset id if it's a dataset entry and isn't set. + # This isn't insecure. We loop the job's output datasets in + # the finish method, so if a tool writes out metadata for a + # dataset id that it doesn't own, it'll just be ignored. + if line['type'] == 'dataset' and 'dataset_id' not in line: + try: + line['dataset_id'] = self.get_output_file_id( line['dataset'] ) + except KeyError: + log.warning( '(%s) Tool provided job dataset-specific metadata without specifying a dataset' % self.job_id ) + continue + self.tool_provided_job_metadata.append( line ) + return self.tool_provided_job_metadata + + def get_dataset_finish_context( self, job_context, dataset ): + for meta in self.get_tool_provided_job_metadata(): + if meta['type'] == 'dataset' and meta['dataset_id'] == dataset.id: + return ExpressionContext( meta, job_context ) + return job_context def check_output_sizes( self ): sizes = [] diff -r 5fa8803716fd -r f6e0863862ef lib/galaxy/tools/__init__.py --- a/lib/galaxy/tools/__init__.py Wed Aug 19 18:07:55 2009 -0400 +++ b/lib/galaxy/tools/__init__.py Thu Aug 20 10:49:54 2009 -0400 @@ -5,7 +5,7 @@ pkg_resources.require( "simplejson" ) -import logging, os, string, sys, tempfile, glob, shutil +import logging, os, string, sys, tempfile, glob, shutil, types import simplejson import binascii from UserDict import DictMixin @@ -415,6 +415,7 @@ output.metadata_source = data_elem.get("metadata_source", "") output.parent = data_elem.get("parent", None) output.label = util.xml_text( data_elem, "label" ) + output.count = int( data_elem.get("count", 1) ) output.filters = data_elem.findall( 'filter' ) self.outputs[ output.name ] = output # Any extra generated config files for the tool @@ -816,7 +817,11 @@ # If we've completed the last page we can execute the tool elif state.page == self.last_page: out_data = self.execute( trans, incoming=params ) - return 'tool_executed.mako', dict( out_data=out_data ) + try: + assert type( out_data ) is types.DictType + return 'tool_executed.mako', dict( out_data=out_data ) + except: + return 'message.mako', dict( message_type='error', message=out_data, refresh_frames=[] ) # Otherwise move on to the next page else: state.page += 1 @@ -824,15 +829,26 @@ self.fill_in_new_state( trans, self.inputs_by_page[ state.page ], state.inputs ) return 'tool_form.mako', dict( errors=errors, tool_state=state ) else: - if filter( lambda x: isinstance( x, FieldStorage ) and x.file, state.inputs.values() ): + try: + self.find_fieldstorage( state.inputs ) + except InterruptedUpload: # If inputs contain a file it won't persist. Most likely this # is an interrupted upload. We should probably find a more # standard method of determining an incomplete POST. return self.handle_interrupted( trans, state.inputs ) - else: - # Just a refresh, render the form with updated state and errors. - return 'tool_form.mako', dict( errors=errors, tool_state=state ) + except: + pass + # Just a refresh, render the form with updated state and errors. + return 'tool_form.mako', dict( errors=errors, tool_state=state ) + def find_fieldstorage( self, x ): + if isinstance( x, FieldStorage ): + raise InterruptedUpload( None ) + elif type( x ) is types.DictType: + [ self.find_fieldstorage( y ) for y in x.values() ] + elif type( x ) is types.ListType: + [ self.find_fieldstorage( y ) for y in x ] + def handle_interrupted( self, trans, inputs ): """ Upon handling inputs, if it appears that we have received an incomplete @@ -1704,3 +1720,6 @@ return value else: return incoming.get( key, default ) + +class InterruptedUpload( Exception ): + pass diff -r 5fa8803716fd -r f6e0863862ef lib/galaxy/tools/actions/upload.py --- a/lib/galaxy/tools/actions/upload.py Wed Aug 19 18:07:55 2009 -0400 +++ b/lib/galaxy/tools/actions/upload.py Thu Aug 20 10:49:54 2009 -0400 @@ -1,8 +1,10 @@ import os, shutil, urllib, StringIO, re, gzip, tempfile, shutil, zipfile +from cgi import FieldStorage from __init__ import ToolAction from galaxy import datatypes, jobs from galaxy.datatypes import sniff from galaxy import model, util +from galaxy.util.json import to_json_string import sys, traceback @@ -11,14 +13,28 @@ class UploadToolAction( ToolAction ): # Action for uploading files - def __init__( self ): - self.empty = False - self.line_count = None - def remove_tempfile( self, filename ): - try: - os.unlink( filename ) - except: - log.exception( 'failure removing temporary file: %s' % filename ) + def persist_uploads( self, incoming ): + if 'files' in incoming: + new_files = [] + temp_files = [] + for upload_dataset in incoming['files']: + f = upload_dataset['file_data'] + if isinstance( f, FieldStorage ): + # very small files can be StringIOs + if 'name' in dir( f.file ) and f.file.name != '<fdopen>': + local_filename = util.mkstemp_ln( f.file.name, 'upload_file_data_' ) + f.file.close() + else: + local_filename = datatypes.sniff.stream_to_file( f.file, prefix="strio_upload_file_" )[0] + upload_dataset['file_data'] = dict( filename = f.filename, + local_filename = local_filename ) + if upload_dataset['url_paste'].strip() != '': + upload_dataset['url_paste'] = datatypes.sniff.stream_to_file( StringIO.StringIO( upload_dataset['url_paste'] ), prefix="strio_url_paste_" )[0] + else: + upload_dataset['url_paste'] = None + new_files.append( upload_dataset ) + incoming['files'] = new_files + return incoming def execute( self, tool, trans, incoming={}, set_output_hid = True ): dataset_upload_inputs = [] for input_name, input in tool.inputs.iteritems(): @@ -42,330 +58,100 @@ log.error( 'Got a precreated dataset (%s) but it does not belong to current user (%s)' % ( data.id, trans.user.id ) ) else: self.precreated_datasets.append( data ) + data_list = [] + + incoming = self.persist_uploads( incoming ) + + json_file = tempfile.mkstemp() + json_file_path = json_file[1] + json_file = os.fdopen( json_file[0], 'w' ) for dataset_upload_input in dataset_upload_inputs: uploaded_datasets = dataset_upload_input.get_uploaded_datasets( trans, incoming ) for uploaded_dataset in uploaded_datasets: - precreated_dataset = self.get_precreated_dataset( uploaded_dataset.precreated_name ) - dataset = self.add_file( trans, uploaded_dataset.primary_file, uploaded_dataset.name, uploaded_dataset.file_type, uploaded_dataset.is_multi_byte, uploaded_dataset.dbkey, space_to_tab = uploaded_dataset.space_to_tab, info = uploaded_dataset.info, precreated_dataset = precreated_dataset, metadata = uploaded_dataset.metadata, uploaded_dataset = uploaded_dataset ) - #dataset state is now set, we should not do anything else to this dataset - data_list.append( dataset ) - #clean up extra temp names - uploaded_dataset.clean_up_temp_files() - + data = self.get_precreated_dataset( uploaded_dataset.name ) + if not data: + data = trans.app.model.HistoryDatasetAssociation( history = trans.history, create_dataset = True ) + data.name = uploaded_dataset.name + data.state = data.states.QUEUED + data.extension = uploaded_dataset.file_type + data.dbkey = uploaded_dataset.dbkey + data.flush() + trans.history.add_dataset( data, genome_build = uploaded_dataset.dbkey ) + permissions = trans.app.security_agent.history_get_default_permissions( trans.history ) + trans.app.security_agent.set_all_dataset_permissions( data.dataset, permissions ) + else: + data.extension = uploaded_dataset.file_type + data.dbkey = uploaded_dataset.dbkey + data.flush() + trans.history.genome_build = uploaded_dataset.dbkey + if uploaded_dataset.type == 'composite': + # we need to init metadata before the job is dispatched + data.init_meta() + for meta_name, meta_value in uploaded_dataset.metadata.iteritems(): + setattr( data.metadata, meta_name, meta_value ) + data.flush() + json = dict( file_type = uploaded_dataset.file_type, + dataset_id = data.dataset.id, + dbkey = uploaded_dataset.dbkey, + type = uploaded_dataset.type, + metadata = uploaded_dataset.metadata, + primary_file = uploaded_dataset.primary_file, + extra_files_path = data.extra_files_path, + composite_file_paths = uploaded_dataset.composite_files, + composite_files = dict( [ ( k, v.__dict__ ) for k, v in data.datatype.get_composite_files( data ).items() ] ) ) + else: + try: + is_binary = uploaded_dataset.datatype.is_binary + except: + is_binary = None + json = dict( file_type = uploaded_dataset.file_type, + name = uploaded_dataset.name, + dataset_id = data.dataset.id, + dbkey = uploaded_dataset.dbkey, + type = uploaded_dataset.type, + is_binary = is_binary, + space_to_tab = uploaded_dataset.space_to_tab, + path = uploaded_dataset.path ) + json_file.write( to_json_string( json ) + '\n' ) + data_list.append( data ) + json_file.close() + #cleanup unclaimed precreated datasets: for data in self.precreated_datasets: log.info( 'Cleaned up unclaimed precreated dataset (%s).' % ( data.id ) ) data.state = data.states.ERROR data.info = 'No file contents were available.' - if data_list: - trans.app.model.flush() + if not data_list: + try: + os.remove( json_file_path ) + except: + pass + return 'No data was entered in the upload form, please go back and choose data to upload.' # Create the job object job = trans.app.model.Job() job.session_id = trans.get_galaxy_session().id job.history_id = trans.history.id job.tool_id = tool.id - try: - # For backward compatibility, some tools may not have versions yet. - job.tool_version = tool.version - except: - job.tool_version = "1.0.1" + job.tool_version = tool.version job.state = trans.app.model.Job.states.UPLOAD job.flush() log.info( 'tool %s created job id %d' % ( tool.id, job.id ) ) trans.log_event( 'created job id %d' % job.id, tool_id=tool.id ) + + for name, value in tool.params_to_strings( incoming, trans.app ).iteritems(): + job.add_parameter( name, value ) + job.add_parameter( 'paramfile', to_json_string( json_file_path ) ) + for i, dataset in enumerate( data_list ): + job.add_output_dataset( i, dataset ) + trans.app.model.flush() - #if we could make a 'real' job here, then metadata could be set before job.finish() is called - hda = data_list[0] #only our first hda is being added as output for the job, why? - job.state = trans.app.model.Job.states.OK - file_size_str = datatypes.data.nice_size( hda.dataset.file_size ) - job.info = "%s, size: %s" % ( hda.info, file_size_str ) - job.add_output_dataset( hda.name, hda ) - job.flush() - log.info( 'job id %d ended ok, file size: %s' % ( job.id, file_size_str ) ) - trans.log_event( 'job id %d ended ok, file size: %s' % ( job.id, file_size_str ), tool_id=tool.id ) - return dict( output=hda ) - - def upload_empty(self, trans, job, err_code, err_msg, precreated_dataset = None): - if precreated_dataset is not None: - data = precreated_dataset - else: - data = trans.app.model.HistoryDatasetAssociation( create_dataset=True ) - trans.app.security_agent.set_all_dataset_permissions( data.dataset, trans.app.security_agent.history_get_default_permissions( trans.history ) ) - data.name = err_code - data.extension = "txt" - data.dbkey = "?" - data.info = err_msg - data.file_size = 0 - data.state = data.states.EMPTY - data.flush() - if precreated_dataset is None: - trans.history.add_dataset( data ) - trans.app.model.flush() - # Indicate job failure by setting state and info - job.state = trans.app.model.Job.states.ERROR - job.info = err_msg - job.add_output_dataset( data.name, data ) - job.flush() - log.info( 'job id %d ended with errors, err_msg: %s' % ( job.id, err_msg ) ) - trans.log_event( 'job id %d ended with errors, err_msg: %s' % ( job.id, err_msg ), tool_id=job.tool_id ) - return dict( output=data ) - - def add_file( self, trans, temp_name, file_name, file_type, is_multi_byte, dbkey, info=None, space_to_tab=False, precreated_dataset=None, metadata = {}, uploaded_dataset = None ): - def dataset_no_data_error( data, message = 'there was an error uploading your file' ): - data.info = "No data: %s." % message - data.state = data.states.ERROR - if data.extension is None: - data.extension = 'data' - return data - data_type = None - - if precreated_dataset is not None: - data = precreated_dataset - else: - data = trans.app.model.HistoryDatasetAssociation( history = trans.history, create_dataset = True ) - trans.app.security_agent.set_all_dataset_permissions( data.dataset, trans.app.security_agent.history_get_default_permissions( trans.history ) ) - - # See if we have an empty file - if not os.path.getsize( temp_name ) > 0: - return dataset_no_data_error( data, message = 'you attempted to upload an empty file' ) - #raise BadFileException( "you attempted to upload an empty file." ) - if is_multi_byte: - ext = sniff.guess_ext( temp_name, is_multi_byte=True ) - else: - if not data_type: #at this point data_type is always None (just initialized above), so this is always True...lots of cleanup needed here - # See if we have a gzipped file, which, if it passes our restrictions, - # we'll decompress on the fly. - is_gzipped, is_valid = self.check_gzip( temp_name ) - if is_gzipped and not is_valid: - return dataset_no_data_error( data, message = 'you attempted to upload an inappropriate file' ) - #raise BadFileException( "you attempted to upload an inappropriate file." ) - elif is_gzipped and is_valid: - # We need to uncompress the temp_name file - CHUNK_SIZE = 2**20 # 1Mb - fd, uncompressed = tempfile.mkstemp() - gzipped_file = gzip.GzipFile( temp_name ) - while 1: - try: - chunk = gzipped_file.read( CHUNK_SIZE ) - except IOError: - os.close( fd ) - os.remove( uncompressed ) - return dataset_no_data_error( data, message = 'problem decompressing gzipped data' ) - #raise BadFileException( 'problem decompressing gzipped data.' ) - if not chunk: - break - os.write( fd, chunk ) - os.close( fd ) - gzipped_file.close() - # Replace the gzipped file with the decompressed file - shutil.move( uncompressed, temp_name ) - file_name = file_name.rstrip( '.gz' ) - data_type = 'gzip' - ext = '' - if not data_type: - # See if we have a zip archive - is_zipped, is_valid, test_ext = self.check_zip( temp_name ) - if is_zipped and not is_valid: - return dataset_no_data_error( data, message = 'you attempted to upload an inappropriate file' ) - #raise BadFileException( "you attempted to upload an inappropriate file." ) - elif is_zipped and is_valid: - # Currently, we force specific tools to handle this case. We also require the user - # to manually set the incoming file_type - if ( test_ext == 'ab1' or test_ext == 'scf' ) and file_type != 'binseq.zip': - return dataset_no_data_error( data, message = "Invalid 'File Format' for archive consisting of binary files - use 'Binseq.zip'" ) - #raise BadFileException( "Invalid 'File Format' for archive consisting of binary files - use 'Binseq.zip'." ) - elif test_ext == 'txt' and file_type != 'txtseq.zip': - return dataset_no_data_error( data, message = "Invalid 'File Format' for archive consisting of text files - use 'Txtseq.zip'" ) - #raise BadFileException( "Invalid 'File Format' for archive consisting of text files - use 'Txtseq.zip'." ) - if not ( file_type == 'binseq.zip' or file_type == 'txtseq.zip' ): - return dataset_no_data_error( data, message = "you must manually set the 'File Format' to either 'Binseq.zip' or 'Txtseq.zip' when uploading zip files" ) - #raise BadFileException( "you must manually set the 'File Format' to either 'Binseq.zip' or 'Txtseq.zip' when uploading zip files." ) - data_type = 'zip' - ext = file_type - if not data_type: - if self.check_binary( temp_name ): - if uploaded_dataset and uploaded_dataset.datatype and uploaded_dataset.datatype.is_binary: - #we need a more generalized way of checking if a binary upload is of the right format for a datatype...magic number, etc - data_type = 'binary' - ext = uploaded_dataset.file_type - else: - parts = file_name.split( "." ) - if len( parts ) > 1: - ext = parts[1].strip().lower() - if not( ext == 'ab1' or ext == 'scf' ): - return dataset_no_data_error( data, message = "you attempted to upload an inappropriate file" ) - #raise BadFileException( "you attempted to upload an inappropriate file." ) - if ext == 'ab1' and file_type != 'ab1': - return dataset_no_data_error( data, message = "you must manually set the 'File Format' to 'Ab1' when uploading ab1 files" ) - #raise BadFileException( "you must manually set the 'File Format' to 'Ab1' when uploading ab1 files." ) - elif ext == 'scf' and file_type != 'scf': - return dataset_no_data_error( data, message = "you must manually set the 'File Format' to 'Scf' when uploading scf files" ) - #raise BadFileException( "you must manually set the 'File Format' to 'Scf' when uploading scf files." ) - data_type = 'binary' - if not data_type: - # We must have a text file - if trans.app.datatypes_registry.get_datatype_by_extension( file_type ).composite_type != 'auto_primary_file' and self.check_html( temp_name ): - return dataset_no_data_error( data, message = "you attempted to upload an inappropriate file" ) - #raise BadFileException( "you attempted to upload an inappropriate file." ) - #if data_type != 'binary' and data_type != 'zip' and not trans.app.datatypes_registry.get_datatype_by_extension( ext ).is_binary: - if data_type != 'binary' and data_type != 'zip': - if space_to_tab: - self.line_count = sniff.convert_newlines_sep2tabs( temp_name ) - else: - self.line_count = sniff.convert_newlines( temp_name ) - if file_type == 'auto': - ext = sniff.guess_ext( temp_name, sniff_order=trans.app.datatypes_registry.sniff_order ) - else: - ext = file_type - data_type = ext - if info is None: - info = 'uploaded %s file' %data_type - data.extension = ext - data.name = file_name - data.dbkey = dbkey - data.info = info - data.flush() - shutil.move( temp_name, data.file_name ) - dataset_state = data.states.OK #don't set actual state here, only set to OK when finished setting attributes of the dataset - data.set_size() - data.init_meta() - #need to set metadata, has to be done after extention is set - for meta_name, meta_value in metadata.iteritems(): - setattr( data.metadata, meta_name, meta_value ) - if self.line_count is not None: - try: - if is_multi_byte: - data.set_multi_byte_peek( line_count=self.line_count ) - else: - data.set_peek( line_count=self.line_count ) - except: - if is_multi_byte: - data.set_multi_byte_peek() - else: - data.set_peek() - else: - if is_multi_byte: - data.set_multi_byte_peek() - else: - data.set_peek() - - # validate incomming data - # Commented by greg on 3/14/07 - # for error in data.datatype.validate( data ): - # data.add_validation_error( - # model.ValidationError( message=str( error ), err_type=error.__class__.__name__, attributes=util.object_to_string( error.__dict__ ) ) ) - if data.missing_meta(): - data.datatype.set_meta( data ) - dbkey_to_store = dbkey - if type( dbkey_to_store ) == type( [] ): - dbkey_to_store = dbkey[0] - if precreated_dataset is not None: - trans.history.genome_build = dbkey_to_store - else: - trans.history.add_dataset( data, genome_build=dbkey_to_store ) - #set up composite files - if uploaded_dataset is not None: - composite_files = data.datatype.get_composite_files( data ) - if composite_files: - os.mkdir( data.extra_files_path ) #make extra files path - for name, value in composite_files.iteritems(): - if uploaded_dataset.composite_files[ value.name ] is None and not value.optional: - data.info = "A required composite data file was not provided (%s)" % name - dataset_state = data.states.ERROR - break - elif uploaded_dataset.composite_files[ value.name] is not None: - if not value.is_binary: - if uploaded_dataset.composite_files[ value.name ].space_to_tab: - sniff.convert_newlines_sep2tabs( uploaded_dataset.composite_files[ value.name ].filename ) - else: - sniff.convert_newlines( uploaded_dataset.composite_files[ value.name ].filename ) - shutil.move( uploaded_dataset.composite_files[ value.name ].filename, os.path.join( data.extra_files_path, name ) ) - if data.datatype.composite_type == 'auto_primary_file': - #now that metadata was set above, we should create the primary file as required - open( data.file_name, 'wb+' ).write( data.datatype.generate_primary_file( dataset = data ) ) - data.state = dataset_state #Always set dataset state LAST - trans.app.model.flush() - trans.log_event( "Added dataset %d to history %d" %( data.id, trans.history.id ), tool_id="upload" ) - return data - - def check_gzip( self, temp_name ): - temp = open( temp_name, "U" ) - magic_check = temp.read( 2 ) - temp.close() - if magic_check != util.gzip_magic: - return ( False, False ) - CHUNK_SIZE = 2**15 # 32Kb - gzipped_file = gzip.GzipFile( temp_name ) - chunk = gzipped_file.read( CHUNK_SIZE ) - gzipped_file.close() - if self.check_html( temp_name, chunk=chunk ) or self.check_binary( temp_name, chunk=chunk ): - return( True, False ) - return ( True, True ) - - def check_zip( self, temp_name ): - if not zipfile.is_zipfile( temp_name ): - return ( False, False, None ) - zip_file = zipfile.ZipFile( temp_name, "r" ) - # Make sure the archive consists of valid files. The current rules are: - # 1. Archives can only include .ab1, .scf or .txt files - # 2. All file extensions within an archive must be the same - name = zip_file.namelist()[0] - test_ext = name.split( "." )[1].strip().lower() - if not ( test_ext == 'scf' or test_ext == 'ab1' or test_ext == 'txt' ): - return ( True, False, test_ext ) - for name in zip_file.namelist(): - ext = name.split( "." )[1].strip().lower() - if ext != test_ext: - return ( True, False, test_ext ) - return ( True, True, test_ext ) - - def check_html( self, temp_name, chunk=None ): - if chunk is None: - temp = open(temp_name, "U") - else: - temp = chunk - regexp1 = re.compile( "<A\s+[^>]*HREF[^>]+>", re.I ) - regexp2 = re.compile( "<IFRAME[^>]*>", re.I ) - regexp3 = re.compile( "<FRAMESET[^>]*>", re.I ) - regexp4 = re.compile( "<META[^>]*>", re.I ) - lineno = 0 - for line in temp: - lineno += 1 - matches = regexp1.search( line ) or regexp2.search( line ) or regexp3.search( line ) or regexp4.search( line ) - if matches: - if chunk is None: - temp.close() - return True - if lineno > 100: - break - if chunk is None: - temp.close() - return False - def check_binary( self, temp_name, chunk=None ): - if chunk is None: - temp = open( temp_name, "U" ) - else: - temp = chunk - lineno = 0 - for line in temp: - lineno += 1 - line = line.strip() - if line: - if util.is_multi_byte( line ): - return False - for char in line: - if ord( char ) > 128: - if chunk is None: - temp.close() - return True - if lineno > 10: - break - if chunk is None: - temp.close() - return False + # Queue the job for execution + trans.app.job_queue.put( job.id, tool ) + trans.log_event( "Added job to the job queue, id: %s" % str(job.id), tool_id=job.tool_id ) + return dict( [ ( i, v ) for i, v in enumerate( data_list ) ] ) def get_precreated_dataset( self, name ): """ @@ -378,7 +164,3 @@ return self.precreated_datasets.pop( names.index( name ) ) else: return None - -class BadFileException( Exception ): - pass - diff -r 5fa8803716fd -r f6e0863862ef lib/galaxy/tools/parameters/basic.py --- a/lib/galaxy/tools/parameters/basic.py Wed Aug 19 18:07:55 2009 -0400 +++ b/lib/galaxy/tools/parameters/basic.py Thu Aug 20 10:49:54 2009 -0400 @@ -304,21 +304,22 @@ def get_html_field( self, trans=None, value=None, other_values={} ): return form_builder.FileField( self.name, ajax = self.ajax, value = value ) def from_html( self, value, trans=None, other_values={} ): + # TODO: Fix nginx upload module support # Middleware or proxies may encode files in special ways (TODO: this # should be pluggable) - if type( value ) == dict: - upload_location = self.tool.app.config.nginx_upload_location - assert upload_location, \ - "Request appears to have been processed by nginx_upload_module \ - but Galaxy is not configured to recgonize it" - # Check that the file is in the right location - local_filename = os.path.abspath( value['path'] ) - assert local_filename.startswith( upload_location ), \ - "Filename provided by nginx is not in correct directory" - value = Bunch( - filename = value["name"], - local_filename = local_filename - ) + #if type( value ) == dict: + # upload_location = self.tool.app.config.nginx_upload_location + # assert upload_location, \ + # "Request appears to have been processed by nginx_upload_module \ + # but Galaxy is not configured to recgonize it" + # # Check that the file is in the right location + # local_filename = os.path.abspath( value['path'] ) + # assert local_filename.startswith( upload_location ), \ + # "Filename provided by nginx is not in correct directory" + # value = Bunch( + # filename = value["name"], + # local_filename = local_filename + # ) return value def get_required_enctype( self ): """ @@ -330,10 +331,18 @@ return None elif isinstance( value, unicode ) or isinstance( value, str ): return value + elif isinstance( value, dict ): + # or should we jsonify? + try: + return value['local_filename'] + except: + return None raise Exception( "FileToolParameter cannot be persisted" ) def to_python( self, value, app ): if value is None: return None + elif isinstance( value, unicode ) or isinstance( value, str ): + return value else: raise Exception( "FileToolParameter cannot be persisted" ) def get_initial_value( self, trans, context ): diff -r 5fa8803716fd -r f6e0863862ef lib/galaxy/tools/parameters/grouping.py --- a/lib/galaxy/tools/parameters/grouping.py Wed Aug 19 18:07:55 2009 -0400 +++ b/lib/galaxy/tools/parameters/grouping.py Thu Aug 20 10:49:54 2009 -0400 @@ -12,6 +12,7 @@ from galaxy.datatypes import sniff from galaxy.util.bunch import Bunch from galaxy.util.odict import odict +from galaxy.util import json class Group( object ): def __init__( self ): @@ -167,33 +168,30 @@ rval.append( rval_dict ) return rval def get_uploaded_datasets( self, trans, context, override_name = None, override_info = None ): - def get_data_file_filename( data_file, is_multi_byte = False, override_name = None, override_info = None ): + def get_data_file_filename( data_file, override_name = None, override_info = None ): dataset_name = override_name dataset_info = override_info def get_file_name( file_name ): file_name = file_name.split( '\\' )[-1] file_name = file_name.split( '/' )[-1] return file_name - if 'local_filename' in dir( data_file ): + try: # Use the existing file - return data_file.local_filename, get_file_name( data_file.filename ), is_multi_byte - elif 'filename' in dir( data_file ): - #create a new tempfile - try: - temp_name, is_multi_byte = sniff.stream_to_file( data_file.file, prefix='upload' ) - precreated_name = get_file_name( data_file.filename ) - if not dataset_name: - dataset_name = precreated_name - if not dataset_info: - dataset_info = 'uploaded file' - return temp_name, get_file_name( data_file.filename ), is_multi_byte, dataset_name, dataset_info - except Exception, e: - log.exception( 'exception in sniff.stream_to_file using file %s: %s' % ( data_file.filename, str( e ) ) ) - self.remove_temp_file( temp_name ) - return None, None, is_multi_byte, None, None - def filenames_from_url_paste( url_paste, group_incoming, override_name = None, override_info = None ): + if not dataset_name and 'filename' in data_file: + dataset_name = get_file_name( data_file['filename'] ) + if not dataset_info: + dataset_info = 'uploaded file' + return Bunch( type='file', path=data_file['local_filename'], name=get_file_name( data_file['filename'] ) ) + #return 'file', data_file['local_filename'], get_file_name( data_file.filename ), dataset_name, dataset_info + except: + # The uploaded file should've been persisted by the upload tool action + return Bunch( type=None, path=None, name=None ) + #return None, None, None, None, None + def get_url_paste_urls_or_filename( group_incoming, override_name = None, override_info = None ): filenames = [] - if url_paste not in [ None, "" ]: + url_paste_file = group_incoming.get( 'url_paste', None ) + if url_paste_file is not None: + url_paste = open( url_paste_file, 'r' ).read( 1024 ) if url_paste.lstrip().lower().startswith( 'http://' ) or url_paste.lstrip().lower().startswith( 'ftp://' ): url_paste = url_paste.replace( '\r', '' ).split( '\n' ) for line in url_paste: @@ -208,114 +206,54 @@ dataset_info = override_info if not dataset_info: dataset_info = 'uploaded url' - try: - temp_name, is_multi_byte = sniff.stream_to_file( urllib.urlopen( line ), prefix='url_paste' ) - except Exception, e: - temp_name = None - precreated_name = str( e ) - log.exception( 'exception in sniff.stream_to_file using url_paste %s: %s' % ( url_paste, str( e ) ) ) - try: - self.remove_temp_file( temp_name ) - except: - pass - yield ( temp_name, precreated_name, is_multi_byte, dataset_name, dataset_info ) - #yield ( None, str( e ), False, dataset_name, dataset_info ) + yield Bunch( type='url', path=line, name=precreated_name ) + #yield ( 'url', line, precreated_name, dataset_name, dataset_info ) else: dataset_name = dataset_info = precreated_name = 'Pasted Entry' #we need to differentiate between various url pastes here if override_name: dataset_name = override_name if override_info: dataset_info = override_info - is_valid = False - for line in url_paste: #Trim off empty lines from begining - line = line.rstrip( '\r\n' ) - if line: - is_valid = True - break - if is_valid: - try: - temp_name, is_multi_byte = sniff.stream_to_file( StringIO.StringIO( url_paste ), prefix='strio_url_paste' ) - except Exception, e: - log.exception( 'exception in sniff.stream_to_file using StringIO.StringIO( url_paste ) %s: %s' % ( url_paste, str( e ) ) ) - temp_name = None - precreated_name = str( e ) - try: - self.remove_temp_file( temp_name ) - except: - pass - yield ( temp_name, precreated_name, is_multi_byte, dataset_name, dataset_info ) - #yield ( None, str( e ), False, dataset_name, dataset_info ) - + yield Bunch( type='file', path=url_paste_file, name=precreated_name ) + #yield ( 'file', url_paste_file, precreated_name, dataset_name, dataset_info ) def get_one_filename( context ): data_file = context['file_data'] url_paste = context['url_paste'] name = context.get( 'NAME', None ) info = context.get( 'INFO', None ) warnings = [] - is_multi_byte = False space_to_tab = False if context.get( 'space_to_tab', None ) not in ["None", None]: space_to_tab = True - temp_name, precreated_name, is_multi_byte, dataset_name, dataset_info = get_data_file_filename( data_file, is_multi_byte = is_multi_byte, override_name = name, override_info = info ) - if temp_name: + file_bunch = get_data_file_filename( data_file, override_name = name, override_info = info ) + if file_bunch.path: if url_paste.strip(): warnings.append( "All file contents specified in the paste box were ignored." ) else: #we need to use url_paste - #file_names = filenames_from_url_paste( url_paste, context, override_name = name, override_info = info ) - for temp_name, precreated_name, is_multi_byte, dataset_name, dataset_info in filenames_from_url_paste( url_paste, context, override_name = name, override_info = info ):#file_names: - if temp_name: + for file_bunch in get_url_paste_urls_or_filename( context, override_name = name, override_info = info ): + if file_bunch.path: break - ###this check will cause an additional file to be retrieved and created...so lets not do that - #try: #check to see if additional paste contents were available - # file_names.next() - # warnings.append( "Additional file contents were specified in the paste box, but ignored." ) - #except StopIteration: - # pass - return temp_name, precreated_name, is_multi_byte, space_to_tab, dataset_name, dataset_info, warnings - + return file_bunch, warnings def get_filenames( context ): rval = [] data_file = context['file_data'] url_paste = context['url_paste'] name = context.get( 'NAME', None ) info = context.get( 'INFO', None ) - warnings = [] - is_multi_byte = False space_to_tab = False if context.get( 'space_to_tab', None ) not in ["None", None]: space_to_tab = True - temp_name, precreated_name, is_multi_byte, dataset_name, dataset_info = get_data_file_filename( data_file, is_multi_byte = is_multi_byte, override_name = name, override_info = info ) - if temp_name: - rval.append( ( temp_name, precreated_name, is_multi_byte, space_to_tab, dataset_name, dataset_info ) ) - for temp_name, precreated_name, is_multi_byte, dataset_name, dataset_info in filenames_from_url_paste( url_paste, context, override_name = name, override_info = info ): - if temp_name: - rval.append( ( temp_name, precreated_name, is_multi_byte, space_to_tab, dataset_name, dataset_info ) ) + warnings = [] + file_bunch = get_data_file_filename( data_file, override_name = name, override_info = info ) + if file_bunch.path: + file_bunch.space_to_tab = space_to_tab + rval.append( file_bunch ) + #rval.append( ( type, temp_name, precreated_name, space_to_tab, dataset_name, dataset_info ) ) + for file_bunch in get_url_paste_urls_or_filename( context, override_name = name, override_info = info ): + if file_bunch.path: + file_bunch.space_to_tab = space_to_tab + rval.append( file_bunch ) return rval - class UploadedDataset( Bunch ): - def __init__( self, **kwd ): - Bunch.__init__( self, **kwd ) - self.primary_file = None - self.composite_files = odict() - self.dbkey = None - self.warnings = [] - self.metadata = {} - - self._temp_filenames = [] #store all created filenames here, delete on cleanup - def register_temp_file( self, filename ): - if isinstance( filename, list ): - self._temp_filenames.extend( filename ) - else: - self._temp_filenames.append( filename ) - def remove_temp_file( self, filename ): - try: - os.unlink( filename ) - except Exception, e: - pass - #log.warning( str( e ) ) - def clean_up_temp_files( self ): - for filename in self._temp_filenames: - self.remove_temp_file( filename ) - file_type = self.get_file_type( context ) d_type = self.get_datatype( trans, context ) dbkey = context.get( 'dbkey', None ) @@ -325,51 +263,50 @@ for group_incoming in context.get( self.name, [] ): i = int( group_incoming['__index__'] ) groups_incoming[ i ] = group_incoming - if d_type.composite_type is not None: #handle uploading of composite datatypes #Only one Dataset can be created + ''' dataset = UploadedDataset() + dataset.datatype = d_type + ''' + dataset = Bunch() + dataset.type = 'composite' dataset.file_type = file_type + dataset.dbkey = dbkey dataset.datatype = d_type - dataset.dbkey = dbkey + dataset.warnings = [] + dataset.metadata = {} + dataset.composite_files = {} #load metadata files_metadata = context.get( self.metadata_ref, {} ) - for meta_name, meta_spec in d_type.metadata_spec.iteritems(): + for meta_name, meta_spec in d_type.metadata_spec.iteritems(): if meta_spec.set_in_upload: if meta_name in files_metadata: dataset.metadata[ meta_name ] = files_metadata[ meta_name ] - - temp_name = None - precreated_name = None - is_multi_byte = False - space_to_tab = False - warnings = [] dataset_name = None dataset_info = None if dataset.datatype.composite_type == 'auto_primary_file': #replace sniff here with just creating an empty file temp_name, is_multi_byte = sniff.stream_to_file( StringIO.StringIO( d_type.generate_primary_file() ), prefix='upload_auto_primary_file' ) - precreated_name = dataset_name = 'Uploaded Composite Dataset (%s)' % ( file_type ) + dataset.primary_file = temp_name + dataset.space_to_tab = False + dataset.precreated_name = dataset.name = 'Uploaded Composite Dataset (%s)' % ( file_type ) else: - temp_name, precreated_name, is_multi_byte, space_to_tab, dataset_name, dataset_info, warnings = get_one_filename( groups_incoming[ 0 ] ) + file_bunch, warnings = get_one_filename( groups_incoming[ 0 ] ) if dataset.datatype.composite_type: precreated_name = 'Uploaded Composite Dataset (%s)' % ( file_type ) writable_files_offset = 1 - if temp_name is None:#remove this before finish, this should create an empty dataset + dataset.primary_file = file_bunch.path + dataset.space_to_tab = file_bunch.space_to_tab + dataset.precreated_name = file_bunch.precreated_name + dataset.name = file_bunch.precreated_name + dataset.warnings.extend( file_bunch.warnings ) + if dataset.primary_file is None:#remove this before finish, this should create an empty dataset raise Exception( 'No primary dataset file was available for composite upload' ) - dataset.primary_file = temp_name - dataset.is_multi_byte = is_multi_byte - dataset.space_to_tab = space_to_tab - dataset.precreated_name = precreated_name - dataset.name = dataset_name - dataset.info = dataset_info - dataset.warnings.extend( warnings ) - dataset.register_temp_file( temp_name ) - keys = [ value.name for value in writable_files.values() ] for i, group_incoming in enumerate( groups_incoming[ writable_files_offset : ] ): key = keys[ i + writable_files_offset ] @@ -377,37 +314,22 @@ dataset.warnings.append( "A required composite file (%s) was not specified." % ( key ) ) dataset.composite_files[ key ] = None else: - temp_name, precreated_name, is_multi_byte, space_to_tab, dataset_name, dataset_info, warnings = get_one_filename( group_incoming ) - if temp_name: - dataset.composite_files[ key ] = Bunch( filename = temp_name, precreated_name = precreated_name, is_multi_byte = is_multi_byte, space_to_tab = space_to_tab, warnings = warnings, info = dataset_info, name = dataset_name ) - dataset.register_temp_file( temp_name ) + file_bunch, warnings = get_one_filename( group_incoming ) + if file_bunch.path: + dataset.composite_files[ key ] = file_bunch.__dict__ else: dataset.composite_files[ key ] = None if not writable_files[ writable_files.keys()[ keys.index( key ) ] ].optional: dataset.warnings.append( "A required composite file (%s) was not specified." % ( key ) ) return [ dataset ] else: + datasets = get_filenames( context[ self.name ][0] ) rval = [] - for temp_name, precreated_name, is_multi_byte, space_to_tab, dataset_name, dataset_info, in get_filenames( context[ self.name ][0] ): - dataset = UploadedDataset() + for dataset in datasets: dataset.file_type = file_type - dataset.datatype = d_type dataset.dbkey = dbkey - dataset.primary_file = temp_name - dataset.is_multi_byte = is_multi_byte - dataset.space_to_tab = space_to_tab - dataset.name = dataset_name - dataset.info = dataset_info - dataset.precreated_name = precreated_name - dataset.register_temp_file( temp_name ) rval.append( dataset ) - return rval - def remove_temp_file( self, filename ): - try: - os.unlink( filename ) - except Exception, e: - log.warning( str( e ) ) - + return rval class Conditional( Group ): type = "conditional" diff -r 5fa8803716fd -r f6e0863862ef lib/galaxy/util/__init__.py --- a/lib/galaxy/util/__init__.py Wed Aug 19 18:07:55 2009 -0400 +++ b/lib/galaxy/util/__init__.py Thu Aug 20 10:49:54 2009 -0400 @@ -3,7 +3,7 @@ """ import logging -import threading, random, string, re, binascii, pickle, time, datetime, math, re, os, sys +import threading, random, string, re, binascii, pickle, time, datetime, math, re, os, sys, tempfile # Older py compatibility try: @@ -454,6 +454,26 @@ out_dict[ str( key ) ] = value return out_dict +def mkstemp_ln( src, prefix='mkstemp_ln_' ): + """ + From tempfile._mkstemp_inner, generate a hard link in the same dir with a + random name. Created so we can persist the underlying file of a + NamedTemporaryFile upon its closure. + """ + dir = os.path.dirname(src) + names = tempfile._get_candidate_names() + for seq in xrange(tempfile.TMP_MAX): + name = names.next() + file = os.path.join(dir, prefix + name) + try: + linked_path = os.link( src, file ) + return (os.path.abspath(file)) + except OSError, e: + if e.errno == errno.EEXIST: + continue # try again + raise + raise IOError, (errno.EEXIST, "No usable temporary file name found") + galaxy_root_path = os.path.join(__path__[0], "..","..","..") dbnames = read_dbnames( os.path.join( galaxy_root_path, "tool-data", "shared", "ucsc", "builds.txt" ) ) #this list is used in edit attributes and the upload tool ucsc_build_sites = read_build_sites( os.path.join( galaxy_root_path, "tool-data", "shared", "ucsc", "ucsc_build_sites.txt" ) ) #this list is used in history.tmpl diff -r 5fa8803716fd -r f6e0863862ef lib/galaxy/web/controllers/tool_runner.py --- a/lib/galaxy/web/controllers/tool_runner.py Wed Aug 19 18:07:55 2009 -0400 +++ b/lib/galaxy/web/controllers/tool_runner.py Thu Aug 20 10:49:54 2009 -0400 @@ -136,6 +136,7 @@ """ Precreate datasets for asynchronous uploading. """ + permissions = trans.app.security_agent.history_get_default_permissions( trans.history ) def create_dataset( name, history ): data = trans.app.model.HistoryDatasetAssociation( create_dataset = True ) data.name = name @@ -143,6 +144,7 @@ data.history = history data.flush() history.add_dataset( data ) + trans.app.security_agent.set_all_dataset_permissions( data.dataset, permissions ) return data tool = self.get_toolbox().tools_by_id.get( tool_id, None ) if not tool: diff -r 5fa8803716fd -r f6e0863862ef lib/galaxy/web/framework/base.py --- a/lib/galaxy/web/framework/base.py Wed Aug 19 18:07:55 2009 -0400 +++ b/lib/galaxy/web/framework/base.py Thu Aug 20 10:49:54 2009 -0400 @@ -212,6 +212,17 @@ else: return None +# For request.params, override cgi.FieldStorage.make_file to create persistent +# tempfiles. Necessary for externalizing the upload tool. It's a little hacky +# but for performance reasons it's way better to use Paste's tempfile than to +# create a new one and copy. +import cgi +class FieldStorage( cgi.FieldStorage ): + def make_file(self, binary=None): + import tempfile + return tempfile.NamedTemporaryFile() +cgi.FieldStorage = FieldStorage + class Request( webob.Request ): """ Encapsulates an HTTP request. diff -r 5fa8803716fd -r f6e0863862ef templates/base_panels.mako --- a/templates/base_panels.mako Wed Aug 19 18:07:55 2009 -0400 +++ b/templates/base_panels.mako Thu Aug 20 10:49:54 2009 -0400 @@ -72,9 +72,6 @@ <script type="text/javascript"> jQuery( function() { $("iframe#galaxy_main").load( function() { - ##$(this.contentDocument).find("input[galaxy-ajax-upload]").each( function() { - ##$("iframe")[0].contentDocument.body.innerHTML = "HELLO" - ##$(this.contentWindow.document).find("input[galaxy-ajax-upload]").each( function() { $(this).contents().find("form").each( function() { if ( $(this).find("input[galaxy-ajax-upload]").length > 0 ){ $(this).submit( function() { diff -r 5fa8803716fd -r f6e0863862ef test/base/twilltestcase.py --- a/test/base/twilltestcase.py Wed Aug 19 18:07:55 2009 -0400 +++ b/test/base/twilltestcase.py Thu Aug 20 10:49:54 2009 -0400 @@ -93,6 +93,8 @@ valid_hid = int( hid ) except: raise AssertionError, "Invalid hid (%s) created when uploading file %s" % ( hid, filename ) + # Wait for upload processing to finish (TODO: this should be done in each test case instead) + self.wait() def upload_url_paste( self, url_paste, ftype='auto', dbkey='unspecified (?)' ): """Pasted data in the upload utility""" self.visit_page( "tool_runner/index?tool_id=upload1" ) @@ -112,6 +114,8 @@ valid_hid = int( hid ) except: raise AssertionError, "Invalid hid (%s) created when pasting %s" % ( hid, url_paste ) + # Wait for upload processing to finish (TODO: this should be done in each test case instead) + self.wait() # Functions associated with histories def check_history_for_errors( self ): diff -r 5fa8803716fd -r f6e0863862ef tools/data_source/upload.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/data_source/upload.py Thu Aug 20 10:49:54 2009 -0400 @@ -0,0 +1,280 @@ +#!/usr/bin/env python +#Processes uploads from the user. + +# WARNING: Changes in this tool (particularly as related to parsing) may need +# to be reflected in galaxy.web.controllers.tool_runner and galaxy.tools + +import urllib, sys, os, gzip, tempfile, shutil, re, gzip, zipfile +from galaxy import eggs +# need to import model before sniff to resolve a circular import dependency +import galaxy.model +from galaxy.datatypes import sniff +from galaxy import util +from galaxy.util.json import * + +assert sys.version_info[:2] >= ( 2, 4 ) + +def stop_err( msg, ret=1 ): + sys.stderr.write( msg ) + sys.exit( ret ) + +def file_err( msg, dataset, json_file ): + json_file.write( to_json_string( dict( type = 'dataset', + ext = 'data', + dataset_id = dataset.dataset_id, + stderr = msg ) ) + "\n" ) + try: + os.remove( dataset.path ) + except: + pass + +def safe_dict(d): + """ + Recursively clone json structure with UTF-8 dictionary keys + http://mellowmachines.com/blog/2009/06/exploding-dictionary-with-unicode-key... + """ + if isinstance(d, dict): + return dict([(k.encode('utf-8'), safe_dict(v)) for k,v in d.iteritems()]) + elif isinstance(d, list): + return [safe_dict(x) for x in d] + else: + return d + +def check_html( temp_name, chunk=None ): + if chunk is None: + temp = open(temp_name, "U") + else: + temp = chunk + regexp1 = re.compile( "<A\s+[^>]*HREF[^>]+>", re.I ) + regexp2 = re.compile( "<IFRAME[^>]*>", re.I ) + regexp3 = re.compile( "<FRAMESET[^>]*>", re.I ) + regexp4 = re.compile( "<META[^>]*>", re.I ) + lineno = 0 + for line in temp: + lineno += 1 + matches = regexp1.search( line ) or regexp2.search( line ) or regexp3.search( line ) or regexp4.search( line ) + if matches: + if chunk is None: + temp.close() + return True + if lineno > 100: + break + if chunk is None: + temp.close() + return False + +def check_binary( temp_name, chunk=None ): + if chunk is None: + temp = open( temp_name, "U" ) + else: + temp = chunk + lineno = 0 + for line in temp: + lineno += 1 + line = line.strip() + if line: + for char in line: + if ord( char ) > 128: + if chunk is None: + temp.close() + return True + if lineno > 10: + break + if chunk is None: + temp.close() + return False + +def check_gzip( temp_name ): + temp = open( temp_name, "U" ) + magic_check = temp.read( 2 ) + temp.close() + if magic_check != util.gzip_magic: + return ( False, False ) + CHUNK_SIZE = 2**15 # 32Kb + gzipped_file = gzip.GzipFile( temp_name ) + chunk = gzipped_file.read( CHUNK_SIZE ) + gzipped_file.close() + if check_html( temp_name, chunk=chunk ) or check_binary( temp_name, chunk=chunk ): + return( True, False ) + return ( True, True ) + +def check_zip( temp_name ): + if not zipfile.is_zipfile( temp_name ): + return ( False, False, None ) + zip_file = zipfile.ZipFile( temp_name, "r" ) + # Make sure the archive consists of valid files. The current rules are: + # 1. Archives can only include .ab1, .scf or .txt files + # 2. All file extensions within an archive must be the same + name = zip_file.namelist()[0] + test_ext = name.split( "." )[1].strip().lower() + if not ( test_ext == 'scf' or test_ext == 'ab1' or test_ext == 'txt' ): + return ( True, False, test_ext ) + for name in zip_file.namelist(): + ext = name.split( "." )[1].strip().lower() + if ext != test_ext: + return ( True, False, test_ext ) + return ( True, True, test_ext ) + +def add_file( dataset, json_file ): + data_type = None + line_count = None + + if dataset.type == 'url': + try: + temp_name, is_multi_byte = sniff.stream_to_file( urllib.urlopen( dataset.path ), prefix='url_paste' ) + except Exception, e: + file_err( 'Unable to fetch %s\n%s' % ( dataset.path, str( e ) ), dataset, json_file ) + return + dataset.path = temp_name + dataset.is_multi_byte = is_multi_byte + + # See if we have an empty file + if not os.path.exists( dataset.path ): + file_err( 'Uploaded temporary file (%s) does not exist. Please' % dataset.path, dataset, json_file ) + return + if not os.path.getsize( dataset.path ) > 0: + file_err( 'The uploaded file is empty', dataset, json_file ) + return + if 'is_multi_byte' not in dir( dataset ): + dataset.is_multi_byte = util.is_multi_byte( open( dataset.path, 'r' ).read( 1024 )[:100] ) + if dataset.is_multi_byte: + ext = sniff.guess_ext( dataset.path, is_multi_byte=True ) + data_type = ext + else: + # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress + is_gzipped, is_valid = check_gzip( dataset.path ) + if is_gzipped and not is_valid: + file_err( 'The uploaded file contains inappropriate content', dataset, json_file ) + return + elif is_gzipped and is_valid: + # We need to uncompress the temp_name file + CHUNK_SIZE = 2**20 # 1Mb + fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( dataset.path ) ) + gzipped_file = gzip.GzipFile( dataset.path ) + while 1: + try: + chunk = gzipped_file.read( CHUNK_SIZE ) + except IOError: + os.close( fd ) + os.remove( uncompressed ) + file_err( 'Problem decompressing gzipped data', dataset, json_file ) + return + if not chunk: + break + os.write( fd, chunk ) + os.close( fd ) + gzipped_file.close() + # Replace the gzipped file with the decompressed file + shutil.move( uncompressed, dataset.path ) + dataset.name = dataset.name.rstrip( '.gz' ) + data_type = 'gzip' + if not data_type: + # See if we have a zip archive + is_zipped, is_valid, test_ext = check_zip( dataset.path ) + if is_zipped and not is_valid: + file_err( 'The uploaded file contains inappropriate content', dataset, json_file ) + return + elif is_zipped and is_valid: + # Currently, we force specific tools to handle this case. We also require the user + # to manually set the incoming file_type + if ( test_ext == 'ab1' or test_ext == 'scf' ) and dataset.file_type != 'binseq.zip': + file_err( "Invalid 'File Format' for archive consisting of binary files - use 'Binseq.zip'", dataset, json_file ) + return + elif test_ext == 'txt' and dataset.file_type != 'txtseq.zip': + file_err( "Invalid 'File Format' for archive consisting of text files - use 'Txtseq.zip'", dataset, json_file ) + return + if not ( dataset.file_type == 'binseq.zip' or dataset.file_type == 'txtseq.zip' ): + file_err( "You must manually set the 'File Format' to either 'Binseq.zip' or 'Txtseq.zip' when uploading zip files", dataset, json_file ) + return + data_type = 'zip' + ext = dataset.file_type + if not data_type: + if check_binary( dataset.path ): + if dataset.is_binary is not None: + data_type = 'binary' + ext = dataset.file_type + else: + parts = dataset.name.split( "." ) + if len( parts ) > 1: + ext = parts[1].strip().lower() + if not( ext == 'ab1' or ext == 'scf' ): + file_err( 'The uploaded file contains inappropriate content', dataset, json_file ) + return + if ext == 'ab1' and dataset.file_type != 'ab1': + file_err( "You must manually set the 'File Format' to 'Ab1' when uploading ab1 files.", dataset, json_file ) + return + elif ext == 'scf' and dataset.file_type != 'scf': + file_err( "You must manually set the 'File Format' to 'Scf' when uploading scf files.", dataset, json_file ) + return + data_type = 'binary' + if not data_type: + # We must have a text file + if check_html( dataset.path ): + file_err( 'The uploaded file contains inappropriate content', dataset, json_file ) + return + if data_type != 'binary' and data_type != 'zip': + if dataset.space_to_tab: + line_count = sniff.convert_newlines_sep2tabs( dataset.path ) + else: + line_count = sniff.convert_newlines( dataset.path ) + if dataset.file_type == 'auto': + ext = sniff.guess_ext( dataset.path ) + else: + ext = dataset.file_type + data_type = ext + # Save job info for the framework + info = dict( type = 'dataset', + dataset_id = dataset.dataset_id, + path = dataset.path, + ext = ext, + stdout = 'uploaded %s file' % data_type, + name = dataset.name, + line_count = line_count ) + json_file.write( to_json_string( info ) + "\n" ) + +def add_composite_file( dataset, json_file ): + if dataset.composite_files: + os.mkdir( dataset.extra_files_path ) + for name, value in dataset.composite_files.iteritems(): + value = util.bunch.Bunch( **value ) + if dataset.composite_file_paths[ value.name ] is None and not value.optional: + file_err( 'A required composite data file was not provided (%s)' % name, dataset, json_file ) + break + elif dataset.composite_file_paths[value.name] is not None: + if not value.is_binary: + if uploaded_dataset.composite_files[ value.name ].space_to_tab: + sniff.convert_newlines_sep2tabs( dataset.composite_file_paths[ value.name ][ 'path' ] ) + else: + sniff.convert_newlines( dataset.composite_file_paths[ value.name ][ 'path' ] ) + shutil.move( dataset.composite_file_paths[ value.name ][ 'path' ], os.path.join( dataset.extra_files_path, name ) ) + info = dict( type = 'dataset', + dataset_id = dataset.dataset_id, + path = dataset.primary_file, + stdout = 'uploaded %s file' % dataset.file_type ) + json_file.write( to_json_string( info ) + "\n" ) + +def __main__(): + + if len( sys.argv ) != 2: + print >>sys.stderr, 'usage: upload.py <json paramfile>' + sys.exit( 1 ) + + json_file = open( 'galaxy.json', 'w' ) + + for line in open( sys.argv[1], 'r' ): + dataset = from_json_string( line ) + dataset = util.bunch.Bunch( **safe_dict( dataset ) ) + + if dataset.type == 'composite': + add_composite_file( dataset, json_file ) + else: + add_file( dataset, json_file ) + + # clean up paramfile + try: + os.remove( sys.argv[1] ) + except: + pass + +if __name__ == '__main__': + __main__() diff -r 5fa8803716fd -r f6e0863862ef tools/data_source/upload.xml --- a/tools/data_source/upload.xml Wed Aug 19 18:07:55 2009 -0400 +++ b/tools/data_source/upload.xml Thu Aug 20 10:49:54 2009 -0400 @@ -1,10 +1,13 @@ <?xml version="1.0"?> -<tool name="Upload File" id="upload1" version="1.0.2"> +<tool name="Upload File" id="upload1" version="1.0.3"> <description> from your computer </description> <action module="galaxy.tools.actions.upload" class="UploadToolAction"/> + <command interpreter="python"> + upload.py $paramfile + </command> <inputs> <param name="file_type" type="select" label="File Format" help="Which format? See help below"> <options from_parameter="tool.app.datatypes_registry.upload_file_formats" transform_lines="[ "%s%s%s" % ( line, self.separator, line ) for line in obj ]">