commit/galaxy-central: dan: Special case genomespace file browser import tool's handling of GenomeSpace format identifier 'http://www.genomespace.org/datamanager/dataformat/unknown' to cause datatype auto-dection via sniff with a fallback to using filename extention.
1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/10ac38a44815/ Changeset: 10ac38a44815 User: dan Date: 2014-01-09 16:54:58 Summary: Special case genomespace file browser import tool's handling of GenomeSpace format identifier 'http://www.genomespace.org/datamanager/dataformat/unknown' to cause datatype auto-dection via sniff with a fallback to using filename extention. Affected #: 1 file diff -r 110a0aabd293fd1393d1296cb220a0b153d69123 -r 10ac38a448152b9a927c34f36334724dfd173bfc tools/genomespace/genomespace_file_browser.py --- a/tools/genomespace/genomespace_file_browser.py +++ b/tools/genomespace/genomespace_file_browser.py @@ -8,11 +8,16 @@ pkg_resources.require( "simplejson" ) import simplejson +import galaxy.model # need to import model before sniff to resolve a circular import dependency +from galaxy.datatypes import sniff +from galaxy.datatypes.registry import Registry + GENOMESPACE_API_VERSION_STRING = "v1.0" GENOMESPACE_SERVER_URL_PROPERTIES = "https://dm.genomespace.org/config/%s/serverurl.properties" % ( GENOMESPACE_API_VERSION_STRING ) CHUNK_SIZE = 2**20 #1mb +AUTO_GALAXY_EXT = "auto" DEFAULT_GALAXY_EXT = "data" #genomespace format identifier is the URL @@ -39,6 +44,9 @@ 'gmt': 'gmt', 'gct': 'gct'} +GENOMESPACE_UNKNOWN_FORMAT_KEY = 'unknown' +GENOMESPACE_FORMAT_IDENTIFIER_UNKNOWN = None + VALID_CHARS = '.-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ' def chunk_write( source_stream, target_stream, source_method = "read", target_method="write" ): @@ -67,7 +75,7 @@ ext = GENOMESPACE_EXT_TO_GALAXY_EXT.get( ext, None ) if ext is None: #could check content type, etc here - ext = DEFAULT_GALAXY_EXT + ext = AUTO_GALAXY_EXT return ext def get_genomespace_site_urls(): @@ -90,6 +98,8 @@ genomespace_formats = simplejson.loads( opened_gs_request.read() ) for format in genomespace_formats: GENOMESPACE_FORMAT_IDENTIFIER_TO_GENOMESPACE_EXT[ format['url'] ] = format['name'] + global GENOMESPACE_FORMAT_IDENTIFIER_UNKNOWN + GENOMESPACE_FORMAT_IDENTIFIER_UNKNOWN = dict( map( lambda x: ( x[1], x[0] ) , GENOMESPACE_FORMAT_IDENTIFIER_TO_GENOMESPACE_EXT.iteritems() ) ).get( GENOMESPACE_UNKNOWN_FORMAT_KEY, GENOMESPACE_FORMAT_IDENTIFIER_UNKNOWN ) def download_from_genomespace_file_browser( json_parameter_file, genomespace_site ): json_params = simplejson.loads( open( json_parameter_file, 'r' ).read() ) @@ -108,6 +118,11 @@ file_url_prefix = "fileUrl" file_type_prefix = "fileFormat" metadata_parameter_file = open( json_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'wb' ) + + #setup datatypes registry for sniffing + datatypes_registry = Registry() + datatypes_registry.load_datatypes( root_dir = json_params[ 'job_config' ][ 'GALAXY_ROOT_DIR' ], config = json_params[ 'job_config' ][ 'GALAXY_DATATYPES_CONF_FILE' ] ) + file_numbers = [] for name in datasource_params.keys(): if name.startswith( file_url_prefix ): @@ -143,28 +158,47 @@ filename = urllib.unquote_plus( parsed_url[2].split( '/' )[-1] ) if not filename: filename = download_url + metadata_dict = None + original_filename = filename if output_filename is None: - original_filename = filename filename = ''.join( c in VALID_CHARS and c or '-' for c in filename ) while filename in used_filenames: filename = "-%s" % filename used_filenames.append( filename ) output_filename = os.path.join( datasource_params['__new_file_path__'], 'primary_%i_%s_visible_%s' % ( hda_id, filename, galaxy_ext ) ) - metadata_parameter_file.write( "%s\n" % simplejson.dumps( dict( type = 'new_primary_dataset', - base_dataset_id = dataset_id, - ext = galaxy_ext, - filename = output_filename, - name = "GenomeSpace import on %s" % ( original_filename ) ) ) ) + + metadata_dict = dict( type = 'new_primary_dataset', + base_dataset_id = dataset_id, + ext = galaxy_ext, + filename = output_filename, + name = "GenomeSpace import on %s" % ( original_filename ) ) else: if dataset_id is not None: - metadata_parameter_file.write( "%s\n" % simplejson.dumps( dict( type = 'dataset', - dataset_id = dataset_id, - ext = galaxy_ext, - name = "GenomeSpace import on %s" % ( filename ) ) ) ) + metadata_dict = dict( type = 'dataset', + dataset_id = dataset_id, + ext = galaxy_ext, + name = "GenomeSpace import on %s" % ( filename ) ) output_file = open( output_filename, 'wb' ) chunk_write( target_download_url, output_file ) output_file.close() + + if ( galaxy_ext == AUTO_GALAXY_EXT or filetype_url == GENOMESPACE_FORMAT_IDENTIFIER_UNKNOWN ) and metadata_dict: + #try to sniff datatype + try: + galaxy_ext = sniff.handle_uploaded_dataset_file( output_filename, datatypes_registry ) + except: + #sniff failed + galaxy_ext = original_filename.rsplit( '.', 1 )[-1] + if galaxy_ext not in datatypes_registry.datatypes_by_extension: + galaxy_ext = DEFAULT_GALAXY_EXT + metadata_dict[ 'ext' ] = galaxy_ext + output_filename = None #only have one filename available + + #write out metadata info + if metadata_dict: + metadata_parameter_file.write( "%s\n" % simplejson.dumps( metadata_dict ) ) + metadata_parameter_file.close() return True Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
commits-noreply@bitbucket.org