1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/0733c87123cb/ Changeset: 0733c87123cb User: martenson Date: 2015-01-26 18:08:16+00:00 Summary: initial version of TS search API and Index builder script beware: always returns jsonp for now Affected #: 7 files diff -r 418633a366083e171f2a3583b591d81a7b333c41 -r 0733c87123cbc2f7009a125c5e57d630bb8296a6 lib/galaxy/webapps/tool_shed/api/search.py --- /dev/null +++ b/lib/galaxy/webapps/tool_shed/api/search.py @@ -0,0 +1,34 @@ +"""API for searching the toolshed repositories""" +from galaxy import exceptions +from galaxy import eggs +from galaxy.web import _future_expose_api as expose_api +# from galaxy.web import _future_expose_api_anonymous as expose_api_anonymous +from galaxy.web import _future_expose_api_raw_anonymous as expose_api_raw_anonymous +from galaxy.web.base.controller import BaseAPIController +from galaxy.webapps.tool_shed.search.repo_search import RepoSearch +import json +import logging +log = logging.getLogger( __name__ ) + + +class SearchController ( BaseAPIController ): + + @expose_api_raw_anonymous + def search( self, trans, search_term, **kwd ): + """ + Perform a search over the Whoosh index. + The index has to be pre-created with build_ts_whoosh_index.sh. + TS config option toolshed_search_on has to be turned on and + toolshed_whoosh_index_dir has to be specified and existing. + """ + if not self.app.config.toolshed_search_on: + raise exceptions.ConfigDoesNotAllowException( 'Searching the TS through the API is turned off for this instance.' ) + if not self.app.config.toolshed_whoosh_index_dir: + raise exceptions.ConfigDoesNotAllowException( 'There is no directory for the search index specified. Please ontact the administrator.' ) + + repo_search = RepoSearch() + results = repo_search.search( trans, search_term ) + + response = '%s(%s);' % ( kwd.get('callback'), json.dumps(results) ) + log.debug(response) + return response diff -r 418633a366083e171f2a3583b591d81a7b333c41 -r 0733c87123cbc2f7009a125c5e57d630bb8296a6 lib/galaxy/webapps/tool_shed/buildapp.py --- a/lib/galaxy/webapps/tool_shed/buildapp.py +++ b/lib/galaxy/webapps/tool_shed/buildapp.py @@ -85,6 +85,11 @@ controller='authenticate', action='get_tool_shed_api_key', conditions=dict( method=[ "GET" ] ) ) + webapp.mapper.connect( 'repo_search', + '/api/search/', + controller='search', + action='search', + conditions=dict( method=[ "GET" ] ) ) webapp.mapper.resource( 'category', 'categories', controller='categories', diff -r 418633a366083e171f2a3583b591d81a7b333c41 -r 0733c87123cbc2f7009a125c5e57d630bb8296a6 lib/galaxy/webapps/tool_shed/config.py --- a/lib/galaxy/webapps/tool_shed/config.py +++ b/lib/galaxy/webapps/tool_shed/config.py @@ -40,6 +40,9 @@ self.database_connection = kwargs.get( "database_connection", False ) self.database_engine_options = get_database_engine_options( kwargs ) self.database_create_tables = string_as_bool( kwargs.get( "database_create_tables", "True" ) ) + # Whoosh search + self.toolshed_search_on = string_as_bool( kwargs.get( "toolshed_search_on", False ) ) + self.toolshed_whoosh_index_dir = kwargs.get( "toolshed_whoosh_index_dir", None ) # Analytics self.ga_code = kwargs.get( "ga_code", None ) # Where dataset files are stored diff -r 418633a366083e171f2a3583b591d81a7b333c41 -r 0733c87123cbc2f7009a125c5e57d630bb8296a6 lib/galaxy/webapps/tool_shed/search/repo_search.py --- /dev/null +++ b/lib/galaxy/webapps/tool_shed/search/repo_search.py @@ -0,0 +1,98 @@ +"""Module for searching the toolshed repositories""" +from galaxy import exceptions +from galaxy import eggs +from galaxy.web.base.controller import BaseAPIController +from galaxy.webapps.tool_shed import model +from tool_shed.util.shed_util_common import generate_sharable_link_for_repository_in_tool_shed + + +import logging +log = logging.getLogger( __name__ ) + +# Whoosh is compatible with Python 2.5+ +# Try to import Whoosh and set flag to indicate whether +# the tool search is ready. +try: + eggs.require( "Whoosh" ) + import whoosh.index + from whoosh.fields import Schema, STORED, ID, KEYWORD, TEXT + from whoosh.scoring import BM25F + from whoosh.qparser import MultifieldParser + from whoosh.index import Index + search_ready = True + + schema = Schema( + id=STORED, + name=TEXT, + description=TEXT, + long_description=TEXT, + repo_type=TEXT, + homepage_url=TEXT, + remote_repository_url=TEXT, + repo_owner_username=TEXT ) +except ImportError, e: + search_ready = False + schema = None + +class RepoSearch ( object ): + + def search( self, trans, search_term, **kwd ): + """ + Perform the search on the given search_term + + :param search_term: unicode encoded string with the search term(s) + + :returns results: dictionary containing number of hits, + hits themselves and matched terms for each + """ + if search_ready: + toolshed_whoosh_index_dir = trans.app.config.toolshed_whoosh_index_dir + index_exists = whoosh.index.exists_in( toolshed_whoosh_index_dir ) + if index_exists: + index = whoosh.index.open_dir( toolshed_whoosh_index_dir ) + try: + # Some literature about BM25F: + # http://trec.nist.gov/pubs/trec13/papers/microsoft-cambridge.web.hard.pdf + # http://en.wikipedia.org/wiki/Okapi_BM25 + # Basically the higher number the bigger weight. + searcher = index.searcher( weighting=BM25F( field_B={ + 'name_B' : 0.9, + 'description_B' : 0.6, + 'long_description_B' : 0.5, + 'homepage_url_B' : 0.3, + 'remote_repository_url_B' : 0.2, + 'repo_owner_username' : 0.3, + 'repo_type_B' : 0.1 } ) ) + parser = MultifieldParser( [ + 'name', + 'description', + 'long_description', + 'repo_type', + 'homepage_url', + 'remote_repository_url', + 'repo_owner_username' ], schema=schema ) + + hits = searcher.search( parser.parse( '*' + search_term + '*' ), terms = True ) + results = {} + results[ 'length'] = len( hits ) + results[ 'hits' ] = [] + for hit in hits: + repo = trans.sa_session.query( model.Repository ).filter_by( id=hit[ 'id' ] ).one() + approved = 'no' + for review in repo.reviews: + if review.approved == 'yes': + approved = 'yes' + break + hit_dict = repo.to_dict( view='element', value_mapper={ 'id': trans.security.encode_id, 'user_id': trans.security.encode_id } ) + hit_dict[ 'url'] = generate_sharable_link_for_repository_in_tool_shed( repo ) + hit_dict[ 'last_updated' ] = repo.update_time.strftime( "%Y-%m-%d %I:%M %p" ) + hit_dict[ 'times_downloaded' ] = repo.times_downloaded + hit_dict[ 'approved' ] = approved + results[ 'hits' ].append( {'repository': hit_dict, 'matched_terms': hit.matched_terms() } ) + return results + finally: + searcher.close() + else: + raise exceptions.InternalServerError( 'The search index file is missing.' ) + else: + raise exceptions.InternalServerError( 'Could not initialize search.' ) diff -r 418633a366083e171f2a3583b591d81a7b333c41 -r 0733c87123cbc2f7009a125c5e57d630bb8296a6 scripts/tool_shed/build_ts_whoosh_index.py --- /dev/null +++ b/scripts/tool_shed/build_ts_whoosh_index.py @@ -0,0 +1,101 @@ +# !/usr/bin/env python +""" Build indeces for searching the TS """ +import sys, os, csv, urllib, urllib2, ConfigParser + +new_path = [ os.path.join( os.getcwd(), "lib" ) ] +new_path.extend( sys.path[1:] ) # remove scripts/ from the path +sys.path = new_path + +from galaxy import eggs + +# Whoosh is compatible with Python 2.5+ Try to import Whoosh and set flag to indicate whether search is enabled. +try: + eggs.require( "Whoosh" ) + import pkg_resources + pkg_resources.require( "SQLAlchemy >= 0.4" ) + + import whoosh.index + import galaxy.webapps.tool_shed.model.mapping + from whoosh.filedb.filestore import FileStorage + from whoosh.fields import Schema, STORED, ID, KEYWORD, TEXT + from whoosh.scoring import BM25F + from whoosh.qparser import MultifieldParser + from whoosh.index import Index + from galaxy.webapps.tool_shed import config, model + + whoosh_ready = True + schema = Schema( + id=STORED, + name=TEXT, + description=TEXT, + long_description=TEXT, + repo_type=TEXT, + homepage_url=TEXT, + remote_repository_url=TEXT, + repo_owner_username=TEXT ) + +except ImportError, e: + print 'import error' + whoosh_ready = False + schema = None + +def build_index( sa_session, toolshed_whoosh_index_dir ): + storage = FileStorage( toolshed_whoosh_index_dir ) + index = storage.create_index( schema ) + writer = index.writer() + def to_unicode( a_basestr ): + if type( a_basestr ) is str: + return unicode( a_basestr, 'utf-8' ) + else: + return a_basestr + + repos_indexed = 0 + for id, name, description, long_description, repo_type, homepage_url, remote_repository_url, repo_owner_username in get_repos( sa_session ): + writer.add_document( id=id, + name=to_unicode( name ), + description=to_unicode( description ), + long_description=to_unicode( long_description ), + repo_type=to_unicode( repo_type ), + homepage_url=to_unicode( homepage_url ), + remote_repository_url=to_unicode( remote_repository_url ), + repo_owner_username=to_unicode( repo_owner_username ) ) + repos_indexed += 1 + writer.commit() + print "Number of repos indexed: ", repos_indexed + +def get_repos( sa_session ): + for repo in sa_session.query( model.Repository ).filter_by( deleted=False ).filter_by( deprecated=False ).filter( model.Repository.type != 'tool_dependency_definition' ): + id = repo.id + name = repo.name + description = repo.description + long_description = repo.long_description + repo_type = repo.type + homepage_url = repo.homepage_url + remote_repository_url = repo.remote_repository_url + + repo_owner_username = "" + if repo.user_id is not None: + user = sa_session.query( model.User ).filter( model.User.id == repo.user_id ).one() + repo_owner_username = user.username + + yield id, name, description, long_description, repo_type, homepage_url, remote_repository_url, repo_owner_username + +def get_sa_session_and_needed_config_settings( ini_file ): + conf_parser = ConfigParser.ConfigParser( { 'here' : os.getcwd() } ) + conf_parser.read( ini_file ) + kwds = dict() + for key, value in conf_parser.items( "app:main" ): + kwds[ key ] = value + config_settings = config.Configuration( **kwds ) + db_con = config_settings.database_connection + if not db_con: + db_con = "sqlite:///%s?isolation_level=IMMEDIATE" % config_settings.database + model = galaxy.webapps.tool_shed.model.mapping.init( config_settings.file_path, db_con, engine_options={}, create_tables=False ) + return model.context.current, config_settings + +if __name__ == "__main__": + if whoosh_ready: + ini_file = sys.argv[ 1 ] + sa_session, config_settings = get_sa_session_and_needed_config_settings( ini_file ) + toolshed_whoosh_index_dir = config_settings.get( 'toolshed_whoosh_index_dir', None ) + build_index( sa_session, toolshed_whoosh_index_dir ) diff -r 418633a366083e171f2a3583b591d81a7b333c41 -r 0733c87123cbc2f7009a125c5e57d630bb8296a6 scripts/tool_shed/build_ts_whoosh_index.sh --- /dev/null +++ b/scripts/tool_shed/build_ts_whoosh_index.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +cd `dirname $0`/../.. +# Make sure your config is at config/tool_shed.ini +# and that you specified toolshed_whoosh_index_dir in it. +python scripts/tool_shed/build_ts_whoosh_index.py config/tool_shed.ini Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.