commit/galaxy-central: martenson: way better search; dont query DB; repair final scoring functions
1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/bbebacb3a4e5/ Changeset: bbebacb3a4e5 User: martenson Date: 2015-01-28 20:24:10+00:00 Summary: way better search; dont query DB; repair final scoring functions Affected #: 3 files diff -r bc4403d1f81b560c8e0685eb1093b83ff6b348e4 -r bbebacb3a4e57063ad911d980c67182b6fe44522 lib/galaxy/webapps/tool_shed/api/search.py --- a/lib/galaxy/webapps/tool_shed/api/search.py +++ b/lib/galaxy/webapps/tool_shed/api/search.py @@ -6,7 +6,9 @@ from galaxy.web import _future_expose_api_raw_anonymous as expose_api_raw_anonymous from galaxy.web.base.controller import BaseAPIController from galaxy.webapps.tool_shed.search.repo_search import RepoSearch +from galaxy.web import url_for import json + import logging log = logging.getLogger( __name__ ) @@ -28,7 +30,6 @@ repo_search = RepoSearch() results = repo_search.search( trans, search_term ) - - response = '%s(%s);' % ( kwd.get('callback'), json.dumps(results) ) - log.debug(response) + results[ 'hostname' ] = url_for( '/', qualified = True ) + response = '%s(%s);' % ( kwd.get( 'callback' ), json.dumps( results ) ) return response diff -r bc4403d1f81b560c8e0685eb1093b83ff6b348e4 -r bbebacb3a4e57063ad911d980c67182b6fe44522 lib/galaxy/webapps/tool_shed/search/repo_search.py --- a/lib/galaxy/webapps/tool_shed/search/repo_search.py +++ b/lib/galaxy/webapps/tool_shed/search/repo_search.py @@ -2,10 +2,7 @@ import datetime from galaxy import exceptions from galaxy import eggs -from galaxy.util import pretty_print_time_interval -from galaxy.web.base.controller import BaseAPIController from galaxy.webapps.tool_shed import model -from tool_shed.util.shed_util_common import generate_sharable_link_for_repository_in_tool_shed import logging log = logging.getLogger( __name__ ) @@ -27,11 +24,11 @@ name = TEXT( field_boost = 1.7 ), description = TEXT( field_boost = 1.5 ), long_description = TEXT, - repo_type = TEXT, homepage_url = TEXT, remote_repository_url = TEXT, repo_owner_username = TEXT, - times_downloaded = STORED ) + times_downloaded = STORED, + approved = STORED ) except ImportError, e: search_ready = False @@ -46,13 +43,25 @@ use_final = True def final( self, searcher, docnum, score ): + log.debug('score before: ' + str(score) ) + + # Arbitrary for now + reasonable_hits = 100.0 + times_downloaded = int( searcher.stored_fields( docnum )[ "times_downloaded" ] ) + # Add 1 to prevent 0 being divided + if times_downloaded == 0: + times_downloaded = 1 + popularity_modifier = ( times_downloaded / reasonable_hits ) + log.debug('popularity_modifier: ' + str(popularity_modifier) ) - maxhits = 300 - hitcount = searcher.stored_fields( docnum )[ "times_downloaded" ] - log.debug( 'hitcount: ' + str( hitcount ) ) + cert_modifier = 2 if searcher.stored_fields( docnum )[ "approved" ] == 'yes' else 1 + log.debug('cert_modifier: ' + str(cert_modifier) ) - # Multiply the computed score for this document by the popularity - return score * ( hitcount / maxhits ) + # Adjust the computed score for this document by the popularity + # and by the certification level. + final_score = score * popularity_modifier * cert_modifier + log.debug('score after: ' + str( final_score ) ) + return final_score class RepoSearch( object ): @@ -80,8 +89,7 @@ 'long_description_B' : 0.5, 'homepage_url_B' : 0.3, 'remote_repository_url_B' : 0.2, - 'repo_owner_username' : 0.3, - 'repo_type_B' : 0.1 } ) + 'repo_owner_username' : 0.3 } ) # log.debug(repo_weighting.__dict__) searcher = index.searcher( weighting = repo_weighting ) @@ -90,7 +98,6 @@ 'name', 'description', 'long_description', - 'repo_type', 'homepage_url', 'remote_repository_url', 'repo_owner_username' ], schema = schema ) @@ -99,30 +106,28 @@ user_query = parser.parse( '*' + search_term + '*' ) hits = searcher.search( user_query, terms = True ) + # hits = searcher.search( user_query ) # hits = searcher.search_page( user_query, 1, pagelen = 1, terms = True ) log.debug( 'searching for: #' + str( search_term ) ) log.debug( 'total hits: ' + str( len( hits ) ) ) log.debug( 'scored hits: ' + str( hits.scored_length() ) ) results = {} - results[ 'total_results'] = len( hits ) + results[ 'total_results'] = str( hits.scored_length() ) results[ 'hits' ] = [] for hit in hits: - repo = trans.sa_session.query( model.Repository ).filter_by( id = hit[ 'id' ] ).one() - approved = 'no' - for review in repo.reviews: - if review.approved == 'yes': - approved = 'yes' - break - hit_dict = repo.to_dict( view = 'element', value_mapper = { 'id': trans.security.encode_id, 'user_id': trans.security.encode_id } ) - hit_dict[ 'url'] = generate_sharable_link_for_repository_in_tool_shed( repo ) - - # Format the time since last update to be nicely readable. - time_ago = pretty_print_time_interval( repo.update_time ) - hit_dict[ 'last_updated' ] = time_ago - hit_dict[ 'full_last_updated' ] = repo.update_time.strftime( "%Y-%m-%d %I:%M %p" ) - hit_dict[ 'times_downloaded' ] = repo.times_downloaded - hit_dict[ 'approved' ] = approved - results[ 'hits' ].append( {'repository': hit_dict, 'matched_terms': hit.matched_terms() } ) + hit_dict = {} + hit_dict[ 'id' ] = trans.security.encode_id( hit.get( 'id' ) ) + hit_dict[ 'repo_owner_username' ] = hit.get( 'repo_owner_username' ) + hit_dict[ 'name' ] = hit.get( 'name' ) + hit_dict[ 'long_description' ] = hit.get( 'long_description' ) + hit_dict[ 'remote_repository_url' ] = hit.get( 'remote_repository_url' ) + hit_dict[ 'homepage_url' ] = hit.get( 'homepage_url' ) + hit_dict[ 'description' ] = hit.get( 'description' ) + hit_dict[ 'last_updated' ] = hit.get( 'last_updated' ) + hit_dict[ 'full_last_updated' ] = hit.get( 'full_last_updated' ) + hit_dict[ 'approved' ] = hit.get( 'approved' ) + hit_dict[ 'times_downloaded' ] = hit.get( 'times_downloaded' ) + results[ 'hits' ].append( {'repository': hit_dict, 'matched_terms': hit.matched_terms(), 'score': hit.score } ) return results finally: searcher.close() diff -r bc4403d1f81b560c8e0685eb1093b83ff6b348e4 -r bbebacb3a4e57063ad911d980c67182b6fe44522 scripts/tool_shed/build_ts_whoosh_index.py --- a/scripts/tool_shed/build_ts_whoosh_index.py +++ b/scripts/tool_shed/build_ts_whoosh_index.py @@ -1,21 +1,26 @@ # !/usr/bin/env python """ Build indeces for searching the TS """ -import sys, os, csv, urllib, urllib2, ConfigParser +import sys +import os +import csv +import urllib +import urllib2 +import ConfigParser new_path = [ os.path.join( os.getcwd(), "lib" ) ] new_path.extend( sys.path[1:] ) # remove scripts/ from the path sys.path = new_path +from galaxy.util import pretty_print_time_interval from galaxy import eggs +eggs.require( "SQLAlchemy" ) + +import galaxy.webapps.tool_shed.model.mapping # Whoosh is compatible with Python 2.5+ Try to import Whoosh and set flag to indicate whether search is enabled. try: eggs.require( "Whoosh" ) - import pkg_resources - pkg_resources.require( "SQLAlchemy >= 0.4" ) - import whoosh.index - import galaxy.webapps.tool_shed.model.mapping from whoosh.filedb.filestore import FileStorage from whoosh.fields import Schema, STORED, ID, KEYWORD, TEXT, STORED from whoosh.scoring import BM25F @@ -26,14 +31,16 @@ whoosh_ready = True schema = Schema( id = STORED, - name = TEXT( field_boost = 1.7 ), - description = TEXT( field_boost = 1.5 ), - long_description = TEXT, - repo_type = TEXT, - homepage_url = TEXT, - remote_repository_url = TEXT, - repo_owner_username = TEXT, - times_downloaded = STORED ) + name = TEXT( field_boost = 1.7, stored = True ), + description = TEXT( field_boost = 1.5, stored = True ), + long_description = TEXT( stored = True ), + homepage_url = TEXT( stored = True ), + remote_repository_url = TEXT( stored = True ), + repo_owner_username = TEXT( stored = True ), + times_downloaded = STORED, + approved = STORED, + last_updated = STORED, + full_last_updated = STORED ) except ImportError, e: print 'import error' @@ -51,16 +58,29 @@ return a_basestr repos_indexed = 0 - for id, name, description, long_description, repo_type, homepage_url, remote_repository_url, repo_owner_username, times_downloaded in get_repos( sa_session ): + for ( id, + name, + description, + long_description, + homepage_url, + remote_repository_url, + repo_owner_username, + times_downloaded, + approved, + last_updated, + full_last_updated ) in get_repos( sa_session ): + writer.add_document( id = id, name = to_unicode( name ), description = to_unicode( description ), long_description = to_unicode( long_description ), - repo_type = to_unicode( repo_type ), homepage_url = to_unicode( homepage_url ), remote_repository_url = to_unicode( remote_repository_url ), repo_owner_username = to_unicode( repo_owner_username ), - times_downloaded = times_downloaded ) + times_downloaded = times_downloaded, + approved = approved, + last_updated = last_updated, + full_last_updated = full_last_updated ) repos_indexed += 1 writer.commit() print "Number of repos indexed: ", repos_indexed @@ -71,17 +91,36 @@ name = repo.name description = repo.description long_description = repo.long_description - repo_type = repo.type homepage_url = repo.homepage_url remote_repository_url = repo.remote_repository_url times_downloaded = repo.times_downloaded - repo_owner_username = "" + repo_owner_username = '' if repo.user_id is not None: user = sa_session.query( model.User ).filter( model.User.id == repo.user_id ).one() repo_owner_username = user.username - yield id, name, description, long_description, repo_type, homepage_url, remote_repository_url, repo_owner_username, times_downloaded + approved = 'no' + for review in repo.reviews: + if review.approved == 'yes': + approved = 'yes' + break + + # Format the time since last update to be nicely readable. + last_updated = pretty_print_time_interval( repo.update_time ) + full_last_updated = repo.update_time.strftime( "%Y-%m-%d %I:%M %p" ) + + yield ( id, + name, + description, + long_description, + homepage_url, + remote_repository_url, + repo_owner_username, + times_downloaded, + approved, + last_updated, + full_last_updated ) def get_sa_session_and_needed_config_settings( ini_file ): conf_parser = ConfigParser.ConfigParser( { 'here' : os.getcwd() } ) Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
commits-noreply@bitbucket.org