[hg] galaxy 2768: More improvement in cleanup_datasets script - ...
details: http://www.bx.psu.edu/hg/galaxy/rev/35dd55a7898e changeset: 2768:35dd55a7898e user: Greg Von Kuster <greg@bx.psu.edu> date: Thu Sep 24 16:52:15 2009 -0400 description: More improvement in cleanup_datasets script - will take significantly less disk, and may be faster. 1 file(s) affected in this change: scripts/cleanup_datasets/cleanup_datasets.py diffs (75 lines): diff -r 93b8b571a9fa -r 35dd55a7898e scripts/cleanup_datasets/cleanup_datasets.py --- a/scripts/cleanup_datasets/cleanup_datasets.py Thu Sep 24 16:16:56 2009 -0400 +++ b/scripts/cleanup_datasets/cleanup_datasets.py Thu Sep 24 16:52:15 2009 -0400 @@ -203,6 +203,7 @@ print "Elapsed time: ", stop - start, "\n" def delete_datasets( app, cutoff_time, remove_from_disk, info_only = False, force_retry = False ): + import sqlalchemy as sa # Marks datasets as deleted if associated items are all deleted. print "######### Starting delete_datasets #########\n" start = time.clock() @@ -211,23 +212,33 @@ .filter( app.model.HistoryDatasetAssociation.table.c.update_time < cutoff_time ).all() library_datasets = app.model.Dataset.options( eagerload( "library_associations" ) ) \ .filter( app.model.LibraryDatasetDatasetAssociation.table.c.update_time < cutoff_time ).all() - else: - history_datasets = app.model.Dataset.filter_by( deleted = False ) \ - .options( eagerload( "history_associations" ) ) \ - .filter( and_( app.model.HistoryDatasetAssociation.table.c.update_time < cutoff_time, - app.model.HistoryDatasetAssociation.table.c.deleted==True ) ).all() - library_datasets = app.model.Dataset.filter_by( deleted = False ) \ - .options( eagerload( "library_associations" ) ) \ - .filter( and_( app.model.LibraryDatasetDatasetAssociation.table.c.update_time < cutoff_time, - app.model.LibraryDatasetDatasetAssociation.table.c.deleted==True ) ).all() - print "Time to query history and library datasets: ", time.clock() - start, "\n" - print "Processing ", len( history_datasets ), " history datasets and ", len( library_datasets ), " library datasets...\n\n" - datasets = history_datasets + library_datasets + else: + # We really only need the id column here, but sqlalchemy barfs when trying to select only 1 column + history_dataset_ids_query = sa.select( ( app.model.Dataset.table.c.id, + app.model.Dataset.table.c.state ), + whereclause = sa.and_( app.model.Dataset.table.c.deleted == False, + app.model.HistoryDatasetAssociation.table.c.update_time < cutoff_time, + app.model.HistoryDatasetAssociation.table.c.deleted == True ), + from_obj = [ sa.outerjoin( app.model.Dataset.table, + app.model.HistoryDatasetAssociation.table ) ] ) + history_dataset_ids = [ row.id for row in history_dataset_ids_query.execute() ] + print "Time to retrieve ", len( history_dataset_ids ), " history dataset ids: ", time.clock() - start + library_dataset_ids_query = sa.select( ( app.model.Dataset.table.c.id, + app.model.Dataset.table.c.state ), + whereclause = sa.and_( app.model.Dataset.table.c.deleted == False, + app.model.LibraryDatasetDatasetAssociation.table.c.update_time < cutoff_time, + app.model.LibraryDatasetDatasetAssociation.table.c.deleted == True ), + from_obj = [ sa.outerjoin( app.model.Dataset.table, + app.model.LibraryDatasetDatasetAssociation.table ) ] ) + library_dataset_ids = [ row.id for row in library_dataset_ids_query.execute() ] + print "Time to retrieve ", len( library_dataset_ids ), " library dataset ids: ", time.clock() - start + dataset_ids = history_dataset_ids + library_dataset_ids skip = [] deleted_dataset_count = 0 deleted_instance_count = 0 - for dataset in datasets: - print "Processing dataset id:", dataset.id, "\n" + for dataset_id in dataset_ids: + print "Processing dataset id:", dataset_id, "\n" + dataset = app.model.Dataset.get( id ) if dataset.id not in skip and _dataset_is_deletable( dataset ): deleted_dataset_count += 1 for dataset_instance in dataset.history_associations + dataset.library_associations: @@ -375,6 +386,13 @@ self.file_path = file_path # Setup the database engine and ORM self.model = galaxy.model.mapping.init( self.file_path, self.database_connection, engine_options={}, create_tables=False ) + @property + def sa_session( self ): + """ + Returns a SQLAlchemy session -- currently just gets the current + session from the threadlocal session context, but this is provided + to allow migration toward a more SQLAlchemy 0.4 style of use. + """ + return self.model.context.current -if __name__ == "__main__": - main() +if __name__ == "__main__": main()
participants (1)
-
Nate Coraor