details: http://www.bx.psu.edu/hg/galaxy/rev/aff67d57fd38 changeset: 2771:aff67d57fd38 user: Greg Von Kuster <greg@bx.psu.edu> date: Thu Sep 24 21:17:32 2009 -0400 description: Improve logging for cleanup_datasets.py script. 1 file(s) affected in this change: scripts/cleanup_datasets/cleanup_datasets.py diffs (376 lines): diff -r 80735c6b6932 -r aff67d57fd38 scripts/cleanup_datasets/cleanup_datasets.py --- a/scripts/cleanup_datasets/cleanup_datasets.py Thu Sep 24 19:58:35 2009 -0400 +++ b/scripts/cleanup_datasets/cleanup_datasets.py Thu Sep 24 21:17:32 2009 -0400 @@ -16,6 +16,7 @@ from optparse import OptionParser import galaxy.model.mapping +import sqlalchemy as sa from galaxy.model.orm import and_, eagerload assert sys.version_info[:2] >= ( 2, 4 ) @@ -26,20 +27,13 @@ parser.add_option( "-r", "--remove_from_disk", action="store_true", dest="remove_from_disk", help="remove datasets from disk when purged", default=False ) parser.add_option( "-i", "--info_only", action="store_true", dest="info_only", help="info about the requested action", default=False ) parser.add_option( "-f", "--force_retry", action="store_true", dest="force_retry", help="performs the requested actions, but ignores whether it might have been done before. Useful when -r wasn't used, but should have been", default=False ) - parser.add_option( "-1", "--delete_userless_histories", action="store_true", dest="delete_userless_histories", default=False, help="delete userless histories and datasets" ) - parser.add_option( "-2", "--purge_histories", action="store_true", dest="purge_histories", default=False, help="purge deleted histories" ) - parser.add_option( "-3", "--purge_datasets", action="store_true", dest="purge_datasets", default=False, help="purge deleted datasets" ) - parser.add_option( "-4", "--purge_libraries", action="store_true", dest="purge_libraries", default=False, help="purge deleted libraries" ) - parser.add_option( "-5", "--purge_folders", action="store_true", dest="purge_folders", default=False, help="purge deleted library folders" ) - parser.add_option( "-6", "--delete_datasets", action="store_true", dest="delete_datasets", default=False, help="mark deletable datasets as deleted and purge associated dataset instances" ) - - + ( options, args ) = parser.parse_args() ini_file = args[0] @@ -67,14 +61,15 @@ cutoff_time = datetime.utcnow() - timedelta( days=options.days ) now = strftime( "%Y-%m-%d %H:%M:%S" ) - print "\n# %s - Handling stuff older than %i days\n" % ( now, options.days ) + print "##########################################" + print "\n# %s - Handling stuff older than %i days" % ( now, options.days ) if options.info_only: print "# Displaying info only ( --info_only )\n" elif options.remove_from_disk: - print "# Datasets will be removed from disk.\n" + print "Datasets will be removed from disk.\n" else: - print "# Datasets will NOT be removed from disk.\n" + print "Datasets will NOT be removed from disk.\n" if options.delete_userless_histories: delete_userless_histories( app, cutoff_time, info_only = options.info_only, force_retry = options.force_retry ) @@ -96,8 +91,7 @@ # The purge history script will handle marking DatasetInstances as deleted. # Nothing is removed from disk yet. history_count = 0 - print '# The following datasets and associated userless histories have been deleted' - start = time.clock() + start = time.time() if force_retry: histories = app.model.History.filter( and_( app.model.History.table.c.user_id==None, app.model.History.table.c.update_time < cutoff_time ) ).all() @@ -107,14 +101,14 @@ app.model.History.table.c.update_time < cutoff_time ) ).all() for history in histories: if not info_only: + print "Deleting history id ", history.id history.deleted = True - print "%d" % history.id history_count += 1 app.model.flush() - stop = time.clock() - print "# Deleted %d histories.\n" % ( history_count ) - print "Elapsed time: ", stop - start, "\n" - + stop = time.time() + print "Deleted %d histories" % history_count + print "Elapsed time: ", stop - start + print "##########################################" def purge_histories( app, cutoff_time, remove_from_disk, info_only = False, force_retry = False ): # Purges deleted histories whose update_time is older than the cutoff_time. @@ -123,17 +117,16 @@ # history.purged == True simply means that it can no longer be undeleted # i.e. all associated datasets are marked as deleted history_count = 0 - print '# The following datasets and associated deleted histories have been purged' - start = time.clock() + start = time.time() if force_retry: histories = app.model.History.filter( and_( app.model.History.table.c.deleted==True, - app.model.History.table.c.update_time < cutoff_time ) ) \ - .options( eagerload( 'datasets' ) ).all() + app.model.History.table.c.update_time < cutoff_time ) ) \ + .options( eagerload( 'datasets' ) ).all() else: histories = app.model.History.filter( and_( app.model.History.table.c.deleted==True, - app.model.History.table.c.purged==False, - app.model.History.table.c.update_time < cutoff_time ) ) \ - .options( eagerload( 'datasets' ) ).all() + app.model.History.table.c.purged==False, + app.model.History.table.c.update_time < cutoff_time ) ) \ + .options( eagerload( 'datasets' ) ).all() for history in histories: for dataset_assoc in history.datasets: _purge_dataset_instance( dataset_assoc, app, remove_from_disk, info_only = info_only ) #mark a DatasetInstance as deleted, clear associated files, and mark the Dataset as deleted if it is deletable @@ -143,13 +136,14 @@ # if we should ever delete info like this from the db though, so commented out for now... #for dhp in history.default_permissions: # dhp.delete() + print "Purging history id ", history.id history.purged = True - print "%d" % history.id history_count += 1 app.model.flush() - stop = time.clock() - print '# Purged %d histories.' % ( history_count ), '\n' - print "Elapsed time: ", stop - start, "\n" + stop = time.time() + print 'Purged %d histories.' % history_count + print "Elapsed time: ", stop - start + print "##########################################" def purge_libraries( app, cutoff_time, remove_from_disk, info_only = False, force_retry = False ): # Purges deleted libraries whose update_time is older than the cutoff_time. @@ -158,8 +152,7 @@ # library.purged == True simply means that it can no longer be undeleted # i.e. all associated LibraryDatasets/folders are marked as deleted library_count = 0 - print '# The following libraries and associated folders have been purged' - start = time.clock() + start = time.time() if force_retry: libraries = app.model.Library.filter( and_( app.model.Library.table.c.deleted==True, app.model.Library.table.c.update_time < cutoff_time ) ).all() @@ -170,13 +163,14 @@ for library in libraries: _purge_folder( library.root_folder, app, remove_from_disk, info_only = info_only ) if not info_only: + print "Purging library id ", library.id library.purged = True - print "%d" % library.id library_count += 1 app.model.flush() - stop = time.clock() - print '# Purged %d libraries .' % ( library_count ), '\n' - print "Elapsed time: ", stop - start, "\n" + stop = time.time() + print '# Purged %d libraries .' % library_count + print "Elapsed time: ", stop - start + print "##########################################" def purge_folders( app, cutoff_time, remove_from_disk, info_only = False, force_retry = False ): # Purges deleted folders whose update_time is older than the cutoff_time. @@ -185,8 +179,7 @@ # libraryFolder.purged == True simply means that it can no longer be undeleted # i.e. all associated LibraryDatasets/folders are marked as deleted folder_count = 0 - print '# The following folders have been purged' - start = time.clock() + start = time.time() if force_retry: folders = app.model.LibraryFolder.filter( and_( app.model.LibraryFolder.table.c.deleted==True, app.model.LibraryFolder.table.c.update_time < cutoff_time ) ).all() @@ -196,22 +189,26 @@ app.model.LibraryFolder.table.c.update_time < cutoff_time ) ).all() for folder in folders: _purge_folder( folder, app, remove_from_disk, info_only = info_only ) - print "%d" % folder.id folder_count += 1 - stop = time.clock() - print '# Purged %d folders.' % ( folder_count ), '\n' - print "Elapsed time: ", stop - start, "\n" + stop = time.time() + print '# Purged %d folders.' % folder_count + print "Elapsed time: ", stop - start + print "##########################################" def delete_datasets( app, cutoff_time, remove_from_disk, info_only = False, force_retry = False ): - import sqlalchemy as sa # Marks datasets as deleted if associated items are all deleted. - print "######### Starting delete_datasets #########\n" - start = time.clock() + start = time.time() if force_retry: - history_datasets = app.model.Dataset.options( eagerload( "history_associations" ) ) \ - .filter( app.model.HistoryDatasetAssociation.table.c.update_time < cutoff_time ).all() - library_datasets = app.model.Dataset.options( eagerload( "library_associations" ) ) \ - .filter( app.model.LibraryDatasetDatasetAssociation.table.c.update_time < cutoff_time ).all() + history_dataset_ids_query = sa.select( ( app.model.Dataset.table.c.id, + app.model.Dataset.table.c.state ), + whereclause = app.model.HistoryDatasetAssociation.table.c.update_time < cutoff_time, + from_obj = [ sa.outerjoin( app.model.Dataset.table, + app.model.HistoryDatasetAssociation.table ) ] ) + library_dataset_ids_query = sa.select( ( app.model.Dataset.table.c.id, + app.model.Dataset.table.c.state ), + whereclause = app.model.LibraryDatasetDatasetAssociation.table.c.update_time < cutoff_time, + from_obj = [ sa.outerjoin( app.model.Dataset.table, + app.model.LibraryDatasetDatasetAssociation.table ) ] ) else: # We really only need the id column here, but sqlalchemy barfs when trying to select only 1 column history_dataset_ids_query = sa.select( ( app.model.Dataset.table.c.id, @@ -221,8 +218,6 @@ app.model.HistoryDatasetAssociation.table.c.deleted == True ), from_obj = [ sa.outerjoin( app.model.Dataset.table, app.model.HistoryDatasetAssociation.table ) ] ) - history_dataset_ids = [ row.id for row in history_dataset_ids_query.execute() ] - print "Time to retrieve ", len( history_dataset_ids ), " history dataset ids: ", time.clock() - start library_dataset_ids_query = sa.select( ( app.model.Dataset.table.c.id, app.model.Dataset.table.c.state ), whereclause = sa.and_( app.model.Dataset.table.c.deleted == False, @@ -230,35 +225,33 @@ app.model.LibraryDatasetDatasetAssociation.table.c.deleted == True ), from_obj = [ sa.outerjoin( app.model.Dataset.table, app.model.LibraryDatasetDatasetAssociation.table ) ] ) - library_dataset_ids = [ row.id for row in library_dataset_ids_query.execute() ] - print "Time to retrieve ", len( library_dataset_ids ), " library dataset ids: ", time.clock() - start + history_dataset_ids = [ row.id for row in history_dataset_ids_query.execute() ] + library_dataset_ids = [ row.id for row in library_dataset_ids_query.execute() ] dataset_ids = history_dataset_ids + library_dataset_ids skip = [] deleted_dataset_count = 0 deleted_instance_count = 0 for dataset_id in dataset_ids: - print "Processing dataset id:", dataset_id, "\n" - dataset = app.model.Dataset.get( id ) + print "######### Processing dataset id:", dataset_id + dataset = app.model.Dataset.get( dataset_id ) if dataset.id not in skip and _dataset_is_deletable( dataset ): deleted_dataset_count += 1 for dataset_instance in dataset.history_associations + dataset.library_associations: - print "Associated Dataset instance: ", dataset_instance.__class__.__name__, dataset_instance.id, "\n" + print "Associated Dataset instance: ", dataset_instance.__class__.__name__, dataset_instance.id _purge_dataset_instance( dataset_instance, app, remove_from_disk, include_children=True, info_only=info_only, is_deletable=True ) deleted_instance_count += 1 skip.append( dataset.id ) - print "Time to process dataset id: ", dataset.id, " - ", time.clock() - start, "\n\n" - print "Time to mark datasets deleted: ", time.clock() - start, "\n\n" - print "Examined %d datasets, marked %d as deleted and purged %d dataset instances\n" % ( len( skip ), deleted_dataset_count, deleted_instance_count ) - print "Total elapsed time: ", time.clock() - start, "\n" - print "######### Finished delete_datasets #########\n" + stop = time.time() + print "Examined %d datasets, marked %d as deleted and purged %d dataset instances" % ( len( skip ), deleted_dataset_count, deleted_instance_count ) + print "Total elapsed time: ", stop - start + print "##########################################" def purge_datasets( app, cutoff_time, remove_from_disk, info_only = False, force_retry = False ): # Purges deleted datasets whose update_time is older than cutoff_time. Files may or may # not be removed from disk. dataset_count = 0 disk_space = 0 - print '# The following deleted datasets have been purged' - start = time.clock() + start = time.time() if force_retry: datasets = app.model.Dataset.filter( and_( app.model.Dataset.table.c.deleted==True, app.model.Dataset.table.c.purgable==True, @@ -276,24 +269,25 @@ disk_space += file_size except: pass - stop = time.clock() - print '# %d datasets purged\n' % dataset_count + stop = time.time() + print 'Purged %d datasets' % dataset_count if remove_from_disk: - print '# Freed disk space: ', disk_space, '\n' - print "Elapsed time: ", stop - start, "\n" - + print 'Freed disk space: ', disk_space + print "Elapsed time: ", stop - start + print "##########################################" def _purge_dataset_instance( dataset_instance, app, remove_from_disk, include_children=True, info_only=False, is_deletable=False ): # A dataset_instance is either a HDA or an LDDA. Purging a dataset instance marks the instance as deleted, # and marks the associated dataset as deleted if it is not associated with another active DatsetInstance. if not info_only: + print "Deleting dataset_instance ", str( dataset_instance ), " id ", dataset_instance.id dataset_instance.mark_deleted( include_children = include_children ) dataset_instance.clear_associated_files() dataset_instance.flush() dataset_instance.dataset.refresh() if is_deletable or _dataset_is_deletable( dataset_instance.dataset ): # Calling methods may have already checked _dataset_is_deletable, if so, is_deletable should be True - _delete_dataset( dataset_instance.dataset, app, remove_from_disk, info_only = info_only ) + _delete_dataset( dataset_instance.dataset, app, remove_from_disk, info_only=info_only, is_deletable=is_deletable ) #need to purge children here if include_children: for child in dataset_instance.children: @@ -303,11 +297,11 @@ #a dataset is deletable when it no longer has any non-deleted associations return not bool( dataset.active_history_associations or dataset.active_library_associations ) -def _delete_dataset( dataset, app, remove_from_disk, info_only = False ): +def _delete_dataset( dataset, app, remove_from_disk, info_only=False, is_deletable=False ): #marks a base dataset as deleted, hdas/ldas associated with dataset can no longer be undeleted #metadata files attached to associated dataset Instances is removed now - if not _dataset_is_deletable( dataset ): - print "# This Dataset (%i) is not deletable, associated Metadata Files will not be removed.\n" % ( dataset.id ) + if not is_deletable and not _dataset_is_deletable( dataset ): + print "This Dataset (%i) is not deletable, associated Metadata Files will not be removed.\n" % ( dataset.id ) else: # Mark all associated MetadataFiles as deleted and purged and remove them from disk metadata_files = [] @@ -319,18 +313,19 @@ for metadata_file in app.model.MetadataFile.filter( app.model.MetadataFile.table.c.lda_id==lda.id ).all(): metadata_files.append( metadata_file ) for metadata_file in metadata_files: - print "# The following metadata files attached to associations of Dataset '%s' have been purged:" % dataset.id + print "The following metadata files attached to associations of Dataset '%s' have been purged:" % dataset.id if not info_only: if remove_from_disk: try: + print "Removing disk file ", metadata_file.file_name os.unlink( metadata_file.file_name ) except Exception, e: - print "# Error, exception: %s caught attempting to purge metadata file %s\n" %( str( e ), metadata_file.file_name ) + print "Error, exception: %s caught attempting to purge metadata file %s\n" %( str( e ), metadata_file.file_name ) metadata_file.purged = True metadata_file.deleted = True #metadata_file.flush() print "%s" % metadata_file.file_name - print + print "Deleting dataset id", dataset.id dataset.deleted = True app.model.flush() @@ -338,32 +333,34 @@ if dataset.deleted: try: if dataset.purgable and _dataset_is_deletable( dataset ): - print "%s" % dataset.file_name if not info_only: # Remove files from disk and update the database if remove_from_disk: # TODO: should permissions on the dataset be deleted here? + print "Removing disk, file ", dataset.file_name os.unlink( dataset.file_name ) # Remove associated extra files from disk if they exist if dataset.extra_files_path and os.path.exists( dataset.extra_files_path ): shutil.rmtree( dataset.extra_files_path ) #we need to delete the directory and its contents; os.unlink would always fail on a directory + print "Purging dataset id", dataset.id dataset.purged = True dataset.flush() else: - print "# This dataset (%i) is not purgable, the file (%s) will not be removed.\n" % ( dataset.id, dataset.file_name ) + print "This dataset (%i) is not purgable, the file (%s) will not be removed.\n" % ( dataset.id, dataset.file_name ) except OSError, exc: - print "# Error, file has already been removed: %s" % str( exc ) + print "Error, dataset file has already been removed: %s" % str( exc ) + print "Purging dataset id", dataset.id dataset.purged = True dataset.flush() except Exception, exc: - print "# Error, exception: %s caught attempting to purge %s\n" %( str( exc ), dataset.file_name ) + print "Error attempting to purge data file: ", dataset.file_name, " error: ", str( exc ) else: - print "# Error: '%s' has not previously been deleted, so it cannot be purged\n" % dataset.file_name - print "" + print "Error: '%s' has not previously been deleted, so it cannot be purged\n" % dataset.file_name def _purge_folder( folder, app, remove_from_disk, info_only = False ): """Purges a folder and its contents, recursively""" for ld in folder.datasets: + print "Deleting library dataset id ", ld.id ld.deleted = True for ldda in [ld.library_dataset_dataset_association] + ld.expired_datasets: _purge_dataset_instance( ldda, app, remove_from_disk, info_only = info_only ) #mark a DatasetInstance as deleted, clear associated files, and mark the Dataset as deleted if it is deletable @@ -371,6 +368,7 @@ _purge_folder( sub_folder, app, remove_from_disk, info_only = info_only ) if not info_only: # TODO: should the folder permissions be deleted here? + print "Purging folder id ", folder.id folder.purged = True folder.flush()