[galaxy-dev] [hg] galaxy 2771: Improve logging for cleanup_datasets.py script.

25 Sep 2009

details:   http://www.bx.psu.edu/hg/galaxy/rev/aff67d57fd38
changeset: 2771:aff67d57fd38
user:      Greg Von Kuster <greg@bx.psu.edu>
date:      Thu Sep 24 21:17:32 2009 -0400
description:
Improve logging for cleanup_datasets.py script.

1 file(s) affected in this change:

scripts/cleanup_datasets/cleanup_datasets.py

diffs (376 lines):

diff -r 80735c6b6932 -r aff67d57fd38 scripts/cleanup_datasets/cleanup_datasets.py

--- a/scripts/cleanup_datasets/cleanup_datasets.py	Thu Sep 24 19:58:35 2009 -0400
+++ b/scripts/cleanup_datasets/cleanup_datasets.py	Thu Sep 24 21:17:32 2009 -0400
@@ -16,6 +16,7 @@
 from optparse import OptionParser
 
 import galaxy.model.mapping
+import sqlalchemy as sa
 from galaxy.model.orm import and_, eagerload
 
 assert sys.version_info[:2] >= ( 2, 4 )
@@ -26,20 +27,13 @@
     parser.add_option( "-r", "--remove_from_disk", action="store_true", dest="remove_from_disk", help="remove datasets from disk when purged", default=False )
     parser.add_option( "-i", "--info_only", action="store_true", dest="info_only", help="info about the requested action", default=False )
     parser.add_option( "-f", "--force_retry", action="store_true", dest="force_retry", help="performs the requested actions, but ignores whether it might have been done before. Useful when -r wasn't used, but should have been", default=False )
-    
     parser.add_option( "-1", "--delete_userless_histories", action="store_true", dest="delete_userless_histories", default=False, help="delete userless histories and datasets" )
-    
     parser.add_option( "-2", "--purge_histories", action="store_true", dest="purge_histories", default=False, help="purge deleted histories" )
-    
     parser.add_option( "-3", "--purge_datasets", action="store_true", dest="purge_datasets", default=False, help="purge deleted datasets" )
-    
     parser.add_option( "-4", "--purge_libraries", action="store_true", dest="purge_libraries", default=False, help="purge deleted libraries" )
-    
     parser.add_option( "-5", "--purge_folders", action="store_true", dest="purge_folders", default=False, help="purge deleted library folders" )
-    
     parser.add_option( "-6", "--delete_datasets", action="store_true", dest="delete_datasets", default=False, help="mark deletable datasets as deleted and purge associated dataset instances" )
-    
-    
+
     ( options, args ) = parser.parse_args()
     ini_file = args[0]
     
@@ -67,14 +61,15 @@
     cutoff_time = datetime.utcnow() - timedelta( days=options.days )
     now = strftime( "%Y-%m-%d %H:%M:%S" )
     
-    print "\n# %s - Handling stuff older than %i days\n" % ( now, options.days )
+    print "##########################################"
+    print "\n# %s - Handling stuff older than %i days" % ( now, options.days )
     
     if options.info_only:
         print "# Displaying info only ( --info_only )\n"
     elif options.remove_from_disk:
-        print "# Datasets will be removed from disk.\n"
+        print "Datasets will be removed from disk.\n"
     else:
-        print "# Datasets will NOT be removed from disk.\n"
+        print "Datasets will NOT be removed from disk.\n"
     
     if options.delete_userless_histories:
         delete_userless_histories( app, cutoff_time, info_only = options.info_only, force_retry = options.force_retry )
@@ -96,8 +91,7 @@
     # The purge history script will handle marking DatasetInstances as deleted. 
     # Nothing is removed from disk yet.
     history_count = 0
-    print '# The following datasets and associated userless histories have been deleted'
-    start = time.clock()
+    start = time.time()
     if force_retry:
         histories = app.model.History.filter( and_( app.model.History.table.c.user_id==None,
                                     app.model.History.table.c.update_time < cutoff_time ) ).all()
@@ -107,14 +101,14 @@
                                     app.model.History.table.c.update_time < cutoff_time ) ).all()
     for history in histories:
         if not info_only:
+            print "Deleting history id ", history.id
             history.deleted = True
-        print "%d" % history.id
         history_count += 1
     app.model.flush()
-    stop = time.clock()
-    print "# Deleted %d histories.\n" % ( history_count )
-    print "Elapsed time: ", stop - start, "\n"
-    
+    stop = time.time()
+    print "Deleted %d histories" % history_count
+    print "Elapsed time: ", stop - start
+    print "##########################################" 
 
 def purge_histories( app, cutoff_time, remove_from_disk, info_only = False, force_retry = False ):
     # Purges deleted histories whose update_time is older than the cutoff_time.
@@ -123,17 +117,16 @@
     # history.purged == True simply means that it can no longer be undeleted
     # i.e. all associated datasets are marked as deleted
     history_count = 0
-    print '# The following datasets and associated deleted histories have been purged'
-    start = time.clock()
+    start = time.time()
     if force_retry:
         histories = app.model.History.filter( and_( app.model.History.table.c.deleted==True,
-                                    app.model.History.table.c.update_time < cutoff_time ) ) \
-                     .options( eagerload( 'datasets' ) ).all()
+                                                    app.model.History.table.c.update_time < cutoff_time ) ) \
+                                      .options( eagerload( 'datasets' ) ).all()
     else:
         histories = app.model.History.filter( and_( app.model.History.table.c.deleted==True,
-                                    app.model.History.table.c.purged==False,
-                                    app.model.History.table.c.update_time < cutoff_time ) ) \
-                     .options( eagerload( 'datasets' ) ).all()
+                                                    app.model.History.table.c.purged==False,
+                                                    app.model.History.table.c.update_time < cutoff_time ) ) \
+                                     .options( eagerload( 'datasets' ) ).all()
     for history in histories:
         for dataset_assoc in history.datasets:
             _purge_dataset_instance( dataset_assoc, app, remove_from_disk, info_only = info_only ) #mark a DatasetInstance as deleted, clear associated files, and mark the Dataset as deleted if it is deletable
@@ -143,13 +136,14 @@
             # if we should ever delete info like this from the db though, so commented out for now...
             #for dhp in history.default_permissions:
             #    dhp.delete()
+            print "Purging history id ", history.id
             history.purged = True
-        print "%d" % history.id
         history_count += 1
     app.model.flush()
-    stop = time.clock()
-    print '# Purged %d histories.' % ( history_count ), '\n'
-    print "Elapsed time: ", stop - start, "\n"
+    stop = time.time()
+    print 'Purged %d histories.' % history_count
+    print "Elapsed time: ", stop - start
+    print "##########################################" 
 
 def purge_libraries( app, cutoff_time, remove_from_disk, info_only = False, force_retry = False ):
     # Purges deleted libraries whose update_time is older than the cutoff_time.
@@ -158,8 +152,7 @@
     # library.purged == True simply means that it can no longer be undeleted
     # i.e. all associated LibraryDatasets/folders are marked as deleted
     library_count = 0
-    print '# The following libraries and associated folders have been purged'
-    start = time.clock()
+    start = time.time()
     if force_retry:
         libraries = app.model.Library.filter( and_( app.model.Library.table.c.deleted==True,
                                     app.model.Library.table.c.update_time < cutoff_time ) ).all()
@@ -170,13 +163,14 @@
     for library in libraries:
         _purge_folder( library.root_folder, app, remove_from_disk, info_only = info_only )
         if not info_only:
+            print "Purging library id ", library.id
             library.purged = True
-        print "%d" % library.id
         library_count += 1
     app.model.flush()
-    stop = time.clock()
-    print '# Purged %d libraries .' % ( library_count ), '\n'
-    print "Elapsed time: ", stop - start, "\n"
+    stop = time.time()
+    print '# Purged %d libraries .' % library_count
+    print "Elapsed time: ", stop - start
+    print "##########################################" 
 
 def purge_folders( app, cutoff_time, remove_from_disk, info_only = False, force_retry = False ):
     # Purges deleted folders whose update_time is older than the cutoff_time.
@@ -185,8 +179,7 @@
     # libraryFolder.purged == True simply means that it can no longer be undeleted
     # i.e. all associated LibraryDatasets/folders are marked as deleted
     folder_count = 0
-    print '# The following folders have been purged'
-    start = time.clock()
+    start = time.time()
     if force_retry:
         folders = app.model.LibraryFolder.filter( and_( app.model.LibraryFolder.table.c.deleted==True,
                                     app.model.LibraryFolder.table.c.update_time < cutoff_time ) ).all()
@@ -196,22 +189,26 @@
                                     app.model.LibraryFolder.table.c.update_time < cutoff_time ) ).all()
     for folder in folders:
         _purge_folder( folder, app, remove_from_disk, info_only = info_only )
-        print "%d" % folder.id
         folder_count += 1
-    stop = time.clock()
-    print '# Purged %d folders.' % ( folder_count ), '\n'
-    print "Elapsed time: ", stop - start, "\n"
+    stop = time.time()
+    print '# Purged %d folders.' % folder_count
+    print "Elapsed time: ", stop - start
+    print "##########################################" 
 
 def delete_datasets( app, cutoff_time, remove_from_disk, info_only = False, force_retry = False ):
-    import sqlalchemy as sa
     # Marks datasets as deleted if associated items are all deleted.
-    print "######### Starting delete_datasets #########\n"
-    start = time.clock()
+    start = time.time()
     if force_retry:
-        history_datasets = app.model.Dataset.options( eagerload( "history_associations" ) ) \
-                                            .filter( app.model.HistoryDatasetAssociation.table.c.update_time < cutoff_time ).all()
-        library_datasets = app.model.Dataset.options( eagerload( "library_associations" ) ) \
-                                            .filter( app.model.LibraryDatasetDatasetAssociation.table.c.update_time < cutoff_time ).all()
+        history_dataset_ids_query = sa.select( ( app.model.Dataset.table.c.id,
+                                                 app.model.Dataset.table.c.state ),
+                                               whereclause = app.model.HistoryDatasetAssociation.table.c.update_time < cutoff_time,
+                                               from_obj = [ sa.outerjoin( app.model.Dataset.table,
+                                                                          app.model.HistoryDatasetAssociation.table ) ] )
+        library_dataset_ids_query = sa.select( ( app.model.Dataset.table.c.id,
+                                                 app.model.Dataset.table.c.state ),
+                                                whereclause = app.model.LibraryDatasetDatasetAssociation.table.c.update_time < cutoff_time,
+                                                from_obj = [ sa.outerjoin( app.model.Dataset.table,
+                                                                           app.model.LibraryDatasetDatasetAssociation.table ) ] )
     else:                                  
         # We really only need the id column here, but sqlalchemy barfs when trying to select only 1 column
         history_dataset_ids_query = sa.select( ( app.model.Dataset.table.c.id,
@@ -221,8 +218,6 @@
                                                                       app.model.HistoryDatasetAssociation.table.c.deleted == True ),
                                                from_obj = [ sa.outerjoin( app.model.Dataset.table,
                                                                           app.model.HistoryDatasetAssociation.table ) ] )
-        history_dataset_ids = [ row.id for row in history_dataset_ids_query.execute() ]
-        print "Time to retrieve ", len( history_dataset_ids ), " history dataset ids: ", time.clock() - start
         library_dataset_ids_query = sa.select( ( app.model.Dataset.table.c.id,
                                                  app.model.Dataset.table.c.state ),
                                                 whereclause = sa.and_( app.model.Dataset.table.c.deleted == False,
@@ -230,35 +225,33 @@
                                                                        app.model.LibraryDatasetDatasetAssociation.table.c.deleted == True ),
                                                 from_obj = [ sa.outerjoin( app.model.Dataset.table,
                                                                            app.model.LibraryDatasetDatasetAssociation.table ) ] )                       
-        library_dataset_ids = [ row.id for row in library_dataset_ids_query.execute() ]
-        print "Time to retrieve ", len( library_dataset_ids ), " library dataset ids: ", time.clock() - start
+    history_dataset_ids = [ row.id for row in history_dataset_ids_query.execute() ]
+    library_dataset_ids = [ row.id for row in library_dataset_ids_query.execute() ]
     dataset_ids = history_dataset_ids + library_dataset_ids
     skip = []
     deleted_dataset_count = 0
     deleted_instance_count = 0
     for dataset_id in dataset_ids:
-        print "Processing dataset id:", dataset_id, "\n"
-        dataset = app.model.Dataset.get( id )
+        print "######### Processing dataset id:", dataset_id
+        dataset = app.model.Dataset.get( dataset_id )
         if dataset.id not in skip and _dataset_is_deletable( dataset ):
             deleted_dataset_count += 1
             for dataset_instance in dataset.history_associations + dataset.library_associations:
-                print "Associated Dataset instance: ", dataset_instance.__class__.__name__, dataset_instance.id, "\n"
+                print "Associated Dataset instance: ", dataset_instance.__class__.__name__, dataset_instance.id
                 _purge_dataset_instance( dataset_instance, app, remove_from_disk, include_children=True, info_only=info_only, is_deletable=True )
                 deleted_instance_count += 1
         skip.append( dataset.id )
-        print "Time to process dataset id: ", dataset.id, " - ", time.clock() - start, "\n\n"
-    print "Time to mark datasets deleted: ", time.clock() - start, "\n\n"
-    print "Examined %d datasets, marked %d as deleted and purged %d dataset instances\n" % ( len( skip ), deleted_dataset_count, deleted_instance_count )
-    print "Total elapsed time: ", time.clock() - start, "\n"
-    print "######### Finished delete_datasets #########\n"
+    stop = time.time()
+    print "Examined %d datasets, marked %d as deleted and purged %d dataset instances" % ( len( skip ), deleted_dataset_count, deleted_instance_count )
+    print "Total elapsed time: ", stop - start
+    print "##########################################" 
 
 def purge_datasets( app, cutoff_time, remove_from_disk, info_only = False, force_retry = False ):
     # Purges deleted datasets whose update_time is older than cutoff_time.  Files may or may
     # not be removed from disk.
     dataset_count = 0
     disk_space = 0
-    print '# The following deleted datasets have been purged'
-    start = time.clock()
+    start = time.time()
     if force_retry:
         datasets = app.model.Dataset.filter( and_( app.model.Dataset.table.c.deleted==True,
                                    app.model.Dataset.table.c.purgable==True,
@@ -276,24 +269,25 @@
             disk_space += file_size
         except:
             pass
-    stop = time.clock()
-    print '# %d datasets purged\n' % dataset_count
+    stop = time.time()
+    print 'Purged %d datasets' % dataset_count
     if remove_from_disk:
-        print '# Freed disk space: ', disk_space, '\n'
-    print "Elapsed time: ", stop - start, "\n"
-
+        print 'Freed disk space: ', disk_space
+    print "Elapsed time: ", stop - start
+    print "##########################################" 
 
 def _purge_dataset_instance( dataset_instance, app, remove_from_disk, include_children=True, info_only=False, is_deletable=False ):
     # A dataset_instance is either a HDA or an LDDA.  Purging a dataset instance marks the instance as deleted, 
     # and marks the associated dataset as deleted if it is not associated with another active DatsetInstance.
     if not info_only:
+        print "Deleting dataset_instance ", str( dataset_instance ), " id ", dataset_instance.id
         dataset_instance.mark_deleted( include_children = include_children )
         dataset_instance.clear_associated_files()
         dataset_instance.flush()
         dataset_instance.dataset.refresh()
     if is_deletable or _dataset_is_deletable( dataset_instance.dataset ):
         # Calling methods may have already checked _dataset_is_deletable, if so, is_deletable should be True
-        _delete_dataset( dataset_instance.dataset, app, remove_from_disk, info_only = info_only )
+        _delete_dataset( dataset_instance.dataset, app, remove_from_disk, info_only=info_only, is_deletable=is_deletable )
     #need to purge children here
     if include_children:
         for child in dataset_instance.children:
@@ -303,11 +297,11 @@
     #a dataset is deletable when it no longer has any non-deleted associations
     return not bool( dataset.active_history_associations or dataset.active_library_associations )
 
-def _delete_dataset( dataset, app, remove_from_disk, info_only = False ):
+def _delete_dataset( dataset, app, remove_from_disk, info_only=False, is_deletable=False ):
     #marks a base dataset as deleted, hdas/ldas associated with dataset can no longer be undeleted
     #metadata files attached to associated dataset Instances is removed now
-    if not _dataset_is_deletable( dataset ):
-        print "# This Dataset (%i) is not deletable, associated Metadata Files will not be removed.\n" % ( dataset.id )
+    if not is_deletable and not _dataset_is_deletable( dataset ):
+        print "This Dataset (%i) is not deletable, associated Metadata Files will not be removed.\n" % ( dataset.id )
     else:
         # Mark all associated MetadataFiles as deleted and purged and remove them from disk
         metadata_files = []
@@ -319,18 +313,19 @@
             for metadata_file in app.model.MetadataFile.filter( app.model.MetadataFile.table.c.lda_id==lda.id ).all():
                 metadata_files.append( metadata_file )
         for metadata_file in metadata_files:
-            print "# The following metadata files attached to associations of Dataset '%s' have been purged:" % dataset.id
+            print "The following metadata files attached to associations of Dataset '%s' have been purged:" % dataset.id
             if not info_only:
                 if remove_from_disk:
                     try:
+                        print "Removing disk file ", metadata_file.file_name
                         os.unlink( metadata_file.file_name )
                     except Exception, e:
-                        print "# Error, exception: %s caught attempting to purge metadata file %s\n" %( str( e ), metadata_file.file_name )
+                        print "Error, exception: %s caught attempting to purge metadata file %s\n" %( str( e ), metadata_file.file_name )
                     metadata_file.purged = True
                 metadata_file.deleted = True
                 #metadata_file.flush()
             print "%s" % metadata_file.file_name
-        print
+        print "Deleting dataset id", dataset.id
         dataset.deleted = True
         app.model.flush()
 
@@ -338,32 +333,34 @@
     if dataset.deleted:
         try:
             if dataset.purgable and _dataset_is_deletable( dataset ):
-                print "%s" % dataset.file_name
                 if not info_only:
                     # Remove files from disk and update the database
                     if remove_from_disk:
                         # TODO: should permissions on the dataset be deleted here?
+                        print "Removing disk, file ", dataset.file_name
                         os.unlink( dataset.file_name )
                         # Remove associated extra files from disk if they exist
                         if dataset.extra_files_path and os.path.exists( dataset.extra_files_path ):
                             shutil.rmtree( dataset.extra_files_path ) #we need to delete the directory and its contents; os.unlink would always fail on a directory
+                    print "Purging dataset id", dataset.id
                     dataset.purged = True
                     dataset.flush()
             else:
-                print "# This dataset (%i) is not purgable, the file (%s) will not be removed.\n" % ( dataset.id, dataset.file_name )
+                print "This dataset (%i) is not purgable, the file (%s) will not be removed.\n" % ( dataset.id, dataset.file_name )
         except OSError, exc:
-            print "# Error, file has already been removed: %s" % str( exc )
+            print "Error, dataset file has already been removed: %s" % str( exc )
+            print "Purging dataset id", dataset.id
             dataset.purged = True
             dataset.flush()
         except Exception, exc:
-            print "# Error, exception: %s caught attempting to purge %s\n" %( str( exc ), dataset.file_name )
+            print "Error attempting to purge data file: ", dataset.file_name, " error: ", str( exc )
     else:
-        print "# Error: '%s' has not previously been deleted, so it cannot be purged\n" % dataset.file_name
-    print ""
+        print "Error: '%s' has not previously been deleted, so it cannot be purged\n" % dataset.file_name
 
 def _purge_folder( folder, app, remove_from_disk, info_only = False ):
     """Purges a folder and its contents, recursively"""
     for ld in folder.datasets:
+        print "Deleting library dataset id ", ld.id
         ld.deleted = True
         for ldda in [ld.library_dataset_dataset_association] + ld.expired_datasets:
             _purge_dataset_instance( ldda, app, remove_from_disk, info_only = info_only ) #mark a DatasetInstance as deleted, clear associated files, and mark the Dataset as deleted if it is deletable
@@ -371,6 +368,7 @@
         _purge_folder( sub_folder, app, remove_from_disk, info_only = info_only )
     if not info_only:
         # TODO: should the folder permissions be deleted here?
+        print "Purging folder id ", folder.id
         folder.purged = True
         folder.flush()

    

[galaxy-dev] [hg] galaxy 2771: Improve logging for cleanup_datasets.py script.

Nate Coraor