details: http://www.bx.psu.edu/hg/galaxy/rev/330bf412f311 changeset: 1616:330bf412f311 user: Greg Von Kuster <greg@bx.psu.edu> date: Mon Nov 10 15:30:07 2008 -0500 description: Fix for purging dataset - add check for pre-history_dataset_association approach to sharing. 1 file(s) affected in this change: scripts/cleanup_datasets/cleanup_datasets.py diffs (122 lines): diff -r 2d7f872ddaf8 -r 330bf412f311 scripts/cleanup_datasets/cleanup_datasets.py --- a/scripts/cleanup_datasets/cleanup_datasets.py Mon Nov 10 14:22:01 2008 -0500 +++ b/scripts/cleanup_datasets/cleanup_datasets.py Mon Nov 10 15:30:07 2008 -0500 @@ -79,7 +79,7 @@ # Provide info about the histories and datasets that will be affected if the delete_userless_histories function is executed. history_count = 0 dataset_count = 0 - where = ( h.table.c.user_id==None ) & ( h.table.c.deleted=='f' ) & ( h.table.c.update_time < cutoff_time ) + where = ( h.table.c.user_id==None ) & ( h.table.c.deleted==False ) & ( h.table.c.update_time < cutoff_time ) histories = h.query().filter( where ).options( eagerload( 'active_datasets' ) ).all() print '# The following datasets and associated userless histories will be deleted' @@ -102,7 +102,7 @@ # The datasets associated with each history are also deleted. Nothing is removed from disk. history_count = 0 dataset_count = 0 - h_where = ( h.table.c.user_id==None ) & ( h.table.c.deleted=='f' ) & ( h.table.c.update_time < cutoff_time ) + h_where = ( h.table.c.user_id==None ) & ( h.table.c.deleted==False ) & ( h.table.c.update_time < cutoff_time ) print '# The following datasets and associated userless histories have been deleted' start = time.clock() @@ -136,7 +136,7 @@ history_count = 0 dataset_count = 0 disk_space = 0 - h_where = ( h.table.c.deleted=='t' ) & ( h.table.c.purged=='f' ) & ( h.table.c.update_time < cutoff_time ) + h_where = ( h.table.c.deleted==True ) & ( h.table.c.purged==False ) & ( h.table.c.update_time < cutoff_time ) print '# The following datasets and associated deleted histories will be purged' start = time.clock() @@ -169,7 +169,7 @@ disk_space = 0 file_size = 0 errors = False - h_where = ( h.table.c.deleted=='t' ) & ( h.table.c.purged=='f' ) & ( h.table.c.update_time < cutoff_time ) + h_where = ( h.table.c.deleted==True ) & ( h.table.c.purged==False ) & ( h.table.c.update_time < cutoff_time ) print '# The following datasets and associated deleted histories have been purged' start = time.clock() @@ -187,7 +187,7 @@ dataset.file_size = 0 if remove_from_disk: dataset.flush() - errmsg = purge_dataset( dataset, m ) + errmsg = purge_dataset( dataset, d, m ) if errmsg: errors = True print errmsg @@ -221,7 +221,7 @@ # Provide info about the datasets that will be affected if the purge_datasets function is executed. dataset_count = 0 disk_space = 0 - where = ( d.table.c.deleted=='t' ) & ( d.table.c.purgable=='t' ) & ( d.table.c.purged=='f' ) & ( d.table.c.update_time < cutoff_time ) + where = ( d.table.c.deleted==True ) & ( d.table.c.purgable==True ) & ( d.table.c.purged==False ) & ( d.table.c.update_time < cutoff_time ) print '# The following deleted datasets will be purged' start = time.clock() @@ -243,7 +243,7 @@ dataset_count = 0 disk_space = 0 file_size = 0 - where = ( d.table.c.deleted=='t' ) & ( d.table.c.purgable=='t' ) & ( d.table.c.purged=='f' ) & ( d.table.c.update_time < cutoff_time ) + where = ( d.table.c.deleted==True ) & ( d.table.c.purgable==True ) & ( d.table.c.purged==False ) & ( d.table.c.update_time < cutoff_time ) print '# The following deleted datasets have been purged' start = time.clock() @@ -251,7 +251,7 @@ for dataset in datasets: file_size = dataset.file_size if remove_from_disk: - errmsg = purge_dataset( dataset, m ) + errmsg = purge_dataset( dataset, d, m ) if errmsg: print errmsg else: @@ -280,23 +280,33 @@ print '# Freed disk space: ', disk_space, '\n' print "Elapsed time: ", stop - start, "\n" -def purge_dataset( dataset, m ): +def purge_dataset( dataset, d, m ): # Removes the file from disk and updates the database accordingly. if dataset.deleted: + purgable = True # Remove files from disk and update the database try: - dataset.purged = True - dataset.file_size = 0 - dataset.flush() - for shared_data in dataset.history_associations: - # Check to see if another dataset is using this file. This happens when a user shares - # their history with another user. In this case, a new record is created in the dataset - # table for each dataset, but the dataset records point to the same data file on disk. So - # if 1 of the 2 users deletes the dataset from their history but the other doesn't, we need - # to keep the dataset on disk for the 2nd user. - if not shared_data.deleted: - break #only purge when not shared - else: + # See if the dataset has been shared + if dataset.external_filename: + # This check handles the pre-history_dataset_association approach to sharing. + shared_data = d.filter( and_( d.table.c.external_filename==dataset.external_filename, d.table.c.deleted==False ) ).all() + if shared_data: + purgable = False + if purgable: + # This check handles the history_dataset_association approach to sharing. + for shared_data in dataset.history_associations: + # Check to see if another dataset is using this file. This happens when a user shares + # their history with another user. In this case, a new record is created in the dataset + # table for each dataset, but the dataset records point to the same data file on disk. So + # if 1 of the 2 users deletes the dataset from their history but the other doesn't, we need + # to keep the dataset on disk for the 2nd user. + if not shared_data.deleted: + purgable = False + break + if purgable: + dataset.purged = True + dataset.file_size = 0 + dataset.flush() # Remove dataset file from disk os.unlink( dataset.file_name ) print "%s" % dataset.file_name