1 new changeset in galaxy-central: http://bitbucket.org/galaxy/galaxy-central/changeset/2a3b27d59bd3/ changeset: r5077:2a3b27d59bd3 user: dannon date: 2011-02-17 18:05:39 summary: Adjustment to dataset line estimation. Estimate is now rounded to only the first two digits, like: ~65,000 lines, to make it clearer that it's an estimate. Will no longer try to estimate for small files, however they ended up without lines set. affected #: 3 files (757 bytes) --- a/lib/galaxy/datatypes/data.py Thu Feb 17 08:45:09 2011 -0500 +++ b/lib/galaxy/datatypes/data.py Thu Feb 17 12:05:39 2011 -0500 @@ -388,7 +388,23 @@ return 'text/plain' def set_meta( self, dataset, **kwd ): """ - Set the number of lines of data in dataset, + Set the number of lines of data in dataset. + """ + dataset.metadata.data_lines = self.count_data_lines(dataset) + def estimate_file_lines( self, dataset ): + """ + Perform a rough estimate by extrapolating number of lines from a small read. + """ + sample_size = 1048576 + dataset_fh = open( dataset.file_name ) + dataset_read = dataset_fh.read(sample_size) + dataset_fh.close() + sample_lines = dataset_read.count('\n') + est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) + return est_lines + def count_data_lines(self, dataset): + """ + Count the number of lines of data in dataset, skipping all blank lines and comments. """ data_lines = 0 @@ -396,16 +412,7 @@ line = line.strip() if line and not line.startswith( '#' ): data_lines += 1 - dataset.metadata.data_lines = data_lines - def estimate_file_lines( self, dataset ): - # Perform a rough estimate by extrapolating number of lines from a small read. - sample_size = 1048576 - dataset_fh = open( dataset.file_name ) - dataset_read = dataset_fh.read(sample_size) - dataset_fh.close() - sample_lines = dataset_read.count('\n') - est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) - return est_lines + return data_lines def set_peek( self, dataset, line_count=None, is_multi_byte=False ): if not dataset.dataset.purged: # The file must exist on disk for the get_file_peek() method @@ -418,8 +425,13 @@ # Number of lines is not known ( this should not happen ), and auto-detect is # needed to set metadata # This can happen when the file is larger than max_optional_metadata_filesize. - est_lines = self.estimate_file_lines(dataset) - dataset.blurb = "~%s %s" % ( util.commaify(str(est_lines)), inflector.cond_plural(est_lines, "line") ) + if int(dataset.get_size()) <= 1048576: + #Small dataset, recount all lines and reset peek afterward. + dataset.metadata.data_lines = self.count_data_lines(dataset) + self.set_peek(dataset) + else: + est_lines = self.estimate_file_lines(dataset) + dataset.blurb = "~%s %s" % ( util.commaify(util.roundify(str(est_lines))), inflector.cond_plural(est_lines, "line") ) else: dataset.blurb = "%s %s" % util.commaify( str(line_count) ), inflector.cond_plural(line_count, "line") else: --- a/lib/galaxy/datatypes/interval.py Thu Feb 17 08:45:09 2011 -0500 +++ b/lib/galaxy/datatypes/interval.py Thu Feb 17 12:05:39 2011 -0500 @@ -71,7 +71,7 @@ else: # Number of lines is not known ( this should not happen ), and auto-detect is # needed to set metadata - dataset.blurb = "~%s regions" % util.commaify( str( self.estimate_file_lines(dataset) ) ) + dataset.blurb = "~%s regions" % util.commaify(util.roundify(str(self.estimate_file_lines(dataset)))) else: dataset.blurb = "%s regions" % util.commaify( str( line_count ) ) else: --- a/lib/galaxy/util/__init__.py Thu Feb 17 08:45:09 2011 -0500 +++ b/lib/galaxy/util/__init__.py Thu Feb 17 12:05:39 2011 -0500 @@ -298,7 +298,16 @@ return new else: return commaify(new) - + +def roundify(amount, sfs = 2): + """ + Take a number in string form and truncate to 'sfs' significant figures. + """ + if len(amount) <= sfs: + return amount + else: + return amount[0:sfs] + '0'*(len(amount) - sfs) + def object_to_string( obj ): return binascii.hexlify( pickle.dumps( obj, 2 ) ) Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.