[hg] galaxy 1654: Update Tabular.set_meta() to use column types ...
details: http://www.bx.psu.edu/hg/galaxy/rev/f77ec6315c7c changeset: 1654:f77ec6315c7c user: Dan Blankenberg <dan@bx.psu.edu> date: Tue Dec 09 14:53:10 2008 -0500 description: Update Tabular.set_meta() to use column types as guessed between rows, when column data is missing. Also better handling of the presumed header line (first line is treated as a header, but information is retained and used if needed, whereas this line would be previously tossed out.) 2 file(s) affected in this change: lib/galaxy/datatypes/interval.py lib/galaxy/datatypes/tabular.py diffs (212 lines): diff -r 06c63a161985 -r f77ec6315c7c lib/galaxy/datatypes/interval.py --- a/lib/galaxy/datatypes/interval.py Fri Dec 05 11:39:38 2008 -0500 +++ b/lib/galaxy/datatypes/interval.py Tue Dec 09 14:53:10 2008 -0500 @@ -758,6 +758,9 @@ """Initialize interval datatype, by adding UCSC display app""" Tabular.__init__(self, **kwd) self.add_display_app ( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' ) + def set_readonly_meta( self, dataset, skip=1, **kwd ): + """Resets the values of readonly metadata elements.""" + Tabular.set_readonly_meta( self, dataset, skip = skip, **kwd ) def set_meta( self, dataset, overwrite = True, **kwd ): Tabular.set_meta( self, dataset, overwrite = overwrite, skip = 1 ) def display_peek( self, dataset ): @@ -869,7 +872,11 @@ """Initialize datatype, by adding GBrowse display app""" Tabular.__init__(self, **kwd) self.add_display_app ('elegans', 'display in GBrowse', 'as_gbrowse_display_file', 'gbrowse_links' ) - + + def set_readonly_meta( self, dataset, skip=1, **kwd ): + """Resets the values of readonly metadata elements.""" + Tabular.set_readonly_meta( self, dataset, skip = skip, **kwd ) + def set_meta( self, dataset, overwrite = True, **kwd ): Tabular.set_meta( self, dataset, overwrite = overwrite, skip = 1 ) diff -r 06c63a161985 -r f77ec6315c7c lib/galaxy/datatypes/tabular.py --- a/lib/galaxy/datatypes/tabular.py Fri Dec 05 11:39:38 2008 -0500 +++ b/lib/galaxy/datatypes/tabular.py Tue Dec 09 14:53:10 2008 -0500 @@ -23,75 +23,125 @@ def init_meta( self, dataset, copy_from=None ): data.Text.init_meta( self, dataset, copy_from=copy_from ) - def set_readonly_meta( self, dataset, skip=1, **kwd ): + def set_readonly_meta( self, dataset, skip=None, **kwd ): """Resets the values of readonly metadata elements.""" Tabular.set_meta( self, dataset, overwrite = True, skip = skip ) - def set_meta( self, dataset, overwrite = True, skip = 1, **kwd ): + def set_meta( self, dataset, overwrite = True, skip = None, **kwd ): """ Tries to determine the number of columns as well as those columns that contain numerical values in the dataset. A skip parameter is used because various tabular data types reuse this function, and their data type classes are responsible to determine how many invalid - comment lines should be skipped. + comment lines should be skipped. Using None for skip will cause skip + to be zero, but the first line will be processed as a header. """ #we treat 'overwrite' as always True (we always want to set tabular metadata when called) + #if a tabular file has no data, it will have one column of type str + + num_check_lines = 100 #we will only check up to this many lines into the file + requested_skip = skip #store original skip value to check with later + if skip is None: + skip = 0 + + column_type_set_order = [ 'int', 'float', 'list', 'str' ] #Order to set column types in + default_column_type = column_type_set_order[-1] # Default column type is lowest in list + column_type_compare_order = list( column_type_set_order ) #Order to compare column types + column_type_compare_order.reverse() + def type_overrules_type( column_type1, column_type2 ): + if column_type1 is None or column_type1 == column_type2: + return False + if column_type2 is None: + return True + for column_type in column_type_compare_order: + if column_type1 == column_type: + return True + if column_type2 == column_type: + return False + #neither column type was found in our ordered list, this cannot happen + raise "Tried to compare unknown column types" + def is_int( column_text ): + try: + int( column_text ) + return True + except: + return False + def is_float( column_text ): + try: + float( column_text ) + return True + except: + if column_text.strip().lower() == 'na': + return True #na is special cased to be a float + return False + def is_list( column_text ): + return "," in column_text + def is_str( column_text ): + #anything, except an empty string, is True + if column_text == "": + return False + return True + is_column_type = {} #Dict to store column type string to checking function + for column_type in column_type_set_order: + is_column_type[column_type] = locals()[ "is_%s" % ( column_type ) ] + def guess_column_type( column_text ): + for column_type in column_type_set_order: + if is_column_type[column_type]( column_text ): + return column_type + return None + + column_types = [] + first_line_column_types = [default_column_type] # default value is one column of type str if dataset.has_data(): - column_types = [] - + #NOTE: if skip > num_check_lines, we won't detect any metadata, and will use default for i, line in enumerate( file ( dataset.file_name ) ): - if i < skip: + line = line.rstrip('\r\n') + if i < skip or not line or line.startswith( '#' ): continue - line = line.rstrip('\r\n') - if line and not line.startswith( '#' ): - elems = line.split( '\t' ) - elems_len = len( elems ) - if elems_len > 0: - # Set the columns metadata attribute - if elems_len != dataset.metadata.columns: - dataset.metadata.columns = elems_len - # Set the column_types metadata attribute - for col in range( 0, elems_len ): - col_type = None - val = elems[ col ] - if not val: - if i == 100: - # We're about to end our loop, so default col_type to 'str' - col_type = 'str' - else: - # Missing a column value, so go to the next line - column_types = [] - break - if not col_type and val.find( '.' ) < 0: - try: - int( val ) - col_type = 'int' - except: - pass - if not col_type: - try: - float( val ) - col_type = 'float' - except: - if val.strip().lower() == 'na': - col_type = 'float' - if not col_type: - val_elems = val.split( ',' ) - if len( val_elems ) > 1: - col_type = 'list' - if not col_type: - # All parameters are strings, so this will be the default - col_type = 'str' - if col_type: - column_types.append( col_type ) - else: - # Couldn't determine column type, so go to the next line - column_types = [] - break - if column_types: - break - if i > 100: - break # Hopefully we never get here... - dataset.metadata.column_types = column_types + + fields = line.split( '\t' ) + for field_count, field in enumerate( fields ): + if field_count >= len( column_types ): #found a previously unknown column, we append None + column_types.append( None ) + column_type = guess_column_type( field ) + if type_overrules_type( column_type, column_types[field_count] ): + column_types[field_count] = column_type + + if i == 0 and requested_skip is None: + #this is our first line, people seem to like to upload files that have a header line, but do not start with '#' (i.e. all column types would then most likely be detected as str) + #we will assume that the first line is always a header (this was previous behavior - it was always skipped) when the requested skip is None + #we only use the data from the first line if we have no other data for a column + #this is far from perfect, as: + #1,2,3 1.1 2.2 qwerty + #0 0 1,2,3 + #will detect as + #"column_types": ["int", "int", "float", "list"] + #instead of: + #"column_types": ["list", "float", "float", "str"] *** would seem to be the 'Truth' by manual observation that the first line should be included as data + #old method would have detected as: + #"column_types": ["int", "int", "str", "list"] + first_line_column_types = column_types + column_types = [ None for col in first_line_column_types ] + elif ( column_types and None not in column_types ) or i > num_check_lines: + #found and set all known columns, or we exceeded our max check lines + break + + #we error on the larger number of columns + #first we pad our column_types by using data from first line + if len( first_line_column_types ) > len( column_types ): + for column_type in first_line_column_types[len( column_types ):]: + column_types.append( column_type ) + + #Now we fill any unknown (None) column_types with data from first line + for i in range( len( column_types ) ): + if column_types[i] is None: + if first_line_column_types[i] is None: + column_types[i] = default_column_type + else: + column_types[i] = first_line_column_types[i] + + dataset.metadata.column_types = column_types + dataset.metadata.columns = len( column_types ) + def make_html_table( self, dataset, skipchars=[] ): """Create HTML table, used for displaying peek""" out = ['<table cellspacing="0" cellpadding="3">']
participants (1)
-
Greg Von Kuster