[galaxy-dev] [hg] galaxy 1654: Update Tabular.set_meta() to use column types ...

10 Dec 2008

details:   http://www.bx.psu.edu/hg/galaxy/rev/f77ec6315c7c
changeset: 1654:f77ec6315c7c
user:      Dan Blankenberg <dan@bx.psu.edu>
date:      Tue Dec 09 14:53:10 2008 -0500
description:
Update Tabular.set_meta() to use column types as guessed between rows, when column data is missing. Also better handling of the presumed header line (first line is treated as a header, but information is retained and used if needed, whereas this line would be previously tossed out.)

2 file(s) affected in this change:

lib/galaxy/datatypes/interval.py
lib/galaxy/datatypes/tabular.py

diffs (212 lines):

diff -r 06c63a161985 -r f77ec6315c7c lib/galaxy/datatypes/interval.py

--- a/lib/galaxy/datatypes/interval.py	Fri Dec 05 11:39:38 2008 -0500
+++ b/lib/galaxy/datatypes/interval.py	Tue Dec 09 14:53:10 2008 -0500
@@ -758,6 +758,9 @@
         """Initialize interval datatype, by adding UCSC display app"""
         Tabular.__init__(self, **kwd)
         self.add_display_app ( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' )
+    def set_readonly_meta( self, dataset, skip=1, **kwd ):
+        """Resets the values of readonly metadata elements."""
+        Tabular.set_readonly_meta( self, dataset, skip = skip, **kwd )
     def set_meta( self, dataset, overwrite = True, **kwd ):
         Tabular.set_meta( self, dataset, overwrite = overwrite, skip = 1 )
     def display_peek( self, dataset ):
@@ -869,7 +872,11 @@
         """Initialize datatype, by adding GBrowse display app"""
         Tabular.__init__(self, **kwd)
         self.add_display_app ('elegans', 'display in GBrowse', 'as_gbrowse_display_file', 'gbrowse_links' )
-
+    
+    def set_readonly_meta( self, dataset, skip=1, **kwd ):
+        """Resets the values of readonly metadata elements."""
+        Tabular.set_readonly_meta( self, dataset, skip = skip, **kwd )
+    
     def set_meta( self, dataset, overwrite = True, **kwd ):
         Tabular.set_meta( self, dataset, overwrite = overwrite, skip = 1 )
     
diff -r 06c63a161985 -r f77ec6315c7c lib/galaxy/datatypes/tabular.py
--- a/lib/galaxy/datatypes/tabular.py	Fri Dec 05 11:39:38 2008 -0500
+++ b/lib/galaxy/datatypes/tabular.py	Tue Dec 09 14:53:10 2008 -0500
@@ -23,75 +23,125 @@
 
     def init_meta( self, dataset, copy_from=None ):
         data.Text.init_meta( self, dataset, copy_from=copy_from )
-    def set_readonly_meta( self, dataset, skip=1, **kwd ):
+    def set_readonly_meta( self, dataset, skip=None, **kwd ):
         """Resets the values of readonly metadata elements."""
         Tabular.set_meta( self, dataset, overwrite = True, skip = skip )
-    def set_meta( self, dataset, overwrite = True, skip = 1, **kwd ):
+    def set_meta( self, dataset, overwrite = True, skip = None, **kwd ):
         """
         Tries to determine the number of columns as well as those columns
         that contain numerical values in the dataset.  A skip parameter is
         used because various tabular data types reuse this function, and
         their data type classes are responsible to determine how many invalid
-        comment lines should be skipped.
+        comment lines should be skipped. Using None for skip will cause skip 
+        to be zero, but the first line will be processed as a header.
         """
         #we treat 'overwrite' as always True (we always want to set tabular metadata when called)
+        #if a tabular file has no data, it will have one column of type str
+        
+        num_check_lines = 100 #we will only check up to this many lines into the file
+        requested_skip = skip #store original skip value to check with later
+        if skip is None:
+            skip = 0
+        
+        column_type_set_order = [ 'int', 'float', 'list', 'str'  ] #Order to set column types in
+        default_column_type = column_type_set_order[-1] # Default column type is lowest in list
+        column_type_compare_order = list( column_type_set_order ) #Order to compare column types
+        column_type_compare_order.reverse() 
+        def type_overrules_type( column_type1, column_type2 ):
+            if column_type1 is None or column_type1 == column_type2:
+                return False
+            if column_type2 is None:
+                return True
+            for column_type in column_type_compare_order:
+                if column_type1 == column_type:
+                    return True
+                if column_type2 == column_type:
+                    return False
+            #neither column type was found in our ordered list, this cannot happen
+            raise "Tried to compare unknown column types"
+        def is_int( column_text ):
+            try:
+                int( column_text )
+                return True
+            except: 
+                return False
+        def is_float( column_text ):
+            try:
+                float( column_text )
+                return True
+            except: 
+                if column_text.strip().lower() == 'na':
+                    return True #na is special cased to be a float
+                return False
+        def is_list( column_text ):
+            return "," in column_text
+        def is_str( column_text ):
+            #anything, except an empty string, is True
+            if column_text == "":
+                return False
+            return True
+        is_column_type = {} #Dict to store column type string to checking function
+        for column_type in column_type_set_order:
+            is_column_type[column_type] = locals()[ "is_%s" % ( column_type ) ]
+        def guess_column_type( column_text ):
+            for column_type in column_type_set_order:
+                if is_column_type[column_type]( column_text ):
+                    return column_type
+            return None
+        
+        column_types = []
+        first_line_column_types = [default_column_type] # default value is one column of type str
         if dataset.has_data():
-            column_types = []
- 
+            #NOTE: if skip > num_check_lines, we won't detect any metadata, and will use default
             for i, line in enumerate( file ( dataset.file_name ) ):
-                if i < skip:
+                line = line.rstrip('\r\n')
+                if i < skip or not line or line.startswith( '#' ):
                     continue
-                line = line.rstrip('\r\n')
-                if line and not line.startswith( '#' ):
-                    elems = line.split( '\t' )
-                    elems_len = len( elems )
-                    if elems_len > 0:
-                        # Set the columns metadata attribute
-                        if elems_len != dataset.metadata.columns:
-                            dataset.metadata.columns = elems_len
-                        # Set the column_types metadata attribute
-                        for col in range( 0, elems_len ):
-                            col_type = None
-                            val = elems[ col ]
-                            if not val:
-                                if i == 100:
-                                    # We're about to end our loop, so default col_type to 'str'
-                                    col_type = 'str'
-                                else:
-                                    # Missing a column value, so go to the next line
-                                    column_types = []
-                                    break
-                            if not col_type and val.find( '.' ) < 0:
-                                try:
-                                    int( val )
-                                    col_type = 'int'
-                                except: 
-                                    pass
-                            if not col_type:
-                                try:
-                                    float( val )
-                                    col_type = 'float'
-                                except:
-                                    if val.strip().lower() == 'na':
-                                        col_type = 'float'
-                            if not col_type:
-                                val_elems = val.split( ',' )
-                                if len( val_elems ) > 1:
-                                    col_type = 'list'
-                            if not col_type:
-                                # All parameters are strings, so this will be the default
-                                col_type = 'str'
-                            if col_type:
-                                column_types.append( col_type )
-                            else:
-                                # Couldn't determine column type, so go to the next line
-                                column_types = []
-                                break
-                        if column_types:
-                            break
-                if i > 100:
-                    break # Hopefully we never get here...
-            dataset.metadata.column_types = column_types
+                
+                fields = line.split( '\t' )
+                for field_count, field in enumerate( fields ):
+                    if field_count >= len( column_types ): #found a previously unknown column, we append None
+                        column_types.append( None )
+                    column_type = guess_column_type( field )
+                    if type_overrules_type( column_type, column_types[field_count] ):
+                        column_types[field_count] = column_type
+                
+                if i == 0 and requested_skip is None:
+                    #this is our first line, people seem to like to upload files that have a header line, but do not start with '#' (i.e. all column types would then most likely be detected as str)
+                    #we will assume that the first line is always a header (this was previous behavior - it was always skipped) when the requested skip is None
+                    #we only use the data from the first line if we have no other data for a column
+                    #this is far from perfect, as:
+                    #1,2,3	1.1	2.2	qwerty
+                    #0	0		1,2,3
+                    #will detect as
+                    #"column_types": ["int", "int", "float", "list"]
+                    #instead of:
+                    #"column_types": ["list", "float", "float", "str"]  *** would seem to be the 'Truth' by manual observation that the first line should be included as data
+                    #old method would have detected as:
+                    #"column_types": ["int", "int", "str", "list"]
+                    first_line_column_types = column_types
+                    column_types = [ None for col in first_line_column_types ]
+                elif ( column_types and None not in column_types ) or i > num_check_lines:
+                    #found and set all known columns, or we exceeded our max check lines
+                    break
+        
+        #we error on the larger number of columns
+        #first we pad our column_types by using data from first line
+        if len( first_line_column_types ) > len( column_types ):
+            for column_type in first_line_column_types[len( column_types ):]:
+                column_types.append( column_type )
+        
+        #Now we fill any unknown (None) column_types with data from first line
+        for i in range( len( column_types ) ):
+            if column_types[i] is None:
+                if first_line_column_types[i] is None:
+                    column_types[i] = default_column_type
+                else:
+                    column_types[i] = first_line_column_types[i]
+        
+        dataset.metadata.column_types = column_types
+        dataset.metadata.columns = len( column_types )
+        
     def make_html_table( self, dataset, skipchars=[] ):
         """Create HTML table, used for displaying peek"""
         out = ['<table cellspacing="0" cellpadding="3">']

    

[galaxy-dev] [hg] galaxy 1654: Update Tabular.set_meta() to use column types ...

Greg Von Kuster