commit/galaxy-central: carlfeberhard: Dataproviders, column: allow filtering data by parsed column values (numeric, list, and string)

9 Feb 2015

1 new commit in galaxy-central:

https://bitbucket.org/galaxy/galaxy-central/commits/bb9de7bc4656/
Changeset:   bb9de7bc4656
User:        carlfeberhard
Date:        2015-02-09 15:36:43+00:00
Summary:     Dataproviders, column: allow filtering data by parsed column values (numeric, list, and string)
Affected #:  1 file

diff -r 9d795f2108c14bb7b8486be0b949b251cb0fd94c -r bb9de7bc465630eb3f8d055df573a9007f670914 lib/galaxy/datatypes/dataproviders/column.py

--- a/lib/galaxy/datatypes/dataproviders/column.py
+++ b/lib/galaxy/datatypes/dataproviders/column.py
@@ -3,6 +3,9 @@
 is further subdivided into multiple data (e.g. columns from a line).
 """
 
+import urllib
+import re
+
 import line
 
 _TODO = """
@@ -34,12 +37,13 @@
         'column_count'  : 'int',
         'column_types'  : 'list:str',
         'parse_columns' : 'bool',
-        'deliminator'   : 'str'
+        'deliminator'   : 'str',
+        'filters'       : 'list:str'
     }
 
     def __init__( self, source, indeces=None,
             column_count=None, column_types=None, parsers=None, parse_columns=True,
-            deliminator='\t', **kwargs ):
+            deliminator='\t', filters=None, **kwargs ):
         """
         :param indeces: a list of indeces of columns to gather from each row
             Optional: will default to `None`.
@@ -103,6 +107,104 @@
             # overwrite with user desired parsers
             self.parsers.update( parsers or {} )
 
+        filters = filters or []
+        self.column_filters = []
+        for filter_ in filters:
+            parsed = self.parse_filter( filter_ )
+            #TODO: might be better to error on bad filter/None here
+            if callable( parsed ):
+                self.column_filters.append( parsed )
+
+    def parse_filter( self, filter_param_str ):
+        split = filter_param_str.split( '-', 3 )
+        if not len( split ) == 3:
+            return None
+        column, op, val = split
+
+        # better checking v. len and indeces
+        column = int( column )
+        if column > len( self.column_types ):
+            return None
+        if self.column_types[ column ] in ( 'float', 'int' ):
+            return self.create_numeric_filter( column, op, val )
+        if self.column_types[ column ] in ( 'str' ):
+            return self.create_string_filter( column, op, val )
+        if self.column_types[ column ] in ( 'list' ):
+            return self.create_list_filter( column, op, val )
+        return None
+
+    def create_numeric_filter( self, column, op, val ):
+        """
+        Return an anonymous filter function that will be passed the array
+        of parsed columns. Return None if no filter function can be 
+        created for the given params.
+
+        The function will compare the column at index `column` against `val`
+        using the given op where op is one of: 
+            lt: less than, le: less than or equal to,
+            eq: equal to, ne: not equal to,
+            ge: greather than or equal to, gt: greater than
+
+        `val` is cast as float here and will return None if there's a parsing error.
+        """
+        try:
+            val = float( val )
+        except ValueError:
+            return None
+        if   'lt' == op:
+            return lambda d: d[column] < val
+        elif 'le' == op:
+            return lambda d: d[column] <= val
+        elif 'eq' == op:
+            return lambda d: d[column] == val
+        elif 'ne' == op:
+            return lambda d: d[column] != val
+        elif 'ge' == op:
+            return lambda d: d[column] >= val
+        elif 'gt' == op:
+            return lambda d: d[column] > val
+        return None
+
+    def create_string_filter( self, column, op, val ):
+        """
+        Return an anonymous filter function that will be passed the array
+        of parsed columns. Return None if no filter function can be 
+        created for the given params.
+
+        The function will compare the column at index `column` against `val`
+        using the given op where op is one of: 
+            eq: exactly matches,
+            has: the column contains the substring `val`,
+            re: the column matches the regular expression in `val`
+        """
+        if   'eq' == op:
+            return lambda d: d[column] == val
+        elif 'has' == op:
+            return lambda d: val in d[column]
+        elif 're' == op:
+            val = urllib.unquote_plus( val )
+            val = re.compile( val )
+            return lambda d: val.match( d[column] ) is not None
+        return None
+
+    def create_list_filter( self, column, op, val ):
+        """
+        Return an anonymous filter function that will be passed the array
+        of parsed columns. Return None if no filter function can be 
+        created for the given params.
+
+        The function will compare the column at index `column` against `val`
+        using the given op where op is one of: 
+            eq: the list `val` exactly matches the list in the column,
+            has: the list in the column contains the sublist `val`,
+        """
+        if   'eq' == op:
+            val = self.parse_value( val, 'list' )
+            return lambda d: d[column] == val
+        elif 'has' == op:
+            return lambda d: val in d[column]
+        return None
+
     def get_default_parsers( self ):
         """
         Return parser dictionary keyed for each columnar type
@@ -140,6 +242,39 @@
             #'gffstrand': # -, +, ?, or '.' for None, etc.
         }
 
+    def filter( self, line ):
+        line = super( ColumnarDataProvider, self ).filter( line )
+        if line == None:
+            return line
+        columns = self.parse_columns_from_line( line )
+        return self.filter_by_columns( columns )
+
+    def parse_columns_from_line( self, line ):
+        """
+        Returns a list of the desired, parsed columns.
+        :param line: the line to parse
+        :type line: str
+        """
+        #TODO: too much going on in this loop - the above should all be precomputed AMAP...
+        all_columns = line.split( self.deliminator )
+        # if no indeces were passed to init, return all columns
+        selected_indeces = self.selected_column_indeces or list( xrange( len( all_columns ) ) )
+        parsed_columns = []
+        for parser_index, column_index in enumerate( selected_indeces ):
+            parsed_columns.append( self.parse_column_at_index( all_columns, parser_index, column_index ) )
+        return parsed_columns
+
+    def parse_column_at_index( self, columns, parser_index, index ):
+        """
+        Get the column type for the parser from `self.column_types` or `None`
+        if the type is unavailable.
+        """
+        try:
+            return self.parse_value( columns[ index ], self.get_column_type( parser_index ) )
+        # if a selected index is not within columns, return None
+        except IndexError, index_err:
+            return None
+
     def parse_value( self, val, type ):
         """
         Attempt to parse and return the given value based on the given type.
@@ -173,51 +308,11 @@
         except IndexError, ind_err:
             return None
 
-    def parse_column_at_index( self, columns, parser_index, index ):
-        """
-        Get the column type for the parser from `self.column_types` or `None`
-        if the type is unavailable.
-        """
-        try:
-            return self.parse_value( columns[ index ], self.get_column_type( parser_index ) )
-        # if a selected index is not within columns, return None
-        except IndexError, index_err:
-            return None
-
-    def parse_columns_from_line( self, line ):
-        """
-        Returns a list of the desired, parsed columns.
-        :param line: the line to parse
-        :type line: str
-        """
-        #TODO: too much going on in this loop - the above should all be precomputed AMAP...
-        all_columns = line.split( self.deliminator )
-        # if no indeces were passed to init, return all columns
-        selected_indeces = self.selected_column_indeces or list( xrange( len( all_columns ) ) )
-        parsed_columns = []
-        for parser_index, column_index in enumerate( selected_indeces ):
-            parsed_columns.append( self.parse_column_at_index( all_columns, parser_index, column_index ) )
-        return parsed_columns
-
-    def __iter__( self ):
-        parent_gen = super( ColumnarDataProvider, self ).__iter__()
-        for line in parent_gen:
-            columns = self.parse_columns_from_line( line )
-            yield columns
-
-    #TODO: implement column filters here and not below - flatten hierarchy
-
-class FilteredByColumnDataProvider( ColumnarDataProvider ):
-    """
-    Data provider that provide a list of columns from the lines of its source
-    _only_ if they pass a given filter function.
-
-    e.g. column #3 is type int and > N
-    """
-    # TODO: how to do this and still have limit and offset work?
-    def __init__( self, source, **kwargs ):
-        raise NotImplementedError()
-        super( FilteredByColumnDataProvider, self ).__init__( source, **kwargs )
+    def filter_by_columns( self, columns ):
+        for filter_fn in self.column_filters:
+            if not filter_fn( columns ):
+                return None
+        return columns
 
 
 class DictDataProvider( ColumnarDataProvider ):

Repository URL: https://bitbucket.org/galaxy/galaxy-central/

--

This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.

    

commits-noreply＠bitbucket.org

tags

participants (1)