commit/galaxy-central: carlfeberhard: Dataproviders, column: allow filtering data by parsed column values (numeric, list, and string)
1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/bb9de7bc4656/ Changeset: bb9de7bc4656 User: carlfeberhard Date: 2015-02-09 15:36:43+00:00 Summary: Dataproviders, column: allow filtering data by parsed column values (numeric, list, and string) Affected #: 1 file diff -r 9d795f2108c14bb7b8486be0b949b251cb0fd94c -r bb9de7bc465630eb3f8d055df573a9007f670914 lib/galaxy/datatypes/dataproviders/column.py --- a/lib/galaxy/datatypes/dataproviders/column.py +++ b/lib/galaxy/datatypes/dataproviders/column.py @@ -3,6 +3,9 @@ is further subdivided into multiple data (e.g. columns from a line). """ +import urllib +import re + import line _TODO = """ @@ -34,12 +37,13 @@ 'column_count' : 'int', 'column_types' : 'list:str', 'parse_columns' : 'bool', - 'deliminator' : 'str' + 'deliminator' : 'str', + 'filters' : 'list:str' } def __init__( self, source, indeces=None, column_count=None, column_types=None, parsers=None, parse_columns=True, - deliminator='\t', **kwargs ): + deliminator='\t', filters=None, **kwargs ): """ :param indeces: a list of indeces of columns to gather from each row Optional: will default to `None`. @@ -103,6 +107,104 @@ # overwrite with user desired parsers self.parsers.update( parsers or {} ) + filters = filters or [] + self.column_filters = [] + for filter_ in filters: + parsed = self.parse_filter( filter_ ) + #TODO: might be better to error on bad filter/None here + if callable( parsed ): + self.column_filters.append( parsed ) + + def parse_filter( self, filter_param_str ): + split = filter_param_str.split( '-', 3 ) + if not len( split ) == 3: + return None + column, op, val = split + + # better checking v. len and indeces + column = int( column ) + if column > len( self.column_types ): + return None + if self.column_types[ column ] in ( 'float', 'int' ): + return self.create_numeric_filter( column, op, val ) + if self.column_types[ column ] in ( 'str' ): + return self.create_string_filter( column, op, val ) + if self.column_types[ column ] in ( 'list' ): + return self.create_list_filter( column, op, val ) + return None + + def create_numeric_filter( self, column, op, val ): + """ + Return an anonymous filter function that will be passed the array + of parsed columns. Return None if no filter function can be + created for the given params. + + The function will compare the column at index `column` against `val` + using the given op where op is one of: + lt: less than, le: less than or equal to, + eq: equal to, ne: not equal to, + ge: greather than or equal to, gt: greater than + + `val` is cast as float here and will return None if there's a parsing error. + """ + try: + val = float( val ) + except ValueError: + return None + if 'lt' == op: + return lambda d: d[column] < val + elif 'le' == op: + return lambda d: d[column] <= val + elif 'eq' == op: + return lambda d: d[column] == val + elif 'ne' == op: + return lambda d: d[column] != val + elif 'ge' == op: + return lambda d: d[column] >= val + elif 'gt' == op: + return lambda d: d[column] > val + return None + + def create_string_filter( self, column, op, val ): + """ + Return an anonymous filter function that will be passed the array + of parsed columns. Return None if no filter function can be + created for the given params. + + The function will compare the column at index `column` against `val` + using the given op where op is one of: + eq: exactly matches, + has: the column contains the substring `val`, + re: the column matches the regular expression in `val` + """ + if 'eq' == op: + return lambda d: d[column] == val + elif 'has' == op: + return lambda d: val in d[column] + elif 're' == op: + val = urllib.unquote_plus( val ) + val = re.compile( val ) + return lambda d: val.match( d[column] ) is not None + return None + + def create_list_filter( self, column, op, val ): + """ + Return an anonymous filter function that will be passed the array + of parsed columns. Return None if no filter function can be + created for the given params. + + The function will compare the column at index `column` against `val` + using the given op where op is one of: + eq: the list `val` exactly matches the list in the column, + has: the list in the column contains the sublist `val`, + """ + if 'eq' == op: + val = self.parse_value( val, 'list' ) + return lambda d: d[column] == val + elif 'has' == op: + return lambda d: val in d[column] + return None + def get_default_parsers( self ): """ Return parser dictionary keyed for each columnar type @@ -140,6 +242,39 @@ #'gffstrand': # -, +, ?, or '.' for None, etc. } + def filter( self, line ): + line = super( ColumnarDataProvider, self ).filter( line ) + if line == None: + return line + columns = self.parse_columns_from_line( line ) + return self.filter_by_columns( columns ) + + def parse_columns_from_line( self, line ): + """ + Returns a list of the desired, parsed columns. + :param line: the line to parse + :type line: str + """ + #TODO: too much going on in this loop - the above should all be precomputed AMAP... + all_columns = line.split( self.deliminator ) + # if no indeces were passed to init, return all columns + selected_indeces = self.selected_column_indeces or list( xrange( len( all_columns ) ) ) + parsed_columns = [] + for parser_index, column_index in enumerate( selected_indeces ): + parsed_columns.append( self.parse_column_at_index( all_columns, parser_index, column_index ) ) + return parsed_columns + + def parse_column_at_index( self, columns, parser_index, index ): + """ + Get the column type for the parser from `self.column_types` or `None` + if the type is unavailable. + """ + try: + return self.parse_value( columns[ index ], self.get_column_type( parser_index ) ) + # if a selected index is not within columns, return None + except IndexError, index_err: + return None + def parse_value( self, val, type ): """ Attempt to parse and return the given value based on the given type. @@ -173,51 +308,11 @@ except IndexError, ind_err: return None - def parse_column_at_index( self, columns, parser_index, index ): - """ - Get the column type for the parser from `self.column_types` or `None` - if the type is unavailable. - """ - try: - return self.parse_value( columns[ index ], self.get_column_type( parser_index ) ) - # if a selected index is not within columns, return None - except IndexError, index_err: - return None - - def parse_columns_from_line( self, line ): - """ - Returns a list of the desired, parsed columns. - :param line: the line to parse - :type line: str - """ - #TODO: too much going on in this loop - the above should all be precomputed AMAP... - all_columns = line.split( self.deliminator ) - # if no indeces were passed to init, return all columns - selected_indeces = self.selected_column_indeces or list( xrange( len( all_columns ) ) ) - parsed_columns = [] - for parser_index, column_index in enumerate( selected_indeces ): - parsed_columns.append( self.parse_column_at_index( all_columns, parser_index, column_index ) ) - return parsed_columns - - def __iter__( self ): - parent_gen = super( ColumnarDataProvider, self ).__iter__() - for line in parent_gen: - columns = self.parse_columns_from_line( line ) - yield columns - - #TODO: implement column filters here and not below - flatten hierarchy - -class FilteredByColumnDataProvider( ColumnarDataProvider ): - """ - Data provider that provide a list of columns from the lines of its source - _only_ if they pass a given filter function. - - e.g. column #3 is type int and > N - """ - # TODO: how to do this and still have limit and offset work? - def __init__( self, source, **kwargs ): - raise NotImplementedError() - super( FilteredByColumnDataProvider, self ).__init__( source, **kwargs ) + def filter_by_columns( self, columns ): + for filter_fn in self.column_filters: + if not filter_fn( columns ): + return None + return columns class DictDataProvider( ColumnarDataProvider ): Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
commits-noreply@bitbucket.org