commit/galaxy-central: carlfeberhard: DataProviders: add hierarchy module and xml provider, add provider to datatype
1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/bf6bf3b5edfd/ Changeset: bf6bf3b5edfd User: carlfeberhard Date: 2013-08-08 18:59:34 Summary: DataProviders: add hierarchy module and xml provider, add provider to datatype Affected #: 5 files diff -r f46505803e8f0efd26be16f9e18353fbdc969205 -r bf6bf3b5edfdc409f235cb91edcc9d91246df816 lib/galaxy/datatypes/dataproviders/__init__.py --- a/lib/galaxy/datatypes/dataproviders/__init__.py +++ b/lib/galaxy/datatypes/dataproviders/__init__.py @@ -23,6 +23,7 @@ import base import chunk import line +import hierarchy import column import external import dataset diff -r f46505803e8f0efd26be16f9e18353fbdc969205 -r bf6bf3b5edfdc409f235cb91edcc9d91246df816 lib/galaxy/datatypes/dataproviders/decorators.py --- a/lib/galaxy/datatypes/dataproviders/decorators.py +++ b/lib/galaxy/datatypes/dataproviders/decorators.py @@ -134,6 +134,7 @@ Parse the values in `query_kwargs` from strings to the proper types listed in the same key in `settings`. """ + #TODO: this was a relatively late addition: review and re-think def list_from_query_string( s ): # assume csv return s.split( ',' ) @@ -155,10 +156,11 @@ #TODO: this would be the place to sanitize any strings query_value = query_kwargs[ key ] needed_type = settings[ key ] - try: - query_kwargs[ key ] = parsers[ needed_type ]( query_value ) - except ( KeyError, ValueError ): - del query_kwargs[ key ] + if needed_type != 'str': + try: + query_kwargs[ key ] = parsers[ needed_type ]( query_value ) + except ( KeyError, ValueError ): + del query_kwargs[ key ] #TODO:?? do we want to remove query_kwarg entries NOT in settings? return query_kwargs diff -r f46505803e8f0efd26be16f9e18353fbdc969205 -r bf6bf3b5edfdc409f235cb91edcc9d91246df816 lib/galaxy/datatypes/dataproviders/hierarchy.py --- /dev/null +++ b/lib/galaxy/datatypes/dataproviders/hierarchy.py @@ -0,0 +1,142 @@ +""" +Dataproviders that iterate over lines from their sources. +""" + +import line +import xml.etree.ElementTree as elementtree + +_TODO = """ +""" + +import logging +log = logging.getLogger( __name__ ) + + +# ----------------------------------------------------------------------------- hierarchal/tree data providers +class HierarchalDataProvider( line.BlockDataProvider ): + """ + Class that uses formats where a datum may have a parent or children + data. + + e.g. XML, HTML, GFF3, Phylogenetic + """ + def __init__( self, source, **kwargs ): + #TODO: (and defer to better (than I can write) parsers for each subtype) + super( HierarchalDataProvider, self ).__init__( source, **kwargs ) + + +# ----------------------------------------------------------------------------- xml +class XMLDataProvider( HierarchalDataProvider ): + """ + Data provider that converts selected XML elements to dictionaries. + """ + # using elementtree's iterparse method to keep mem down + #TODO: this, however (AFAIK), prevents the use of xpath + settings = { + 'selector' : 'str', #urlencoded + 'max_depth' : 'int', + } + ITERPARSE_ALL_EVENTS = ( 'start', 'end', 'start-ns', 'end-ns' ) + #TODO: move appropo into super + + def __init__( self, source, selector=None, max_depth=None, **kwargs ): + """ + :param selector: some partial string in the desired tags to return + :param max_depth: the number of generations of descendents to return + """ + self.selector = selector + self.max_depth = max_depth + self.namespaces = {} + + super( XMLDataProvider, self ).__init__( source, **kwargs ) + + def matches_selector( self, element, selector=None ): + """ + Returns true if the ``element`` matches the ``selector``. + + :param element: an XML ``ElementTree.Element`` + :param selector: some partial string in the desired tags to return + + Change point for more sophisticated selectors. + """ + # search for partial match of selector to the element tag + #TODO: add more flexibility here w/o re-implementing xpath + #TODO: fails with '#' - browser thinks it's anchor - use urlencode + #TODO: need removal/replacement of etree namespacing here - then move to string match + return bool( ( selector == None ) + or ( isinstance( element, elementtree.Element ) and selector in element.tag ) ) + + def element_as_dict( self, element ): + """ + Converts an XML element (its text, tag, and attributes) to dictionary form. + + :param element: an XML ``ElementTree.Element`` + """ + #TODO: Key collision is unlikely here, but still should be better handled + return { + 'tag' : element.tag, + 'text' : element.text.strip() if element.text else None, + # needs shallow copy to protect v. element.clear() + 'attrib' : dict( element.attrib ) + } + + def get_children( self, element, max_depth=None ): + """ + Yield all children of element (and their children - recursively) + in dictionary form. + :param element: an XML ``ElementTree.Element`` + :param max_depth: the number of generations of descendents to return + """ + if not isinstance( max_depth, int ) or max_depth >= 1: + for child in element.getchildren(): + child_data = self.element_as_dict( child ) + + next_depth = max_depth - 1 if isinstance( max_depth, int ) else None + grand_children = list( self.get_children( child, next_depth ) ) + if grand_children: + child_data[ 'children' ] = grand_children + + yield child_data + + def __iter__( self ): + context = elementtree.iterparse( self.source, events=self.ITERPARSE_ALL_EVENTS ) + context = iter( context ) + + selected_element = None + for event, element in context: + #print 'iterparse, event:', event + #print 'iterparse, element:', element, ( element.tag if hasattr( element, 'tag' ) else '' ) + + if event == 'start-ns': + ns, uri = element + self.namespaces[ ns ] = uri + + elif event == 'start': + if( ( selected_element == None ) + and ( self.matches_selector( element, self.selector ) ) ): + # start tag of selected element - wait for 'end' to emit/yield + selected_element = element + + elif event == 'end': + if( ( selected_element != None ) + and ( element == selected_element ) ): + self.num_valid_data_read += 1 + + # offset + if self.num_valid_data_read > self.offset: + # convert to dict and yield + selected_element_dict = self.element_as_dict( selected_element ) + children = list( self.get_children( selected_element, self.max_depth ) ) + if children: + selected_element_dict[ 'children' ] = children + yield selected_element_dict + + # limit + self.num_data_returned += 1 + if self.limit is not None and self.num_data_returned >= self.limit: + break + + selected_element.clear() + selected_element = None + + self.num_data_read += 1 diff -r f46505803e8f0efd26be16f9e18353fbdc969205 -r bf6bf3b5edfdc409f235cb91edcc9d91246df816 lib/galaxy/datatypes/dataproviders/line.py --- a/lib/galaxy/datatypes/dataproviders/line.py +++ b/lib/galaxy/datatypes/dataproviders/line.py @@ -132,7 +132,7 @@ e.g. Fasta, GenBank, MAF, hg log Note: mem intensive (gathers list of lines before output) """ - def __init__( self, source, new_block_delim_fn, block_filter_fn=None, **kwargs ): + def __init__( self, source, new_block_delim_fn=None, block_filter_fn=None, **kwargs ): """ :param new_block_delim_fn: T/F function to determine whether a given line is the start of a new block. @@ -214,7 +214,7 @@ """ if self.new_block_delim_fn: return self.new_block_delim_fn( line ) - return False + return True # NOTE: # some formats have one block attr per line @@ -251,17 +251,3 @@ if self.block_filter_fn: return self.block_filter_fn( block ) return block - - -# ----------------------------------------------------------------------------- hierarchal/tree data providers -class HierarchalDataProvider( BlockDataProvider ): - """ - Class that uses formats where a datum may have a parent or children - data. - - e.g. XML, HTML, GFF3, Phylogenetic - """ - def __init__( self, source, **kwargs ): - #TODO: (and defer to better (than I can write) parsers for each subtype) - raise NotImplementedError( 'Abstract class' ) - super( HierarchalDataProvider, self ).__init__( source, **kwargs ) diff -r f46505803e8f0efd26be16f9e18353fbdc969205 -r bf6bf3b5edfdc409f235cb91edcc9d91246df816 lib/galaxy/datatypes/xml.py --- a/lib/galaxy/datatypes/xml.py +++ b/lib/galaxy/datatypes/xml.py @@ -4,9 +4,11 @@ import data import logging from galaxy.datatypes.sniff import * +import dataproviders log = logging.getLogger(__name__) +@dataproviders.decorators.has_dataproviders class GenericXml( data.Text ): """Base format class for any XML file.""" file_ext = "xml" @@ -47,6 +49,12 @@ data.Text.merge(split_files, output_file) merge = staticmethod(merge) + @dataproviders.decorators.dataprovider_factory( 'xml', dataproviders.hierarchy.XMLDataProvider.settings ) + def xml_dataprovider( self, dataset, **settings ): + dataset_source = dataproviders.dataset.DatasetDataProvider( dataset ) + return dataproviders.hierarchy.XMLDataProvider( dataset_source, **settings ) + + class MEMEXml( GenericXml ): """MEME XML Output data""" file_ext = "memexml" @@ -62,6 +70,7 @@ def sniff( self, filename ): return False + class CisML( GenericXml ): """CisML XML data""" #see: http://www.ncbi.nlm.nih.gov/pubmed/15001475 file_ext = "cisml" @@ -77,6 +86,7 @@ def sniff( self, filename ): return False + class Phyloxml( GenericXml ): """Format for defining phyloxml data http://www.phyloxml.org/""" file_ext = "phyloxml" @@ -105,4 +115,4 @@ Returns a list of visualizations for datatype. """ - return [ 'phyloviz' ] \ No newline at end of file + return [ 'phyloviz' ] Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
commits-noreply@bitbucket.org