1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/3778811f053a/ Changeset: 3778811f053a User: carlfeberhard Date: 2013-08-08 21:04:53 Summary: Graph datatype: add data providers for SIF & XGMML; simple graph data structure to util Affected #: 2 files diff -r 2e0abb7f9b04540e616458a90a87191ed98a3ba4 -r 3778811f053a5f71c68408a9e8beb627f6ccdede lib/galaxy/datatypes/graph.py --- a/lib/galaxy/datatypes/graph.py +++ b/lib/galaxy/datatypes/graph.py @@ -2,12 +2,18 @@ Graph content classes. """ -import data, tabular, xml +import data +import tabular +import xml + +import dataproviders +from galaxy.util import simplegraph import logging log = logging.getLogger( __name__ ) +@dataproviders.decorators.has_dataproviders class Xgmml( xml.GenericXml ): """ XGMML graph format @@ -48,7 +54,13 @@ #For one file only, use base class method (move/copy) data.Text.merge( split_files, output_file ) + @dataproviders.decorators.dataprovider_factory( 'node-edge', dataproviders.hierarchy.XMLDataProvider.settings ) + def node_edge_dataprovider( self, dataset, **settings ): + dataset_source = dataproviders.dataset.DatasetDataProvider( dataset ) + return XGMMLGraphDataProvider( dataset_source, **settings ) + +@dataproviders.decorators.has_dataproviders class Sif( tabular.Tabular ): """ SIF graph format @@ -75,7 +87,6 @@ """ Determines whether the file is SIF """ - print '---------------------------------------- sniffing Siffing' line = '' with open( filename ) as infile: correct = True @@ -92,6 +103,11 @@ def merge( split_files, output_file ): data.Text.merge( split_files, output_file ) + @dataproviders.decorators.dataprovider_factory( 'node-edge', dataproviders.column.ColumnarDataProvider.settings ) + def node_edge_dataprovider( self, dataset, **settings ): + dataset_source = dataproviders.dataset.DatasetDataProvider( dataset ) + return SIFGraphDataProvider( dataset_source, **settings ) + #TODO: we might want to look at rdflib or a similar, larger lib/egg class Rdf( xml.GenericXml ): @@ -108,3 +124,75 @@ else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' + + #TODO: won't be as simple + #@dataproviders.decorators.dataprovider_factory( 'node-edge', dataproviders.column.ColumnarDataProvider.settings ) + #def node_edge_dataprovider( self, dataset, **settings ): + # dataset_source = dataproviders.dataset.DatasetDataProvider( dataset ) + # return None + + +# ----------------------------------------------------------------------------- graph specific data providers +class XGMMLGraphDataProvider( dataproviders.hierarchy.XMLDataProvider ): + """ + Provide two lists: nodes, edges:: + + 'nodes': contains objects of the form: + { 'id' : <some string id>, 'data': <any extra data> } + 'edges': contains objects of the form: + { 'source' : <an index into nodes>, 'target': <an index into nodes>, 'data': <any extra data> } + """ + def __iter__( self ): + # use simple graph to store nodes and links, later providing them as a dict + # essentially this is a form of aggregation + graph = simplegraph.SimpleGraph() + + parent_gen = super( XGMMLGraphDataProvider, self ).__iter__() + for graph_elem in parent_gen: + if 'children' not in graph_elem: + continue + for elem in graph_elem[ 'children' ]: + # use endswith to work around Elementtree namespaces + if elem[ 'tag' ].endswith( 'node' ): + node_id = elem[ 'attrib' ][ 'id' ] + # pass the entire, parsed xml element as the data + graph.add_node( node_id, **elem ) + + elif elem[ 'tag' ].endswith( 'edge' ): + source_id = elem[ 'attrib' ][ 'source' ] + target_id = elem[ 'attrib' ][ 'target' ] + graph.add_edge( source_id, target_id, **elem ) + + yield graph.as_dict() + + +class SIFGraphDataProvider( dataproviders.column.ColumnarDataProvider ): + """ + Provide two lists: nodes, edges:: + + 'nodes': contains objects of the form: + { 'id' : <some string id>, 'data': <any extra data> } + 'edges': contains objects of the form: + { 'source' : <an index into nodes>, 'target': <an index into nodes>, 'data': <any extra data> } + """ + def __iter__( self ): + # use simple graph to store nodes and links, later providing them as a dict + # essentially this is a form of aggregation + graph = simplegraph.SimpleGraph() + # SIF is tabular with the source, link-type, and all targets in the columns + parent_gen = super( SIFGraphDataProvider, self ).__iter__() + for columns in parent_gen: + if columns: + source_id = columns[0] + # there's no extra data for nodes (or links) in the examples I've seen + graph.add_node( source_id ) + + # targets are the (variadic) remaining columns + if len( columns ) >= 3: + relation = columns[1] + targets = columns[2:] + for target_id in targets: + graph.add_node( target_id ) + graph.add_edge( source_id, target_id, type=relation ) + + yield graph.as_dict() diff -r 2e0abb7f9b04540e616458a90a87191ed98a3ba4 -r 3778811f053a5f71c68408a9e8beb627f6ccdede lib/galaxy/util/simplegraph.py --- /dev/null +++ b/lib/galaxy/util/simplegraph.py @@ -0,0 +1,127 @@ +""" +Fencepost-simple graph structure implementation. +""" +# Currently (2013.7.12) only used in easing the parsing of graph datatype data. + +from galaxy.util.odict import odict + + +class SimpleGraphNode( object ): + """ + Node representation. + """ + def __init__( self, index, **data ): + """ + :param index: index of this node in some parent list + :type index: int + :param data: any extra data that needs to be saved + :type data: (variadic dictionary) + """ + # a bit application specific (could be 'id') + self.index = index + self.data = data + + +class SimpleGraphEdge( object ): + """ + Edge representation. + """ + def __init__( self, source_index, target_index, **data ): + """ + :param source_index: index of the edge's source node in some parent list + :type source_index: int + :param target_index: index of the edge's target node in some parent list + :type target_index: int + :param data: any extra data that needs to be saved + :type data: (variadic dictionary) + """ + self.source_index = source_index + self.target_index = target_index + self.data = data + + +class SimpleGraph( object ): + """ + Each node is unique (by id) and stores it's own index in the node list/odict. + Each edge is represented as two indeces into the node list/odict. + Both nodes and edges allow storing extra information if needed. + + Allows: + multiple edges between two nodes + self referential edges (an edge from a node to itself) + + These graphs are not specifically directed but since source and targets on the + edges are listed - it could easily be used that way. + """ + def __init__( self, nodes=None, edges=None ): + # use an odict so that edge indeces actually match the final node list indeces + self.nodes = nodes or odict() + self.edges = edges or [] + + def add_node( self, node_id, **data ): + """ + Adds a new node only if it doesn't already exist. + :param node_id: some unique identifier + :type node_id: (hashable) + :param data: any extra data that needs to be saved + :type data: (variadic dictionary) + :returns: the new node + """ + if node_id in self.nodes: + return self.nodes[ node_id ] + node_index = len( self.nodes ) + new_node = SimpleGraphNode( node_index, **data ) + self.nodes[ node_id ] = new_node + return new_node + + def add_edge( self, source_id, target_id, **data ): + """ + Adds a new node only if it doesn't already exist. + :param source_id: the id of the source node + :type source_id: (hashable) + :param target_id: the id of the target node + :type target_id: (hashable) + :param data: any extra data that needs to be saved for the edge + :type data: (variadic dictionary) + :returns: the new node + + ..note: that, although this will create new nodes if necessary, there's + no way to pass `data` to them - so if you need to assoc. more data with + the nodes, use `add_node` first. + """ + # adds target_id to source_id's edge list + # adding source_id and/or target_id to nodes if not there already + if source_id not in self.nodes: + self.add_node( source_id ) + if target_id not in self.nodes: + self.add_node( target_id ) + new_edge = SimpleGraphEdge( self.nodes[ source_id ].index, self.nodes[ target_id ].index, **data ) + self.edges.append( new_edge ) + return new_edge + + def gen_node_dicts( self ): + """ + Returns a generator that yields node dictionaries in the form: + { 'id': <the nodes unique id>, 'data': <any additional node data> } + """ + for node_id, node in self.nodes.items(): + yield { 'id': node_id, 'data': node.data } + + def gen_edge_dicts( self ): + """ + Returns a generator that yields node dictionaries in the form: + { + 'source': <the index of the source node in the graph's node list>, + 'target': <the index of the target node in the graph's node list>, + 'data' : <any additional edge data> + } + """ + for edge in self.edges: + yield { 'source': edge.source_index, 'target': edge.target_index, 'data': edge.data } + + def as_dict( self ): + """ + Returns a dictionary of the form + { 'nodes': <a list of node dictionaries>, 'edges': <a list of node dictionaries> } + """ + return { 'nodes': list( self.gen_node_dicts() ), 'edges': list( self.gen_edge_dicts() ) } Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.