#! /usr/bin/python """ Find differences between two RDF graphs, using functional and inverse functional properties to identify bnodes in the patch file. --from=uri -f uri from-file --to=uri -t uri file against which to check for differences --meta=uri -m uri file with metadata to be assumed (in addition to schemas) --help -h print this help message --verbose -v verbose mode (two for extra) --granularity= -g 0 g=0 - lots of little diffs. g=1, fewer diffs (default) If from-file but not to-file is given, from-file is smushed and output Uris are relative to present working directory. For motivation and explanation, see $Id: delta.py,v 1.2 2004/11/09 01:08:28 syosi Exp $ http://www.w3.org/2000/10/swap/diff.py """ import string, getopt from sets import Set # Python2.3 and on import string import sys # http://www.w3.org/2000/10/swap/ try: from swap import llyn, diag from swap.myStore import loadMany from swap.diag import verbosity, setVerbosity, progress from swap import notation3 # N3 parsers and generators from swap.RDFSink import FORMULA, LITERAL, ANONYMOUS, Logic_NS from swap import uripath from swap.uripath import base from swap.myStore import Namespace from swap import myStore from swap.notation3 import RDF_NS_URI from swap.llyn import Formula, CONTEXT, PRED, SUBJ, OBJ except ImportError: import llyn, diag from myStore import loadMany from diag import verbosity, setVerbosity, progress import notation3 # N3 parsers and generators from RDFSink import FORMULA, LITERAL, ANONYMOUS, Logic_NS import uripath from uripath import base from myStore import Namespace import myStore from notation3 import RDF_NS_URI from llyn import Formula, CONTEXT, PRED, SUBJ, OBJ #daml = Namespace("http://www.daml.org/2001/03/daml+oil#") OWL = Namespace("http://www.w3.org/2002/07/owl#") RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#") LOG = Namespace("http://www.w3.org/2000/10/swap/log#") DELTA = Namespace("http://www.w3.org/2004/delta#") def debugBnode(n, f): progress("For node %s" % `n`) for s in f.statementsMatching(subj=n): progress(" %s %s; # could be ifp?" %(`s.predicate()`, `s.object()`)) for s in f.statementsMatching(obj=n): progress(" is %s of %s; # could be fp?" %(`s.predicate()`, `s.subject()`)) def lookUp(predicates, assumptions=Set()): """Look up all the schemas for the predicates given""" global verbose schemas = assumptions for pred in predicates: if verbose: progress("Predicate: %s" % `pred`) u = pred.uriref() hash = u.find("#") if hash <0: if verbose: progress("Warning: Predicate <%s> looks like web resource not Property" % u) else: schemas.add(u[:hash]) if verbose: for r in schemas: progress("Metadata to be loaded: ", r) if schemas: return loadMany([(x) for x in schemas]) return myStore.store.newFormula() # Empty formula def nailFormula(f, assumptions=Set()): """Smush the formula. Build a dictionary of nodes which are indirectly identified by [inverse] functonal properties.""" global verbose cc, predicates, ss, oo = getParts(f) nodes = ss | oo sofar = {} bnodes = Set() for node in nodes: if node.generated() or node in f.existentials(): bnodes.add(node) if verbose >=2: progress("Blank node: %s" % `node`) else: if verbose >=2: progress("Fixed node: %s" % `node`) sofar[node] = [] meta = lookUp(predicates, assumptions) ifps = predicates & Set(meta.each(pred=RDF.type, obj=OWL.InverseFunctionalProperty)) fps = predicates & Set(meta.each(pred=RDF.type, obj=OWL.FunctionalProperty)) if verbose: for p in fps: progress("Functional Property:", p) for p in ifps: progress("Inverse Functional: ", p) definitions = [] if len(bnodes) == 0: if verbose: progress("No bnodes in graph") return bnodes, definitions a = float(len(bnodes))/len(nodes) if verbose: progress("Proportion of bodes which are blank: %f" % a) # if a == 0: return bnodes, definitions loose = bnodes.copy() equivs = Set() # Note possible optmization: First pass only like this, # future passes work from newNodes. while loose: newNailed = Set() for preds, inverse, char in ((fps, 0, "!"), (ifps, 1, "^")): for pred in preds: if verbose: progress("Predicate", pred) ss = f.statementsMatching(pred=pred) for s in ss: if inverse: y, x = s.object(), s.subject() else: x, y = s.object(), s.subject() if not x.generated(): continue # Only anchor bnodes if y not in loose: # y is the possible anchor defi = (x, inverse, pred, y) if x in loose: # this node if verbose: progress(" Nailed %s as %s%s%s" % (`x`, `y`, `char`, `pred`)) loose.discard(x) newNailed.add(x) else: if verbose: progress(" (ignored %s as %s%s%s)" % (`x`, `y`, `char`, `pred`)) definitions.append(defi) # if verbose: progress(" Definition[x] is now", definition[x]) if inverse: equivalentSet = Set(f.each(obj=y, pred=pred)) else: equivalentSet = Set(f.each(subj=y, pred=pred)) if len(equivalentSet) > 1: equivs.add(equivalentSet) if not newNailed: if verbose: progress("Failed to nail nodes:", loose) for n in loose: debugBnode(n, f) break # At this point if we still have loose nodes, we have failed with ifps and fps. # Diff may not be strong. (It might still be: the diffs might not involve weak definitions) weak = loose.copy() # Remember if verbose: progress("\nFailed to directly nail everything, looking for weak nailings:") # sys.exit(-1) #@@@ while loose: newNailed = Set() if verbose: progress("\nPass: loose = %s" % loose) for x in loose.copy(): if verbose: progress("Checking weakly node %s" % x) for s in f.statementsMatching(obj=x): pred, y = s.predicate(), s.subject() if y in loose: if verbose: progress("Can't nail to loose %s" % y) continue # Can't nail to something loose others = f.each(subj=y, pred=pred) # @@ Should ignore redirected equivalent nodes in others if len(others) != 1: continue # Defn would be ambiguous in this graph defi = (x, 0, pred, y) if verbose: progress(" Weakly-nailed %s as %s%s%s" % (x, y, "!", pred)) loose.discard(x) newNailed.add(x) definitions.append(defi) break # progress else: for s in f.statementsMatching(subj=x): pred, obj = s.predicate(), s.object() if obj in loose: if verbose: progress("Can't nail to loose %s" % obj) continue # Can't nail to something loose others = f.each(obj=obj, pred=pred) # @@ Should ignore redirected equivalent nodes in others if len(others) != 1: continue # Defn would be ambiguous in this graph defi = (x, 1, pred, obj) if verbose: progress(" Weakly-nailed %s as %s%s%s" % (`x`, `obj`, "^", `pred`)) loose.discard(x) newNailed.add(x) definitions.append(defi) break # progress if not newNailed: if verbose: progress("Failed to even weakly nail nodes:", loose) for n in loose: progress("For node %s" % n) for s in f.statementsMatching(subj=n): progress(" %s %s; # could be ifp?" %(`s.predicate()`, `s.object()`)) for s in f.statementsMatching(obj=n): progress(" is %s of %s; # could be fp?" %(s.predicate(), s.subject())) raise ValueError("Graph insufficiently labelled for nodes: %s" % loose) if verbose and not weak: progress("Graph is solid.") if verbose and weak: progress("Graph is NOT solid.") f.reopen() for es in equivs: if verbose: progress("Equivalent: ", es) prev = None for x in es: if prev: f.add(x, OWL.sameAs, prev) prev = x return bnodes, definitions def removeCommon(f, g, match): """Find common statements from f and g macth gives the dictionary mapping bnodes in f to bnodes in g""" only_f, common_g = Set(), Set() for st in f.statements[:]: s, p, o = st.spo() assert s not in f._redirections assert o not in f._redirections if s.generated(): sg = match.get(s, None) else: sg = s if o.generated(): og = match.get(o, None) else: og = o if og != None and sg != None: gsts = g.statementsMatching(subj=sg, pred=p, obj=og) if len(gsts) == 1: if verbose: progress("Statement in both", st) common_g.add(gsts[0]) continue only_f.add(st) return only_f, Set(g.statements)-common_g def patches(delta, f, only_f, originalBnodes, definitions, deleting=0): """Generate patches in patch formula, for the remaining statements in f given the bnodes and definitions for f.""" todo = only_f.copy() if deleting: patchVerb = DELTA.deletion else: patchVerb = DELTA.insertion if verbose: progress("Patch:", patchVerb) while todo: # find a contiguous subgraph defined in the given graph bnodesToDo = Set() bnodes = Set() rhs = delta.newFormula() lhs = delta.newFormula() # left hand side of patch newStatements = Set() for seed in todo: break # pick one #@2 fn? statementsToDo = Set([seed]) if verbose: progress("Seed:", seed) subgraph = statementsToDo while statementsToDo or bnodesToDo: for st in statementsToDo: s, p, o = st.spo() for x in s, p, o: if x.generated() and x not in bnodes: # and x not in commonBnodes: if verbose: progress(" Bnode ", x) bnodesToDo.add(x) bnodes.add(x) rhs.add(s, p, o) statementsToDo = Set() for x in bnodesToDo: bnodes.add(x) ss = (f.statementsMatching(subj=x) + f.statementsMatching(pred=x) + f.statementsMatching(obj=x)) for z in ss: if z in only_f: newStatements.add(z) if verbose: progress(" New statements from %s: %s" % (x, newStatements)) statementsToDo = statementsToDo | newStatements subgraph = subgraph |newStatements bnodesToDo = Set() if verbose: progress("Subgraph of %i statements (%i left):\n\t%s\n" %(len(subgraph), len(todo), subgraph)) todo = todo - subgraph undefined = bnodes.copy() for x, inverse, pred, y in definitions: if x in undefined: if inverse: s, p, o = x, pred, y else: s, p, o = y, pred, x if deleting: delta.declareUniversal(x) lhs.add(subj=s, pred=p, obj=o) else: # inserting if x in originalBnodes: delta.declareUniversal(x) lhs.add(subj=s, pred=p, obj=o) else: rhs.declareExistential(x) if y.generated(): undefined.add(y) undefined.discard(x) if undefined: progress("Still haven't defined bnodes %s" % undefined) for n in undefined: debugBnode(n, f) raise RuntimeError("BNodes still undefined", undefined) delta.add(subj=lhs.close(), pred=patchVerb, obj=rhs.close()) return def consolidate(delta, patchVerb): """Consolidate patches Where the same left hand side applies to more than 1 RHS formula, roll those RHS formulae into one, to make the dif file more readable and faster to execute in some implementations """ agenda = {} if verbose: progress("Consolidating %s" % patchVerb) for s in delta.statementsMatching(pred=patchVerb): list = agenda.get(s.subject(), None) if list == None: list = [] agenda[s.subject()] = list list.append(s) for lhs, list in agenda.items(): if verbose: progress("Patches lhs= %s: %s" %(lhs, list)) if len(list) > 1: rhs = delta.newFormula() for s in list: delta.store.copyFormula(s.object(), rhs) delta.removeStatement(s) delta.add(subj=lhs, pred=patchVerb, obj=rhs.close()) def differences(f, g, assumptions): """Smush the formulae. Compare them, generating patch instructions.""" global lumped # Cross-map nodes: g_bnodes, g_definitions = nailFormula(g, assumptions) bnodes, definitions = nailFormula(f, assumptions) definitions.reverse() # go back down list @@@ reverse the g list too? @@@ g_definitions.reverse() # @@ needed for the patch generation unmatched = bnodes.copy() match = {} # Mapping of nodes in f to nodes in g for x, inverse, pred, y in definitions: if x in match: continue # done already if x in f._redirections: if verbose: progress("Redirected %s to %s. Ignoring" % (`x`, `f._redirections[x]`)) unmatched.discard(x) continue if verbose: progress("Definition %s = %s%s%s"% (`x`, `pred`, ".^"[inverse], `y`)) if y.generated(): while y in f._redirections: y = f._redirections[y] if verbose: progress(" redirected to %s = %s%s%s"% (`x`, `pred`, ".^"[inverse], `y`)) yg = match.get(y, None) if yg == None: if verbose: progress(" Had definition for %s in terms of %s which is not matched"%(`x`,`y`)) continue else: yg = y if inverse: # Inverse functional property like ssn matches = Set(g.each(obj=yg, pred=pred)) else: matches = Set(g.each(subj=yg, pred=pred)) if len(matches) == 0: continue # This is normal - the node does not exist in the other graph # raise RuntimeError("Can't match %s" % x) if len(matches) > 1: raise RuntimeError("More than 1 match for %s: 6s" % (`x`, `matches`)) for q in matches: # pick only one @@ python function? z = q break if verbose: progress("Found match for %s in %s " % (`x`,`z`)) match[x] = z unmatched.discard(x) if len(unmatched) > 0: if verbose: progress("Failed to match all nodes:", unmatched) for n in unmatched: debugBnode(n, f) # Find common parts only_f, only_g = removeCommon(f,g, match) delta = f.newFormula() if len(only_f) == 0 and len(only_g) == 0: return delta f = f.close() # We are not going to mess with them any more g = g.close() common = Set([match[x] for x in match]) if verbose: progress("Comon nodes (as in g)", common) patches(delta, f, only_f, Set(), definitions, deleting=1) patches(delta, g, only_g, common, g_definitions, deleting=0) if lumped: consolidate(delta, delta.store.insertion) consolidate(delta, delta.store.deletion) return delta def getParts(f, meta=None): """Make lists of all node IDs and arc types """ values = [Set([]),Set([]),Set([]),Set([])] for s in f.statements: for p in SUBJ, PRED, OBJ: x = s[p] values[p].add(x) return values def loadFiles(files): graph = myStore.formula() graph.setClosureMode("e") # Implement sameAs by smushing graph = myStore.loadMany(files, openFormula=graph) if verbose: progress("Loaded", graph, graph.__class__) return graph def usage(): sys.stderr.write(__doc__) def main(): testFiles = [] diffFiles = [] assumptions = Set() global ploughOn # even if error ploughOn = 0 global verbose global lumped verbose = 0 lumped = 1 try: opts, args = getopt.getopt(sys.argv[1:], "hf:t:m:vg", ["help", "from=", "to=", "meta=", "verbose", "granularity="]) except getopt.GetoptError: # print help information and exit: usage() sys.exit(2) output = None for o, a in opts: if o in ("-h", "--help"): usage() sys.exit() if o in ("-v", "--verbose"): verbose += 1 if o in ("-l", "--granularity"): lumped = int(a) if o in ("-f", "--from"): testFiles.append(a) if o in ("-t", "--to"): diffFiles.append(a) if o in ("-m", "--meta"): assumptions.add(a) # if testFiles == []: testFiles = [ "/dev/stdin" ] if testFiles == []: usage() sys.exit(2) graph = loadFiles(testFiles) version = "$Id: delta.py,v 1.2 2004/11/09 01:08:28 syosi Exp $"[1:-1] if diffFiles == []: nailFormula(graph, assumptions) if verbose: print "# Smush by " + version print graph.close().n3String(base=base(), flags="a") sys.exit(0) graph2 = loadFiles(diffFiles) delta = differences(graph, graph2, assumptions) if verbose: print "# Differences by " + version print delta.close().n3String(base=base()) # sys.exit(len(delta)) sys.exit(0) # didn't crash if __name__ == "__main__": main()