#!/usr/local/bin/python """ $Id: notation3.py,v 1.166 2004/10/28 17:41:59 timbl Exp $ This module implements basic sources and sinks for RDF data. It defines a stream interface for such data. It has a command line interface, can work as a web query engine, and has built in test(), all of which demosntrate how it is used. To make a new RDF processor, subclass RDFSink. See also: Notation 3 http://www.w3.org/DesignIssues/Notation3 Closed World Machine - and RDF Processor http;//www.w3.org/2000/10/swap/cwm To DO: See also "@@" in comments Internationlization: - Decode incoming N3 file as unicode - Encode outgoing file - unicode \u (??) escapes in parse - unicode \u (??) escapes in string output Note currently unicode strings work in this code but fail when they are output into the python debugger interactive window. ______________________________________________ Module originally by Dan Connolly, includeing notation3 parser and RDF generator. TimBL added RDF stream model and N3 generation, replaced stream model with use of common store/formula API. DWC: oops... I'm not doing qname expansion as described there (i.e. adding a # if it's not already there). I allow unprefixed qnames, so not all barenames are keywords. ---- hmmmm ... not expandable - a bit of a trap. I haven't done quoting yet. idea: migrate toward CSS notation? idea: use notation3 for wiki record keeping. """ import types, sys import string import codecs # python 2-ism; for writing utf-8 in RDF/xml output import urllib import re from diag import verbosity, setVerbosity, progress from uripath import refTo, join import uripath import RDFSink from RDFSink import CONTEXT, PRED, SUBJ, OBJ, PARTS, ALL4 from RDFSink import LITERAL, ANONYMOUS, SYMBOL from RDFSink import Logic_NS import diag from why import BecauseOfData, FormulaReason N3_forSome_URI = RDFSink.forSomeSym N3_forAll_URI = RDFSink.forAllSym # Magic resources we know about from RDFSink import RDF_type_URI, RDF_NS_URI, DAML_sameAs_URI, parsesTo_URI from RDFSink import RDF_spec, List_NS, uniqueURI from decimal import Decimal ADDED_HASH = "#" # Stop where we use this in case we want to remove it! # This is the hash on namespace URIs RDF_type = ( SYMBOL , RDF_type_URI ) DAML_sameAs = ( SYMBOL, DAML_sameAs_URI ) from RDFSink import N3_first, N3_rest, N3_nil, N3_List, N3_Empty LOG_implies_URI = "http://www.w3.org/2000/10/swap/log#implies" INTEGER_DATATYPE = "http://www.w3.org/2001/XMLSchema#integer" FLOAT_DATATYPE = "http://www.w3.org/2001/XMLSchema#double" DECIMAL_DATATYPE = "http://www.w3.org/2001/XMLSchema#decimal" option_noregen = 0 # If set, do not regenerate genids on output # @@ I18n - the notname chars need extending for well known unicode non-text characters. # The XML spec switched to assuming unknown things were name characaters. # _namechars = string.lowercase + string.uppercase + string.digits + '_-' _notQNameChars = "\t\r\n !\"#$%&'()*.,+/;<=>?@[\\]^`{|}~" # Assume anything else valid qname :-/ _notNameChars = _notQNameChars + ":" # Assume anything else valid name :-/ _rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' N3CommentCharacter = "#" # For unix script #! compatabilty ########################################## Parse string to sink # # Regular expressions: eol = re.compile(r'[ \t]*(#[^\n]*)?\r?\n') # end of line, poss. w/comment eof = re.compile(r'[ \t]*(#[^\n]*)?$') # end of file, poss. w/comment ws = re.compile(r'[ \t]*') # Whitespace not including newline @@(whatabout unicode NL? ask MartinD) signed_integer = re.compile(r'[-+]?[0-9]+') # integer number_syntax = re.compile(r'([-+]?[0-9]+)(\.[0-9]+)?(e[-+]?[0-9]+)?') digitstring = re.compile(r'[0-9]+') # Unsigned integer interesting = re.compile(r'[\\\r\n\"]') langcode = re.compile(r'[a-zA-Z0-9]+(-[a-zA-Z0-9]+)?') #" class SinkParser: def __init__(self, store, openFormula=None, thisDoc="", baseURI=None, genPrefix = "", metaURI=None, flags="", why=None): """ note: namespace names should *not* end in #; the # will get added during qname processing """ self._bindings = {} if thisDoc != "": assert ':' in thisDoc, "Document URI if any must be absolute: <%s>" % thisDoc self._bindings[""] = thisDoc + "#" # default self._store = store if genPrefix: store.setGenPrefix(genPrefix) # pass it on self._thisDoc = thisDoc self.lines = 0 # for error handling self.startOfLine = 0 # For calculating character number self._genPrefix = genPrefix self.keywords = ['a', 'this', 'bind', 'has', 'is', 'of' ] self.keywordsSet = 0 # When and only when they have been set can others be considerd qnames self._anonymousNodes = {} # Dict of anon nodes already declared ln : Term self._reason = why # Why the parser w self._reason2 = None # Why these triples if diag.tracking: self._reason2 = BecauseOfData(store.newSymbol(thisDoc), because=self._reason) if baseURI: self._baseURI = baseURI else: if thisDoc: self._baseURI = thisDoc else: self._baseURI = None assert not self._baseURI or ':' in self._baseURI if not self._genPrefix: if self._thisDoc: self._genPrefix = self._thisDoc + "#_g" else: self._genPrefix = uniqueURI() if openFormula ==None: if self._thisDoc: self._formula = store.newFormula(thisDoc + "#_formula") else: self._formula = store.newFormula() else: self._formula = openFormula if diag.tracking: progress ("@@@@@@ notation3 167 loading ",thisDoc, why, self._formula) proof = self._formula.collector if proof == None: proof = FormulaReason(self._formula) progress ("\tAllocating new FormulaReason") progress ("\tNow: 169", why, proof, self._formula) self._context = self._formula self._parentContext = None if metaURI: self.makeStatement((SYMBOL, metaURI), # relate document to parse tree (SYMBOL, PARSES_TO_URI ), #pred (SYMBOL, thisDoc), #subj self._context) # obj self.makeStatement(((SYMBOL, metaURI), # quantifiers - use inverse? (SYMBOL, N3_forSome_URI), #pred self._context, #subj subj)) # obj def here(self, i): """String generated from position in file This is for repeatability when refering people to bnodes in a document. This has diagnostic uses less formally, as it should point one to which bnode the arbitrary identifier actually is. It gives the line and character number of the '[' charcacter or path character which introduced the blank node. The first blank node is boringly _L1C1. It used to be used only for tracking, but for tests in general it makes the canonical ordering of bnodes repeatable.""" # if not diag.tracking: return None return "%s_L%iC%i" % (self._genPrefix , self.lines, i - self.startOfLine + 1) def formula(self): return self._formula def loadStream(self, stream): return self.loadBuf(stream.read()) # Not ideal def loadBuf(self, buf): """Parses a buffer and returns its top level formula""" self.startDoc() self.feed(buf) return self.endDoc() # self._formula def feed(self, octets): """Feed an octet stream tothe parser if BadSyntax is raised, the string passed in the exception object is the remainder after any statements have been parsed. So if there is more data to feed to the parser, it should be straightforward to recover.""" str = octets.decode('utf-8') i = 0 while i >= 0: j = self.skipSpace(str, i) if j<0: return i = self.directiveOrStatement(str,j) if i<0: print "# next char: ", `str[j]` raise BadSyntax(self._thisDoc, self.lines, str, j, "expected directive or statement") def directiveOrStatement(self, str,h): i = self.skipSpace(str, h) if i<0: return i # EOF j = self.directive(str, i) if j>=0: return self.checkDot(str,j) j = self.statement(str, i) if j>=0: return self.checkDot(str,j) return j #@@I18N global _notNameChars #_namechars = string.lowercase + string.uppercase + string.digits + '_-' def tok(self, tok, str, i): """Check for keyword. Space must have been stripped on entry and we must not be at end of file.""" assert tok[0] not in _notNameChars # not for punctuation whitespace = '\t\n\x0b\x0c\r ' # string.whitespace was '\t\n\x0b\x0c\r \xa0' not ascii if str[i:i+1] == "@": i = i+1 else: if tok not in self.keywords: return -1 # Nope, this has neither keywords declaration nor "@" if (str[i:i+len(tok)] == tok and (str[i+len(tok)] in _notQNameChars )): i = i + len(tok) return i else: return -1 def directive(self, str, i): j = self.skipSpace(str, i) if j<0: return j # eof res = [] j = self.tok('bind', str, i) # implied "#". Obsolete. if j>0: raise BadSyntax(self._thisDoc, self.lines, str, i, "keyword bind is obsolete: use @prefix") j = self.tok('keywords', str, i) if j>0: i = self.commaSeparatedList(str, j, res, self.bareWord) if i < 0: raise BadSyntax(self._thisDoc, self.lines, str, i, "'@keywords' needs comma separated list of words") self.setKeywords(res[:]) if diag.chatty_flag > 80: progress("Keywords ", self.keywords) return i j = self.tok('forAll', str, i) if j > 0: i = self.commaSeparatedList(str, j, res, self.uri_ref2) if i <0: raise BadSyntax(self._thisDoc, self.lines, str, i, "Bad variable list after @forAll") for x in res: self._context.declareUniversal(x) return i j = self.tok('forSome', str, i) if j > 0: i = self. commaSeparatedList(str, j, res, self.uri_ref2) if i <0: raise BadSyntax(self._thisDoc, self.lines, str, i, "Bad variable list after @forSome") for x in res: self._context.declareExistential(x) return i j=self.tok('prefix', str, i) # no implied "#" if j<0: return -1 t = [] i = self.qname(str, j, t) if i<0: raise BadSyntax(self._thisDoc, self.lines, str, j, "expected qname after @prefix") j = self.uri_ref2(str, i, t) if j<0: raise BadSyntax(self._thisDoc, self.lines, str, i, "expected after @prefix _qname_") if isinstance(t[1], types.TupleType): ns = t[1][1] # old system for --pipe else: ns = t[1].uriref() if self._baseURI: ns = join(self._baseURI, ns) else: assert ":" in ns, "With no base URI, cannot handle relative URI" assert ':' in ns # must be absolute self._bindings[t[0][0]] = ns self.bind(t[0][0], hexify(ns)) return j def bind(self, qn, uri): assert isinstance(uri, types.StringType), "Any unicode must be %x-encoded already" if qn == "": self._store.setDefaultNamespace(uri) else: self._store.bind(qn, uri) def setKeywords(self, k): "Takes a list of strings" if k == None: self.keywordsSet = 0 else: self.keywords = k self.keywordsSet = 1 def startDoc(self): self._store.startDoc() def endDoc(self): """Signal end of document and stop parsing. returns formula""" self._store.endDoc(self._formula) # don't canonicalize yet return self._formula def makeStatement(self, quadruple): # print "# Parser output: ", `triple` self._store.makeStatement(quadruple, why=self._reason2) def statement(self, str, i): r = [] i = self.object(str, i, r) # Allow literal for subject - This extends RDF model if i<0: return i j = self.property_list(str, i, r[0]) if j<0: raise BadSyntax(self._thisDoc, self.lines, str, i, "expected propertylist") return j def subject(self, str, i, res): return self.item(str, i, res) def verb(self, str, i, res): """ has _prop_ is _prop_ of a = _prop_ >- prop -> <- prop -< _operator_""" j = self.skipSpace(str, i) if j<0:return j # eof r = [] j = self.tok('has', str, i) if j>=0: i = self.prop(str, j, r) if i < 0: raise BadSyntax(self._thisDoc, self.lines, str, j, "expected property after 'has'") res.append(('->', r[0])) return i j = self.tok('is', str, i) if j>=0: i = self.prop(str, j, r) if i < 0: raise BadSyntax(self._thisDoc, self.lines, str, j, "expected after 'is'") j = self.skipSpace(str, i) if j<0: raise BadSyntax(self._thisDoc, self.lines, str, i, "EOF found, expected property after 'is'") return j # eof i=j j = self.tok('of', str, i) if j<0: raise BadSyntax(self._thisDoc, self.lines, str, i, "expected 'of' after 'is' ") res.append(('<-', r[0])) return j j = self.tok('a', str, i) if j>=0: res.append(('->', RDF_type)) return j if str[i:i+2] == "<=": res.append(('<-', self._store.newSymbol(Logic_NS+"implies"))) return i+2 if str[i:i+1] == "=": if str[i+1:i+2] == ">": res.append(('->', self._store.newSymbol(Logic_NS+"implies"))) return i+2 res.append(('->', DAML_sameAs)) return i+1 if str[i:i+2] == ":=": res.append(('->', Logic_NS+"becomes")) # patch file relates two formulae, uses this return i+2 j = self.prop(str, i, r) if j >= 0: res.append(('->', r[0])) return j if str[i:i+2] == ">-" or str[i:i+2] == "<-": raise BadSyntax(self._thisDoc, self.lines, str, j, ">- ... -> syntax is obsolete.") return -1 def prop(self, str, i, res): return self.item(str, i, res) def item(self, str, i, res): return self.path(str, i, res) def path(self, str, i, res): """Parse the path production. """ j = self.nodeOrLiteral(str, i, res) if j<0: return j # nope while str[j:j+1] in "!^.": # no spaces, must follow exactly (?) ch = str[j:j+1] # @@ Allow "." followed IMMEDIATELY by a node. if ch == ".": ahead = str[j+1:j+2] if not ahead or (ahead in _notNameChars and ahead not in ":?<[{("): break subj = res.pop() obj = self._store.newBlankNode(self._context, uri=self.here(j), why=self._reason2) j = self.node(str, j+1, res) if j<0: raise BadSyntax(self._thisDoc, self.lines, str, j, "EOF found in middle of path syntax") pred = res.pop() if ch == "^": # Reverse traverse self.makeStatement((self._context, pred, obj, subj)) else: self.makeStatement((self._context, pred, subj, obj)) res.append(obj) return j def anonymousNode(self, ln): """Remember or generate a term for one of these _: anonymous nodes""" term = self._anonymousNodes.get(ln, None) if term != None: return term term = self._store.newBlankNode(self._context, why=self._reason2) # term = self._store.newExistential(self._context, self._genPrefix + ln, why=self._reason2) self._anonymousNodes[ln] = term return term def node(self, str, i, res, subjectAlready=None): """Parse the production. Space is now skipped once at the beginning instead of in multipe calls to self.skipSpace(). """ subj = subjectAlready j = self.skipSpace(str,i) if j<0: return j #eof i=j ch = str[i:i+1] # Quick 1-character checks first: if ch == "[": bnodeID = self.here(i) j=self.skipSpace(str,i+1) if j<0: raise BadSyntax(self._thisDoc, self.lines, str, i, "EOF after '['") if str[j:j+1] == "=": # Hack for "is" binding name to anon node i = j+1 objs = [] j = self.objectList(str, i, objs); if j>=0: subj = objs[0] if len(objs)>1: for obj in objs: self.makeStatement((self._context, DAML_sameAs, subj, obj)) j = self.skipSpace(str, j) if j<0: raise BadSyntax(self._thisDoc, self.lines, str, i, "EOF when objectList expected after [ = ") if str[j:j+1] == ";": j=j+1 else: raise BadSyntax(self._thisDoc, self.lines, str, i, "objectList expected after [= ") if subj is None: subj=self._store.newBlankNode( self._context,uri= bnodeID, why=self._reason2) i = self.property_list(str, j, subj) if i<0: raise BadSyntax(self._thisDoc, self.lines, str, j, "property_list expected") j = self.skipSpace(str, i) if j<0: raise BadSyntax(self._thisDoc, self.lines, str, i, "EOF when ']' expected after [ ") if str[j:j+1] != "]": raise BadSyntax(self._thisDoc, self.lines, str, j, "']' expected") res.append(subj) return j+1 if ch == "{": j=i+1 oldParentContext = self._parentContext self._parentContext = self._context parentAnonymousNodes = self._anonymousNodes self._anonymousNodes = {} if subj is None: subj = self._store.newFormula() self._context = subj while 1: i = self.skipSpace(str, j) if i<0: raise BadSyntax(self._thisDoc, self.lines, str, i, "needed '}', found end.") if str[i:i+1] == "}": j = i+1 break j = self.directiveOrStatement(str,i) if j<0: raise BadSyntax(self._thisDoc, self.lines, str, i, "expected statement or '}'") self._anonymousNodes = parentAnonymousNodes self._context = self._parentContext self._parentContext = oldParentContext res.append(subj.close()) # Must not actually use the formula until it has been closed return j if ch == "(": j=i+1 ## ###################################### ## #The below code should be hit with a sledgehammer multiple times. ## #It is creating firsts and nexts, only to have the formula _slowly_ ## #go through them and recreate the list. This will change ## previous = None # remember value to return ## while 1: ## i = self.skipSpace(str, j) ## if i<0: raise BadSyntax(self._thisDoc, self.lines, str, i, "needed ')', found end.") ## if str[i:i+1] == ')': ## j = i+1 ## break ## ## item = [] ## j = self.item(str,i, item) #@@@@@ should be path, was object ## if j<0: raise BadSyntax(self._thisDoc, self.lines, str, i, "expected item in list or ')'") ## this = self._context.newBlankNode(uri=self.here(i), why=self._reason2) ## if previous: ## self.makeStatement((self._context, N3_rest, previous, this )) ## else: ## head = this ## self.makeStatement((self._context, N3_first, this, item[0])) ## previous = this ## previousvalue = item[0] ## ## if not previous: ## res.append(N3_nil) ## return j ## self.makeStatement((self._context, N3_rest, previous, N3_nil )) ## res.append(head) ## #There is a far better way to do the above. See below ## ########################################## List = [] while 1: i = self.skipSpace(str, j) if i<0: raise BadSyntax(self._thisDoc, self.lines, str, i, "needed ')', found end.") if str[i:i+1] == ')': j = i+1 break item = [] j = self.item(str,i, item) #@@@@@ should be path, was object if j<0: raise BadSyntax(self._thisDoc, self.lines, str, i, "expected item in list or ')'") List.append(self._store.intern(item[0])) res.append(self._store.newList(List)) return j j = self.tok('this', str, i) # This context if j>=0: res.append(self._context) return j if subj is None: # If this can be a named node, then check for a name. j = self.uri_ref2(str, i, res) if j >= 0: return j return -1 def property_list(self, str, i, subj): """Parse property list Leaves the terminating punctuation in the buffer """ while 1: j = self.skipSpace(str, i) if j<0: raise BadSyntax(self._thisDoc, self.lines, str, i, "EOF found when expected verb in property list") return j #eof if str[j:j+2] ==":-": i = j + 2 res = [] j = self.node(str, i, res, subj) if j<0: raise BadSyntax(self._thisDoc, self.lines, str, i, "bad {} or () or [] node after :- ") i=j continue i=j v = [] j = self.verb(str, i, v) if j<=0: return i # void but valid objs = [] i = self.objectList(str, j, objs) if i<0: raise BadSyntax(self._thisDoc, self.lines, str, j, "objectList expected") for obj in objs: dir, sym = v[0] if dir == '->': self.makeStatement((self._context, sym, subj, obj)) else: self.makeStatement((self._context, sym, obj, subj)) j = self.skipSpace(str, i) if j<0: raise BadSyntax(self._thisDoc, self.lines, str, j, "EOF found in list of objects") return j #eof if str[i:i+1] != ";": return i i = i+1 # skip semicolon and continue def commaSeparatedList(self, str, j, res, what): """return value: -1 bad syntax; >1 new position in str res has things found appended """ i = self.skipSpace(str, j) if i<0: raise BadSyntax(self._thisDoc, self.lines, str, i, "EOF found expecting comma sep list") return i if str[i] == ".": return j # empty list is OK i = what(str, i, res) if i<0: return -1 while 1: # progress("^^^^^", res, `what`, "\n >>>%s" % str[i:i+40]) j = self.skipSpace(str, i) if j<0: return j # eof ch = str[j:j+1] if ch != ",": if ch != ".": return -1 return j # Found but not swallowed "." i = what(str, j+1, res) if i<0: raise BadSyntax(self._thisDoc, self.lines, str, i, "bad list content") return i def objectList(self, str, i, res): i = self.object(str, i, res) if i<0: return -1 while 1: j = self.skipSpace(str, i) if j<0: raise BadSyntax(self._thisDoc, self.lines, str, j, "EOF found after object") return j #eof if str[j:j+1] != ",": return j # Found something else! i = self.object(str, j+1, res) if i<0: return i def checkDot(self, str, i): j = self.skipSpace(str, i) if j<0: return j #eof if str[j:j+1] == ".": return j+1 # skip if str[j:j+1] == "}": return j # don't skip it if str[j:j+1] == "]": return j raise BadSyntax(self._thisDoc, self.lines, str, j, "expected '.' or '}' or ']' at end of statement") return i def uri_ref2(self, str, i, res): """Generate uri from n3 representation. Note that the RDF convention of directly concatenating NS and local name is now used though I prefer inserting a '#' to make the namesapces look more like what XML folks expect. """ qn = [] j = self.qname(str, i, qn) if j>=0: pfx, ln = qn[0] if pfx is None: assert 0, "not used?" ns = self._baseURI + ADDED_HASH else: try: ns = self._bindings[pfx] except KeyError: if pfx == "_": # Magic prefix added 2001/05/30, can be overridden res.append(self.anonymousNode(ln)) return j raise BadSyntax(self._thisDoc, self.lines, str, i, "Prefix %s not bound" % (pfx)) res.append(self._store.newSymbol(ns + ln)) # @@@ "#" CONVENTION if not string.find(ns, "#"):progress("Warning: no # on NS %s," % ns) return j i = self.skipSpace(str, i) if i<0: return -1 if str[i] == "?": v = [] j = self.variable(str,i,v) if j>0: #Forget varibles as a class, only in context. res.append(v[0]) return j return -1 elif str[i]=="<": i = i + 1 st = i while i < len(str): if str[i] == ">": uref = str[st:i] # the join should dealt with "": if self._baseURI: uref = uripath.join(self._baseURI, uref) else: assert ":" in uref, "With no base URI, cannot deal with relative URIs" if str[i-1:i]=="#" and not uref[-1:]=="#": uref = uref + "#" # She meant it! Weirdness in urlparse? res.append(self._store.newSymbol(uref)) return i+1 i = i + 1 raise BadSyntax(self._thisDoc, self.lines, str, j, "unterminated URI reference") elif self.keywordsSet: v = [] j = self.bareWord(str,i,v) if j<0: return -1 #Forget varibles as a class, only in context. if v[0] in self.keywords: raise BadSyntax(self._thisDoc, self.lines, str, i, 'Keyword "%s" not allowed here.' % v[0]) res.append(self._store.newSymbol(self._bindings[""]+v[0])) return j else: return -1 def skipSpace(self, str, i): """Skip white space, newlines and comments. return -1 if EOF, else position of first non-ws character""" while 1: m = eol.match(str, i) if m == None: break self.lines = self.lines + 1 i = m.end() # Point to first character unmatched self.startOfLine = i m = ws.match(str, i) if m != None: i = m.end() m = eof.match(str, i) if m != None: return -1 return i def variable(self, str, i, res): """ ?abc -> variable(:abc) """ j = self.skipSpace(str, i) if j<0: return -1 if str[j:j+1] != "?": return -1 j=j+1 i = j if str[j] in "0123456789-": raise BadSyntax(self._thisDoc, self.lines, str, j, "Varible name can't start with '%s'" % str[j]) return -1 while i >" % str[j:i] return i def bareWord(self, str, i, res): """ abc -> :abc """ j = self.skipSpace(str, i) if j<0: return -1 if str[j] in "0123456789-" or str[j] in _notNameChars: return -1 i = j while i ('xyz', 'def') If not in keywords and keywordsSet: def -> ('', 'def') :def -> ('', 'def') """ i = self.skipSpace(str, i) if i<0: return -1 c = str[i] if c in "0123456789-+": return -1 if c not in _notNameChars: ln = c i = i + 1 while i < len(str): c = str[i] if c not in _notNameChars: ln = ln + c i = i + 1 else: break else: # First character is non-alpha ln = '' # Was: None - TBL (why? useful?) if i= 0: return j else: j = self.skipSpace(str, i) if j<0: return -1 else: i=j if str[i]=='"': if str[i:i+3] == '"""': delim = '"""' else: delim = '"' i = i + len(delim) j, s = self.strconst(str, i, delim) res.append(self._store.newLiteral(s)) progress("New string const ", s, j) return j else: return -1 def nodeOrLiteral(self, str, i, res): j = self.node(str, i, res) if j>= 0: return j else: j = self.skipSpace(str, i) if j<0: return -1 else: i=j ch = str[i] if ch in "-+0987654321": m = number_syntax.match(str, i) if m == None: raise BadSyntax(self._thisDoc, self.lines, str, i, "Bad number syntax") j = m.end() if m.group(3) != None: # includes decimal exponent res.append(float(str[i:j])) # res.append(self._store.newLiteral(str[i:j], # self._store.newSymbol(FLOAT_DATATYPE))) elif m.group(2) != None: res.append(float(str[i:j])) else: res.append(long(str[i:j])) # res.append(self._store.newLiteral(str[i:j], # self._store.newSymbol(INTEGER_DATATYPE))) return j if str[i]=='"': if str[i:i+3] == '"""': delim = '"""' else: delim = '"' i = i + len(delim) dt = None j, s = self.strconst(str, i, delim) lang = None if str[j:j+1] == "@": # Language? m = langcode.match(str, j+1) if m == None: raise BadSyntax(self._thisDoc, startline, str, i, "Bad language code syntax on string literal, after @") i = m.end() lang = str[j+1:i] j = i if str[j:j+2] == "^^": res2 = [] j = self.uri_ref2(str, j+2, res2) # Read datatype URI dt = res2[0] res.append(self._store.newLiteral(s, dt, lang)) return j else: return -1 def strconst(self, str, i, delim): """parse an N3 string constant delimited by delim. return index, val """ j = i ustr = u"" # Empty unicode string startline = self.lines # Remember where for error messages while j>>>", m.group(0), "<<< in string >>>>>", m.string, "<<<<<<" assert m , "Quote expected in string at ^ in %s^%s" %( str[j-20:j], str[j:j+20]) # we at least have to find a quote # print "@@@ old i = ",i, " j=",j, "m.start=", m.start(),"m.end=", m.end(), # print ">>>>>>>", m.string[:j+m.start()], "|||||||", m.string[j+m.start(): j+m.end()], "<<<<<<<" i = m.start() try: ustr = ustr + str[j:i] except UnicodeError: err = "" for c in str[j:i]: err = err + (" %02x" % ord(c)) streason = sys.exc_info()[1].__str__() raise BadSyntax(self._thisDoc, startline, str, j, "Unicode error appending characters %s to string, because\n\t%s" % (err, streason)) # print "@@@ i = ",i, " j=",j, "m.end=", m.end() ch = str[i] if ch == '"': j = i continue elif ch == "\r": # Strip carriage returns j = i+1 continue elif ch == "\n": if delim == '"': raise BadSyntax(self._thisDoc, startline, str, i, "newline found in string literal") self.lines = self.lines + 1 ustr = ustr + ch j = i + 1 self.startOfLine = j elif ch == "\\": j = i + 1 ch = str[j:j+1] # Will be empty if string ends if not ch: raise BadSyntax(self._thisDoc, startline, str, i, "unterminated string literal (2)") k = string.find('abfrtvn\\"', ch) if k >= 0: uch = '\a\b\f\r\t\v\n\\"'[k] ustr = ustr + uch j = j + 1 elif ch == "u": j, ch = self.uEscape(str, j+1, startline) ustr = ustr + ch else: raise BadSyntax(self._thisDoc, self.lines, str, i, "bad escape") raise BadSyntax(self._thisDoc, self.lines, str, i, "unterminated string literal") def uEscape(self, str, i, startline): j = i count = 0 value = 0 while count < 4: # Get 4 more characters ch = str[j:j+1].lower() # sbp http://ilrt.org/discovery/chatlogs/rdfig/2002-07-05 j = j + 1 if ch == "": raise BadSyntax(self._thisDoc, startline, str, i, "unterminated string literal(3)") k = string.find("0123456789abcdef", ch) if k < 0: raise BadSyntax(self._thisDoc, startline, str, i, "bad string literal hex escape") value = value * 16 + k count = count + 1 uch = unichr(value) return j, uch # If we are going to do operators then they should generate # [ is operator:plus of ( \1 \2 ) ] class BadSyntax(SyntaxError): def __init__(self, uri, lines, str, i, why): self._str = str.encode('utf-8') # Better go back to strings for errors self._i = i self._why = why self.lines = lines self._uri = uri def __str__(self): str = self._str i = self._i st = 0 if i>60: pre="..." st = i - 60 else: pre="" if len(str)-i > 60: post="..." else: post="" return 'Line %i of <%s>: Bad syntax (%s) at ^ in:\n"%s%s^%s%s"' \ % (self.lines +1, self._uri, self._why, pre, str[st:i], str[i:i+60], post) def stripCR(str): res = "" for ch in str: if ch != "\r": res = res + ch return res def dummyWrite(x): pass ############################################################################################ class ToN3(RDFSink.RDFSink): """Serializer output sink for N3 keeps track of most recent subject and predicate reuses them. Adapted from Dan's ToRDFParser(Parser); """ flagDocumentation = """Flags for N3 output are as follows:- a Anonymous nodes should be output using the _: convention (p flag or not). d Don't use default namespace (empty prefix) e escape literals --- use \u notation i Use identifiers from store - don't regen on output l List syntax suppression. Don't use (..) n No numeric syntax - use strings typed with ^^ syntax p Prefix suppression - don't use them, always URIs in <> instead of qnames. q Quiet - don't make comments about the environment in which processing was done. r Relative URI suppression. Always use absolute URIs. s Subject must be explicit for every statement. Don't use ";" shorthand. t "this" and "()" special syntax should be suppresed. u Use \u for unicode escaping in URIs instead of utf-8 %XX v Use "this log:forAll" instead of @forAll, and "this log:forAll" for "@forSome". """ # " # A word about regenerated Ids. # # Within the program, the URI of a resource is kept the same, and in fact # tampering with it would leave risk of all kinds of inconsistencies. # Hwoever, on output, where there are URIs whose values are irrelevant, # such as variables and generated IDs from anonymous ndoes, it makes the # document very much more readable to regenerate the IDs. # We use here a convention that underscores at the start of fragment IDs # are reserved for generated Ids. The caller can change that. # # Now there is a new way of generating these, with the "_" prefix for anonymous nodes. def __init__(self, write, base=None, genPrefix = None, noLists=0 , quiet=0, flags=""): gp = genPrefix if gp == None: gp = "#_g" if base!=None: try: gp = uripath.join(base, "#_g") except ValueError: pass # bogus: base eg RDFSink.RDFSink.__init__(self, gp) self._write = self.writeEncoded self._writeRaw = write self._quiet = quiet or "q" in flags self._flags = flags self._subj = None self.prefixes = {} # Look up prefix conventions self.defaultNamespace = None self.indent = 1 # Level of nesting of output self.base = base # self.nextId = 0 # Regenerate Ids on output self.regen = {} # Mapping of regenerated Ids # self.genPrefix = genPrefix # Prefix for generated URIs on output self.stack = [ 0 ] # List mode? self.noLists = noLists # Suppress generation of lists? self._anodeName = {} # For "a" flag self._anodeId = {} # For "a" flag - reverse mapping if "l" in self._flags: self.noLists = 1 def dummyClone(self): "retun a version of myself which will only count occurrences" return ToN3(write=dummyWrite, base=self.base, genPrefix=self._genPrefix, \ noLists=self.noLists, quiet=self._quiet, flags=self._flags ) def writeEncoded(self, str): """Write a possibly unicode string out to the output""" return self._writeRaw(str.encode('utf-8')) def setDefaultNamespace(self, uri): return self.bind("", uri) def bind(self, prefixString, uri): """ Just accepting a convention here """ assert ':' in uri # absolute URI references only if "p" in self._flags: return # Ignore the prefix system completely if not prefixString: raise RuntimError("Please use setDefaultNamespace instead") if (uri == self.defaultNamespace and "d" not in self._flags): return # don't duplicate ?? self._endStatement() self.prefixes[uri] = prefixString self._write(" @prefix %s: <%s> ." % (prefixString, refTo(self.base, uri)) ) self._newline() def setDefaultNamespace(self, uri): if "d" in self._flags or "p" in self._flags: return # Ignore the prefix system completely self._endStatement() self.defaultNamespace = uri if self.base: # Sometimes there is none, and nowadays refTo is intolerant x = refTo(self.base, uri) else: x = uri self._write(" @prefix : <%s> ." % x ) self._newline() def startDoc(self): if not self._quiet: # Suppress stuff which will confuse test diffs self._write("\n# Notation3 generation by\n") idstring = "$Id: notation3.py,v 1.166 2004/10/28 17:41:59 timbl Exp $" # CVS CHANGES THIS self._write("# " + idstring[5:-2] + "\n\n") # Strip $s in case result is checked in if self.base: self._write("# Base was: " + self.base + "\n") self._write(" " * self.indent) self._subj = None # self._nextId = 0 def endDoc(self, rootFormulaPair=None): self._endStatement() self._write("\n") if not self._quiet: self._write("#ENDS\n") return # No formula returned - this is not a store def makeComment(self, str): for line in string.split(str, "\n"): self._write("#" + line + "\n") # Newline order??@@ self._write(" " * self.indent + " ") def _newline(self, extra=0): self._write("\n"+ " " * (self.indent+extra)) def makeStatement(self, triple, why=None, aIsPossible=1): # triple = tuple([a.asPair() for a in triple2]) if self.stack[-1]: if triple[PRED] == N3_first: self._write(self.representationOf(triple[CONTEXT], triple[OBJ])+" ") elif triple[PRED] == RDF_type and triple[OBJ] == N3_List: pass # We knew elif triple[PRED] == RDF_type and triple[OBJ] == N3_Empty: pass # not how we would have put it but never mind elif triple[PRED] != N3_rest: raise RuntimeError ("Should only see %s and %s in list mode" %(N3_first, N3_rest), triple) return if ("a" in self._flags and triple[PRED] == (SYMBOL, N3_forSome_URI) and triple[CONTEXT] == triple[SUBJ]) : # and # We assume the output is flat @@@ true, we should not try: aIsPossible = aIsPossible() except TypeError: aIsPossible = 1 if aIsPossible: ty, value = triple[OBJ] i = len(value) while i > 0 and value[i-1] not in _notNameChars+"_": i = i - 1 str2 = value[i:] if self._anodeName.get(str2, None) != None: j = 1 while 1: str3 = str2 + `j` if self._anodeName.get(str3, None) == None: break j = j +1 str2 = str3 if str2[0] in "0123456789": str2 = "a"+str2 if diag.chatty_flag > 60: progress("Anode %s means %s" % (str2, value)) self._anodeName[str2] = value self._anodeId[value] = str2 return self._makeSubjPred(triple[CONTEXT], triple[SUBJ], triple[PRED]) self._write(self.representationOf(triple[CONTEXT], triple[OBJ])) # Below is for writing an anonymous node which is the object of only one arc def startAnonymous(self, triple, isList=0): if isList and not self.noLists: wasList = self.stack[-1] if wasList and (triple[PRED]==N3_rest) : # rest p of existing list self.stack.append(2) # just a rest - no parens self._subj = triple[OBJ] else: self._makeSubjPred(triple[CONTEXT], triple[SUBJ], triple[PRED]) self.stack.append(1) # New list self._write(" (") self.indent = self.indent + 1 self._pred = N3_first else: self._makeSubjPred(triple[CONTEXT], triple[SUBJ], triple[PRED]) self.stack.append(0) self._write(" [") self.indent = self.indent + 1 self._pred = None self._newline() self._subj = triple[OBJ] # The object is now the current subject def endAnonymous(self, subject, verb): # Remind me where we are wasList = self.stack.pop() isList = self.stack[-1] if wasList: if not isList or wasList == 1: self._write(" )") self.indent = self.indent - 1 else: self._write(" ]") self.indent = self.indent - 1 self._subj = subject self._pred = verb # Below we do anonymous top level node - arrows only leave this circle def startAnonymousNode(self, subj, isList=0): self.stack.append(isList) if self._subj: self._write(" .") self._newline() self.indent = self.indent + 1 if isList: self._write(" ( ") else: self._write(" [ ") self._subj = subj # The object is not the subject context self._pred = None def endAnonymousNode(self, subj=None): # Remove default subject isList = self.stack.pop() if isList: self._write(" )") else: self._write(" ]") if not subj: self._write(".") self.indent = self.indent - 1 self._newline() self._subj = subj self._pred = None # Below we notate a nested bag of statements def startBagSubject(self, context): self.stack.append(0) if self._subj != context: self._endStatement() self.indent = self.indent + 1 self._write("{") self._newline() self._subj = None self._pred = None def endBagSubject(self, subj): # Remove context self.stack.append(0) self.stack.pop() self._endStatement() # @@@@@@@@ remove in syntax change to implicit self._newline() self.indent = self.indent - 1 self._write("}") self._subj = subj self._pred = None def startBagNamed(self, context, subj): if self._subj != subj: self._endStatement() if self.indent == 1: # Top level only - extra newline self._newline() self._write(self.representationOf(context, subj)) self._subj = subj self._pred = None if self._pred is not None: self._write(";") # self._makeSubjPred(somecontext, subj, ( SYMBOL, DAML_sameAs_URI )) self.stack.append(0) self.indent = self.indent + 1 self._write(" :- {") self._newline() self._subj = None self._pred = None def endBagNamed(self, subj): # Remove context self.stack.pop() self._endStatement() # @@@@@@@@ remove in syntax change to implicit self._newline() self._write("}") self._subj = subj self._pred = None self.indent = self.indent - 1 def startBagObject(self, triple): self._makeSubjPred(triple[CONTEXT], triple[SUBJ], triple[PRED]) self.stack.append(0) self.indent = self.indent + 1 self._write("{") self._subj = None self._pred = None def endBagObject(self, pred, subj): # Remove context self.stack.pop() self._endStatement() # @@@@@@@@ remove in syntax change to implicit self.indent = self.indent - 1 self._write("}") # self._newline() self._subj = subj self._pred = pred def _makeSubjPred(self, context, subj, pred): if self.stack[-1]: return # If we are in list mode, don't need to. varDecl = (subj == context and "v" not in self._flags and ( pred == (SYMBOL, N3_forAll_URI) or pred == (SYMBOL, N3_forSome_URI))) if self._subj != subj or "s" in self._flags: self._endStatement() if self.indent == 1: # Top level only - extra newline self._newline() if "v" in self._flags or subj != context: self._write(self.representationOf(context, subj)) else: # "this" suppressed if (pred != (SYMBOL, N3_forAll_URI) and pred != (SYMBOL, N3_forSome_URI)): raise ValueError("On N3 output, 'this' used with bad predicate: %s" % pred) self._subj = subj self._pred = None if self._pred != pred: if self._pred: if "v" not in self._flags and ( self._pred== (SYMBOL, N3_forAll_URI) or self._pred == (SYMBOL, N3_forSome_URI)): self._write(".") else: self._write(";") self._newline(1) # Indent predicate from subject elif not varDecl: self._write(" ") if varDecl: if pred == (SYMBOL, N3_forAll_URI): self._write( " @forAll ") else: self._write( " @forSome ") elif pred == ( SYMBOL, DAML_sameAs_URI ) and "t" not in self._flags: self._write(" = ") elif pred == ( SYMBOL, RDF_type_URI ) and "t" not in self._flags: self._write(" a ") else : self._write( " %s " % self.representationOf(context, pred)) self._pred = pred else: self._write(",") self._newline(3) # Same subject and pred => object list def _endStatement(self): if self._subj: self._write(" .") self._newline() self._subj = None def representationOf(self, context, pair): """ Representation of a thing in the output stream Regenerates genids if required. Uses prefix dictionary to use qname syntax if possible. """ if "t" not in self._flags: if pair == context: return "this" if pair == N3_nil and not self.noLists: return"()" ty, value = pair if ty == LITERAL: singleLine = "n" in self._flags if type(value) is not types.TupleType: # simple old-fashioned string return stringToN3(value, singleLine=singleLine, flags = self._flags) s, dt, lang = value if dt != None and "n" not in self._flags: dt_uri = dt.uriref() if (dt_uri == INTEGER_DATATYPE): return str(long(s)) if (dt_uri == FLOAT_DATATYPE): return str(float(s)) # numeric value python-normalized if (dt_uri == DECIMAL_DATATYPE): return str(Decimal(s)) st = stringToN3(s, singleLine= singleLine, flags=self._flags) if lang != None: st = st + "@" + lang if dt != None: return st + "^^" + self.representationOf(context, dt.asPair()) return st aid = self._anodeId.get(pair[1], None) if aid != None: # "a" flag only return "_:" + aid # Must start with alpha as per NTriples spec. if ((ty == ANONYMOUS) and not option_noregen and "i" not in self._flags ): x = self.regen.get(value, None) if x == None: x = self.genId() self.regen[value] = x value = x # return "<"+x+">" j = string.rfind(value, "#") if j<0 and "/" in self._flags: j=string.rfind(value, "/") # Allow "/" namespaces as a second best if (j>=0 and "p" not in self._flags): # Suppress use of prefixes? # and value[j+1:].find(".") <0 ): # Can't use prefix if localname includes "." # print "|%s|%s|"%(self.defaultNamespace, value[:j+1]) for ch in value[j+1:]: # Examples: "." ";" we can't have in qname if ch in _notQNameChars: if verbosity() > 0: progress("Cannot have character %i in local name." % ord(ch)) break else: namesp = value[:j+1] if (self.defaultNamespace and self.defaultNamespace == namesp and "d" not in self._flags): return ":"+value[j+1:] self.countNamespace(namesp) prefix = self.prefixes.get(namesp, None) # @@ #CONVENTION if prefix != None : return prefix + ":" + value[j+1:] if value[:j] == self.base: # If local to output stream, return "<#" + value[j+1:] + ">" # use local frag id (@@ lone word?) if "r" not in self._flags and self.base != None: value = refTo(self.base, value) elif "u" in self._flags: value = backslashUify(value) else: value = hexify(value) return "<" + value + ">" # Everything else ################################################### # # Utilities # Escapes = {'a': '\a', 'b': '\b', 'f': '\f', 'r': '\r', 't': '\t', 'v': '\v', 'n': '\n', '\\': '\\', '"': '"'} forbidden1 = re.compile(ur'[\\\"\a\b\f\r\v\u0080-\uffff]') forbidden2 = re.compile(ur'[\\\"\a\b\f\r\v\t\n\u0080-\uffff]') #" def stringToN3(str, singleLine=0, flags=""): res = '' if (len(str) > 20 and str[-1] <> '"' and not singleLine and (string.find(str, "\n") >=0 or string.find(str, '"') >=0)): delim= '"""' forbidden = forbidden1 # (allow tabs too now) else: delim = '"' forbidden = forbidden2 i = 0 while i < len(str): m = forbidden.search(str, i) if not m: break j = m.start() res = res + str[i:j] ch = m.group(0) if ch == '"' and delim == '"""' and str[j:j+3] != '"""': #" res = res + ch else: k = string.find('\a\b\f\r\t\v\n\\"', ch) if k >= 0: res = res + "\\" + 'abfrtvn\\"'[k] else: if 'e' in flags: # res = res + ('\\u%04x' % ord(ch)) res = res + ('\\u%04X' % ord(ch)) # http://www.w3.org/TR/rdf-testcases/#ntriples else: res = res + ch i = j + 1 return delim + res + str[i:] + delim def backslashUify(ustr): """Use URL encoding to return an ASCII string corresponding to the given unicode""" # progress("String is "+`ustr`) # s1=ustr.encode('utf-8') str = "" for ch in ustr: # .encode('utf-8'): if ord(ch) > 126: ch = "\\u%04X" % ord(ch) else: ch = "%c" % ord(ch) str = str + ch return str def hexify(ustr): """Use URL encoding to return an ASCII string corresponding to the given unicode""" # progress("String is "+`ustr`) # s1=ustr.encode('utf-8') str = "" for ch in ustr: # .encode('utf-8'): if ord(ch) > 126: ch = "%%%02X" % ord(ch) else: ch = "%c" % ord(ch) str = str + ch return str def dummy(): res = "" if len(str) > 20 and (string.find(str, "\n") >=0 or string.find(str, '"') >=0): delim= '"""' forbidden = "\\\"\a\b\f\r\v" # (allow tabs too now) else: delim = '"' forbidden = "\\\"\a\b\f\r\v\t\n" for i in range(len(str)): ch = str[i] j = string.find(forbidden, ch) if ch == '"' and delim == '"""' and i+1 < len(str) and str[i+1] != '"': j=-1 # Single quotes don't need escaping in long format if j>=0: ch = "\\" + '\\"abfrvtn'[j] elif ch not in "\n\t" and (ch < " " or ch > "}"): ch = "[[" + `ch` + "]]" #[2:-1] # Use python res = res + ch return delim + res + delim #ends