""" mathish_regex.py Starting to play around with a "formal language" version of regular expressions ... Taking advantage of utf8, the notation I'll use is : Σ alphabet {a,b,c,...,x,y,z} ∪ {0,1,...,8,9} a set with just a {a} A,B,C,... set names variables are upper case letters ∅ empty set {} ε empty string "" ; {ε} in a regex, like {a} AB concatenation A ○ B A|B set union A ∪ B (AB)C parens for grouping A* any number of A with these "syntatic sugar" extensions : . Σ any single alphabet symbol A+ A(A*) one or more of A A? (A|ε) zero or one of A [abc] (a|b|c) one of these (explicit symbols) [^abc] (d|e|f|...|8|9) one of the others (explicit symbols) Therefore a regex will be encoded as a string containing made up of : lowercase letters some of the alphabet symbols numbers the rest of the alphabet symbols . any single character (i.e. letter or number) ε empty string * zero or more of preceding thing + one or more of preceding thing ? zero or one of preceding thing ( ) grouping (must balance) [ ] or [^ ] character class (only contains alphabet symbols) The regex will always be applied to the entire string to be matched. In the normal programming use of python regular expressions, the regex string is usuall a raw string r'xxxx', to avoid backslash complications such as "\n" which is in a python string is one newline character, not the bytes '\' and 'n'. Since I'm not using backslash, here that raw string stuff is moot. The API is valid(regex) # True if regex is a valid # regular expression as described above. accept(regex, string) # True if regex matches the whole string. test(regex, yes=[t1, t2, ...], # True if t1,t2,... accept no =[f1, f2, ...]) # and f1,f2,... don't. For example >>> valid("a(b.)*[^zy]") True >>> valid("a(") False >>> r = "a(1|2)(3|4)b" # 'a' then one of ('13','14','23','24') then 'b' >>> accept(r, "a13b") True >>> accept(r, "xxxx") False >>> test(r, yes=["a13b", "a24b"], no=["xxxx", "01"]) True Jim Mahoney | cs.marlboro.college | Sep 2019 | MIT License """ # Use python 3.5 or higher. import sys version = str(sys.version_info[0]) + '.' + str(sys.version_info[1]) assert(version >= '3.5') # See e.g. https://docs.python.org/3.5/library/re.html import re def valid(regex): """ return True if regex is a valid regular expression >>> valid('a+[^b]?') True >>> valid('X') False """ # # Quick quiz: # Can this be done with only math-ish regex? # In other words, is the set {s|valid(s)} a regular language? # if None == re.fullmatch(r'[a-z0-9()\[\]+?ε^.*]*', regex): # If there's an illegal character .... return False else: # TODO - also fail on things like "(a))", "*b", ... return True def accept(regex, string): """ return True if the regex matches the whole string >>> accept('a+b*', 'aaaab') True >>> accept('a+b*', 'b') False """ # See https://docs.python.org/3/library/re.html#re.Pattern.match . # Note that # - re.fullmatch tests the regex against the entire string # - re.fullmatch returns None if it doesn't match. return None != re.fullmatch(regex, string) def test(regex, yes=[], no=[]): """ return True only if all the 'yes' strings are accepted and all the 'no' strings are not accepted. >>> test('a+b*', yes=['aaab', 'aaa'], no=['', 'abc']) True """ # (It might make more sense to return some sort of summary # information about which tests passed or failed.) for string in yes: if not accept(regex, string): return False for string in no: if accept(regex, string): return False return True if __name__ == "__main__": import doctest doctest.testmod()