mathish_regex.py

"""
 mathish_regex.py

 Starting to play around with a "formal language" 
 version of regular expressions ...

 Taking advantage of utf8, the notation I'll use is :

    Σ            alphabet               {a,b,c,...,x,y,z} ∪ {0,1,...,8,9}
    a            set with just a        {a}
    A,B,C,...    set names              variables are upper case letters
    ∅            empty set              {}
    ε            empty string           "" ; {ε} in a regex, like {a}
    AB           concatenation          A ○ B
    A|B          set union              A ∪ B
    (AB)C        parens for grouping 
    A*           any number of A        
 
 with these "syntatic sugar" extensions :  

    .            Σ                      any single alphabet symbol
    A+           A(A*)                  one or more of A
    A?           (A|ε)                  zero or one of A
    [abc]        (a|b|c)                one of these (explicit symbols)
    [^abc]       (d|e|f|...|8|9)        one of the others (explicit symbols)

 Therefore a regex will be encoded as a string containing made up of :

    lowercase letters       some of the alphabet symbols
    numbers                 the rest of the alphabet symbols
    .                       any single character (i.e. letter or number)
    ε                       empty string
    *                       zero or more of preceding thing
    +                       one or more of preceding thing
    ?                       zero or one of preceding thing
    ( )                     grouping (must balance)
    [ ] or [^ ]             character class (only contains alphabet symbols)

 The regex will always be applied to the entire string to be matched.

 In the normal programming use of python regular expressions, 
 the regex string is usuall a raw string r'xxxx', to avoid
 backslash complications such as "\n" which is in a python 
 string is one newline character, not the bytes '\' and 'n'.
 Since I'm not using backslash, here that raw string stuff is moot.

 The API is 

    valid(regex)                    #  True if regex is a valid
                                    #  regular expression as described above.

    accept(regex, string)           #  True if regex matches the whole string.

    test(regex, yes=[t1, t2, ...],  #  True if t1,t2,... accept
                no =[f1, f2, ...])  #       and f1,f2,... don't.

 For example

   >>> valid("a(b.)*[^zy]")
   True
   >>> valid("a(")
   False
   
   >>> r = "a(1|2)(3|4)b"   # 'a' then one of ('13','14','23','24') then 'b'

   >>> accept(r, "a13b")
   True
   >>> accept(r, "xxxx")
   False

   >>> test(r, yes=["a13b", "a24b"], no=["xxxx", "01"])
   True

 Jim Mahoney | cs.marlboro.college | Sep 2019 | MIT License 
"""

# Use python 3.5 or higher.
import sys
version = str(sys.version_info[0]) + '.' + str(sys.version_info[1])
assert(version >= '3.5')

# See e.g. https://docs.python.org/3.5/library/re.html
import re

def valid(regex):
    """ return True if regex is a valid regular expression 
        >>> valid('a+[^b]?')
        True
        >>> valid('X')
        False
    """
    #
    # Quick quiz:
    #    Can this be done with only math-ish regex?
    #    In other words, is the set {s|valid(s)} a regular language?
    #
    if None == re.fullmatch(r'[a-z0-9()\[\]+?ε^.*]*', regex):
        # If there's an illegal character ....
        return False
    else:
        # TODO - also fail on things like "(a))", "*b", ...
        return True

def accept(regex, string):
    """ return True if the regex matches the whole string 
        >>> accept('a+b*', 'aaaab')
        True
        >>> accept('a+b*', 'b')
        False
    """
    # See https://docs.python.org/3/library/re.html#re.Pattern.match .
    # Note that
    #  - re.fullmatch tests the regex against the entire string
    #  - re.fullmatch returns None if it doesn't match.
    return None != re.fullmatch(regex, string)

def test(regex, yes=[], no=[]):
    """ return True only if all the 'yes' strings are accepted
        and all the 'no' strings are not accepted. 
        >>> test('a+b*', yes=['aaab', 'aaa'], no=['', 'abc'])
        True
    """
    # (It might make more sense to return some sort of summary
    # information about which tests passed or failed.)
    for string in yes:
        if not accept(regex, string):
            return False
    for string in no:
        if accept(regex, string):
            return False
    return True

if __name__ == "__main__":
    import doctest
    doctest.testmod()