url_regex.py

"""
 regex_url.py

 Playing around with a regular expression that recognizes
 a subset of the legal URLs.

 Jim Mahoney | cs.marlboro.college | Sep 2019 | MIT License
"""

# The full definition as of 2005 is RFC3986 e.g.
# https://www.ietf.org/rfc/rfc3986.txt
#
# examples (even without utf8 stuff)
#
#      ftp://ftp.is.co.za/rfc/rfc1808.txt
#      http://www.ietf.org/rfc/rfc2396.txt
#      ldap://[2001:db8::7]/c=GB?objectClass?one
#      mailto:John.Doe@example.com
#      news:comp.infosystems.www.servers.unix
#      tel:+1-816-555-1212
#      telnet://192.0.2.16:80/
#      urn:oasis:names:specification:docbook:dtd:xml:4.1.2

# See e.g. https://en.wikipedia.org/wiki/URL
#   Here [] means "optional" , following a BNF-ish notation
#   e.g. https://en.wikipedia.org/wiki/Extended_Backus-Naur_form
#
#   URI        = [scheme:][//authority]path[?query][#fragment]
#   authority  = [userinfo@]host[:port]
#   userinfo   = user[:password]
#   host       = domain | IP_address   i.e.  google.com | 12.233.12.32
#   domain     = label.label...
#                label is a-z, A-Z, 0-9, -, (but not - first or last)
#                see https://en.wikipedia.org/wiki/Domain_name
#   path       = /foo/bar/baz/...
#                "if authority given, must start with / or be empty"
#   query      = key=value&key=value...
#   fragment   = name
#
# domain names are *not* case sensitive
#    whorepresents   = WhoRepresents   = WhorePresents   (oops)
#    expertsexchange = ExpertsExchange = ExpertSexChange (oops)
# which is the dash (experts-exchange) can be good, eh?
# By the way : example.com is set aside for fictious use in RFC2606.
#
# OK, I'm going to massively simplify, looking at just full web URLs,
# and not at all trying to capture everything.
#
# In particular, I would need to allow more characters in several
# of these - domain_names, file_names - at the very least the %xxx
# "percent encoding"; see https://en.wikipedia.org/wiki/Percent-encoding
# 
# TODO :
#   give variable names to some of these character classes,
#   then use string interpolation to fill those into the regex.
#
# Note that I'm using the re.VERBOSE that
#   * ignores whitespace
#   * ignores everything after the # comment sign
#

import re

regex = r"""https?                   # scheme     |
            ://                      #            |
            (([a-zA-Z0-9]+)          # username          | userinfo
               (\:([a-zA-Z0-9]+))?   # password          |
            \@)?                     #                   |
            (([a-zA-Z0-9]            # label_first_letter      | domain
              [a-zA-Z0-9-]*)         # label following letter  | (chars?)
             (\.                     # between labels          |
             [a-zA-Z0-9]             # label_first_letter      |
             [a-zA-Z0-9\-]*          # label following letter  |
             )+)                     #                         |
             (:                      # port   | port
              [0-9]+                 #        |
            )?                       #        |
            ((/                      # path                       | path
             ([a-zA-Z0-9_\.\-\%]*)   # file_or_folder             | (chars?)
            )*                       #                            |
            (/)?)                    # optional trailing slash    |
            (\?                      # query                        | query
              ([a-zA-Z0-9_]+         #                              |
                (=[a-zA-Z0-9_]+)?)   #  first key=value pair        |
              (\&[a-zA-Z0-9_]+       #                              |
                (=[a-zA-Z0-9_]+)?)*  #  more key=value pairs        |
            )?                       #                              |
            (\#                      # fragment                       | fragment
              [a-zA-Z0-9_]+          #   (chars?)                     |
            )?                       #                                |   
         """

tests = ["http://aa.bb.cc",
        "http://foo:bar@aa.bb.cc",
        "https://joe@example.coms/",
        "http://aa.bb.cc/",
        "http://aa.bb.cc:80/",        
        "http://aa.bb.cc/one",
        "http://aa.bb.cc/one/two", 
        "http://aa.bb.cc/one1.cgi/two",
        "http://aa.bb.cc/one1.cgi-beta/two",
        "http://aa.bb.cc/one1.cgi-beta_gamma/two",
        "https://cs.marlboro.college/cours/fall2019/formal_languages/notes/dfa",
        "https://example.com:400/cours/foo.py?a=b",
        "https://example.com/cours/foo/bar?html",
        "https://example.com/cours/foo/bar?alpha=beta&zero=112",
        "http://aa.bb.cc/one1.cgi-beta/two#one",
        "www.google.com",                          # no http//
        "test",                                    # no http://
        "http://www google"                        # space
       ]

url_regex = re.compile(regex, re.VERBOSE)

for test in tests:
    result = url_regex.fullmatch(test)
    if result:
        print(url, ' => ', result.groups())
    else:
        print(url, ' => NO MATCH')