""" regex_url.py Playing around with a regular expression that recognizes a subset of the legal URLs. Jim Mahoney | cs.marlboro.college | Sep 2019 | MIT License """ # The full definition as of 2005 is RFC3986 e.g. # https://www.ietf.org/rfc/rfc3986.txt # # examples (even without utf8 stuff) # # ftp://ftp.is.co.za/rfc/rfc1808.txt # http://www.ietf.org/rfc/rfc2396.txt # ldap://[2001:db8::7]/c=GB?objectClass?one # mailto:John.Doe@example.com # news:comp.infosystems.www.servers.unix # tel:+1-816-555-1212 # telnet://192.0.2.16:80/ # urn:oasis:names:specification:docbook:dtd:xml:4.1.2 # See e.g. https://en.wikipedia.org/wiki/URL # Here [] means "optional" , following a BNF-ish notation # e.g. https://en.wikipedia.org/wiki/Extended_Backus-Naur_form # # URI = [scheme:][//authority]path[?query][#fragment] # authority = [userinfo@]host[:port] # userinfo = user[:password] # host = domain | IP_address i.e. google.com | 12.233.12.32 # domain = label.label... # label is a-z, A-Z, 0-9, -, (but not - first or last) # see https://en.wikipedia.org/wiki/Domain_name # path = /foo/bar/baz/... # "if authority given, must start with / or be empty" # query = key=value&key=value... # fragment = name # # domain names are *not* case sensitive # whorepresents = WhoRepresents = WhorePresents (oops) # expertsexchange = ExpertsExchange = ExpertSexChange (oops) # which is the dash (experts-exchange) can be good, eh? # By the way : example.com is set aside for fictious use in RFC2606. # # OK, I'm going to massively simplify, looking at just full web URLs, # and not at all trying to capture everything. # # In particular, I would need to allow more characters in several # of these - domain_names, file_names - at the very least the %xxx # "percent encoding"; see https://en.wikipedia.org/wiki/Percent-encoding # # TODO : # give variable names to some of these character classes, # then use string interpolation to fill those into the regex. # # Note that I'm using the re.VERBOSE that # * ignores whitespace # * ignores everything after the # comment sign # import re regex = r"""https? # scheme | :// # | (([a-zA-Z0-9]+) # username | userinfo (\:([a-zA-Z0-9]+))? # password | \@)? # | (([a-zA-Z0-9] # label_first_letter | domain [a-zA-Z0-9-]*) # label following letter | (chars?) (\. # between labels | [a-zA-Z0-9] # label_first_letter | [a-zA-Z0-9\-]* # label following letter | )+) # | (: # port | port [0-9]+ # | )? # | ((/ # path | path ([a-zA-Z0-9_\.\-\%]*) # file_or_folder | (chars?) )* # | (/)?) # optional trailing slash | (\? # query | query ([a-zA-Z0-9_]+ # | (=[a-zA-Z0-9_]+)?) # first key=value pair | (\&[a-zA-Z0-9_]+ # | (=[a-zA-Z0-9_]+)?)* # more key=value pairs | )? # | (\# # fragment | fragment [a-zA-Z0-9_]+ # (chars?) | )? # | """ tests = ["http://aa.bb.cc", "http://foo:bar@aa.bb.cc", "https://joe@example.coms/", "http://aa.bb.cc/", "http://aa.bb.cc:80/", "http://aa.bb.cc/one", "http://aa.bb.cc/one/two", "http://aa.bb.cc/one1.cgi/two", "http://aa.bb.cc/one1.cgi-beta/two", "http://aa.bb.cc/one1.cgi-beta_gamma/two", "https://cs.marlboro.college/cours/fall2019/formal_languages/notes/dfa", "https://example.com:400/cours/foo.py?a=b", "https://example.com/cours/foo/bar?html", "https://example.com/cours/foo/bar?alpha=beta&zero=112", "http://aa.bb.cc/one1.cgi-beta/two#one", "www.google.com", # no http// "test", # no http:// "http://www google" # space ] url_regex = re.compile(regex, re.VERBOSE) for test in tests: result = url_regex.fullmatch(test) if result: print(url, ' => ', result.groups()) else: print(url, ' => NO MATCH')