#!/usr/bin/env python """ WebtoWeb.py Patrick Kennedy Artificial Intelligence assignment: Fall 2007 Bi-Directional Search """ """ I would have liked to implement some sort of intelligence with my bi-diectional search. Maybe a degree of closeness in link names. The value would increase or decease as parts of one address were visible in another link. Because as my program stands now, it will find the link, but it has to scan through countless other external sites. And as I watch the sites move in order according to a que, I see that many branches of the tree go down paths totaly irrelivent to the url it is searching for. """ import sys from urllib import urlopen #---- MAIN PROGRAM: THE NITTY-GRITTY ----# def main(): """Finds the common link between 2 urls, using a bi-directional search method""" print "\n\n########WebtoWeb.py#########" url1 = "cs.marlboro.edu/home/links.html" url2 = "cs.marlboro.edu/courses/marlboro_wiki/home" findLink(url1, url2) # Jim: http://www.in-snec.com/home/index.htm # Jim: http://www.flyinggoose.com/ #wool.fm/upcomingmusicdata.html #http://www.flyinggoose.com/ #decor8.blogspot.com/2007/02/international-magazine-swap-sign-up.html def findLink(url1, url2): """Find the common link by expanding the fringe""" assignPoles(url1,url2) done = False while not done: expandNorth('north') expandSouth() def assignPoles(url1,url2): """Assign each starting url with a 'pole' and a Url_Node' for storage of information""" url = url_node(url1, "NRoot") table['north']['fringe'].append(url) addItem(url, 'north') url = url_node(url2, "SRoot") table['south']['fringe'].append(url) addItem(url, 'south') def addItem(node, pole): """Adds an item to the sorta hash dictionary object 'table'. If we've seen the link before, erase the node. If the link has been seen by the other pole: end the program, a link has been found.""" name = node.name if table[pole].has_key(name): #seen this link before return False else: linked = checkLink(node, pole) if linked != False: #found a common link formating = "--"*10 print "\n%sLINK FOUND%s" %(formating, formating) print "%s == %s" %(node.name, linked.name) print "%sLINK FOUND%s\n" %(formating, formating) traceLink(node, linked) sys.exit() else: print "adding node: ", name #add the node to the table table[pole][name] = node return True def checkLink(node, pole): """Check to see if the node has been seen by the opposite pole. If so return False (we can't add the node) else: add the node to the table.""" name = node.name if pole == 'north': try: table['south'][name] node = table['south'][name] return node except: return False else: try: table['north'][name] node = table['north'][name] return node except: return False def test(url): """test(url) Tests to see if a url exists or can be located by the server. If not the function returns 'False'. The function also adds the appropriate header to the url. Instead of trying to open '/academics/current' the function will pair the name with its parent. (ie. 'http://marlboro.edu/academics/current') """ #if type(url) == type(" "): # name = url #else: name = url.name try: urlopen(name) return True except: print "I CAN'T OPEN THIS URL:\n %s" %(name) del url #delete the url_node instance that does not work return False def searchLinks(url, pole): page_text = accessUrl(url) for line in page_text: if "Inlinks" in line: #html marker of total links start = line.rfind("Inlinks") + 17 end = line.rfind(")") total_links = line[start:end] return int(total_links) #return the string as an interger def getLinkAddress(link_page_text): """returns the name of the link address. so that the additional links that are on different pages can be seen.""" for line in link_page_text: if "
1" in line: start = line.find("") link = line[:end] return link def incrementLinkAddress(link_page, COUNTER): """Increments the end of the link address to reveil more links on another page""" print "link page: ", link_page #if test(link_page) == False: # expandSouth() if COUNTER == 1: return link_page else: marker = link_page.rfind("=") value = link_page[marker+1:] link_page = link_page.strip(value) value = int(value) + 10 for char in str(value): link_page = link_page + char return link_page #expandSouth("marlboro.edu"): #----- THE NORTH DIRECTION: working foreward -----# def expandNorth(pole): """Expands the North Pole by popping off a item from the fringe and adding the links off it to the North Pole fringe""" print "\nExpanding the fringe..." url = table[pole]['fringe'].pop() works = test(url) #Test to see if the url actually works if works: print "*---- Searching: %s ----*" %(url.name) searchLinks(url, pole) #search for all the links off it. else: print "#---- Can't Access: %s ----#" %(url.name) def inLine(line, pole, url): """Scan the line of html code and return only the link addresses""" link_count = line.count('") -1 #find the end of the link line = line[:end] #strip all html after the link link = line #if 'multi' is set to True return the start of the link so that the line #can be broken down one link at a time. if multi: return (link, start) return link def accessUrl(url): """Open a url; read the text; and return the text""" page = urlopen(url.name) page_text = page.readlines() return page_text #----- URL_NODE: STORAGE -----# class url_node: """ A class for storeage of link information: A links parent url_node and few functions to return that information. It would have been better to implement a hash table, but I couldn't get one running in time. Instead I used a python dictionary instance as a sorta hash. Instead of an algorithm to generate a number to a name, I used the name as the hash key in a dictionary. It is more costly and cumbersome, but works to some extent. """ def __init__(self, name, parent): self.name = name self.parent = parent self.cleanLink() def cleanLink(self): """make links uniform so that duplicates can be found and the linkcan be located""" if self.name[-1] == "/": self.name = self.name[:-1] if self.name[0] == "/": self.name = self.parent.name + self.name if "http://" not in self.name: self.name = "http://" + self.name def getName(self): return self.name def getParent(self): if self.parent != 'NRoot' and self.parent != 'SRoot': return self.parent.name else: return self.parent main()