#!/usr/bin/env python

"""
WebtoWeb.py
Patrick Kennedy
Artificial Intelligence assignment: Fall 2007
Bi-Directional Search
"""

"""
I would have liked to implement some sort of intelligence with my
bi-diectional search.  Maybe a degree of closeness in link names.  The
value would increase or decease as parts of one address were visible
in another link.  Because as my program stands now, it will find the
link, but it has to scan through countless other external sites.  And
as I watch the sites move in order according to a que, I see that many
branches of the tree go down paths totaly irrelivent to the url it is
searching for.
"""

import sys
from urllib import urlopen

	
#---- MAIN PROGRAM: THE NITTY-GRITTY ----#
	
def main():
	"""Finds the common link between 2 urls, using a bi-directional search method"""
	print "\n\n########WebtoWeb.py#########"
	url1 = "cs.marlboro.edu/home/links.html"
	url2 = "cs.marlboro.edu/courses/marlboro_wiki/home"
	findLink(url1, url2)
	# Jim: http://www.in-snec.com/home/index.htm
	# Jim: http://www.flyinggoose.com/
	#wool.fm/upcomingmusicdata.html
	#http://www.flyinggoose.com/
	#decor8.blogspot.com/2007/02/international-magazine-swap-sign-up.html

	
def findLink(url1, url2):
	"""Find the common link by expanding the fringe"""
	assignPoles(url1,url2)
	done = False
	while not done:
		expandNorth('north')
		expandSouth()


def assignPoles(url1,url2):
	"""Assign each starting url with a 'pole' and a Url_Node' for storage of information"""
	url = url_node(url1, "NRoot")
	table['north']['fringe'].append(url)
	addItem(url, 'north')
	url = url_node(url2, "SRoot")
	table['south']['fringe'].append(url)
	addItem(url, 'south')
	

def addItem(node, pole):
	"""Adds an item to the sorta hash dictionary object 'table'.  
	If we've seen the link before, erase the node.  If the link has been 
	seen by the other pole: end the program, a link has been found."""
	name = node.name
	if table[pole].has_key(name):   #seen this link before
		return False
	else:	
		linked = checkLink(node, pole)
		if linked != False:        #found a common link
			formating = "--"*10
			print "\n%sLINK FOUND%s" %(formating, formating)
			print "%s == %s" %(node.name, linked.name)
			print "%sLINK FOUND%s\n" %(formating, formating)
			traceLink(node, linked)
			sys.exit()
		else:
			print "adding node: ", name    #add the node to the table
			table[pole][name] = node
			return True


def checkLink(node, pole):
	"""Check to see if the node has been seen by the opposite pole. If so 
	return False (we can't add the node) else: add the node to the table."""
	name = node.name
	if pole == 'north':
		try:
			table['south'][name]
			node = table['south'][name]
			return node
		except:
			return False
	else:
		try:
			table['north'][name]
			node = table['north'][name]
			return node
		except:
			return False
			
			
def test(url):
	"""test(url)
	Tests to see if a url exists or can be located by the server.  If not the 
	function returns 'False'.  The function also adds the appropriate header to 
	the url.   Instead of trying to open '/academics/current' the function will 
	pair the name with its parent.  (ie.  'http://marlboro.edu/academics/current')
	"""
	#if type(url) == type(" "):
	#	name = url
	#else:
	name = url.name
	try: 
		urlopen(name)
		return True
	except:
		print "I CAN'T OPEN THIS URL:\n %s" %(name)
		del url    #delete the url_node instance that does not work
		return False


def searchLinks(url, pole):
	page_text = accessUrl(url)
	for line in page_text:
		if "<a href=" in line:
			inLine(line, pole, url)
			
def traceLink(node, node2):
	tree = []
	for item in makeTree(node):
		tree.append(item)
	for item in makeTree(node2):
		tree.insert(0, item)
	print "-----TREE-----"
	for item in tree:
		print item
		
			
def makeTree(node):
	tree = []
	tree.append(node.name)
	while node.parent != 'NRoot' and node.parent != 'SRoot':
				node = node.parent
				tree.append(node.name)
	return tree
	

###############################
table   = {'north': { 'fringe': [] }, 'south': { 'fringe':[] }}

	
#-----  THE SOUTH DIRECTION: working backward  -----#	

def expandSouth():
	url = table['south']['fringe'].pop()
	#must access the yahoo site explorer website to use their 
	#inlink application to generate all the links to a website.
	app_address = "https://siteexplorer.search.yahoo.com/search?p=%s&amp;bwm=i&amp;bwms=p&amp;bwmf=u&amp;fr=FP-tab-web-t&amp;fr2=seo-rd-se" %(url.name)
	app = urlopen(app_address)         #open the app
	app_text = app.readlines()         #read the html
	openLinkPage(app_text, url)         #the inlinks are found in a link on the apps page


def openLinkPage(app_text, url):
	"""Sets up the link page:  finds the total number of links; the link addres; and calls a 
	function to get all the links"""
	for line in app_text:
		if "Inlinks" in line:                  #an html marker that marks the link page
			end = line.rfind("Inlinks") - 2
			line = line[:end]
			start = line.rfind("<a href") + 9

			link_page = line[start:]
			link_page = urlopen(link_page)           #open link page
			link_page_text = link_page.readlines()   #read the link page
			LINKS = totalLinks(link_page_text)    #count the total links
			LINK_PAGE_ADDRESS = getLinkAddress(link_page_text)
			getAllLinks(LINK_PAGE_ADDRESS, LINKS, url)
	
	
def totalLinks(link_page_text):
	for line in link_page_text:
		if "<strong>Inlinks" in line:     #html marker of total links
			start = line.rfind("<strong>Inlinks") + 17 
			end = line.rfind(")</strong>")
			total_links = line[start:end]
			return int(total_links)       #return the string as an interger	
	
	
def getLinkAddress(link_page_text):
	"""returns the name of the link address. so that the additional links 
	that are on different pages can be seen."""
	for line in link_page_text:
		if "<div id=yschpg><span>1</span>" in line:
			start = line.find("<a href=") + 9
			line = line[start:]       #strip all html before the start of the link
			end = line.find("\"") -1  #find the end of the link
			line = line[:end]         #strip all html after the link
			link = line
			LINK_PAGE_ADDRESS = link
			return LINK_PAGE_ADDRESS


def getAllLinks(LINK_PAGE_ADDRESS, LINKS, url):
	"find all the links to the south url"
	COUNTER = 0	
	link_page = LINK_PAGE_ADDRESS
	
	while COUNTER <= LINKS:         #a loop stopper
		link_page = incrementLinkAddress(link_page, COUNTER)
		COUNTER += cycleThroughLinks(link_page, COUNTER, url) - COUNTER

			
def cycleThroughLinks(link_page, COUNTER, url):		
	"""Walks through all the links on the page"""
	page = urlopen(link_page)
	link_page_text = page.readlines()
	for line in link_page_text:
		if "<a class=\"yschttl" in line: 
			COUNTER += 1
			link = cleanLink(line)
			link = url_node(link, url)
			addItem(link, 'south')
			table['south']['fringe'].append(link)
	return COUNTER
			

def cleanLink(line):
	"""Cleans the html off a link"""
	start = line.find("href=")  + 6
	line = line[start:]
	end = line.rfind("\">")
	link = line[:end]	
	return link
			

def incrementLinkAddress(link_page, COUNTER):
	"""Increments the end of the link address to reveil more links on another page"""
	print "link page: ", link_page
	#if test(link_page) == False:
	#	expandSouth()
	if COUNTER == 1:
		return link_page
	else: 
		marker = link_page.rfind("=")
		value = link_page[marker+1:]
		link_page = link_page.strip(value)
		value = int(value) + 10
		for char in str(value):
			link_page = link_page + char
		return link_page
		
		
#expandSouth("marlboro.edu"):	


#-----  THE NORTH DIRECTION: working foreward  -----#

def expandNorth(pole):
	"""Expands the North Pole by popping off a item from the fringe and adding the 
	links off it to the North Pole fringe"""
	print "\nExpanding the fringe..."
	url = table[pole]['fringe'].pop()
	works = test(url)         #Test to see if the url actually works
	if works:
		print "*---- Searching: %s ----*" %(url.name)
		searchLinks(url, pole)       #search for all the links off it.
		
	else: 
		print "#---- Can't Access: %s ----#" %(url.name)

	
def inLine(line, pole, url):
	"""Scan the line of html code and return only the link addresses"""
	link_count = line.count('<a href')
	if link_count != 1:
		for i in range(link_count):
			(link, start) = purifyLink(line, True)
			if link != False and "\"" not in link and len(link) != 1:
				line = line[start:]
				link = url_node(link, url)
				if addItem(link, pole) == True:
					table[pole]['fringe'].append(link)
				
			else:
				return False
	else:
		link = purifyLink(line)
		if link != (False,False) and "\"" not in link and len(link) != 1:   #block stupid html gargin 
			link = url_node(link, url)
			if addItem(link, pole) == True:
				table[pole]['fringe'].append(link)
	

def purifyLink(line, multi = False):
	"""purifyLink(line, multi = False)
	
	Strips all the useless html from a line, leaving only the link address
	to be passed into a url_node.  If there are multiple links in one line,  
	the boolian 'multi' can be set to True and 'purifyLink()' will seperate
	all the links from the single line.	
	"""
	if "<a href='" in line:       #check for variations on link writing in html
		start = line.find("<a href='") + 9     #find the start of the link
	if "<a href=\"" in line:
		start = line.find("<a href=\"") + 9
	else:                         #if there are no more links stop the function
		return (False,False)
	line = line[start:]           #strip all html before the start of the link
	end = line.find(">") -1       #find the end of the link
	line = line[:end]             #strip all html after the link
	link = line
	
	#if 'multi' is set to True return the start of the link so that the line 
	#can be broken down one link at a time.
	if multi:        
		return (link, start)         
	return link
	
		
def accessUrl(url):
	"""Open a url; read the text; and return the text"""
	page = urlopen(url.name)
	page_text = page.readlines()
	return page_text

			
#-----  URL_NODE: STORAGE -----#
class url_node:
	"""
	A class for storeage of link information:  A links parent url_node and few functions
	to return that information.
	
	It would have been better to implement a hash table, but I couldn't get one running in time.
	Instead I used a python dictionary instance as a sorta hash.  Instead of an algorithm
	to generate a number to a name, I used the name as the hash key in a dictionary.  
	It is more costly and cumbersome, but works to some extent. 
	"""
	def __init__(self, name, parent):
		self.name = name
		self.parent = parent
		self.cleanLink()
		
	def cleanLink(self):
		"""make links uniform so that duplicates can be found and the linkcan be located"""
		if self.name[-1] == "/":
			self.name = self.name[:-1]
		if self.name[0] == "/":
			self.name = self.parent.name + self.name
		if "http://" not in self.name:
			self.name = "http://" + self.name	
		
	def getName(self):
		return self.name
		
	def getParent(self):
		if self.parent != 'NRoot' and self.parent != 'SRoot':
			return self.parent.name
		else:
			return self.parent
		
		
main()