#!/bin/env perl
###############################################
#
# stfw_online.cgi -                           #
#
# Search the web from a web form.             #
#
# Copyright(c) 2000,  Jim Mahoney             #
###############################################
#my $VERSION = '0.06';	# 11/8/2000
my $VERSION = '0.07';	# 11/2004
my $DEBUG = 0;
use warnings;
use strict;

# --- global variables  -----------------------------------------
my $tree;		# root of recursive web-pages-searched  data tree.
my %allUrls       = ();	# hash of urls => times_it_appeared
my $webpages_seen = 0;  # total number of distinct webpages examined.
my $results       = "";

# --- cgi parameters from the URL ---------------------------

use CGI qw(:standard);

my $default_start_url	= "http://cs.marlboro.edu/term/fall04/web_perl/code/stfw/test_search.html";
my $default_regexp	= "Larry";
my $default_max_depth	= 2;

my $start_url  = param('start_url') || $default_start_url;
my $regexp     = param('regexp')    || $default_regexp;
my $max_depth  = param('max_depth') || $default_max_depth;
my $submit     = param('submit')    || "";

# Keep things from blowing up - it takes too long otherwise
$max_depth = 4 if $max_depth > 4;

# define strings for the printing of the web page.
my $top_of_webpage = <<"END_OF_PAGE_TOP";
<html>
 <head><title>Search the Web</title></head>
 <body>
     <h1>Search the Web</h1>

     This does a recursive search of the web:<br>
     starting at a given web page, we look for the given regexp on that page
     and on each link on that web page, and so on.<p>

     Warning: this currently returns entire lines of text - that may be a lot 
     of text, depending on the web page.
     <p>

     See 
    <a href="http://www.perldoc.com/perl5.8.0/lib/LWP/Simple.html">LWP::Simple</a> 
    and <a href="http://www.perldoc.com/perl5.6.1/lib/HTML/LinkExtor.html">HTML::LinkExtor</a>
     <p>

     <form method=POST action="stfw_online.cgi">

      starting url:      
      <input type="TEXT" name="start_url" value="$start_url" size=80><br>

      search expression: 
      <input type="TEXT" name="regexp"    value="$regexp"    size=64><br>

      depth (max 4):             
      <input type="TEXT" name="max_depth" value="$max_depth" size=6><br>

      <input type="SUBMIT" name="submit" value="do it">

     </form>
END_OF_PAGE_TOP

my $bottom_of_webpage = <<"END_OF_PAGE_BOTTOM";
 <hr noshade size=1>
 Jim Mahoney (mahoney\@marlboro.edu)
 </body>
</html>
END_OF_PAGE_BOTTOM

# -----------------------------------------------------------
# --- main  -------------------------------------------------
# First, make sure that anything printed is treated as html.
print header;

# Second, if the "submit" button has been pressed,
# do the websearch and save the results in the 
# $search_results string.

if ($submit) {
  firstTimeSeen($start_url);	# Include root in allUrls, $webpages_seen
  $tree = newNode($start_url);	# Initialize data structure.
  searchTheWeb($tree, 0);
  makeResultsString($tree);
}

# Third, print out the webpage.
print $top_of_webpage;
use Data::Dumper;

if ($DEBUG) {
  print qq{
    <hr noshade size=1>
    <pre>
    Parameters set to                        
     start_url = '$start_url'                 
     regexp    = '$regexp'                    
     max depth = '$max_depth'
    </pre>
  };
}

if ($submit) {
  print qq{
    <hr noshade size=1>
    <b>Search results:</b> <p>
   $results
   };
}

if ($DEBUG){
 print " allUrls : <br>" . join("<br>",keys(%allUrls)) . "<br>";
 print "<pre>";
 print " tree = " . Dumper($tree);
 print "</pre>";
}

print $bottom_of_webpage;

exit; # Done.

#========================================================
#== subrs ===============================================
#========================================================

# These are pretty much the same as
# the ones in the command line version stfw, 
# except that I've changed
# the printing to store things in $results so
# that I can output it all at once.

# --- global parameters  ----------------------------------------
# These shouldn't change once they're initialized from command line options.
# See the documentation below for details
my $show_progress       = 1;	# 1 => print links as found, 0 => don't.
my $only_these_urls     = "";   # if present, only visit urls that match this.
my $exclude_these_urls  = "";   # if present, exclude urls that match this.

#########################################################
# Recursive search of web pages.
# This routine is the real workhorse of the whole program;
# everything else is preparation or clean up.
sub searchTheWeb {
  use LWP::Simple;             # import the $content=get($url) interface to the web
  my ($node, $depth) = @_;
  my $url = getUrlOfNode($node);
  my $content = get($url) or return;

  # Look for lines that have the given regular expression, and save them.
  # (Note that this can find at most one match within a given line.)
  my @lines_matched = ($content =~ /^(.*$regexp.*)$/gm);
  appendMatchToNode($node, @lines_matched);

  return if $depth >= $max_depth;	# Limit depth of search.

  my @links = getLinks( $content, $url );
  foreach my $link (@links) {

    # Print out the $link under consideration,
    # indented by how deep we are, with the
    # how many times we've already seen it or "no" in
    # parens afterward.
    my $will_search = urlWeWantToSearch($link);
    if ($show_progress) {
      my $times = $allUrls{$link} ? " ($allUrls{$link})"
                                  : $will_search ? "" : " (no)";
      print " ". " "x$depth . $link . $times . "\n";
    }

    # If we haven't searched this link before,
    # and if it is one that we want to search, continue
    # our search recursively at a lower depth.
    if ( firstTimeSeen($link) and $will_search ) {
      my $subnode = appendLinkToNode($node, $link);
      searchTheWeb($subnode, $depth+1);
    }
  }
}

####################################################
# Show 'em what we got.
sub makeResultsString {
  my ($tree) = @_;
  my $s = $webpages_seen==1 ? "" : "s";
  $results .= "   $webpages_seen web page$s searched. <br>";
  $results .= "<br>-------- tree of websites searched ------------- <br>";
  printUrlTree( $tree, 0 );
  $results .= "<br>-------- matches found ------------------------- <br>";
  printMatches( $tree, 0 );
  $results .= " <br>";
}

###########################
# Recursively print url for each node.  Indent according to depth.
sub printUrlTree {
  my ($node, $depth) = @_;
  $results .= "  "x$depth . " url='" . getUrlOfNode($node) . "' at depth=" . $depth . "<br>";
  foreach my $subnode (getSubnodesOfNode($node)) {
    printUrlTree( $subnode, $depth+1);
  }
}

###########################
# Recursively search for and print stored successful matches.
sub printMatches {
  my ($node, $depth) = @_;
  my $matches = matchesFoundAtNode($node);
  if ($matches) {
    $results .= "url: " . getUrlOfNode($node). "<br>";
    $results .= $matches . "<br>";
  }
  foreach my $subnode (getSubnodesOfNode($node)) {
    printMatches( $subnode, $depth+1);
  }
}


##########################################
# Return true if this url is one that we do want to search.
sub urlWeWantToSearch {
  my ($url) = @_;
  if ($only_these_urls and $exclude_these_urls) {
    return ($url =~ m/$only_these_urls/ and $url !~ m/$exclude_these_urls/);
  }
  elsif ($only_these_urls) {
    return $url =~ m/$only_these_urls/;
  }
  elsif ($exclude_these_urls) {
    return $url !~ m/$exclude_these_urls/;
  }
  else {
    return 1;
  }
}

########################################
# Return a list of links for the given content and url.
# This is a big regular expression exercise, which
# here I avoid by calling some CPAN routines to do the job.
sub getLinks {
  use HTML::SimpleLinkExtor;	# extracts links from web pages
  use URI;			# converts URL's to absolute form
  my ($content, $url) = @_;
  my $p = HTML::SimpleLinkExtor->new();
  $p->parse( $content );
  my @raw_links = $p->href;	# get all links in href tags.
  my @links = ();
  foreach my $link (@raw_links) {
    # don't follow mail links, pictures, pdf files, etc.
    next if $link =~ m/^mailto:/;
    next if $link =~ m/\.(jpg|jpeg|gif|pdf|png|css)$/i;
    # Any file.html#
    push @links, URI->new_abs( $link, $url );  # absoulte URL
  }
  return @links;
}

#################################################
# Keep track of how many times we see each url while searching.
# Also increment total number of pages seen.
# Return true if this is the first time we've seen this one,
# return false if we've seen it before.
sub firstTimeSeen {
  my ($candidateUrl) = @_;
  $allUrls{$candidateUrl}++;	# increment times we've seen this.
  if ( $allUrls{$candidateUrl}==1 ) {
    $webpages_seen++;
    return 1;
  }
  else {
    return 0;
  }
}

#------- start tree/node routines -------------------------

# We're traversing the web as a tree.
# So why not use a similar data structure to save what we've done?
# Let node of this data structure be a hashref,
# representing one web page, and containing
#   * the url of that web page
#   * matches for the string we're looking for (ref to list of strings)
#   * links (urls) that the page contains (ref to list of hashrefs
#
#
# Here's a picture of what this data structure looks like.
# (For you fans of ascii art.)
#
# $node
# |
# |  -----------------------------------
# |  | hash named  "" (anonymous)      |
# |  |                                 |
# |->|  KEY        |  VALUE            |      
#    |  -------------------------------| |--> ( "match1", "match2", ... )
#    |  itsUrl     | www.marlboro.edu/ | |
#    |  itsMatches | ref to list  -------- 
#    |  itsLinks   | ref to list  ---------->( $subnode1, $subnode2, ...)
#    |                                 |       |          |
#    -----------------------------------       |          |
#                                              |          |
#                                              |          |
#               -------------------            |          |
#               | hash            |<-----------|          |
#               |  KEY  | VALUE   |                       |
#               |itsUrl | ~markf/ |    -----------        |
#               |  etc....        |    | hash    |<-------|
#               -------------------    |  etc... |
#       			       -----------
#
#

##################################
# Return a new node for a given url.
sub newNode {
  my ($url) = @_;
  return {
	  itsUrl		=> $url,
	  itsMatches		=> [],
	  itsLinks		=> [],
	 }
}

##################################
# Return the url for a given node.
sub getUrlOfNode {
  my ($node) = @_;
  return $node->{itsUrl};
}

##################################
# Append a match (a string) to
# the list of matches found in a node.
sub appendMatchToNode {
  my ($node, @stuff_found) = @_;
  push @{$node->{itsMatches}}, @stuff_found;
  return;
}

###################################
# return a string of lines that matched for this url,
# or the empty string if there was no match.
sub matchesFoundAtNode {
  my ($node) = @_;
  my $result = "";
  foreach my $match (@{$node->{itsMatches}}) {
    chomp($match);
    $match =~ s/>/&gt;/g;     # convert < > to &lt; and &gt;  
    $match =~ s/</&lt;/g;
    $result .= "  => $match <br>";
  }
  return $result;
}

##################################
# Given a url and a parent node,
# create a new subnode with that url
# and append the subnode to the parent.
# Return the new subnode.
sub appendLinkToNode {
  my ($node, $url) = @_;
  my $subnode = newNode($url);
  push @{$node->{itsLinks}}, $subnode;
  return $subnode;
}

##################################
# return list of subnodes
sub getSubnodesOfNode {
  my ($node) = @_;
  return @{$node->{itsLinks}};
}

# ----------- end of tree/node routines -----------------------

###################################################
# Return a short string with the name and version of this program.
sub versionString {
  return "stfw_online, version $VERSION  -  search the web via cgi form";
}

