#!/usr/bin/perl -w
###############################################
# stfw_online.cgi -                           #
# Search the web from a web form.             #
# See stfw for more comments                  #
# and a command-line interface.               #
# Copyright(c) 2000,  Jim Mahoney             #
###############################################
my $VERSION = '0.06';	# 11/8/2000
use strict;

# --- global parameters  ----------------------------------------
# These shouldn't change once they're initialized from command line options.
# See the documentation below for details
my $show_debug    = 0;	# 1 => print debugging details, 0 => don't.
my $show_progress = 0;	# 1 => print links as found, 0 => don't.
my $only_these_urls     = ""; # if present, only visit urls that match this.
my $exclude_these_urls  = ""; # if present, exclude urls that match this.

# --- global variables  -----------------------------------------
my $tree;		# root of recursive web-pages-searched  data tree.
my %allUrls = ();	# hash of urls => times_it_appeared
my $webpages_seen = 0;  # total number of distinct webpages examined.
my $results       = ""; # output of search results put here.

# These are passed in as cgi parameters.
# I can't use "my" here, because I'm going to set them
# using a  ${"name"} which refers to a main package global.
use vars qw( $start_url $regexp $max_depth $submit );
$start_url = "";
$regexp = "";
$max_depth = "";
$submit = "";

my $default_start_url	= "http://www.marlboro.edu/~mahoney/misc/test_search.html";
my $default_regexp	= "Larry";
my $default_max_depth	= 2;

# define strings for the printing of the web page.
my $top_of_webpage = <<"END_OF_PAGE_TOP";
<html>
 <head><title>Search the Web</title></head>
 <body>
     <h1>Search the Web</h1>

     <form method=POST action=stfw_online.cgi>

      starting url:      
      <input type=TEXT name=start_url value='$default_start_url' size=64><br>

      search expression: 
      <input type=TEXT name=regexp    value='$default_regexp' size=32><br>

      depth:             
      <input type=TEXT name=max_depth value='$default_max_depth' size=6><br>

      <input type=SUBMIT name="submit">

     </form>
END_OF_PAGE_TOP

my $bottom_of_webpage = <<"END_OF_PAGE_BOTTOM";
 <hr noshade size=1>
 Jim Mahoney (mahoney\@marlboro.edu)
 </body>
</html>
END_OF_PAGE_BOTTOM

# -----------------------------------------------------------
# --- main  -------------------------------------------------
# Make sure that anything printed is treated as html.
print "Content-type: text/html\n\n";

# (Note: the CGI module does what I'm doing here *much* easier. 
#  I'm reading in the parameters and translating them by hand
#  only to illustrate what's really going on.  In practice,
#  you'd never do this stuff by hand, as I am here.
#  Instead you'd do something like
#     use CGI;
#     my $cgi = CGI->new;
#     $submit = $cgi->param("submit");
#  and so on.)

# First, get search parameters passed to this cgi script.
my $raw_cgi_parameters = <STDIN>;
if ($raw_cgi_parameters) {
  my @key_value_pairs = split "&", $raw_cgi_parameters;
  foreach my $pair (@key_value_pairs) {
    my ($k, $v) = split "=", $pair;
    # translate some of the funky characters in cgi parameter strings
    # This list is not complete; I've only covered ~ : /
    for ($v) {
      s/%3A/:/g;
      s/%2F/\//g;
      s/%7E/~/g;
    }
    no strict;
    ${$k} = $v;     # Set the value of variable $k to $v.
    # print " key '$k', value '$v',  soft '${$k}' <br> \n";
  }
}

# Second, if the "submit" button has been pressed,
# do the websearch and save the results in the 
# $search_results string.

if ($submit) {
  firstTimeSeen($start_url);	# Include root in allUrls, $webpages_seen
  $tree = newNode($start_url);	# Initialize data structure.
  searchTheWeb($tree, 0);
  setResultsString($tree);
}

# Third, print out the webpage.
print $top_of_webpage;
if ($raw_cgi_parameters) {
  print " <hr noshade size=1> \n";
  print " Raw cgi inputs were '$raw_cgi_parameters' <br> \n";
  print " Parameters set to <br>";
  print " start_url = '$start_url' <br> \n";
  print " regexp    = '$regexp'    <br> \n";
  print " max depth = '$max_depth' <br> \n";
  print " submit = '$submit' <br> \n";
}
if ($submit) {
  print " <hr noshade size=1> \n";
  print " Search results: <p> \n";
  print $results;
}
print $bottom_of_webpage;
exit; # Done.

#========================================================
#== subrs ===============================================
#========================================================

# These are pretty much the same as
# the ones in stfw, except that I've changed
# the printing to store things in $results so
# that I can output it all at once.

#########################################################
# Recursive search of web pages.
# This routine is the real workhorse of the whole program;
# everything else is preparation or clean up.
sub searchTheWeb {
  use LWP::Simple;  # import the $content=get($url) interface to the web
  my ($node, $depth) = @_;
  my $url = getUrlOfNode($node);
  my $content = get($url) or return;

  # Look for lines that have the given regular expression, and save them.
  # (Note that this can find at most one match within a given line.)
  my @lines_matched = ($content =~ /^(.*$regexp.*)$/gm);
  appendMatchToNode($node, @lines_matched);

  return if $depth >= $max_depth;	# Limit depth of search.

  my @links = getLinks( $content, $url );
  foreach my $link (@links) {

    # Print out the $link under consideration,
    # indented by how deep we are, with the
    # how many times we've already seen it or "no" in
    # parens afterward.
    my $will_search = urlWeWantToSearch($link);
    if ($show_progress) {
      my $times = $allUrls{$link} ? " ($allUrls{$link})"
                                  : $will_search ? "" : " (no)";
      print " ". " "x$depth . $link . $times . "\n";
    }

    # If we haven't searched this link before,
    # and if it is one that we want to search, continue
    # our search recursively at a lower depth.
    if ( firstTimeSeen($link) and $will_search ) {
      my $subnode = appendLinkToNode($node, $link);
      searchTheWeb($subnode, $depth+1);
    }
  }
}

####################################################
# Show 'em what we got.
sub setResultsString {
  my $s = $webpages_seen==1 ? "" : "s";
  $results .= "   $webpages_seen web page$s searched. <br>";
  $results .= "<br>-------- tree of websites searched ------------- <br>";
  printUrlTree( $tree, 0 );
  $results .= "<br>-------- matches found ------------------------- <br>";
  printMatches( $tree, 0 );
  $results .= " <br>";
}

###########################
# Recursively print url for each node.  Indent according to depth.
sub printUrlTree {
  my ($node, $depth) = @_;
  $results .= "  "x$depth . " url='" . getUrlOfNode($node) . "' at depth=" . $depth . "<br>";
  foreach my $subnode (getSubnodesOfNode($node)) {
    printUrlTree( $subnode, $depth+1);
  }
}

###########################
# Recursively search for and print stored successful matches.
sub printMatches {
  my ($node, $depth) = @_;
  my $matches = matchesFoundAtNode($node);
  if ($matches) {
    $results .= "url: " . getUrlOfNode($node). "<br>";
    $results .= $matches . "<br>";
  }
  foreach my $subnode (getSubnodesOfNode($node)) {
    printMatches( $subnode, $depth+1);
  }
}


##########################################
# Return true if this url is one that we do want to search.
sub urlWeWantToSearch {
  my ($url) = @_;
  if ($only_these_urls and $exclude_these_urls) {
    return ($url =~ m/$only_these_urls/ and $url !~ m/$exclude_these_urls/);
  }
  elsif ($only_these_urls) {
    return $url =~ m/$only_these_urls/;
  }
  elsif ($exclude_these_urls) {
    return $url !~ m/$exclude_these_urls/;
  }
  else {
    return 1;
  }
}

########################################
# Return a list of links for the given content and url.
# This is a big regular expression exercise, which
# here I avoid by calling some CPAN routines to do the job.
# (See stfw
sub getLinks {
  use HTML::SimpleLinkExtor;	# extracts links from web pages
  use URI;			# converts URL's to absolute form
  my ($content, $url) = @_;
  my $p = HTML::SimpleLinkExtor->new();
  $p->parse( $content );
  my @raw_links = $p->href;	# get all links in href tags.
  my @links = ();
  foreach my $link (@raw_links) {
    # don't follow mail links, pictures, pdf files, etc.
    next if $link =~ m/^mailto:/;
    next if $link =~ m/\.(jpg|jpeg|gif|pdf|png|css)$/i;
    # Any file.html#
    push @links, URI->new_abs( $link, $url );  # absoulte URL
  }
  return @links;
}

#################################################
# Keep track of how many times we see each url while searching.
# Also increment total number of pages seen.
# Return true if this is the first time we've seen this one,
# return false if we've seen it before.
sub firstTimeSeen {
  my ($candidateUrl) = @_;
  $allUrls{$candidateUrl}++;	# increment times we've seen this.
  if ( $allUrls{$candidateUrl}==1 ) {
    $webpages_seen++;
    return 1;
  }
  else {
    return 0;
  }
}

#------- start tree/node routines -------------------------

# We're traversing the web as a tree.
# So why not use a similar data structure to save what we've done?
# Let node of this data structure be a hashref,
# representing one web page, and containing
#   * the url of that web page
#   * matches for the string we're looking for (ref to list of strings)
#   * links (urls) that the page contains (ref to list of hashrefs
#
#
# Here's a picture of what this data structure looks like.
# (For you fans of ascii art.)
#
# $node
# |
# |  -----------------------------------
# |  | hash named  "" (anonymous)      |
# |  |                                 |
# |->|  KEY        |  VALUE            |      
#    |  -------------------------------| |--> ( "match1", "match2", ... )
#    |  itsUrl     | www.marlboro.edu/ | |
#    |  itsMatches | ref to list  -------- 
#    |  itsLinks   | ref to list  ---------->( $subnode1, $subnode2, ...)
#    |                                 |       |          |
#    -----------------------------------       |          |
#                                              |          |
#                                              |          |
#               -------------------            |          |
#               | hash            |<-----------|          |
#               |  KEY  | VALUE   |                       |
#               |itsUrl | ~markf/ |    -----------        |
#               |  etc....        |    | hash    |<-------|
#               -------------------    |  etc... |
#       			       -----------
#
#

##################################
# Return a new node for a given url.
sub newNode {
  my ($url) = @_;
  return {
	  itsUrl		=> $url,
	  itsMatches		=> [],
	  itsLinks		=> [],
	 }
}

##################################
# Return the url for a given node.
sub getUrlOfNode {
  my ($node) = @_;
  return $node->{itsUrl};
}

##################################
# Append a match (a string) to
# the list of matches found in a node.
sub appendMatchToNode {
  my ($node, @stuff_found) = @_;
  push @{$node->{itsMatches}}, @stuff_found;
  return;
}

###################################
# return a string of lines that matched for this url,
# or the empty string if there was no match.
sub matchesFoundAtNode {
  my ($node) = @_;
  my $result = "";
  foreach my $match (@{$node->{itsMatches}}) {
    chomp($match);
    $result .= "  => $match <br>";
  }
  return $result;
}

##################################
# Given a url and a parent node,
# create a new subnode with that url
# and append the subnode to the parent.
# Return the new subnode.
sub appendLinkToNode {
  my ($node, $url) = @_;
  my $subnode = newNode($url);
  push @{$node->{itsLinks}}, $subnode;
  return $subnode;
}

##################################
# return list of subnodes
sub getSubnodesOfNode {
  my ($node) = @_;
  return @{$node->{itsLinks}};
}

# ----------- end of tree/node routines -----------------------

###################################################
# Return a short string with the name and version of this program.
sub versionString {
  return "stfw_online, version $VERSION  -  search the web via cgi form";
}

