#!/usr/bin/perl -w ############################################### # stfw_online.cgi - # # Search the web from a web form. # # See stfw for more comments # # and a command-line interface. # # Copyright(c) 2000, Jim Mahoney # ############################################### my $VERSION = '0.06'; # 11/8/2000 use strict; # --- global parameters ---------------------------------------- # These shouldn't change once they're initialized from command line options. # See the documentation below for details my $show_debug = 0; # 1 => print debugging details, 0 => don't. my $show_progress = 0; # 1 => print links as found, 0 => don't. my $only_these_urls = ""; # if present, only visit urls that match this. my $exclude_these_urls = ""; # if present, exclude urls that match this. # --- global variables ----------------------------------------- my $tree; # root of recursive web-pages-searched data tree. my %allUrls = (); # hash of urls => times_it_appeared my $webpages_seen = 0; # total number of distinct webpages examined. my $results = ""; # output of search results put here. # These are passed in as cgi parameters. # I can't use "my" here, because I'm going to set them # using a ${"name"} which refers to a main package global. use vars qw( $start_url $regexp $max_depth $submit ); $start_url = ""; $regexp = ""; $max_depth = ""; $submit = ""; my $default_start_url = "http://www.marlboro.edu/~mahoney/misc/test_search.html"; my $default_regexp = "Larry"; my $default_max_depth = 2; # define strings for the printing of the web page. my $top_of_webpage = <<"END_OF_PAGE_TOP"; Search the Web

Search the Web

END_OF_PAGE_TOP my $bottom_of_webpage = <<"END_OF_PAGE_BOTTOM";

Jim Mahoney (mahoney\@marlboro.edu) END_OF_PAGE_BOTTOM # ----------------------------------------------------------- # --- main ------------------------------------------------- # Make sure that anything printed is treated as html. print "Content-type: text/html\n\n"; # (Note: the CGI module does what I'm doing here *much* easier. # I'm reading in the parameters and translating them by hand # only to illustrate what's really going on. In practice, # you'd never do this stuff by hand, as I am here. # Instead you'd do something like # use CGI; # my $cgi = CGI->new; # $submit = $cgi->param("submit"); # and so on.) # First, get search parameters passed to this cgi script. my $raw_cgi_parameters = ; if ($raw_cgi_parameters) { my @key_value_pairs = split "&", $raw_cgi_parameters; foreach my $pair (@key_value_pairs) { my ($k, $v) = split "=", $pair; # translate some of the funky characters in cgi parameter strings # This list is not complete; I've only covered ~ : / for ($v) { s/%3A/:/g; s/%2F/\//g; s/%7E/~/g; } no strict; ${$k} = $v; # Set the value of variable $k to $v. # print " key '$k', value '$v', soft '${$k}'
\n"; } } # Second, if the "submit" button has been pressed, # do the websearch and save the results in the # $search_results string. if ($submit) { firstTimeSeen($start_url); # Include root in allUrls, $webpages_seen $tree = newNode($start_url); # Initialize data structure. searchTheWeb($tree, 0); setResultsString($tree); } # Third, print out the webpage. print $top_of_webpage; if ($raw_cgi_parameters) { print "

\n"; print " Raw cgi inputs were '$raw_cgi_parameters'
\n"; print " Parameters set to
"; print " start_url = '$start_url'
\n"; print " regexp = '$regexp'
\n"; print " max depth = '$max_depth'
\n"; print " submit = '$submit'
\n"; } if ($submit) { print "

\n"; print " Search results:

\n"; print $results; } print $bottom_of_webpage; exit; # Done. #======================================================== #== subrs =============================================== #======================================================== # These are pretty much the same as # the ones in stfw, except that I've changed # the printing to store things in $results so # that I can output it all at once. ######################################################### # Recursive search of web pages. # This routine is the real workhorse of the whole program; # everything else is preparation or clean up. sub searchTheWeb { use LWP::Simple; # import the $content=get($url) interface to the web my ($node, $depth) = @_; my $url = getUrlOfNode($node); my $content = get($url) or return; # Look for lines that have the given regular expression, and save them. # (Note that this can find at most one match within a given line.) my @lines_matched = ($content =~ /^(.*$regexp.*)$/gm); appendMatchToNode($node, @lines_matched); return if $depth >= $max_depth; # Limit depth of search. my @links = getLinks( $content, $url ); foreach my $link (@links) { # Print out the $link under consideration, # indented by how deep we are, with the # how many times we've already seen it or "no" in # parens afterward. my $will_search = urlWeWantToSearch($link); if ($show_progress) { my $times = $allUrls{$link} ? " ($allUrls{$link})" : $will_search ? "" : " (no)"; print " ". " "x$depth . $link . $times . "\n"; } # If we haven't searched this link before, # and if it is one that we want to search, continue # our search recursively at a lower depth. if ( firstTimeSeen($link) and $will_search ) { my $subnode = appendLinkToNode($node, $link); searchTheWeb($subnode, $depth+1); } } } #################################################### # Show 'em what we got. sub setResultsString { my $s = $webpages_seen==1 ? "" : "s"; $results .= " $webpages_seen web page$s searched.
"; $results .= "
-------- tree of websites searched -------------
"; printUrlTree( $tree, 0 ); $results .= "
-------- matches found -------------------------
"; printMatches( $tree, 0 ); $results .= "
"; } ########################### # Recursively print url for each node. Indent according to depth. sub printUrlTree { my ($node, $depth) = @_; $results .= " "x$depth . " url='" . getUrlOfNode($node) . "' at depth=" . $depth . "
"; foreach my $subnode (getSubnodesOfNode($node)) { printUrlTree( $subnode, $depth+1); } } ########################### # Recursively search for and print stored successful matches. sub printMatches { my ($node, $depth) = @_; my $matches = matchesFoundAtNode($node); if ($matches) { $results .= "url: " . getUrlOfNode($node). "
"; $results .= $matches . "
"; } foreach my $subnode (getSubnodesOfNode($node)) { printMatches( $subnode, $depth+1); } } ########################################## # Return true if this url is one that we do want to search. sub urlWeWantToSearch { my ($url) = @_; if ($only_these_urls and $exclude_these_urls) { return ($url =~ m/$only_these_urls/ and $url !~ m/$exclude_these_urls/); } elsif ($only_these_urls) { return $url =~ m/$only_these_urls/; } elsif ($exclude_these_urls) { return $url !~ m/$exclude_these_urls/; } else { return 1; } } ######################################## # Return a list of links for the given content and url. # This is a big regular expression exercise, which # here I avoid by calling some CPAN routines to do the job. # (See stfw sub getLinks { use HTML::SimpleLinkExtor; # extracts links from web pages use URI; # converts URL's to absolute form my ($content, $url) = @_; my $p = HTML::SimpleLinkExtor->new(); $p->parse( $content ); my @raw_links = $p->href; # get all links in href tags. my @links = (); foreach my $link (@raw_links) { # don't follow mail links, pictures, pdf files, etc. next if $link =~ m/^mailto:/; next if $link =~ m/\.(jpg|jpeg|gif|pdf|png|css)$/i; # Any file.html# push @links, URI->new_abs( $link, $url ); # absoulte URL } return @links; } ################################################# # Keep track of how many times we see each url while searching. # Also increment total number of pages seen. # Return true if this is the first time we've seen this one, # return false if we've seen it before. sub firstTimeSeen { my ($candidateUrl) = @_; $allUrls{$candidateUrl}++; # increment times we've seen this. if ( $allUrls{$candidateUrl}==1 ) { $webpages_seen++; return 1; } else { return 0; } } #------- start tree/node routines ------------------------- # We're traversing the web as a tree. # So why not use a similar data structure to save what we've done? # Let node of this data structure be a hashref, # representing one web page, and containing # * the url of that web page # * matches for the string we're looking for (ref to list of strings) # * links (urls) that the page contains (ref to list of hashrefs # # # Here's a picture of what this data structure looks like. # (For you fans of ascii art.) # # $node # | # | ----------------------------------- # | | hash named "" (anonymous) | # | | | # |->| KEY | VALUE | # | -------------------------------| |--> ( "match1", "match2", ... ) # | itsUrl | www.marlboro.edu/ | | # | itsMatches | ref to list -------- # | itsLinks | ref to list ---------->( $subnode1, $subnode2, ...) # | | | | # ----------------------------------- | | # | | # | | # ------------------- | | # | hash |<-----------| | # | KEY | VALUE | | # |itsUrl | ~markf/ | ----------- | # | etc.... | | hash |<-------| # ------------------- | etc... | # ----------- # # ################################## # Return a new node for a given url. sub newNode { my ($url) = @_; return { itsUrl => $url, itsMatches => [], itsLinks => [], } } ################################## # Return the url for a given node. sub getUrlOfNode { my ($node) = @_; return $node->{itsUrl}; } ################################## # Append a match (a string) to # the list of matches found in a node. sub appendMatchToNode { my ($node, @stuff_found) = @_; push @{$node->{itsMatches}}, @stuff_found; return; } ################################### # return a string of lines that matched for this url, # or the empty string if there was no match. sub matchesFoundAtNode { my ($node) = @_; my $result = ""; foreach my $match (@{$node->{itsMatches}}) { chomp($match); $result .= " => $match
"; } return $result; } ################################## # Given a url and a parent node, # create a new subnode with that url # and append the subnode to the parent. # Return the new subnode. sub appendLinkToNode { my ($node, $url) = @_; my $subnode = newNode($url); push @{$node->{itsLinks}}, $subnode; return $subnode; } ################################## # return list of subnodes sub getSubnodesOfNode { my ($node) = @_; return @{$node->{itsLinks}}; } # ----------- end of tree/node routines ----------------------- ################################################### # Return a short string with the name and version of this program. sub versionString { return "stfw_online, version $VERSION - search the web via cgi form"; }