#!/bin/env perl ############################################### # # stfw_online.cgi - # # # Search the web from a web form. # # # Copyright(c) 2000, Jim Mahoney # ############################################### #my $VERSION = '0.06'; # 11/8/2000 my $VERSION = '0.07'; # 11/2004 my $DEBUG = 0; use warnings; use strict; # --- global variables ----------------------------------------- my $tree; # root of recursive web-pages-searched data tree. my %allUrls = (); # hash of urls => times_it_appeared my $webpages_seen = 0; # total number of distinct webpages examined. my $results = ""; # --- cgi parameters from the URL --------------------------- use CGI qw(:standard); my $default_start_url = "http://cs.marlboro.edu/term/fall04/web_perl/code/stfw/test_search.html"; my $default_regexp = "Larry"; my $default_max_depth = 2; my $start_url = param('start_url') || $default_start_url; my $regexp = param('regexp') || $default_regexp; my $max_depth = param('max_depth') || $default_max_depth; my $submit = param('submit') || ""; # Keep things from blowing up - it takes too long otherwise $max_depth = 4 if $max_depth > 4; # define strings for the printing of the web page. my $top_of_webpage = <<"END_OF_PAGE_TOP"; Search the Web

Search the Web

This does a recursive search of the web:
starting at a given web page, we look for the given regexp on that page and on each link on that web page, and so on.

Warning: this currently returns entire lines of text - that may be a lot of text, depending on the web page.

See LWP::Simple and HTML::LinkExtor

END_OF_PAGE_TOP my $bottom_of_webpage = <<"END_OF_PAGE_BOTTOM";

Jim Mahoney (mahoney\@marlboro.edu) END_OF_PAGE_BOTTOM # ----------------------------------------------------------- # --- main ------------------------------------------------- # First, make sure that anything printed is treated as html. print header; # Second, if the "submit" button has been pressed, # do the websearch and save the results in the # $search_results string. if ($submit) { firstTimeSeen($start_url); # Include root in allUrls, $webpages_seen $tree = newNode($start_url); # Initialize data structure. searchTheWeb($tree, 0); makeResultsString($tree); } # Third, print out the webpage. print $top_of_webpage; use Data::Dumper; if ($DEBUG) { print qq{

    Parameters set to                        
     start_url = '$start_url'                 
     regexp    = '$regexp'                    
     max depth = '$max_depth'

}; } if ($submit) { print qq{

Search results:

$results }; } if ($DEBUG){ print " allUrls :
" . join("
",keys(%allUrls)) . "
"; print "

";
 print " tree = " . Dumper($tree);
 print "

"; } print $bottom_of_webpage; exit; # Done. #======================================================== #== subrs =============================================== #======================================================== # These are pretty much the same as # the ones in the command line version stfw, # except that I've changed # the printing to store things in $results so # that I can output it all at once. # --- global parameters ---------------------------------------- # These shouldn't change once they're initialized from command line options. # See the documentation below for details my $show_progress = 1; # 1 => print links as found, 0 => don't. my $only_these_urls = ""; # if present, only visit urls that match this. my $exclude_these_urls = ""; # if present, exclude urls that match this. ######################################################### # Recursive search of web pages. # This routine is the real workhorse of the whole program; # everything else is preparation or clean up. sub searchTheWeb { use LWP::Simple; # import the $content=get($url) interface to the web my ($node, $depth) = @_; my $url = getUrlOfNode($node); my $content = get($url) or return; # Look for lines that have the given regular expression, and save them. # (Note that this can find at most one match within a given line.) my @lines_matched = ($content =~ /^(.*$regexp.*)$/gm); appendMatchToNode($node, @lines_matched); return if $depth >= $max_depth; # Limit depth of search. my @links = getLinks( $content, $url ); foreach my $link (@links) { # Print out the $link under consideration, # indented by how deep we are, with the # how many times we've already seen it or "no" in # parens afterward. my $will_search = urlWeWantToSearch($link); if ($show_progress) { my $times = $allUrls{$link} ? " ($allUrls{$link})" : $will_search ? "" : " (no)"; print " ". " "x$depth . $link . $times . "\n"; } # If we haven't searched this link before, # and if it is one that we want to search, continue # our search recursively at a lower depth. if ( firstTimeSeen($link) and $will_search ) { my $subnode = appendLinkToNode($node, $link); searchTheWeb($subnode, $depth+1); } } } #################################################### # Show 'em what we got. sub makeResultsString { my ($tree) = @_; my $s = $webpages_seen==1 ? "" : "s"; $results .= " $webpages_seen web page$s searched.
"; $results .= "
-------- tree of websites searched -------------
"; printUrlTree( $tree, 0 ); $results .= "
-------- matches found -------------------------
"; printMatches( $tree, 0 ); $results .= "
"; } ########################### # Recursively print url for each node. Indent according to depth. sub printUrlTree { my ($node, $depth) = @_; $results .= " "x$depth . " url='" . getUrlOfNode($node) . "' at depth=" . $depth . "
"; foreach my $subnode (getSubnodesOfNode($node)) { printUrlTree( $subnode, $depth+1); } } ########################### # Recursively search for and print stored successful matches. sub printMatches { my ($node, $depth) = @_; my $matches = matchesFoundAtNode($node); if ($matches) { $results .= "url: " . getUrlOfNode($node). "
"; $results .= $matches . "
"; } foreach my $subnode (getSubnodesOfNode($node)) { printMatches( $subnode, $depth+1); } } ########################################## # Return true if this url is one that we do want to search. sub urlWeWantToSearch { my ($url) = @_; if ($only_these_urls and $exclude_these_urls) { return ($url =~ m/$only_these_urls/ and $url !~ m/$exclude_these_urls/); } elsif ($only_these_urls) { return $url =~ m/$only_these_urls/; } elsif ($exclude_these_urls) { return $url !~ m/$exclude_these_urls/; } else { return 1; } } ######################################## # Return a list of links for the given content and url. # This is a big regular expression exercise, which # here I avoid by calling some CPAN routines to do the job. sub getLinks { use HTML::SimpleLinkExtor; # extracts links from web pages use URI; # converts URL's to absolute form my ($content, $url) = @_; my $p = HTML::SimpleLinkExtor->new(); $p->parse( $content ); my @raw_links = $p->href; # get all links in href tags. my @links = (); foreach my $link (@raw_links) { # don't follow mail links, pictures, pdf files, etc. next if $link =~ m/^mailto:/; next if $link =~ m/\.(jpg|jpeg|gif|pdf|png|css)$/i; # Any file.html# push @links, URI->new_abs( $link, $url ); # absoulte URL } return @links; } ################################################# # Keep track of how many times we see each url while searching. # Also increment total number of pages seen. # Return true if this is the first time we've seen this one, # return false if we've seen it before. sub firstTimeSeen { my ($candidateUrl) = @_; $allUrls{$candidateUrl}++; # increment times we've seen this. if ( $allUrls{$candidateUrl}==1 ) { $webpages_seen++; return 1; } else { return 0; } } #------- start tree/node routines ------------------------- # We're traversing the web as a tree. # So why not use a similar data structure to save what we've done? # Let node of this data structure be a hashref, # representing one web page, and containing # * the url of that web page # * matches for the string we're looking for (ref to list of strings) # * links (urls) that the page contains (ref to list of hashrefs # # # Here's a picture of what this data structure looks like. # (For you fans of ascii art.) # # $node # | # | ----------------------------------- # | | hash named "" (anonymous) | # | | | # |->| KEY | VALUE | # | -------------------------------| |--> ( "match1", "match2", ... ) # | itsUrl | www.marlboro.edu/ | | # | itsMatches | ref to list -------- # | itsLinks | ref to list ---------->( $subnode1, $subnode2, ...) # | | | | # ----------------------------------- | | # | | # | | # ------------------- | | # | hash |<-----------| | # | KEY | VALUE | | # |itsUrl | ~markf/ | ----------- | # | etc.... | | hash |<-------| # ------------------- | etc... | # ----------- # # ################################## # Return a new node for a given url. sub newNode { my ($url) = @_; return { itsUrl => $url, itsMatches => [], itsLinks => [], } } ################################## # Return the url for a given node. sub getUrlOfNode { my ($node) = @_; return $node->{itsUrl}; } ################################## # Append a match (a string) to # the list of matches found in a node. sub appendMatchToNode { my ($node, @stuff_found) = @_; push @{$node->{itsMatches}}, @stuff_found; return; } ################################### # return a string of lines that matched for this url, # or the empty string if there was no match. sub matchesFoundAtNode { my ($node) = @_; my $result = ""; foreach my $match (@{$node->{itsMatches}}) { chomp($match); $match =~ s/>/>/g; # convert < > to < and > $match =~ s/ $match
"; } return $result; } ################################## # Given a url and a parent node, # create a new subnode with that url # and append the subnode to the parent. # Return the new subnode. sub appendLinkToNode { my ($node, $url) = @_; my $subnode = newNode($url); push @{$node->{itsLinks}}, $subnode; return $subnode; } ################################## # return list of subnodes sub getSubnodesOfNode { my ($node) = @_; return @{$node->{itsLinks}}; } # ----------- end of tree/node routines ----------------------- ################################################### # Return a short string with the name and version of this program. sub versionString { return "stfw_online, version $VERSION - search the web via cgi form"; }