#!/usr/bin/perl use integer; # No use for floating-point numbers here. my ( $WORDS, %WORDS ); SCAN_WORDS: { # Locate a stem word list: now very Unix-dependent. my ( $words_dir ); foreach $words_dir ( qw(/usr/share/dict /usr/dict .) ) { $WORDS = "$words_dir/words"; last SCAN_WORDS if -f $WORDS; } } die "$0: failed to find the stop list database.\n" unless -f $WORDS; print "Found the stop list database at '$WORDS'.\n"; open( WORDS, $WORDS ) or die "$0: failed to open file '$WORDS': $!\n"; sub find_word { my $word = $_[0]; # The word to be looked for. use Search::Dict; unless ( exists $WORDS{ $word } ) { # If $word has not yet ever been tried. my $pos = look( *WORDS, $word, 0, 1 ); if ( $pos < 0 ) { # If the $word was tried but not found. $WORDS{ $word } = 0; } else { my $line = ; chomp( $line ); # If the $word was tried, 1 if found, 0 if not found. $WORDS{ $word } = lc( $line ) eq lc( $word ); } } return $WORDS{ $word }; } sub backderive { # The word to backderive, the derivation rules, # and the derivation so far. my ( $word, $rules, $path ) = @_; @$path = ( $word ) unless defined $path; if ( find_word( $word ) ) { print "@$path\n"; return; } my ( $i, $work ); for ( $i = 0; $i < @$rules; $i += 2 ) { my $src = $rules->[ $i ]; my $dst = $rules->[ $i+1 ]; $work = $word; if ( $dst =~ /\$/ ) { # Complex rule, one more /e. while ( $work =~ s/$src/$dst/eex ) { backderive( $work, $rules, [ @$path, $work ] ); } } else { # Simple rule. while ( $work =~ s/$src/$dst/ex ) { backderive( $work, $rules, [ @$path, $work ] ); } } } return; } # The rules have two parts: "before" and "after", in s/// terms. # Simple rules. my @RULES = split(/\s*,\s*/, <<'__RULES__', -1); ^bi , , ^de , , ^dis , , ^hyper , , ^mal , , ^mega , , ^mid , , ^re , , ^sub , , ^super , , ^tri , , ^un , , able$ , , al$ , , d$ , , ed$ , , est$ , , ful$ , , hood$ , , ian$ , , ic$ , , ing$ , , on$ , , ise$ , , ist$ , , ity$ , , ive$ , , ize$ , , less$ , , like$ , , ly$ , , ment$ , , ness$ , , s$ , , worthy$ , , iable$ , y, ian$ , y, ic$ , y, ial$ , y, iation$ , y, ier$ , y, iest$ , y, iful$ , y, ihood$ , y, iless$ , y, ily$ , y, iness$ , y, ist$ , y, able$ , e, ation$ , e, ing$ , e, ion$ , e, ise$ , e, ism$ , e, ist$ , e, ity$ , e, ize$ , e, ce$ , t, cy$ , t __RULES__ # Drop accidental trailing empty field. pop( @RULES ) if @RULES % 2 == 1; # Complex rules. my $C = '[bcdfghjklmnpqrstvwxz]'; push( @RULES, "($C)".'\1(?: ing|ed)$', '$1' ); # Cleanup rules from whitespace. foreach ( @RULES ) { s/^\s+//; s/\s+$//; } # Do the stem. while ( ) { chomp; backderive( $_, \@RULES ); }