#!/usr/local/bin/perl -w
#####
#
#  Trim the list of bloglines categories to remove semantic dupes
#  The list is here http://www.usabilityviews.com/bloglines/bloglines_folders.csv
#
#  USAGE:
#
#          % narrow <filename>
#
#  TODO:
#        Use a thesaurus. Unfortunately this probably means needing an ISO format English thesaurus
# 
#
#  Copyright, 2005 - Simon Wistow <simon@thegestalt.org>
#  Released under the same terms as Perl itself
#
#####


use strict;
use Lingua::Stem;
use Lingua::EN::Tagger;
use Lingua::EN::FindNumber;
use Lingua::EN::Inflect::Number qw(to_S);

my %got;
my $count     = 1;
my @stopwords = map { chomp; $_ } <DATA>;
my %stopwords = map { $_ => 1 } @stopwords;

my $stemmer   = Lingua::Stem->new(-locale => 'EN'); $stemmer->stem_caching({ -level => 2 });
my $p         = Lingua::EN::Tagger->new( stem => 1 );

$|++; # We don't need no steeenking buffering

while (<>) {
    $count++;
    chomp;             # trim
    s/\s+\d+\s*$//;    # get rid of the number


    $_ = lc($_);       # lower case

    s/[^\sa-z\d]//g;   # normalise to numbers letters and spaces
    s/\s+/ /;          # multiple spaces
    $_ = numify($_);   # turn all words to numbers
    my $out = "";       
    my %word_list = $p->get_words( $_ );    

    foreach my $part (keys %word_list) {       # check each word
        next if $stopwords{$part};             # eliminate stopwords            
        my $word = to_S($part);                # singularise it
        $word = $stemmer->stem($word)->[0];    # stem it ( note, are the last two steps redundant? )
        $out .= $word;
    }
    print "Count: $count\r";                   # print out how far we've got
	next if $out =~ /^\s*$/;
    $got{$out}++;

}
print "\n";

my @top = sort { $got{$b} <=> $got{$a} } keys %got;

for (splice @top, 0, 10) {
	print $got{$_}." - '$_'\n";
}

my $kept = scalar(keys %got);
print "Kept: $kept (saved ".($count-$kept).")\n";
__DATA__
after
also    
an    
and
as    
at    
be
because
before
between    
but    
before
for
however
from    
if
in
into    
of
or
other    
out
since
such
than
that    
the
these
there
this
those
to
under
upon
when
where
whether
which
with
within
without