#!/usr/local/bin/perl -w ##### # # Trim the list of bloglines categories to remove semantic dupes # The list is here http://www.usabilityviews.com/bloglines/bloglines_folders.csv # # USAGE: # # % narrow # # TODO: # Use a thesaurus. Unfortunately this probably means needing an ISO format English thesaurus # # # Copyright, 2005 - Simon Wistow # Released under the same terms as Perl itself # ##### use strict; use Lingua::Stem; use Lingua::EN::Tagger; use Lingua::EN::FindNumber; use Lingua::EN::Inflect::Number qw(to_S); my %got; my $count = 1; my @stopwords = map { chomp; $_ } ; my %stopwords = map { $_ => 1 } @stopwords; my $stemmer = Lingua::Stem->new(-locale => 'EN'); $stemmer->stem_caching({ -level => 2 }); my $p = Lingua::EN::Tagger->new( stem => 1 ); $|++; # We don't need no steeenking buffering while (<>) { $count++; chomp; # trim s/\s+\d+\s*$//; # get rid of the number $_ = lc($_); # lower case s/[^\sa-z\d]//g; # normalise to numbers letters and spaces s/\s+/ /; # multiple spaces $_ = numify($_); # turn all words to numbers my $out = ""; my %word_list = $p->get_words( $_ ); foreach my $part (keys %word_list) { # check each word next if $stopwords{$part}; # eliminate stopwords my $word = to_S($part); # singularise it $word = $stemmer->stem($word)->[0]; # stem it ( note, are the last two steps redundant? ) $out .= $word; } print "Count: $count\r"; # print out how far we've got next if $out =~ /^\s*$/; $got{$out}++; } print "\n"; my @top = sort { $got{$b} <=> $got{$a} } keys %got; for (splice @top, 0, 10) { print $got{$_}." - '$_'\n"; } my $kept = scalar(keys %got); print "Kept: $kept (saved ".($count-$kept).")\n"; __DATA__ after also an and as at be because before between but before for however from if in into of or other out since such than that the these there this those to under upon when where whether which with within without