use strict; use warnings; package Text::WordCounter; { $Text::WordCounter::VERSION = '0.001'; } use namespace::autoclean; use Moose; use Lingua::ZH::MMSEG; use Unicode::UCD qw(charinfo); use URI::Find; use Lingua::Stem; has stemming => (is => 'rw', isa => 'Int', default => 0); has stopwords => (is => 'ro', isa => 'HashRef', default => sub { {} }); sub is_stop_word { my( $self, $word, $script ) = @_; return 0 if( $script eq 'Han' ); return 1 if exists $self->stopwords->{lc $word}; return length($word) <= 3; } sub normalize { my ($self, $word) = @_; if ($self->stemming) { my $stemmed = Lingua::Stem::stem($word)->[0]; if ($stemmed ne '') { return $stemmed; } } return lc $word } my %char_cache = (); sub split_scripts { my ( $self, $text ) = @_; my @parts; while ( $text =~ /(\X)/g ) { my $part = $1; my $pos = pos( $text ); my $ord = ord $part; unless ($char_cache{$ord}) { if (scalar(keys(%char_cache)) > 5000) { # XXX: Some LRU cache would be more appropriate, but this cleaning # will probably happen very rarely or never, so there's (hopefully) no # need to bother about it too much undef %char_cache; } $char_cache{$ord} = charinfo($ord); } my $charinfo = $char_cache{$ord}; if( ! defined $charinfo ){ warn "$1 does not look like good UTF8 - no charinfo"; next; } my $script = $charinfo->{script}; if( ! defined $script ){ warn "$1 does not look like good UTF8 - no script"; next; } next if $script eq 'Common'; $text=~ /((\p{$script}|[-0-9:])*)/g; $part .= $1; push @parts, { text => $part, script => $script }; } # warn join ' | ', map { $_->{text} } @parts; return @parts; } sub word_count { my ( $self, $text, $features ) = @_; $features ||= {}; for my $part ( $self->split_scripts( $text ) ){ my @words = ( $part->{text} ); if( $part->{script} eq 'Han' ){ @words = mmseg( $part->{text} ); } for my $word ( @words ){ next if $self->is_stop_word( $word, $part->{script} ); $features->{ $self->normalize( $word ) }++; } } return $features; } __PACKAGE__->meta()->make_immutable(); 1; # ABSTRACT: counting words in multilingual texts __END__ =pod =head1 NAME Text::WordCounter - counting words in multilingual texts =head1 VERSION version 0.001 =head1 SYNOPSIS my $counter = Text::WordCounter->new(); my $word_count = $counter->word_count( $text ) =head1 DESCRIPTION It is quite heuristic, for example '-' and digits inside word characters are treated as a word character, see the tests to find out how all the special cases are resolved, The features parameter should be a hashref and is an accumulator for found features. =head1 ATTRIBUTES =head2 stemming If set stemming via Lingua::Stem is performed on the words. We never managed to make it sanely in multilingual texts. =head2 stopwords A hashref with words to discard. =head1 INSTANCE METHODS =head2 C =head2 C Lowercases words and stemms them if the C attribute is true. =head2 C =head2 C Returns a hashref with word counts. =head1 LIMITATIONS From languages that don't use spaces only Chinese is currently supported (using Lingua::ZH::MMSEG). =head1 SEE ALSO __END__ =head1 AUTHORS =over 4 =item * Zbigniew Lukasiak =item * Tadeusz SoĊ›nierz, tsosnierz@opera.com =back =head1 COPYRIGHT AND LICENSE This software is Copyright (c) 2012 by Opera Software ASA. This is free software, licensed under: The Artistic License 2.0 (GPL Compatible) =cut