package Lingua::YALI; # ABSTRACT: YALI - Yet Another Language Identifier. use strict; use warnings; use Carp; our $VERSION = '0.016'; # VERSION # TODO: refactor - remove bzcat sub _open { my ($f) = @_; croak("Not found: $f") if !-e $f; my $opn; my $hdl; my $ft = qx(file '$f'); # file might not recognize some files! if ( $f =~ /\.gz$/ || $ft =~ /gzip compressed data/ ) { $opn = "zcat $f |"; } elsif ( $f =~ /\.bz2$/ || $ft =~ /bzip2 compressed data/ ) { $opn = "bzcat $f |"; } else { $opn = "$f"; } open($hdl,"<:bytes", $opn) or croak ("Can't open '$opn': $!"); binmode $hdl, ":bytes"; return $hdl; } sub _identify_handle { my ($identifier, $fh, $format, $languages, $each_line) = @_; if ( $each_line ) { while (<$fh>) { chomp; _identify_string($identifier, $_, $format, $languages); } } else { my $result = $identifier->identify_handle($fh); _print_result($result, $format, $languages); } } sub _identify { my ($identifier, $file, $format, $languages, $each_line) = @_; my $fh = Lingua::YALI::_open($file); _identify_handle($identifier, $fh, $format, $languages, $each_line); } sub _identify_string { my ($identifier, $string, $format, $languages) = @_; my $result = $identifier->identify_string($string); _print_result($result, $format, $languages); } sub _print_result { my ($result, $format, $languages) = @_; my $line = ""; if ( $format eq "single" ) { if ( scalar @$result > 0 ) { $line = $result->[0]->[0]; } } elsif ( $format eq "all" ) { $line = join("\t", map { $_->[0] } @{$result}); } elsif ( $format eq "all_p" ) { $line = join("\t", map { $_->[0].":".$_->[1] } @{$result}); } elsif ( $format eq "tabbed" ) { my %res = (); map { $res{$_->[0]} = $_->[1] } @{$result}; $line = join("\t", map { $res{$_} } @$languages); } else { croak("Unsupported format $format"); } print $line . "\n"; } 1; __END__ =pod =encoding UTF-8 =head1 NAME Lingua::YALI - YALI - Yet Another Language Identifier. =head1 VERSION version 0.016 =head1 SYNOPSIS The YALI package is a collection of modules and tools for language identification. It was developed at the L at Charles University in Prague. More information can be found at the L. =head2 Modules =over =item * L - contains examples. =item * L - is a language identification module capable of identifying 122 languages. =item * L - is a module used to train custom language models. =item * L - allows to use your own models for identification. =back =head2 Tools =over =item * L - tool for a language identification with pretrained models =item * L - tool for a building custom language models. =item * L - tool for a language identification with custom language models. =back =head1 WHY TO USE YALI =over =item * Contains pretrained models for identifying 122 languages. =item * Allows to create own models, trained on texts from specific domain, which outperforms the pretrained ones. =item * It is based on published paper L. =back =head1 COMPARISON WITH OTHERS =over =item * L can recognize 45 languages and returns only the most probable result without any weight. =item * L requires training files, so it is similar to L, but it does not provide any options for constructing models. =item * L can recognize 33 languages but it does not allows you to use different models. =back =head1 AUTHOR Martin Majlis =head1 COPYRIGHT AND LICENSE This software is Copyright (c) 2012 by Martin Majlis. This is free software, licensed under: The (three-clause) BSD License =head1 AUTHOR Martin Majlis =head1 COPYRIGHT AND LICENSE This software is Copyright (c) 2012 by Martin Majlis. This is free software, licensed under: The (three-clause) BSD License =cut