#!/usr/bin/env perl # ABSTRACT: uses MinHash & SpeedyFx to compare large text data # PODNAME: minhash_cmp use autodie; use strict; use utf8; use warnings; our $VERSION = '0.013'; # TRIAL VERSION use Getopt::Long; use List::MoreUtils qw(distinct); use Pod::Usage; use Text::SpeedyFx; GetOptions( q(help) => \my $help, q(epsilon=f) => \my $e, q(k=i) => \my $k, q(seed=i) => \my $seed, q(bits=i) => \my $bits, ) or pod2usage(-verbose => 1); pod2usage(-verbose => 2) if $help or $#ARGV != 1; $e = 0.05 unless $e; $k = 0 + sprintf q(%0.0f), 1 / ($e ** 2) unless $k; $seed = 0x4c53_4820 unless $seed; $bits = 8 unless $bits; my $hashes = []; srand $seed; push @{$hashes}, Text::SpeedyFx->new(int rand 2 ** 32, $bits) for 1 .. $k; my @text = map { sub { open my $fh, q(<:mmap), shift; local $/ = undef; my $data = <$fh>; close $fh; return $data; }->($_) } @ARGV; my $match = 0; for my $hash (@{$hashes}) { ++$match if 1 == distinct map { $hash->hash_min($_); } @text; } printf qq(k=%d; similarity=%0.5f\n), $k, $match / $k; __END__ =pod =encoding UTF-8 =head1 NAME minhash_cmp - uses MinHash & SpeedyFx to compare large text data =head1 VERSION version 0.013 =head1 SYNOPSIS minhash_cmp [options] FILE1 FILE2 =head1 DESCRIPTION MinHash (or the min-wise independent permutations locality sensitive hashing scheme) is a technique for quickly estimating how similar two sets are. =head1 OPTIONS =over 4 =item --help This. =item --epsilon Expected error value used to compute the number of different hash functions (default: 0.05). =item --k Number of different hash functions to use (default: 400; overrides C<--epsilon>). =item --seed Custom seed (integer). =item --bits How many bits do represent one character. The default value, 8, sacrifices Unicode handling but is fast and low on memory footprint. The value of 18 encompasses I, I and I planes. =back =head1 CAVEATS Under C setting, each initialized hash function consumes ~500KB. =head1 SEE ALSO =over 4 =item * L =item * L =back =head1 AUTHOR Stanislaw Pusep =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2021 by Stanislaw Pusep. This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut