package ElasticSearch; use strict; use warnings FATAL => 'all'; use ElasticSearch::Transport(); use ElasticSearch::Error(); use ElasticSearch::RequestParser; use ElasticSearch::Util qw(throw parse_params); our $VERSION = '0.68'; our $DEBUG = 0; #=================================== sub new { #=================================== my ( $proto, $params ) = parse_params(@_); my $self = { _base_qs => {}, _default => {}, _builder_class => 'ElasticSearch::SearchBuilder' }; bless $self, ref $proto || $proto; $self->{_transport} = ElasticSearch::Transport->new($params); $self->$_( $params->{$_} ) for keys %$params; return $self; } #=================================== sub builder_class { #=================================== my $self = shift; if (@_) { $self->{_builder_class} = shift; delete $self->{_builder}; } return $self->{_builder_class}; } #=================================== sub builder { #=================================== my $self = shift; unless ( $self->{_builder} ) { my $class = $self->{_builder_class} or $self->throw( 'Param', "No builder_class specified" ); eval "require $class; 1" or $self->throw( 'Internal', "Couldn't load class $class: " . ( $@ || 'Unknown error' ) ); $self->{_builder} = $class->new(@_); } return $self->{_builder}; } #=================================== sub request { #=================================== my ( $self, $params ) = parse_params(@_); return $self->transport->request($params); } #=================================== sub use_index { #=================================== my $self = shift; if (@_) { $self->{_default}{index} = shift; } return $self->{_default}{index}; } #=================================== sub use_type { #=================================== my $self = shift; if (@_) { $self->{_default}{type} = shift; } return $self->{_default}{type}; } #=================================== sub reindex { #=================================== my ( $self, $params ) = parse_params(@_); my $source = $params->{source} or $self->throw( 'Param', 'Missing source param' ); my $transform = $params->{transform} || sub { shift() }; my $verbose = !$params->{quiet}; my $dest_index = $params->{dest_index}; my $bulk_size = $params->{bulk_size} || 1000; my $method = $params->{_method_name} || 'next'; local $| = $verbose; printf( "Reindexing %d docs\n", $source->total ) if $verbose; my @docs; while (1) { my $doc = $source->$method(); if ( !$doc or @docs == $bulk_size ) { my $results = $self->bulk_index( docs => \@docs, map { $_ => $params->{$_} } qw(on_conflict on_error), ); $results = $results->recv if ref $results ne 'HASH' && $results->isa('AnyEvent::CondVar'); if ( my $err = $results->{errors} ) { my @errors = splice @$err, 0, 5; push @errors, sprintf "...and %d more", scalar @$err if @$err; $self->throw( 'Request', "Errors occurred while reindexing:", \@errors ); } @docs = (); print "." if $verbose; } last unless $doc; $doc = $transform->($doc) or next; $doc->{version_type} = 'external' if defined $doc->{_version}; if ( my $fields = delete $doc->{fields} ) { $doc->{parent} = $fields->{_parent} if defined $fields->{_parent}; } $doc->{_index} = $dest_index if $dest_index; push @docs, $doc; } print "\nDone\n" if $verbose; } #=================================== sub transport { shift()->{_transport} } sub trace_calls { shift->transport->trace_calls(@_) } sub timeout { shift->transport->timeout(@_) } sub refresh_servers { shift->transport->refresh_servers(@_) } #=================================== #=================================== sub query_parser { #=================================== require ElasticSearch::QueryParser; shift; # drop class/$self ElasticSearch::QueryParser->new(@_); } =head1 NAME ElasticSearch - DEPRECATED: An API for communicating with ElasticSearch =head1 VERSION Version 0.68, tested against ElasticSearch server version 0.90.0. =head1 DEPRECATION This module is being deprecated in favour of the new official client L and will be removed from CPAN in 2015. =head1 DESCRIPTION ElasticSearch is an Open Source (Apache 2 license), distributed, RESTful Search Engine based on Lucene, and built for the cloud, with a JSON API. Check out its features: L This module is a thin API which makes it easy to communicate with an ElasticSearch cluster. It maintains a list of all servers/nodes in the ElasticSearch cluster, and spreads the load across these nodes in round-robin fashion. If the current active node disappears, then it attempts to connect to another node in the list. Forking a process triggers a server list refresh, and a new connection to a randomly chosen node in the list. =cut =head1 SYNOPSIS use ElasticSearch; my $es = ElasticSearch->new( servers => 'search.foo.com:9200', # default '127.0.0.1:9200' transport => 'http' # default 'http' | 'httplite' | 'httptiny' | 'curl' | 'aehttp' | 'aecurl' | 'thrift', max_requests => 10_000, # default 10_000 trace_calls => 'log_file', no_refresh => 0 | 1, ); $es->index( index => 'twitter', type => 'tweet', id => 1, data => { user => 'kimchy', post_date => '2009-11-15T14:12:12', message => 'trying out Elastic Search' } ); $data = $es->get( index => 'twitter', type => 'tweet', id => 1 ); # native elasticsearch query language $results = $es->search( index => 'twitter', type => 'tweet', query => { text => { user => 'kimchy' } } ); # ElasticSearch::SearchBuilder Perlish query language $results = $es->search( index => 'twitter', type => 'tweet', queryb => { message => 'Perl API', user => 'kimchy', post_date => { '>' => '2010-01-01', '<=' => '2011-01-01', } } ); $dodgy_qs = "foo AND AND bar"; $results = $es->search( index => 'twitter', type => 'tweet', query => { query_string => { query => $es->query_parser->filter($dodgy_qs) }, } ); See the C directory for a simple working example. =cut =head1 GETTING ElasticSearch You can download the latest released version of ElasticSearch from L. See here for setup instructions: L =cut =head1 CALLING CONVENTIONS I've tried to follow the same terminology as used in the ElasticSearch docs when naming methods, so it should be easy to tie the two together. Some methods require a specific C and a specific C, while others allow a list of indices or types, or allow you to specify all indices or types. I distinguish between them as follows: $es->method( index => multi, type => single, ...) C values must be a scalar, and are required parameters type => 'tweet' C values can be: index => 'twitter' # specific index index => ['twitter','user'] # list of indices index => undef # (or not specified) = all indices C values work like C values, but at least one value is required, so: index => 'twitter' # specific index index => ['twitter','user'] # list of indices index => '_all' # all indices index => [] # error index => undef # error Also, see L. =head2 as_json If you pass C<< as_json => 1 >> to any request to the ElasticSearch server, it will return the raw UTF8-decoded JSON response, rather than a Perl datastructure. =cut =head1 RETURN VALUES AND EXCEPTIONS Methods that query the ElasticSearch cluster return the raw data structure that the cluster returns. This may change in the future, but as these data structures are still in flux, I thought it safer not to try to interpret. Anything that is known to be an error throws an exception, eg trying to delete a non-existent index. =cut =head1 INTEGRATION WITH ElasticSearch::SearchBuilder L provides a concise Perlish L-style query language, which gets translated into the native L that ElasticSearch uses. For instance: { content => 'search keywords', -filter => { tags => ['perl','ruby'], date => { '>' => '2010-01-01', '<=' => '2011-01-01' }, } } Would be translated to: { query => { filtered => { query => { text => { content => "search keywords" } }, filter => { and => [ { terms => { tags => ["perl", "ruby"] } }, { numeric_range => { date => { gt => "2010-01-01", lte => "2011-01-01" }}}, ], } }}} All you have to do to start using L is to change your C or C parameter to C or C (where the extra C stands for C): $es->search( queryb => { content => 'keywords' } ) If you want to see what your SearchBuilder-style query is being converted into, you can either use L or access it directly with: $native_query = $es->builder->query( $query ) $native_filter = $es->builder->filter( $filter ) See the L docs for more information about the syntax. =head1 METHODS =head2 Creating a new ElasticSearch instance =head3 new() $es = ElasticSearch->new( transport => 'http', servers => '127.0.0.1:9200' # single server | ['es1.foo.com:9200', 'es2.foo.com:9200'], # multiple servers trace_calls => 1 | '/path/to/log/file' | $fh timeout => 30, max_requests => 10_000, # refresh server list # after max_requests no_refresh => 0 | 1 # don't retrieve the live # server list. Instead, use # just the servers specified ); C can be either a single server or an ARRAY ref with a list of servers. If not specified, then it defaults to C and the port for the specified transport (eg C<9200> for C or C<9500> for C). These servers are used in a round-robin fashion. If any server fails to connect, then the other servers in the list are tried, and if any succeeds, then a list of all servers/nodes currently known to the ElasticSearch cluster are retrieved and stored. Every C (default 10,000) this list of known nodes is refreshed automatically. To disable this automatic refresh, you can set C to C<0>. To force a lookup of live nodes, you can do: $es->refresh_servers(); =head4 no_refresh() Regardless of the C setting, a list of live nodes will still be retrieved on the first request. This may not be desirable behaviour if, for instance, you are connecting to remote servers which use internal IP addresses, or which don't allow remote C requests. If you want to disable this behaviour completely, set C to C<1>, in which case the transport module will round robin through the C list only. Failed nodes will be removed from the list (but added back in every C or when all nodes have failed). =head4 Transport Backends There are various C backends that ElasticSearch can use: C (the default, based on LWP), C (based on L), C (based on L), C (based on L), C (based on L), C (based on L) and C (which uses the Thrift protocol). Although the C interface has the right buzzwords (binary, compact, sockets), the generated Perl code is very slow. Until that is improved, I recommend one of the C backends instead. The C backend is about 30% faster than the default C backend, and will probably become the default after more testing in production. The C backend is 1% faster again than C. See also: L, L, L, L and L =cut =head2 Document-indexing methods =head3 index() $result = $es->index( index => single, type => single, id => $document_id, # optional, otherwise auto-generated data => { key => value, ... }, # optional consistency => 'quorum' | 'one' | 'all', create => 0 | 1, parent => $parent, percolate => $percolate, refresh => 0 | 1, replication => 'sync' | 'async', routing => $routing, timeout => eg '1m' or '10s' version => int, version_type => 'internal' | 'external', ); eg: $result = $es->index( index => 'twitter', type => 'tweet', id => 1, data => { user => 'kimchy', post_date => '2009-11-15T14:12:12', message => 'trying out Elastic Search' }, ); Used to add a document to a specific C as a specific C with a specific C. If the C combination already exists, then that document is updated, otherwise it is created. Note: =over =item * If the C is not specified, then ElasticSearch autogenerates a unique ID and a new document is always created. =item * If C is passed, and the current version in ElasticSearch is different, then a C error will be thrown. =item * C can also be a raw JSON encoded string (but ensure that it is correctly encoded, otherwise you see errors when trying to retrieve it from ElasticSearch). $es->index( index => 'foo', type => 'bar', id => 1, data => '{"foo":"bar"}' ); =item * C for all CRUD methods and L is a query timeout, specifying the amount of time ElasticSearch will spend (roughly) processing a query. Units can be concatenated with the integer value, e.g., C<500ms> or C<1s>. See also: L Note: this is distinct from the transport timeout, see L. =back See also: L, L and L =head3 set() C is a synonym for L =head3 create() $result = $es->create( index => single, type => single, id => $document_id, # optional, otherwise auto-generated data => { key => value, ... }, # optional consistency => 'quorum' | 'one' | 'all', parent => $parent, percolate => $percolate, refresh => 0 | 1, replication => 'sync' | 'async', routing => $routing, timeout => eg '1m' or '10s', version => int, version_type => 'internal' | 'external', ); eg: $result = $es->create( index => 'twitter', type => 'tweet', id => 1, data => { user => 'kimchy', post_date => '2009-11-15T14:12:12', message => 'trying out Elastic Search' }, ); Used to add a NEW document to a specific C as a specific C with a specific C. If the C combination already exists, then a C error is thrown. If the C is not specified, then ElasticSearch autogenerates a unique ID. If you pass a C parameter to C, then it must be C<0> unless you also set C to C. See also: L =head3 update() $result = $es->update( index => single, type => single, id => single, # required script => $script, | doc => $doc # optional params => { params }, upsert => { new_doc }, consistency => 'quorum' | 'one' | 'all', fields => ['_source'], ignore_missing => 0 | 1, parent => $parent, percolate => $percolate, retry_on_conflict => 2, routing => $routing, timeout => '10s', replication => 'sync' | 'async' ) The C method accepts a C