package Treex::Core::Scenario; $Treex::Core::Scenario::VERSION = '2.20210102'; use Moose; use Treex::Core::Common; use File::Basename; use File::Slurp; use File::chdir; use Digest::MD5 qw(md5_hex); #use Parse::RecDescent 1.967003; now using standalone version has from_file => ( is => 'ro', isa => 'Str', predicate => '_has_from_file', documentation => q(Path to file with scenario), ); has from_string => ( is => 'ro', isa => 'Str', predicate => '_has_from_string', documentation => q(String with scenario), ); has scenario_string => ( is => 'ro', isa => 'Str', builder => '_build_scenario_string', lazy => 1, ); has block_items => ( is => 'ro', isa => 'ArrayRef[HashRef]', builder => 'parse_scenario_string', init_arg => undef, lazy => 1, ); has loaded_blocks => ( is => 'ro', isa => 'ArrayRef[Treex::Core::Block]', builder => '_build_loaded_blocks', predicate => 'is_initialized', lazy => 1, init_arg => undef, ); has document_reader => ( is => 'rw', does => 'Treex::Core::DocumentReader', predicate => '_has_document_reader', writer => '_set_document_reader', init_arg => undef, documentation => 'DocumentReader starts every scenario and reads a stream of documents.' ); has writers => ( is => 'rw', does => 'ArrayRef[Treex::Block::Write::BaseWriter]', default => sub { [] } ); has _global_params => ( is => 'ro', isa => 'HashRef[Str]', traits => ['Hash'], default => sub { {} }, handles => { get_global_param => 'get', set_global_param => 'set', #get_global_param_names => 'keys', #set_verbose => [ set => 'verbose' ], #get_verbose => [ get => 'verbose' ], #set_language => [ set => 'language' ], #get_language => [ get => 'language' ], #... ? }, ); has parser => ( is => 'ro', isa => 'Parse::RecDescent::_Runtime', init_arg => undef, builder => '_build_parser', documentation => q{Parses treex scenarios} ); has runner => ( is => 'ro', isa => 'Treex::Core::Run', writer => '_set_runner', weak_ref => 1, documentation => 'Treex::Core::Run instance in which the scenario is running', ); has cache => ( is => 'rw', isa => 'Maybe[Cache::Memcached]', builder => '_build_cache', ); sub _build_scenario_string { my $self = shift; if ( $self->_has_from_file ) { return $self->_load_scenario_file( $self->from_file ); } elsif ( $self->_has_from_string ) { return $self->from_string; } log_fatal("You have to provide from_file or from_string attribute"); } my %sequence = (); sub _build_loaded_blocks { my $self = shift; my @block_items = @{ $self->block_items }; my $block_count = scalar @block_items; my $i = 0; my @loaded_blocks; my $sequence_from = 0; my $sequence_hash = ""; foreach my $block_item (@block_items) { $i++; my $params = ''; if ( $block_item->{block_parameters} ) { $params = join ' ', @{ $block_item->{block_parameters} }; } log_info("Loading block $block_item->{block_name} $params ($i/$block_count)"); my $new_block = $self->_load_block($block_item); if ( $new_block->does('Treex::Core::DocumentReader') ) { log_fatal("Only one DocumentReader per scenario is permitted ($block_item->{block_name})") if $self->_has_document_reader; $self->_set_document_reader($new_block); } elsif ( $new_block->isa('Treex::Block::Write::BaseWriter') ) { push( @{ $self->writers }, $new_block ); push @loaded_blocks, $new_block; # duplicity } else { if ( ref($new_block) eq "Treex::Core::CacheBlock" ) { $sequence{$sequence_from}{from} = $sequence_from; $sequence{$sequence_from}{to} = $i; $sequence{$sequence_from}{hash} = $sequence_hash; $sequence{$i}{_from} = $sequence_from; $sequence_from = $i; push( @{ $sequence{$sequence_from}{block} }, $new_block->get_hash() ); $sequence_hash = $new_block->get_hash(); } elsif ($self->cache) { $sequence_hash = md5_hex( $sequence_hash . $new_block->get_hash() ); if ( defined( $sequence{$sequence_from} ) ) { push( @{ $sequence{$sequence_from}{block} }, $new_block->get_hash() ); } } push @loaded_blocks, $new_block; } } log_info('ALL BLOCKS SUCCESSFULLY LOADED.'); return \@loaded_blocks; } sub _load_parser { my $self = shift; require Treex::Core::ScenarioParser; return Treex::Core::ScenarioParser->new(); } sub _my_dir { return dirname( (caller)[1] ); } sub _build_parser { my $self = shift; my $parser; eval { $parser = $self->_load_parser(); 1; } and return $parser; log_info("Cannot find precompiled scenario parser, trying to build it from grammar"); my $dir = $self->_my_dir(); #get module's directory my $file = "$dir/ScenarioParser.rdg"; #find grammar file log_fatal("Cannot find grammar file") if !-e $file; #in fact we should never reach this log_warn('We should NOT reach this place. Treex distribution may be corrupted.'); my $grammar = read_file($file); #load it eval { log_info("Trying to precompile it for you"); require Parse::RecDescent; local $CWD = $dir; Parse::RecDescent->Precompile( { -standalone => 1 }, $grammar, 'Treex::Core::ScenarioParser' ); $parser = $self->_load_parser(); 1; } or eval { log_info("Cannot precompile, loading directly from grammar. Consider precompiling it manually"); require Parse::RecDescent; $parser = Parse::RecDescent->new($grammar); #create parser 1; } or log_fatal("Cannot create Scenario parser"); return $parser; } sub _build_cache { my $self = shift; if ( $self->runner && $self->runner->cache ) { require Treex::Core::CacheBlock; require Treex::Tool::Memcached::Memcached; return Treex::Tool::Memcached::Memcached::get_connection( "documents-cache" ); } return; } sub _load_scenario_file { my ( $self, $scenario_filename ) = @_; log_info "Loading scenario description $scenario_filename"; my $scenario_string = read_file( $scenario_filename, binmode => ':utf8', err_mode => 'quiet' ) or log_fatal "Can't open scenario file $scenario_filename"; return $scenario_string; } sub parse_scenario_string { my $self = shift; my $scenario_string = $self->scenario_string; my $from_file = $self->from_file; my $parsed = $self->parser->startrule( $scenario_string, 1, $from_file ); log_fatal("Cannot parse the scenario: $scenario_string") if !defined $parsed; return $parsed; } # reverse of parse_scenario_string, used in Treex::Core::Run for treex --dump sub construct_scenario_string { my $self = shift; my %args = @_; my $multiline = $args{multiline}; my @block_items = @{ $self->block_items }; my $delim = $multiline ? qq{\n} : q{ }; my @block_strings; foreach my $block_item (@block_items) { my $name = $block_item->{block_name}; my @parameters = map { _add_quotes($_) } @{ $block_item->{block_parameters} }; $name =~ s{^Treex::Block::}{} or $name = "::$name"; #strip leading Treex::Block:: or add leading :: my $params; if ( scalar @parameters ) { $params = q{ } . join q{ }, @parameters; } else { $params = q{}; } push @block_strings, $name . $params; } return join $delim, @block_strings; } sub get_required_files { my $self = shift; my @block_items = @{ $self->block_items }; my @required_files; foreach my $block_item (@block_items) { my $block = $self->_load_block($block_item); push @required_files, map { $block_item->{block_name} . "\t" . $_; } $block->get_required_share_files(); } return @required_files; } sub _add_quotes { # adding quotes only if param. value contains a space my ($block_parameter) = @_; my ( $name, $value ) = split /=/, $block_parameter, 2; if ( $value =~ /\s/ ) { my $res_string = "$name="; if ( $value =~ /'/ && $value !~ /"/ ) { $res_string .= '"' . $value . '"'; } else { $value =~ s/'/\\'/g; $res_string .= "'" . $value . "'"; } return $res_string; } return $block_parameter; } sub load_blocks { my $self = shift; $self->loaded_blocks; #just access lazy attribute return; } sub init { my $self = shift; $self->load_blocks(); return; } sub _load_block { my ( $self, $block_item ) = @_; my $block_name = $block_item->{block_name}; my $new_block; # Initialize with global (scenario) parameters my %params = ( %{ $self->_global_params }, scenario => $self ); # which can be overriden by (local) block parameters. foreach my $param ( @{ $block_item->{block_parameters} } ) { my ( $name, $value ) = split /=/, $param, 2; $params{$name} = $value; } eval "use $block_name; 1;" or log_fatal "Can't use block $block_name !\n$@\n"; eval { $new_block = $block_name->new( \%params ); 1; } or log_fatal "Treex::Core::Scenario->new: error when initializing block $block_name\n\nEVAL ERROR:\t$@"; if ( $self->cache && $params{'use_cache'} ) { $new_block = Treex::Core::CacheBlock->new( { block => $new_block, cache => $self->cache } ); } return $new_block; } sub run { my ($self) = @_; my $number_of_blocks = @{ $self->loaded_blocks }; log_fatal('No DocumentReader supplied') if !$self->_has_document_reader; my $reader = $self->document_reader; my $number_of_documents = $reader->number_of_documents_per_this_job() || '?'; my $document_number = 0; #if ( $self->cache ) { # $document_number = $self->_run_with_cache( $reader, $number_of_blocks, $number_of_documents ); #} #else { $document_number = $self->_run_without_cache( $reader, $number_of_blocks, $number_of_documents ); #} log_info "Processed $document_number document" . ( $document_number == 1 ? '' : 's' ); return 1; } sub _run_with_cache { my ( $self, $reader, $number_of_blocks, $number_of_documents ) = @_; my $document_number = 0; while ( my $document = $reader->next_document_for_this_job() ) { $document_number++; my $doc_name = $document->full_filename; my $doc_from = $document->loaded_from; log_info "Document $document_number/$number_of_documents $doc_name loaded from $doc_from"; my $block_number = 0; my $skip_to = 0; my $process = 0; my $skip_from = 0; my $from_hash = ""; my $document_last_hash = ""; foreach my $block ( @{ $self->loaded_blocks } ) { $block_number++; $process = 1; if ( $block_number < $skip_to ) { # we know that there are identical, so we can skip them log_info "Skipping block $block_number/$number_of_blocks " . ref($block); $process = 0; } elsif ( $block_number == $skip_to ) { # this is border Cache block -> we have to check whether next sequence is also same $skip_from = $block_number + 1; # following sequence is same => we can continue with skipping if ($sequence{$skip_from}{'to'} && $self->_is_known_sequence( $sequence{$skip_from}{'hash'}, $document->get_hash() ) ) { #log_warn("\tskip from " . $sequence{$skip_from}{from} . ' to ' . $sequence{$skip_from}{to}); $skip_to = $sequence{$skip_from}{to} - 1; $from_hash = $document->get_hash(); $process = 0; } else { $document_last_hash = $document->get_hash(); #$document->set_hash(md5_hex($document->get_hash() . $block->get_hash())); my $full_hash = $document->get_hash(); $document = $self->cache->get($full_hash); if ( !$document ) { log_fatal("Document - $full_hash is missing!!!"); } $process = 2; } } if ( $process == 1 ) { log_info "Applying block $block_number/$number_of_blocks " . ref($block); $block->process_start if !$block->is_started; $block->_set_is_started(1); #log_info("Document-hash: " . $document->get_hash()); $skip_from = $block_number + 1; my $status = $block->process_document($document); if (defined($status) && $status == $Treex::Core::Block::DOCUMENT_FROM_CACHE && $sequence{$skip_from}{'to'} && $self->_is_known_sequence( $sequence{$skip_from}{'hash'}, $document->get_hash() ) ) { #log_warn("\tskip from " . $sequence{$skip_from}{from} . ' to ' . $sequence{$skip_from}{to}); $skip_to = $sequence{$skip_from}{to} - 1; $skip_from = $block_number + 1; $from_hash = $document->get_hash(); } } $document_last_hash = $document->get_hash(); $document->set_hash( md5_hex( $document->get_hash() . $block->get_hash() ) ); if ( ref($block) eq "Treex::Core::CacheBlock" ) { # cache block => mark this path as known my $id = $block_number + 1; my $from = $sequence{$id}{'_from'}; # the first sequence has no document if ( defined( $sequence{$from}{'document'} ) ) { $self->_set_known_sequence( $sequence{$from}{'hash'}, $sequence{$from}{'document'} ); } $sequence{$id}{'document'} = $document_last_hash; } } # this actually marks the document as successfully done in parallel processing (if this line # does not appear in the output, the parallel process will fail -- it must appear at any errorlevel, # therefore not using log_info or similiar) if ( $self->document_reader->jobindex ) { print STDERR "Document $document_number/$number_of_documents $doc_name: [success].\n"; } } log_info "Applying process_end"; foreach my $block ( @{ $self->loaded_blocks } ) { $block->process_end() if ( $block->is_started ); } return $document_number; } sub _is_known_sequence { my ( $self, $sequence_hash, $document_hash ) = @_; my $hash = md5_hex( $sequence_hash, $document_hash ); return $self->cache->get($hash); } sub _set_known_sequence { my ( $self, $sequence_hash, $document_hash ) = @_; my $hash = md5_hex( $sequence_hash, $document_hash ); $self->cache->set( $hash, 1 ); return; } sub _run_without_cache { my ( $self, $reader, $number_of_blocks, $number_of_documents ) = @_; my $document_number = 0; $self->start(); while ( my $document = $reader->next_document_for_this_job() ) { $document_number++; my $doc_name = $document->full_filename; my $doc_from = $document->loaded_from; log_info "Document $document_number/$number_of_documents $doc_name loaded from $doc_from"; my $block_number = 0; foreach my $block ( @{ $self->loaded_blocks } ) { $block_number++; log_info "Applying block $block_number/$number_of_blocks " . ref($block); $block->process_document($document); } # this actually marks the document as successfully done in parallel processing (if this line # does not appear in the output, the parallel process will fail -- it must appear at any errorlevel, # therefore not using log_info or similiar) if ( $self->document_reader->jobindex ) { print STDERR "Document $document_number/$number_of_documents $doc_name: [success].\n"; } } $self->end(); log_info "Processed $document_number document" . ( $document_number == 1 ? '' : 's' ); return $document_number; } # Apply process_start to all blocks for which this has not yet been applied sub start { my ($self) = @_; log_info "Applying process_start"; foreach my $block ( @{ $self->loaded_blocks } ) { $block->process_start() if !$block->is_started; $block->_set_is_started(1); } return; } # Apply the scenario to documents given in parameter sub apply_to_documents { my ( $self, @documents ) = @_; my $number_of_blocks = @{ $self->loaded_blocks }; my $block_number = 0; foreach my $document (@documents){ log_info "Processing document" . $document->full_filename; foreach my $block ( @{ $self->loaded_blocks } ) { $block_number++; log_info "Applying block $block_number/$number_of_blocks " . ref($block); $block->process_document($document); } } return; } # Apply process_end to all blocks for which this has not yet been applied sub end { my ($self) = @_; log_info "Applying process_end"; foreach my $block ( @{ $self->loaded_blocks } ) { $block->process_end() if ( $block->is_started ); } return; } use Module::Reload; sub restart { my ($self) = @_; my $changed_modules = Module::Reload->check; log_info "Number of reloaded modules = $changed_modules"; log_info "reseting the document reader\n"; $self->document_reader->restart(); # TODO rebuild the reloaded blocks return; } 1; __END__ =for Pod::Coverage BUILD =encoding utf-8 =head1 NAME Treex::Core::Scenario - a larger Treex processing unit, composed of blocks =head1 VERSION version 2.20210102 =head1 SYNOPSIS use Treex::Core; my $doc1, $doc2; my $scenario = Treex::Core::Scenario->new(from_file => 'myscenario.scen' ); $scenario->run; $scenario = Treex::Core::Scenario->new(from_string => 'W2A::EN::Segment language=en'); $scenario->start(); $scenario->apply_to_documents($doc1, $doc2); $scenario->end(); =head1 DESCRIPTION A Treex scenario consists of a sequence of (possibly parametrized) Treex blocks. Scenarios can be described by a simple textual format, which is either passed directly to the scenario construction, or is contained in a text file whose name is passed. The string description of scenarios looks as follows. 1) It contains a list of block names from which their 'C' prefixes were removed. 2) The block names are separated by one or more white spaces. 3) The block names are listed in the same order in which they should be applied on data. 4) For each block, there can be one or more parameters specified, using the C form. 5) Comments start with 'C<#>' and end with the nearest newline character. Scenario example: # morphological analysis of an English text Util::SetGlobal language=en selector=src Read::Text W2A::ResegmentSentences W2A::EN::Tokenize W2A::EN::NormalizeForms W2A::EN::FixTokenization W2A::EN::TagMorce =head1 METHODS =head2 Constructor =over 4 =item my $scenario = Treex::Core::Scenario->new(from_string => 'W2A::Tokenize language=en W2A::Lemmatize' ); Constructor parameter C specifies the names of blocks which are to be executed (in the specified order) when the scenario is applied on a L object. =item my $scenario = Treex::Core::Scenario->new(from_file => 'myscenario.scen' ); The scenario description is loaded from the file. =back =head2 Running the scenario =over 4 =item $scenario->run(); Run the scenario. One of the blocks (usually the first one) must be the document reader (see L) that produces the documents on which this scenario is applied. =item $scenario->apply_to_documents($treex_doc); Apply this scenario to a L instance obtained from elsewhere. Please note that C must be called before the first call to this method and C after the last call to this method. The scenario does not need to contain a document reader if documents are given explicitly. =item $scenario->start(); Apply C to all blocks in the scenario. This is called automatically by C, but must be called before C. =item $scenario->end(); Apply C to all blocks in the scenario. This is called automatically by C, but must be called after calls to C. =back =head2 Internal methods for loading scenarios =over 4 =item _load_scenario_file($filename) loads a scenario description from a file =item parse_scenario_string parses a textual description of a scenario =item construct_scenario_string constructs a scenario textual description from an existing scenario instance accepts named parameter multiline - when set, blocks are separated by newline instead of space =item load_blocks use blocks and call their constructors can be used for preloading blocks for e.g. server applications when running scenario blocks are loaded automatically =item init do all initialization so after this method scenario is ready to run currently just load blocks =item restart resets the document reader, in future it will rebuild reloaded blocks =back =head1 SEE ALSO L L =head1 AUTHORS Zdeněk Žabokrtský Martin Popel David Mareček Tomáš Kraut Martin Majliš Ondřej Dušek =head1 COPYRIGHT AND LICENSE Copyright © 2011-2012 by Institute of Formal and Applied Linguistics, Charles University in Prague This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.