# # Copyright (c) 2013-2020 Christian Jaeger, copying@christianjaeger.ch # # This is free software, offered under either the same terms as perl 5 # or the terms of the Artistic License version 2 or the terms of the # MIT License (Expat version). See the file COPYING.md that came # bundled with this file. # =head1 NAME PXML::Serialize =head1 SYNOPSIS =head1 DESCRIPTION =head1 SPECIAL VALUES There are some special values that the serializer will evaluate transparently: =over 4 =item promises from FP::Lazy are Cd =item code references are called with no arguments =item objects In body context, their `pxml_serialized_body_string` method (if available) will be called, in attribute context, `pxml_serialized_attribute_string`, in both cases the string is inserted into the output without escaping (see `PXML::Preserialize` for an example that uses this). Missing those, `string` will be called if available and the result escaped, otherwise an exception is thrown. =back =head1 NOTE This is alpha software! Read the status section in the package README or on the L. =cut package PXML::Serialize; use strict; use warnings; use warnings FATAL => 'uninitialized'; use Exporter "import"; our @EXPORT = qw(pxml_xhtml_print); our @EXPORT_OK = qw(pxml_print pxml_print_fragment pxml_xhtml_print_fast pxml_print_fragment_fast putxmlfile puthtmlfile attribute_escape content_escape ); our %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); use FP::Show; use PXML::Element; use PXML qw(is_pxml_element is_pxmlflush); use FP::Lazy; use FP::List; use FP::Stream; use Chj::xperlfunc qw(xprint xprintln); use FP::Weak 'weaken'; # instead of from Scalar::Util so that it can # be turned off globally (and we depend on FP # anyway) use Scalar::Util qw(blessed refaddr); use FP::Carp; sub is_somearray { @_ == 1 or fp_croak_arity 1; my $r = ref($_[0]); # XX mess, make this a proper dependency $r eq "ARRAY" or $r eq "PXML::Body" } sub is_empty_string { @_ == 1 or fp_croak_arity 1; defined $_[0] and !length ref $_[0] and $_[0] eq "" } my %attribute_escape = ('&' => '&', '<' => '<', '>' => '>', '"' => '"'); sub attribute_escape { my ($str) = @_; return "" unless defined $str; # XX or should attributes with undefined value be dropped? (Or, # OTOH, should list processing be done?) $str =~ s/([&<>"])/$attribute_escape{$1}/sg; $str } my %content_escape = ('&' => '&', '<' => '<', '>' => '>'); sub content_escape { my ($str) = @_; $str =~ s/([&<>])/$content_escape{$1}/sg; $str } sub pxmlforce; sub pxmlforce { @_ == 1 or fp_croak_arity 1; my ($v) = @_; if (my $r = ref $v) { if ($r eq "CODE") { pxmlforce(&$r()) } else { force $v } } else { $v } } sub object_force_escape { @_ == 4 or fp_croak_arity 4; my ($v, $string_method_for_context, $escape, $fh) = @_; # $v is certain to be a reference (XX ehr, not even that?!), but # not necessarily blessed. if (defined blessed $v) { if (my $m = $v->can($string_method_for_context)) { # no escaping return &$m($v, $fh); } elsif ( $m = # XX should this instead simply stringify using # '"$v"'? That would not show up errors with # context. But it would be less interruptive # perhaps? Just issue a warning? Ideal would # probably be to do the '""', but give a warning # if it was Perl's default stringification. How to # do this? $v->can("string") ) { return &$escape(&$m($v)); } } die "unexpected type of reference that doesn't have a 'string' method: " . (show $v); } # XXX hack, the code is really hopeless, should use ~same code as for # body parts. sub _attribute_val_to_string { my ($v, $fh) = @_; my $ref = ref $v; if (length($ref)) { if ($ref eq "ARRAY") { join("", map { _attribute_val_to_string($_, $fh) } @$v) } elsif (is_pxmlflush($v)) { flush $fh or die $!; "" } else { object_force_escape(pxmlforce($v), "pxml_serialized_attribute_string", \&attribute_escape, $fh) } } else { # fast path: attribute_escape $v } } sub _pxml_print_fragment_fast { @_ == 4 or fp_croak_arity 4; my ($v, $fh, $html5compat, $void_element_h) = @_; weaken $_[0] # necessary since we're also called with strings: if ref $_[0]; LP: { ## **NOTE**: this has seen some evil optimizations; before ## working on the code, please undo them first by using git ## revert. if (my $ref = ref $v) { if (defined(my $class = blessed $v)) { if ( $ref eq "PXML::Element" or $ref eq "PXML::_::XHTML" # ^ is this a worthwhile optimization? or $v->isa("PXML::Element") ) { PXML: my $n = $v->name; print $fh "<$n" or die $!; if (my $attrs = $v->maybe_attributes) { for my $k (sort keys %$attrs) { print $fh " $k=\"" or die $!; my $str = _attribute_val_to_string $$attrs{$k}, $fh; print $fh "$str\"" or die $!; } } my $body = $v->body; my $looksempty = # fast path ( not defined $body # XX allow undef or don't? Please # finally settle this! or (not ref $body and length($body) == 0) or ( is_somearray($body) and ( not @$body or ( @$body == 1 and ( # XX remove undef check here now, too? OK?--nope, necessary not defined $$body[0] or (is_somearray($$body[0]) and not @{ $$body[0] }) or is_empty_string($$body[0]) ) ) ) ) ); my $selfreferential; if ($html5compat) { if ($$void_element_h{$n}) { if ($looksempty) { $selfreferential = 1; } else { my $isempty = # slow path is_null(stream_mixed_flatten($body)); $selfreferential = $isempty; warn "html5 compatible serialization requested " . "but got void element '$n' that is not empty" if not $isempty; } } else { $selfreferential = 0; } } else { $selfreferential = $looksempty; } if ($selfreferential) { print $fh "/>" or die $!; } else { print $fh ">" or die $!; no warnings "recursion"; # hu. _pxml_print_fragment_fast($body, $fh, $html5compat, $void_element_h); print $fh "" or die $!; } } elsif (my $car_and_cdr = $v->can("car_and_cdr")) { PAIR: #my $a; ($a, $v) = &$car_and_cdr($v); _pxml_print_fragment_fast($a, $fh, $html5compat, $void_element_h); #_pxml_print_fragment_fast (cdr $v, $fh); redo LP; } elsif (my $for_each = $v->can("for_each")) { # catches null, too. Well. &$for_each( $v, sub { my ($a) = @_; _pxml_print_fragment_fast($a, $fh, $html5compat, $void_element_h); } ); } else { my $v2 = force($v, 1); # ^XX why pass nocache flag? (Was this to avoid # memory retention issues?) my $addr2 = refaddr($v2); if (defined($addr2) and $addr2 != refaddr($v)) { $v = $v2; redo LP; } elsif (is_somearray($v)) { # COPY-PASTE. Really should refactor # _pxml_print_fragment_fast into local hash-table # based dispatcher. no warnings "recursion"; # hu. for (@$v) { # XXX use Keep around $_ to prevent mutation of tree? # nope, can't, will prevent streaming. _pxml_print_fragment_fast($_, $fh, $html5compat, $void_element_h); } } elsif (is_pxmlflush $v) { flush $fh or die $! } else { # Fallback for references, XX copy-paste print $fh object_force_escape($v, "pxml_serialized_body_string", \&content_escape, $fh) or die $!; } } } else { if (is_somearray($v)) { no warnings "recursion"; # hu. for (@$v) { # XXX use Keep around $_ to prevent mutation of tree? # nope, can't, will prevent streaming. _pxml_print_fragment_fast($_, $fh, $html5compat, $void_element_h); } } # 'force' doesn't evaluate CODE (probably rightly so), # thus need to be explicit if we want 'late binding' # (e.g. reference to dynamic variables) during # serialization elsif ($ref eq "CODE") { $v = &$v(); redo LP; } elsif (is_null $v) { die "OBSOLETE?"; # end of linked list, nothing # XX obsolete now, since $v->can("for_each") above # will catch it already. } else { warn "XXX when does this happen?"; #use FP::Repl; #repl; # slow fallback... again, see above **NOTE** re # evil. $ref or die "BUG"; # we're in the if ref scope, right? goto PXML if $v->isa("PXML::Element"); goto PAIR if is_pair $v; # goto PROMISE if is_promise $v; # Fallback for references, XX copy-paste print $fh object_force_escape($v, "pxml_serialized_body_string", \&content_escape, $fh) or die $!; } } } elsif (not defined $v) { # (previously end of linked list marker) nothing; XX # should this give exception (to point out any issue with # deleted streams, the reason why I changed from using # undef to null)? But exception won't show a good # backtrace anyway at this point. #warn "warning: ignoring undef in PXML datastructure"; # XXX what to do about this? } else { #print $fh content_escape($v) or die $!; $v =~ s/([&<>])/$content_escape{$1}/sg; print $fh $v or die $!; } } } sub pxml_print_fragment_fast { @_ == 2 or fp_croak_arity 2; my ($v, $fh) = @_; weaken $_[0] if ref $_[0]; # ref check perhaps unnecessary here my $no_element = sub { @_ = ($v, $fh, undef, undef); goto \&_pxml_print_fragment_fast; }; my $with_first_element = sub { my ($firstel) = @_; weaken $_[0] if ref $_[0]; my $html5compat = $firstel->require_printing_nonvoid_elements_nonselfreferential; @_ = ($v, $fh, $html5compat, ($html5compat and $firstel->void_element_h)); goto \&_pxml_print_fragment_fast; }; if (length(my $r = ref $v)) { if (defined blessed $v and $v->isa("PXML::XHTML")) { @_ = ($v); goto &$with_first_element; } else { my $s = force(stream_mixed_flatten($v)->filter(\&is_pxml_element)); if (is_null $s) { goto &$no_element } else { @_ = (car $s); goto &$with_first_element; } } } else { goto &$no_element } } sub pxml_xhtml_print_fast { @_ >= 2 and @_ <= 3 or fp_croak_arity "2-3"; my ($v, $fh, $maybe_lang) = @_; weaken $_[0] if ref $_[0]; # ref check perhaps unnecessary here if (not ref $v or (defined(blessed $v) and not $v->isa("PXML::Element"))) { die "not an element: " . (show $v); } if (not "html" eq $v->name) { die "not an 'html' element: " . (show $v); } xprint($fh, "\n"); xprint($fh, "\n" ); # add attributes to toplevel element my $v2 = $v->maybe_attributes ? $v : $v->attributes_set( do { my $lang = $maybe_lang or die "missing 'lang' attribute from html element and no lang option given"; +{ xmlns => "http://www.w3.org/1999/xhtml", "xml:lang" => $lang, lang => $lang } } ); @_ = ($v2, $fh); goto \&pxml_print_fragment_fast; } # for now, sub pxml_xhtml_print; *pxml_xhtml_print = \&pxml_xhtml_print_fast; use Chj::xopen "xopen_write"; sub pxml_print { @_ == 2 or fp_croak_arity 2; my ($v, $fh) = @_; weaken $_[0] if ref $_[0]; # ref check perhaps unnecessary here xprintln($fh, q{}); pxml_print_fragment_fast($v, $fh); } sub putxmlfile { @_ == 2 or fp_croak_arity 2; my ($path, $xml) = @_; weaken $_[1] if ref $_[0]; # ref check perhaps unnecessary here my $f = xopen_write $path; binmode($f, ":utf8") or die "binmode"; # ^ XX should this use ":encoding(UTF-8)"? To validate in-memory # strings? Shouldn't we just check all *inputs*? pxml_print($xml, $f); $f->xclose; } sub PXML::Element::xmlfile { my ($v, $path) = @_; weaken $_[0]; putxmlfile($path, $v) } sub puthtmlfile { @_ >= 2 and @_ <= 3 or fp_croak_arity "2-3"; my ($path, $v, $maybe_lang) = @_; weaken $_[1] if ref $_[0]; # ref check perhaps unnecessary here #xmkdir_p dirname $path; my $out = xopen_write($path); binmode $out, ":utf8" or die "binmode"; # ^ XX dito, see comment in putxmlfile pxml_xhtml_print_fast($v, $out, $maybe_lang || "en"); $out->xclose; } sub PXML::Element::htmlfile { my ($v, $path, $maybe_lang) = @_; weaken $_[0]; puthtmlfile($path, $v, $maybe_lang) } 1