package CodeGen::Cpppp::Enum; our $VERSION = '0.005'; # VERSION # ABSTRACT: Helper for enumerations and generating related utility functions use v5.20; use warnings; use Carp; use experimental 'signatures', 'lexical_subs', 'postderef'; use Scalar::Util 'looks_like_number'; use List::Util 'any', 'min', 'max'; BEGIN { *uniqstr= List::Util->can('uniqstr') // sub { my %seen; grep !$seen{$_}++, @_ } } use CodeGen::Cpppp::CParser; sub new($class, %attrs) { my $self= bless {}, $class; # apply num_format first because it affects set_values $self->num_format(delete $attrs{num_format}) if exists $attrs{num_format}; $self->$_($attrs{$_}) for keys %attrs; return $self; } sub prefix($self, @val) { if (@val) { $self->{prefix}= $val[0]; return $self } $self->{prefix} // '' } sub macro_prefix($self, @val) { if (@val) { $self->{macro_prefix}= $val[0]; return $self } $self->{macro_prefix} // uc($self->prefix); } sub symbol_prefix($self, @val) { if (@val) { $self->{symbol_prefix}= $val[0]; return $self } $self->{symbol_prefix} // lc($self->prefix); } sub type($self, @val) { if (@val) { $self->{type}= $val[0]; return $self; } $self->{type} // 'int'; } sub values($self, @val) { return $self->set_values(@val) if @val; @{ $self->{values} // [] } } sub set_values($self, @spec) { my @values; for (@spec == 1 && ref $spec[0]? @{$spec[0]} : @spec) { if ('ARRAY' eq ref) { push @values, [ @$_ ]; } elsif (/^\w+$/) { push @values, [ $_ ]; } else { defined $values[-1] or croak "Got an enum value '$_' before a name"; defined $values[-1][1] and croak "'$_' is not a valid enum name"; $values[-1][1]= $_; } } # Fill in missing values with next sequential integer my $prev= $values[0][1] //= 0; for (@values[1..$#values]) { if (!defined $_->[1]) { my ($base, $ofs, $fmt)= $self->_parse_value_expr($prev); $_->[1]= sprintf $fmt, $ofs+1; } $prev= $_->[1]; } $self->{values}= \@values; $self->{_analysis}= undef; $self; } sub value_table_var($self, @val) { if (@val) { $self->{value_table_var}= $val[0]; return $self; } $self->{value_table_var} // $self->symbol_prefix . 'value_table'; } sub indent($self, @val) { if (@val) { $self->{indent}= $val[0]; return $self; } $self->{indent} // ' '; } sub _current_indent { $CodeGen::Cpppp::INDENT // shift->indent; } sub num_format($self, @val) { if (@val) { $self->{num_format}= $val[0]; $self->{_analysis}= undef; return $self; } $self->{num_format} // '%d'; } sub max_waste_factor($self, @val) { if (@val) { $self->{max_waste_factor}= $val[0]; $self->{_analysis}= undef; return $self; } $self->{max_waste_factor} // 2; } our %_algorithm= map +( $_ => 1 ), qw( bsearch hashtable switch ); sub algorithm($self, @val) { if (@val) { !defined $val[0] or $_algorithm{$val[0]} or croak "Unknown parse_design '$val[0]', expected one of ".join(', ', keys %_algorithm); $self->{algorithm}= $val[0]; return $self; } $self->{algorithm} } sub _parse_value_expr($self, $val) { # Make the common case fast return '', +$val, '%d' if $val =~ /^[-+]?(?:0|[1-9][0-9]*)\Z/; # else need to parse the expression my @tokens= CodeGen::Cpppp::CParser->tokenize($val); my $type_pattern= join '', map $_->type, @tokens; # Recognize patterns where a +N occurs at the end of the expression # Else, the whole value is the expression and will get '+N' appended. return $val, 0, "($val+".($self->{num_format}//'%d').")" unless $type_pattern =~ /(^|[-+])integer\W*$/; my $context= $1; # walk backward to last 'integer' token my $i= $#tokens; $i-- while $tokens[$i]->type ne 'integer'; # could be start of string, -N, +N, EXPR-N, EXPR+N, or EXPR OP -N my $fmt_str= $val; my ($pos, $pos2)= ($tokens[$i]->src_pos, $tokens[$i]->src_pos+$tokens[$i]->src_len); my $n= $tokens[$i]->value; # If start of string or preceeded by '+', nothing to do. # If preceeded by '-', need to convert that to '+' in format string if ($context eq '-') { $n= -$n; $pos= $tokens[$i-1]->src_pos + 1; substr($fmt_str, $tokens[$i-1]->src_pos, 1, '+'); } my $num_str= substr($val, $tokens[$i]->src_pos, $tokens[$i]->src_len); my $notation= $self->{num_format} // $num_str =~ /^-?0x[0-9A-F]+$/? 'X' : $num_str =~ /^-?0x[0-9a-f]+$/? 'x' : $num_str =~ /^-?0[0-9]+/? 'o' : 'd'; substr($fmt_str, $pos, $pos2-$pos, '%'.($pos2-$pos).$notation); # The "base" is everying to the left of the number minus the number of "(" # to match the number of ")" to the right of the number my $rparen= grep $_->type eq ')', @tokens[$i..$#tokens]; shift @tokens while $tokens[0]->type eq '(' && $rparen--; my $base= substr($val, $tokens[0]->src_pos, $pos-$tokens[0]->src_pos); return ($base, $n, $fmt_str); } sub is_symbolic($self) { $self->_analysis->{base_expr} ne ''; } sub is_sequential($self) { $self->_analysis->{is_seq} } sub is_nearly_sequential($self) { $self->_analysis->{is_nearly_seq} } sub _analysis($self) { $self->{_analysis} //= do { my @vals= map +[ $_->[0], $self->_parse_value_expr($_->[1]) ], $self->values; my $base_expr= $vals[0][1]; my %seen_ofs= ( $vals[0][2] => 1 ); for (@vals[1..$#vals]) { # Can't be sequential unless they share a symbolic base expression $base_expr= undef, last unless $_->[1] eq $base_expr; $seen_ofs{$_->[2]}++; } my %info= ( vals => \@vals ); if (defined $base_expr) { # Find the min/max my ($min, $max)= (min(keys %seen_ofs), max(keys %seen_ofs)); # Is it sequential? my ($is_seq, $is_nearly_seq, $gap); # don't iterate unless the range is reasonable if (($max - $min - @vals) <= $self->max_waste_factor * @vals) { $gap= 0; for ($min .. $max) { $gap++ unless $seen_ofs{$_}; } $is_seq= $gap == 0; $is_nearly_seq= $gap <= $self->max_waste_factor * ($max-$min+1-$gap); } $info{is_seq}= $is_seq; $info{is_nearly_seq}= $is_nearly_seq; $info{gap}= $gap; $info{min}= $min; $info{max}= $max; $info{base_expr}= $base_expr; } \%info }; } sub generate_declaration($self, %options) { return join "\n", $self->_generate_declaration_macros(\%options); } sub _generate_declaration_macros($self, $options) { my @vals= $self->values; my $name_width= max map length($_->[0]), @vals; my $prefix= $self->macro_prefix; my $fmt= "#define $prefix%-${name_width}s %s"; return map sprintf($fmt, $_->[0], $_->[1]), @vals; } sub generate_static_tables($self, %options) { return join "\n", _generate_enum_table($self, \%options); } sub _generate_enum_table($self, $options) { my $prefix= $self->prefix; my @names= map $prefix . $_->[0], $self->values; my $name_width= max map length, @names; my $indent= $self->_current_indent; my $fmt= $indent.$indent.'{ "%s",%*s %s },'; my @code= ( "const struct { const char *name; const ".$self->type." value; }", $indent . $self->value_table_var . "[] = {", (map sprintf($fmt, $_, $name_width-length, '', $_), @names), $indent . '};' ); substr($code[-2], -1, 1, ''); # remove trailing comma return @code; } sub generate_lookup_by_value($self, %options) { return join "\n", $self->_generate_lookup_by_value_switch(\%options); } sub _generate_lookup_by_value_switch($self, $options) { my @vals= $self->values; my $name_width= max map length($_->[0]), @vals; my $info= $self->_analysis; my $val_variable= 'value'; my $prefix= $self->macro_prefix; my $enum_table= $self->value_table_var; # Generate a switch() table to look them up my @code= "switch ($val_variable) {"; my $fmt= "case $prefix%s:%*s return ${enum_table}[%d].name;"; for (0..$#vals) { push @code, sprintf($fmt, $vals[$_][0], $name_width - length($vals[$_][0]), '', $_); } push @code, 'default: return NULL;', '}'; return @code; } sub generate_lookup_by_name($self, %options) { return join "\n", $self->_generate_lookup_by_name_switch(\%options); } sub _generate_lookup_by_name_switch($self, $options) { my @vals= $self->values; my $info= $self->_analysis; my $caseless= $options->{caseless}; my $prefixless= $options->{prefixless}; my $prefixlen= length($self->macro_prefix); my $indent= $self->_current_indent; my $len_var= $options->{len_var} // 'len'; my $str_ptr= $options->{str_ptr} // 'str'; my $enum_table= $self->value_table_var; my $strcmp= $caseless? "strcasecmp" : "strcmp"; my $idx_type= @vals <= 0x7F? 'int8_t' : @vals <= 0x7FFF? 'int16_t' : @vals <= 0x7FFFFFFF? 'int32_t' : 'int64_t'; my @search_set; for (0..$#vals) { push @search_set, [ $self->macro_prefix . $vals[$_][0], $_ ]; push @search_set, [ $vals[$_][0], -$_ ] if $prefixless; } my %by_len; for (@search_set) { push @{ $by_len{length $_->[0]} }, $_; } my $longest= max(keys %by_len); my @code= ( "$idx_type test_el= 0;", ("char str_buf[$longest+1];")x!!$caseless, "switch ($len_var) {", ); # Generate one binary decision tree for each string length for (sort { $a <=> $b } keys %by_len) { my %pivot_pos; my @split_expr= $self->_binary_split($by_len{$_}, $caseless, $caseless? 'str_buf' : $str_ptr, \%pivot_pos); push @code, "case $_:", ($caseless? ( map "${indent}str_buf[$_]= tolower(${str_ptr}[$_]);", sort { $a <=> $b } keys %pivot_pos ) : ()), (map "$indent$_", @split_expr), "${indent}break;", } push @code, "default:", "${indent}return false;", "}"; # If allowing prefixless match, some test_el will be negative, meaning to # test str+prefixlen if ($prefixless) { push @code, "if (test_el < 0) {", "${indent}if ($strcmp($str_ptr, ${enum_table}[-test_el].name + $prefixlen) == 0) {", "${indent}${indent}if (value_out) *value_out= ${enum_table}[-test_el].value;", "${indent}${indent}return true;", "${indent}}", "${indent}return false;", "}"; } push @code, "if ($strcmp($str_ptr, ${enum_table}[test_el].name) == 0) {", "${indent}if (value_out) *value_out= ${enum_table}[test_el].value;", "${indent}return true;", "}", "return false;"; return @code; } sub _binary_split($self, $vals, $caseless, $str_var, $pivot_pos) { # Stop at length 1 return qq{test_el= $vals->[0][1];} if @$vals == 1; # Find a character comparison that splits the list roughly in half. my $goal= .5 * scalar @$vals; # Test every possible character and keep track of the best. my ($best_i, $best_ch, $best_less); for (my $i= 0; $i < length $vals->[0][0]; ++$i) { if (!$caseless) { for my $ch (uniqstr map substr($_->[0], $i, 1), @$vals) { my @less= grep substr($_->[0], $i, 1) lt $ch, @$vals; ($best_i, $best_ch, $best_less)= ($i, $ch, \@less) if !defined $best_i || abs($goal - @less) < abs($goal - @$best_less); } } else { for my $ch (uniqstr map lc substr($_->[0], $i, 1), @$vals) { my @less= grep +(lc(substr($_->[0], $i, 1)) lt $ch), @$vals; ($best_i, $best_ch, $best_less)= ($i, $ch, \@less) if !defined $best_i || abs($goal - @less) < abs($goal - @$best_less); } } } $pivot_pos->{$best_i}++; # inform caller of which chars were used # Binary split the things less than the pivot character my @less_src= $self->_binary_split($best_less, $caseless, $str_var, $pivot_pos); # Binary split the things greater-or-equal to the pivot character my %less= map +($_->[0] => 1), @$best_less; my @ge_src= $self->_binary_split([ grep !$less{$_->[0]}, @$vals ], $caseless, $str_var, $pivot_pos); my $indent= $self->_current_indent; return ( "if (${str_var}[$best_i] < '$best_ch') {", (map $indent.$_, @less_src), (@ge_src > 1 # combine "else { if" ? ( '} else '.$ge_src[0], @ge_src[1..$#ge_src] ) # else { statement } : ( '} else {', (map $indent.$_, @ge_src), '}' ) ) ); } 1; __END__ =pod =encoding UTF-8 =head1 NAME CodeGen::Cpppp::Enum - Helper for enumerations and generating related utility functions =head1 SYNOPSIS ## my $enum= CodeGen::Cpppp::Enum->new( ## prefix => 'MYENUM_', type => 'int', values => q( ## VAL_1 1 ## VAL_2 ## ALT_VAL ## SOME_VAL ## OTHER_VAL 1<<10 ## NONE -1 ## )); ## section PUBLIC; ${{ $enum->generate_declaration }} extern const char * myenum_name(int value); extern bool myenum_parse(const char *str, size_t len, int *value_out); ## section PRIVATE; ${{ $enum->generate_static_tables }} const char * myenum_name(int value) { ${{ $enum->generate_lookup_by_value }} } bool myenum_parse(const char *str, size_t len, int *value_out) { ${{ $enum->generate_lookup_by_name }} } =head1 DESCRIPTION This utility module helps you generate C code for enumerations. First, you create an instance of this object and load it with a list of enum values. These can come from a supplied list, or you can parse a header to find them. This module supports multiple names with the same value, but not multiple values with the same name. Next, you can generate the definition of the enum as either C<#define> macros or a C++ C<< enum { ... } >> syntax. This also can generate code that looks up the name by the value, or parses a name to get the value. The header would generally look like: #define EXAMPLE_1 1 #define EXAMPLE_2 2 // Returns true if found a match extern bool example_parse(const char *ch, int len, int *value_out); extern const char *example_name(int enum_val); There are several implementations to choose from, and a sensible one will be chosen by default based on the patterns in your enum values. =head1 CONSTRUCTOR =head2 new Standard constructor. Pass values for any of the non-readonly attributes below. =head1 ATTRIBUTES =head2 prefix String to be prefixed onto each name of the enum. You may then optionally allow name lookups that match strings without the prefix as well as ones that include it. =head2 type Defaults to 'int'. =head2 values Returns a list of C<< [ $name, $value ] >>. You can initialize it with a list or arrayref containing C<$name> or C<< [ $name, $value ] >>. Any element without a value will get the next sequential value from the previous entry, starting from 0. =head2 value_table_var C Variable name of the constant which holds the official list of enum values. =head2 indent Set to a literal string to use for each level of indent in generated code. =head2 num_format Controls formatting of integers, as decimal or hex. Can be any single placeholder known to sprintf, like '%d', '%x', or '%X'. =head2 max_waste_ratio Permissible ratio of empty lookup table elements vs. populated elements. If there would be more empty cells than that, a different algorithm will be chosen. The default is 2, meaning that the table must be at least 1/3 populated. =head2 algorithm One of 'bsearch', 'hashtable', 'switch'. For each, the enum is stored in an array of string/value pairs (unless the values match the array index, then they get omitted). If this happens to be the sorted order of the names or sorted order of the values, those optimizations are taken into account. Otherwise, the algorithm operates as follows: =over =item bsearch Secondary arrays of integers are built that refer back to the enum array. These arrays are searched during name or value lookups using the C library C function. This results in the least space usage. =item hashtable Hash tables will be built that refer back to the enum array. Hash function constants will be chosen to provide the fewest number of collisions possible. This results in good performance without much added code. =item switch The lookup functions will be built using a combination of C 'switch' statements and binary character-comparison 'if' statements. This results in the absolute fastest performance, but may generate a fair amount of parsing code. =back =head2 is_symbolic True if one if your values is an expression instead of a constant integer. In this case, only the switch algorithm can be used for lookups. =head2 is_sequential Read-only, calculated by whether all the values have adjacent numeric values. =head2 is_sequentialish Read-only, calculated by whether all the values can be stored in an array at the offset of their numeric value without wasting more than C elements. =head2 generate_declaration @code_lines= $enum->generate_declaration(%options); Return either a list of C<< #define NAME ... >> or the lines of a C++ C<< enum Foo { ... } >>. =head2 generate_static_tables @code_lines= $enum->generate_static_tables(%options); Return lines of code that declare the tables needed for the lookups. This always includes a table in the same order as the declaration of the enum values, but may also include additional hash tables or lookup tables to represent the values in alphabetical order. =head2 generate_lookup_by_value @code_lines= $enum->generate_lookup_by_value(%options); Return lines of code that return a C<< const char * >> of the name associated with a value. The variable holding the value is named 'value' by default, and the implementation returns NULL if this value does not have a name. =head2 generate_lookup_by_name @code_lines= $enum->generate_lookup_by_name(%options); Return lines of code that take a name in a variable C, C bytes long, and store the enum value for that name into a variable C and then return true (if found) or false if not found. =head1 AUTHOR Michael Conrad =head1 VERSION version 0.005 =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2024 by Michael Conrad. This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut