From 3588d5186ef7321fa573adbb62f42b05d7a138be Mon Sep 17 00:00:00 2001 From: Andrew Dolgov Date: Fri, 18 Sep 2020 14:05:34 +0300 Subject: - gettext: merge patch from Sunil Mohan Adapa which rewrites plural parser to not use eval() - fix typo in aforementioned patch which caused plurals to never load - update code again to newer PHP constructor syntax --- lib/gettext/gettext.inc.php | 8 +- lib/gettext/gettext.php | 74 ++----- lib/gettext/plurals.php | 461 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 484 insertions(+), 59 deletions(-) create mode 100644 lib/gettext/plurals.php diff --git a/lib/gettext/gettext.inc.php b/lib/gettext/gettext.inc.php index c9f7dc016..ed5be6bbd 100644 --- a/lib/gettext/gettext.inc.php +++ b/lib/gettext/gettext.inc.php @@ -69,10 +69,10 @@ function get_list_of_locales($locale) { * sr_CS.UTF-8@latin, sr_CS@latin, sr@latin, sr_CS.UTF-8, sr_CS, sr. */ $locale_names = array(); - $lang = NULL; - $country = NULL; - $charset = NULL; - $modifier = NULL; + $lang = null; + $country = null; + $charset = null; + $modifier = null; if ($locale) { if (preg_match("/^(?P[a-z]{2,3})" // language code ."(?:_(?P[A-Z]{2}))?" // country code diff --git a/lib/gettext/gettext.php b/lib/gettext/gettext.php index edbd93304..173d4c448 100755 --- a/lib/gettext/gettext.php +++ b/lib/gettext/gettext.php @@ -21,6 +21,8 @@ */ +require('plurals.php'); + /** * Provides a simple gettext replacement that works independently from * the system's gettext abilities. @@ -39,16 +41,16 @@ class gettext_reader { //private: var $BYTEORDER = 0; // 0: low endian, 1: big endian - var $STREAM = NULL; + var $STREAM = null; var $short_circuit = false; var $enable_cache = false; - var $originals = NULL; // offset of original table - var $translations = NULL; // offset of translation table - var $pluralheader = NULL; // cache header field for plural forms + var $originals = null; // offset of original table + var $translations = null; // offset of translation table + var $pluralheader = null; // cache header field for plural forms var $total = 0; // total string count - var $table_originals = NULL; // table for original strings (offsets) - var $table_translations = NULL; // table for translated strings (offsets) - var $cache_translations = NULL; // original -> translation mapping + var $table_originals = null; // table for original strings (offsets) + var $table_translations = null; // table for translated strings (offsets) + var $cache_translations = null; // original -> translation mapping /* Methods */ @@ -269,41 +271,6 @@ class gettext_reader { } } - /** - * Sanitize plural form expression for use in PHP eval call. - * - * @access private - * @return string sanitized plural form expression - */ - function sanitize_plural_expression($expr) { - // Get rid of disallowed characters. - $expr = preg_replace('@[^a-zA-Z0-9_:;\(\)\?\|\&=!<>+*/\%-]@', '', $expr); - - // Add parenthesis for tertiary '?' operator. - $expr .= ';'; - $res = ''; - $p = 0; - for ($i = 0; $i < strlen($expr); $i++) { - $ch = $expr[$i]; - switch ($ch) { - case '?': - $res .= ' ? ('; - $p++; - break; - case ':': - $res .= ') : ('; - break; - case ';': - $res .= str_repeat( ')', $p) . ';'; - $p = 0; - break; - default: - $res .= $ch; - } - } - return $res; - } - /** * Parse full PO header and extract only plural forms line. * @@ -327,17 +294,17 @@ class gettext_reader { function get_plural_forms() { // lets assume message number 0 is header // this is true, right? - $this->load_tables(); + $this->load_tables(); // cache header field for plural forms - if (! is_string($this->pluralheader)) { + if ($this->pluralheader === null) { if ($this->enable_cache) { $header = $this->cache_translations[""]; } else { $header = $this->get_translation_string(0); } $expr = $this->extract_plural_forms_header_from_po_header($header); - $this->pluralheader = $this->sanitize_plural_expression($expr); + $this->pluralheader = new PluralHeader($expr); } return $this->pluralheader; } @@ -353,17 +320,14 @@ class gettext_reader { if (!is_int($n)) { throw new InvalidArgumentException( "Select_string only accepts integers: " . $n); - } - $string = $this->get_plural_forms(); - $string = str_replace('nplurals',"\$total",$string); - $string = str_replace("n",$n,$string); - $string = str_replace('plural',"\$plural",$string); + } + + $plural_header = $this->get_plural_forms(); + $plural = $plural_header->expression->evaluate($n); - $total = 0; - $plural = 0; + if ($plural < 0) $plural = 0; + if ($plural >= $plural_header->total) $plural = $plural_header->total - 1; - eval("$string"); - if ($plural >= $total) $plural = $total - 1; return $plural; } @@ -387,7 +351,7 @@ class gettext_reader { // find out the appropriate form $select = $this->select_string($number); - // this should contains all strings separated by NULLs + // this should contains all strings separated by nulls $key = $single . chr(0) . $plural; diff --git a/lib/gettext/plurals.php b/lib/gettext/plurals.php new file mode 100644 index 000000000..dbf912c37 --- /dev/null +++ b/lib/gettext/plurals.php @@ -0,0 +1,461 @@ + + + Drop in replacement for native gettext. + + This file is part of PHP-gettext. + + PHP-gettext is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + PHP-gettext is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PHP-gettext; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +*/ + +/** + * Lexical analyzer for gettext plurals expression. Takes a string to parse + * during construction and returns a single token every time peek() or + * fetch_token() are called. The special string '__END__' is returned if there + * are no more tokens to be read. Spaces are ignored during tokenization. + */ +class PluralsLexer { + private $string; + private $position; + + /** + * Constructor + * + * @param string string Contains the value gettext plurals expression to + * analyze. + */ + public function __construct(string $string) { + $this->string = $string; + $this->position = 0; + } + + /** + * Return the next token and the length to advance the read position without + * actually advancing the read position. Tokens for operators and variables + * are simple strings containing the operator or variable. If there are no + * more token to provide, the special value ['__END__', 0] is returned. If + * there was an unexpected input an Exception is raised. + * + * @access private + * @throws Exception If there is unexpected input in the provided string. + * @return array The next token and length to advance the current position. + */ + private function _tokenize() { + $buf = $this->string; + + // Consume all spaces until the next token + $index = $this->position; + while ($index < strlen($buf) && $buf[$index] == ' ') { + $index++; + } + $this->position = $index; + + // Return special token if next of the string is reached. + if (strlen($buf) - $index == 0) { + return ['__END__', 0]; + } + + // Operators with two characters + $doubles = ['==', '!=', '>=', '<=', '&&', '||']; + $next = substr($buf, $index, 2); + if (in_array($next, $doubles)) { + return [$next, 2]; + } + + // Operators with single character or variable 'n'. + $singles = [ + 'n', '(', ')', '?', ':', '+', '-', '*', '/', '%', '!', '>', '<']; + if (in_array($buf[$index], $singles)) { + return [$buf[$index], 1]; + } + + // Whole number constants, return an integer. + $digits = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']; + $pos = $index; + while ($pos < strlen($buf) && in_array($buf[$pos], $digits)) { + $pos++; + } + if ($pos != $index) { + $length = $pos - $index; + return [(int)substr($buf, $index, $length), $length]; + } + + // Throw and exception for all other unexpected input in the string. + throw new Exception('Lexical analysis failed'); + } + + /** + * Return the next token without actually advancing the read position. + * Tokens for operators and variables are simple strings containing the + * operator or variable. If there are no more tokens to provide, the special + * value '__END__' is returned. If there was an unexpected input an + * Exception is raised. + * + * @throws Exception If there is unexpected input in the provided string. + * @return string The next token. + */ + public function peek() { + list($token, $length) = $this->_tokenize(); + return $token; + } + + /** + * Return the next token after advancing the read position. Tokens for + * operators and variables are simple strings containing the operator or + * variable. If there are no more token to provide, the special value + * '__END__' is returned. If there was an unexpected input an Exception is + * raised. + * + * @throws Exception If there is unexpected input in the provided string. + * @return string The next token. + */ + public function fetch_token() { + list($token, $length) = $this->_tokenize(); + $this->position += $length; + return $token; + } +} + +/** + * A parsed representation of the gettext plural expression. This is a tree + * containing further expressions depending on how nested the given input is. + * Calling the evaluate() function computes the value of the expression if the + * variable 'n' is set a certain value. This is used to decide which plural + * string translation to use based on the number items at hand. + */ +class PluralsExpression { + private $operator; + private $operands; + + const BINARY_OPERATORS = [ + '==', '!=', '>=', '<=', '&&', '||', '+', '-', '*', '/', '%', '>', '<']; + const UNARY_OPERATORS = ['!']; + + /** + * Constructor + * + * @param string Operator for the expression. + * @param (int|string|PuralsExpression)[] Variable number of operands of the + * expression. One int operand is expected in case the operator is 'const'. + * One string operand with value 'n' is expected in case the operator is + * 'var'. For all other operators, the operands much be objects of type + * PluralExpression. Unary operators expect one operand, binary operators + * expect two operands and trinary operators expect three operands. + */ + public function __construct($operator, ...$operands) { + $this->operator = $operator; + $this->operands = $operands; + } + + /** + * Return a parenthesized string representation of the expression for + * debugging purposes. + * + * @return string A string representation of the expression. + */ + public function to_string() { + if ($this->operator == 'const' || $this->operator == 'var') { + return $this->operands[0]; + } elseif (in_array($this->operator, self::BINARY_OPERATORS)) { + return sprintf( + "(%s %s %s)", $this->operands[0]->to_string(), $this->operator, + $this->operands[1]->to_string()); + } elseif (in_array($this->operator, self::UNARY_OPERATORS)) { + return sprintf( + "(%s %s)", $this->operator, $this->operands[0]->to_string()); + } elseif ($this->operator == '?') { + return sprintf( + "(%s ? %s : %s)", $this->operands[0]->to_string(), + $this->operands[1]->to_string(), + $this->operands[2]->to_string()); + } + } + + /** + * Return the computed value of the expression if the variable 'n' is set to + * a certain value. + * + * @param int The value of the variable n to use when evaluating. + * @throws Exception If the expression has been constructed incorrectly. + * @return int The value of the expression after evaluation. + */ + public function evaluate($n) { + if (!in_array($this->operator, ['const', 'var'])) { + $operand1 = $this->operands[0]->evaluate($n); + } + if (in_array($this->operator, self::BINARY_OPERATORS) || + $this->operator == '?') { + $operand2 = $this->operands[1]->evaluate($n); + } + if ($this->operator == '?') { + $operand3 = $this->operands[2]->evaluate($n); + } + + switch ($this->operator) { + case 'const': + return $this->operands[0]; + case 'var': + return $n; + case '!': + return !($operand1); + case '==': + return $operand1 == $operand2; + case '!=': + return $operand1 != $operand2; + case '>=': + return $operand1 >= $operand2; + case '<=': + return $operand1 <= $operand2; + case '>': + return $operand1 > $operand2; + case '<': + return $operand1 < $operand2; + case '&&': + return $operand1 && $operand2; + case '||': + return $operand1 || $operand2; + case '+': + return $operand1 + $operand2; + case '-': + return $operand1 - $operand2; + case '*': + return $operand1 * $operand2; + case '/': + return (int)($operand1 / $operand2); + case '%': + return $operand1 % $operand2; + case '?': + return $operand1 ? $operand2 : $operand3; + default: + throw new Exception('Invalid expression'); + } + } +} + +/** + * A simple operator-precedence parser for gettext plural expressions. Takes a + * string during construction and returns a PluralsExpression tree when + * parse() is called. + */ +class PluralsParser { + private $lexer; + + /* + * Operator precedence. The parsing only happens with minimum precedence of + * 0. However, ':' and ')' exist here to make sure that parsing does not + * proceed beyond them when they are not to be parsed. + */ + private const PREC = [ + ':' => -1, '?' => 0, '||' => 1, '&&' => 2, '==' => 3, '!=' => 3, + '>' => 4, '<' => 4, '>=' => 4, '<=' => 4, '+' => 5, '-' => 5, '*' => 6, + '/' => 6, '%' => 6, '!' => 7, '__END__' => -1, ')' => -1 + ]; + + // List of right associative operators + private const RIGHT_ASSOC = ['?']; + + /** + * Constructor + * + * @param string string the plural expression to be parsed. + */ + public function __construct(string $string) { + $this->lexer = new PluralsLexer($string); + } + + /** + * Expect a primary next for parsing and return a PluralsExpression or throw + * and exception otherwise. A primary can be the variable 'n', an whole + * number constant, a unary operator expression string with '!', or a + * parenthesis expression. + * + * @throws Exception If the next token is not a primary or if parenthesis + * expression is not closes properly with ')'. + * @return PluralsExpression That is constructed from the parsed primary. + */ + private function _parse_primary() { + $token = $this->lexer->fetch_token(); + if ($token === 'n') { + return new PluralsExpression('var', 'n'); + } elseif (is_int($token)) { + return new PluralsExpression('const', (int)$token); + } elseif ($token === '!') { + return new PluralsExpression('!', $this->_parse_primary()); + } elseif ($token === '(') { + $result = $this->_parse($this->_parse_primary(), 0); + if ($this->lexer->fetch_token() != ')') { + throw new Exception('Mismatched parenthesis'); + } + return $result; + } + + throw new Exception('Primary expected'); + } + + /** + * Fetch an operator from the lexical analyzer and test for it. Optionally + * advance the position of the lexical analyzer to next token. Raise + * exception if the token retrieved is not an operator. + * + * @access private + * @param bool peek A flag to indicate whether the position of the lexical + * analyzer should *not* be advanced. If false, the lexical analyzer is + * advanced by one token. + * @throws Exception If the token read is not an operator. + * @return string The operator that has been fetched from the lexical + * analyzer. + */ + private function _parse_operator($peek) { + if ($peek) { + $token = $this->lexer->peek(); + } else { + $token = $this->lexer->fetch_token(); + } + + if ($token !== null && !array_key_exists($token, self::PREC)) { + throw new Exception('Operator expected'); + } + return $token; + } + + /** + * A parsing method suitable for recursion. + * + * @access private + * @param ParserExpression left_side A pre-parsed left-hand side expression + * of the file expression to be constructed. This helps with recursion. + * @param int min_precedence The minimum value of precedence for the + * operators to be considered for parsing. Parsing will stop and current + * expression is returned if an operator of a lower precedence is + * encountered. + * @throws Exception If the input string does not conform to the grammar of + * the gettext plural expression. + * @return ParserExpression A complete expression after parsing. + */ + private function _parse($left_side, $min_precedence) { + $next_token = $this->_parse_operator(true); + + while (self::PREC[$next_token] >= $min_precedence) { + $operator = $this->_parse_operator(false); + $right_side = $this->_parse_primary(); + + $next_token = $this->_parse_operator(true); + + /* + * Consume (recursively) into right hand side all expressions of higher + * precedence. + */ + while ((self::PREC[$operator] < self::PREC[$next_token]) || + ((self::PREC[$operator] == self::PREC[$next_token]) && + in_array($operator, self::RIGHT_ASSOC))) { + $right_side = $this->_parse( + $right_side, self::PREC[$next_token]); + $next_token = $this->_parse_operator(true); + } + + if ($operator != '?') { + /* + * Handling for all binary operators. Consume into left hand side all + * expressions of equal precedence. + */ + $left_side = new PluralsExpression($operator, $left_side, $right_side); + } else { + // Special handling for (a ? b : c) expression + $operator = $this->lexer->fetch_token(); + if ($operator != ':') { + throw new Exception('Invalid ? expression'); + } + + $right_side2 = $this->_parse( + $this->_parse_primary(), self::PREC[$operator] + 1); + $next_token = $this->_parse_operator(true); + $left_side = new PluralsExpression( + '?', $left_side, $right_side, $right_side2); + } + } + return $left_side; + } + + /** + * A simple implementation of an operator-precedence parser. See: + * https://en.wikipedia.org/wiki/Operator-precedence_parser for an analysis + * of the algorithm. + * + * @throws Exception If the input string does not conform to the grammar of + * the gettext plural expression. + * @return ParserExpression A complete expression after parsing. + */ + public function parse() { + $expression = $this->_parse($this->_parse_primary(), 0); + // Special handling for an extra ')' at the end. + if ($this->lexer->peek() != '__END__') { + throw new Exception('Could not parse completely'); + } + return $expression; + } +} + +/** + * Provides a class to parse the value of the 'Plural-Forms:' header in the + * gettext translation files. Holds the expression tree and the number of + * plurals after parsing. Parsing happens during construction which takes as + * its only argument the string to parse. Error during parsing are silently + * suppressed and the fallback behavior is used with the value for Germanic + * languages as follows: "nplurals=2; plural=n == 1 ? 0 : 1;". + */ +class PluralHeader { + public $total; + public $expression; + + /** + * Constructor + * + * @param string The value of the Plural-Forms: header as seen in .po files. + */ + function __construct($string) { + try { + list($total, $expression) = $this->parse($string); + } catch (Exception $e) { + $string = "nplurals=2; plural=n == 1 ? 0 : 1;"; + list($total, $expression) = $this->parse($string); + } + $this->total = $total; + $this->expression = $expression; + } + + /** + * Return the number of plural forms and the parsed expression tree. + * + * @access private + * @param string string The value of the Plural-Forms: header. + * @throws Exception If the string could not be parsed. + * @return array The number of plural forms and parsed expression tree. + */ + private function parse($string) { + $regex = "/^\s*nplurals\s*=\s*(\d+)\s*;\s*plural\s*=([^;]+);/i"; + if (preg_match($regex, $string, $matches)) { + $total = (int)$matches[1]; + $expression_string = $matches[2]; + } else { + throw new Exception('Invalid header value'); + } + + $parser = new PluralsParser($expression_string); + $expression = $parser->parse(); + return [$total, $expression]; + } +} -- cgit v1.2.3