- gettext: merge patch from Sunil Mohan Adapa which rewrites plural parser to not use eval()

- fix typo in aforementioned patch which caused plurals to never load - update code again to newer PHP constructor syntax
author: Andrew Dolgov <[email protected]> 2020-09-18 14:05:34 +0300
committer: Andrew Dolgov <[email protected]> 2020-09-18 14:05:34 +0300
commit: 3588d5186ef7321fa573adbb62f42b05d7a138be (patch)
tree: 14d49460bfebe2d370a1e874915f3670efba25f5 /lib
parent: 4f5ae94b62b0e949adda5a5e2672cc79a193c02d (diff)
3 files changed, 484 insertions, 59 deletions
diff --git a/lib/gettext/gettext.inc.php b/lib/gettext/gettext.inc.php
index c9f7dc016..ed5be6bbd 100644
--- a/lib/gettext/gettext.inc.php
+++ b/lib/gettext/gettext.inc.php
@@ -69,10 +69,10 @@ function get_list_of_locales($locale) {
    * sr_CS.UTF-8@latin, sr_CS@latin, sr@latin, sr_CS.UTF-8, sr_CS, sr.
    */
   $locale_names = array();
-  $lang = NULL;
-  $country = NULL;
-  $charset = NULL;
-  $modifier = NULL;
+  $lang = null;
+  $country = null;
+  $charset = null;
+  $modifier = null;
   if ($locale) {
     if (preg_match("/^(?P<lang>[a-z]{2,3})"              // language code
                    ."(?:_(?P<country>[A-Z]{2}))?"           // country code
diff --git a/lib/gettext/gettext.php b/lib/gettext/gettext.php
index edbd93304..173d4c448 100755
--- a/lib/gettext/gettext.php
+++ b/lib/gettext/gettext.php
@@ -21,6 +21,8 @@
 
 */
 
+require('plurals.php');
+
 /**
  * Provides a simple gettext replacement that works independently from
  * the system's gettext abilities.
@@ -39,16 +41,16 @@ class gettext_reader {
 
    //private:
   var $BYTEORDER = 0;        // 0: low endian, 1: big endian
-  var $STREAM = NULL;
+  var $STREAM = null;
   var $short_circuit = false;
   var $enable_cache = false;
-  var $originals = NULL;      // offset of original table
-  var $translations = NULL;    // offset of translation table
-  var $pluralheader = NULL;    // cache header field for plural forms
+  var $originals = null;      // offset of original table
+  var $translations = null;    // offset of translation table
+  var $pluralheader = null;    // cache header field for plural forms
   var $total = 0;          // total string count
-  var $table_originals = NULL;  // table for original strings (offsets)
-  var $table_translations = NULL;  // table for translated strings (offsets)
-  var $cache_translations = NULL;  // original -> translation mapping
+  var $table_originals = null;  // table for original strings (offsets)
+  var $table_translations = null;  // table for translated strings (offsets)
+  var $cache_translations = null;  // original -> translation mapping
 
 
   /* Methods */
@@ -270,41 +272,6 @@ class gettext_reader {
   }
 
   /**
-   * Sanitize plural form expression for use in PHP eval call.
-   *
-   * @access private
-   * @return string sanitized plural form expression
-   */
-  function sanitize_plural_expression($expr) {
-    // Get rid of disallowed characters.
-    $expr = preg_replace('@[^a-zA-Z0-9_:;\(\)\?\|\&=!<>+*/\%-]@', '', $expr);
-
-    // Add parenthesis for tertiary '?' operator.
-    $expr .= ';';
-    $res = '';
-    $p = 0;
-    for ($i = 0; $i < strlen($expr); $i++) {
-      $ch = $expr[$i];
-      switch ($ch) {
-      case '?':
-        $res .= ' ? (';
-        $p++;
-        break;
-      case ':':
-        $res .= ') : (';
-        break;
-      case ';':
-        $res .= str_repeat( ')', $p) . ';';
-        $p = 0;
-        break;
-      default:
-        $res .= $ch;
-      }
-    }
-    return $res;
-  }
-
-  /**
    * Parse full PO header and extract only plural forms line.
    *
    * @access private
@@ -327,17 +294,17 @@ class gettext_reader {
   function get_plural_forms() {
     // lets assume message number 0 is header
     // this is true, right?
-    $this->load_tables();
+		$this->load_tables();
 
     // cache header field for plural forms
-    if (! is_string($this->pluralheader)) {
+    if ($this->pluralheader === null) {
       if ($this->enable_cache) {
         $header = $this->cache_translations[""];
       } else {
         $header = $this->get_translation_string(0);
       }
       $expr = $this->extract_plural_forms_header_from_po_header($header);
-      $this->pluralheader = $this->sanitize_plural_expression($expr);
+      $this->pluralheader = new PluralHeader($expr);
     }
     return $this->pluralheader;
   }
@@ -353,17 +320,14 @@ class gettext_reader {
     if (!is_int($n)) {
       throw new InvalidArgumentException(
         "Select_string only accepts integers: " . $n);
-    }
-    $string = $this->get_plural_forms();
-    $string = str_replace('nplurals',"\$total",$string);
-    $string = str_replace("n",$n,$string);
-    $string = str_replace('plural',"\$plural",$string);
+		}
+
+		$plural_header = $this->get_plural_forms();
+    $plural = $plural_header->expression->evaluate($n);
 
-    $total = 0;
-    $plural = 0;
+    if ($plural < 0) $plural = 0;
+    if ($plural >= $plural_header->total) $plural = $plural_header->total - 1;
 
-    eval("$string");
-    if ($plural >= $total) $plural = $total - 1;
     return $plural;
   }
 
@@ -387,7 +351,7 @@ class gettext_reader {
     // find out the appropriate form
     $select = $this->select_string($number);
 
-    // this should contains all strings separated by NULLs
+    // this should contains all strings separated by nulls
     $key = $single . chr(0) . $plural;
 
 
diff --git a/lib/gettext/plurals.php b/lib/gettext/plurals.php
new file mode 100644
index 000000000..dbf912c37
--- /dev/null
+++ b/lib/gettext/plurals.php
@@ -0,0 +1,461 @@
+<?php
+/*
+   Copyright (c) 2020 Sunil Mohan Adapa <sunil at medhas dot org>
+
+   Drop in replacement for native gettext.
+
+   This file is part of PHP-gettext.
+
+   PHP-gettext is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   PHP-gettext is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with PHP-gettext; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+*/
+
+/**
+ * Lexical analyzer for gettext plurals expression. Takes a string to parse
+ * during construction and returns a single token every time peek() or
+ * fetch_token() are called. The special string '__END__' is returned if there
+ * are no more tokens to be read. Spaces are ignored during tokenization.
+ */
+class PluralsLexer {
+  private $string;
+  private $position;
+
+  /**
+   * Constructor
+   *
+   * @param string string Contains the value gettext plurals expression to
+   * analyze.
+   */
+  public function __construct(string $string) {
+    $this->string = $string;
+    $this->position = 0;
+  }
+
+  /**
+   * Return the next token and the length to advance the read position without
+   * actually advancing the read position. Tokens for operators and variables
+   * are simple strings containing the operator or variable. If there are no
+   * more token to provide, the special value ['__END__', 0] is returned. If
+   * there was an unexpected input an Exception is raised.
+   *
+   * @access private
+   * @throws Exception If there is unexpected input in the provided string.
+   * @return array The next token and length to advance the current position.
+   */
+  private function _tokenize() {
+    $buf = $this->string;
+
+    // Consume all spaces until the next token
+    $index = $this->position;
+    while ($index < strlen($buf) && $buf[$index] == ' ') {
+      $index++;
+    }
+    $this->position = $index;
+
+    // Return special token if next of the string is reached.
+    if (strlen($buf) - $index == 0) {
+      return ['__END__', 0];
+    }
+
+    // Operators with two characters
+    $doubles = ['==', '!=', '>=', '<=', '&&', '||'];
+    $next = substr($buf, $index, 2);
+    if (in_array($next, $doubles)) {
+      return [$next, 2];
+    }
+
+    // Operators with single character or variable 'n'.
+    $singles = [
+      'n', '(', ')', '?', ':', '+', '-', '*', '/', '%', '!', '>', '<'];
+    if (in_array($buf[$index], $singles)) {
+      return [$buf[$index], 1];
+    }
+
+    // Whole number constants, return an integer.
+    $digits = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'];
+    $pos = $index;
+    while ($pos < strlen($buf) && in_array($buf[$pos], $digits)) {
+      $pos++;
+    }
+    if ($pos != $index) {
+      $length = $pos - $index;
+      return [(int)substr($buf, $index, $length), $length];
+    }
+
+    // Throw and exception for all other unexpected input in the string.
+    throw new Exception('Lexical analysis failed');
+  }
+
+  /**
+   * Return the next token without actually advancing the read position.
+   * Tokens for operators and variables are simple strings containing the
+   * operator or variable. If there are no more tokens to provide, the special
+   * value '__END__' is returned. If there was an unexpected input an
+   * Exception is raised.
+   *
+   * @throws Exception If there is unexpected input in the provided string.
+   * @return string The next token.
+   */
+  public function peek() {
+    list($token, $length) = $this->_tokenize();
+    return $token;
+  }
+
+  /**
+   * Return the next token after advancing the read position. Tokens for
+   * operators and variables are simple strings containing the operator or
+   * variable. If there are no more token to provide, the special value
+   * '__END__' is returned. If there was an unexpected input an Exception is
+   * raised.
+   *
+   * @throws Exception If there is unexpected input in the provided string.
+   * @return string The next token.
+   */
+  public function fetch_token() {
+    list($token, $length) = $this->_tokenize();
+    $this->position += $length;
+    return $token;
+  }
+}
+
+/**
+ * A parsed representation of the gettext plural expression. This is a tree
+ * containing further expressions depending on how nested the given input is.
+ * Calling the evaluate() function computes the value of the expression if the
+ * variable 'n' is set a certain value. This is used to decide which plural
+ * string translation to use based on the number items at hand.
+ */
+class PluralsExpression {
+  private $operator;
+  private $operands;
+
+  const BINARY_OPERATORS = [
+    '==', '!=', '>=', '<=', '&&', '||', '+', '-', '*', '/', '%', '>', '<'];
+  const UNARY_OPERATORS = ['!'];
+
+  /**
+   * Constructor
+   *
+   * @param string Operator for the expression.
+   * @param (int|string|PuralsExpression)[] Variable number of operands of the
+   * expression. One int operand is expected in case the operator is 'const'.
+   * One string operand with value 'n' is expected in case the operator is
+   * 'var'. For all other operators, the operands much be objects of type
+   * PluralExpression. Unary operators expect one operand, binary operators
+   * expect two operands and trinary operators expect three operands.
+   */
+  public function __construct($operator, ...$operands) {
+    $this->operator = $operator;
+    $this->operands = $operands;
+  }
+
+  /**
+   * Return a parenthesized string representation of the expression for
+   * debugging purposes.
+   *
+   * @return string A string representation of the expression.
+   */
+  public function to_string() {
+    if ($this->operator == 'const' || $this->operator == 'var') {
+      return $this->operands[0];
+    } elseif (in_array($this->operator, self::BINARY_OPERATORS)) {
+      return sprintf(
+        "(%s %s %s)", $this->operands[0]->to_string(), $this->operator,
+        $this->operands[1]->to_string());
+    } elseif (in_array($this->operator, self::UNARY_OPERATORS)) {
+      return sprintf(
+        "(%s %s)", $this->operator, $this->operands[0]->to_string());
+    } elseif ($this->operator == '?') {
+      return sprintf(
+        "(%s ? %s : %s)", $this->operands[0]->to_string(),
+        $this->operands[1]->to_string(),
+        $this->operands[2]->to_string());
+    }
+  }
+
+  /**
+   * Return the computed value of the expression if the variable 'n' is set to
+   * a certain value.
+   *
+   * @param int The value of the variable n to use when evaluating.
+   * @throws Exception If the expression has been constructed incorrectly.
+   * @return int The value of the expression after evaluation.
+   */
+  public function evaluate($n) {
+    if (!in_array($this->operator, ['const', 'var'])) {
+      $operand1 = $this->operands[0]->evaluate($n);
+    }
+    if (in_array($this->operator, self::BINARY_OPERATORS) ||
+        $this->operator == '?') {
+      $operand2 = $this->operands[1]->evaluate($n);
+    }
+    if ($this->operator == '?') {
+      $operand3 = $this->operands[2]->evaluate($n);
+    }
+
+    switch ($this->operator) {
+      case 'const':
+        return $this->operands[0];
+      case 'var':
+        return $n;
+      case '!':
+        return !($operand1);
+      case '==':
+        return $operand1 == $operand2;
+      case '!=':
+        return $operand1 != $operand2;
+      case '>=':
+        return $operand1 >= $operand2;
+      case '<=':
+        return $operand1 <= $operand2;
+      case '>':
+        return $operand1 > $operand2;
+      case '<':
+        return $operand1 < $operand2;
+      case '&&':
+        return $operand1 && $operand2;
+      case '||':
+        return $operand1 || $operand2;
+      case '+':
+        return $operand1 + $operand2;
+      case '-':
+        return $operand1 - $operand2;
+      case '*':
+        return $operand1 * $operand2;
+      case '/':
+        return (int)($operand1 / $operand2);
+      case '%':
+        return $operand1 % $operand2;
+      case '?':
+        return $operand1 ? $operand2 : $operand3;
+      default:
+        throw new Exception('Invalid expression');
+    }
+  }
+}
+
+/**
+ * A simple operator-precedence parser for gettext plural expressions. Takes a
+ * string during construction and returns a PluralsExpression tree when
+ * parse() is called.
+ */
+class PluralsParser {
+  private $lexer;
+
+  /*
+   * Operator precedence. The parsing only happens with minimum precedence of
+   * 0. However, ':' and ')' exist here to make sure that parsing does not
+   * proceed beyond them when they are not to be parsed.
+   */
+  private const PREC = [
+    ':' => -1, '?' => 0, '||' => 1, '&&' => 2, '==' => 3, '!=' => 3,
+    '>' => 4, '<' => 4, '>=' => 4, '<=' => 4, '+' => 5, '-' => 5, '*' => 6,
+    '/' => 6, '%' => 6, '!' => 7, '__END__' => -1, ')' => -1
+  ];
+
+  // List of right associative operators
+  private const RIGHT_ASSOC = ['?'];
+
+  /**
+   * Constructor
+   *
+   * @param string string the plural expression to be parsed.
+   */
+  public function __construct(string $string) {
+    $this->lexer = new PluralsLexer($string);
+  }
+
+  /**
+   * Expect a primary next for parsing and return a PluralsExpression or throw
+   * and exception otherwise. A primary can be the variable 'n', an whole
+   * number constant, a unary operator expression string with '!', or a
+   * parenthesis expression.
+   *
+   * @throws Exception If the next token is not a primary or if parenthesis
+   * expression is not closes properly with ')'.
+   * @return PluralsExpression That is constructed from the parsed primary.
+   */
+  private function _parse_primary() {
+    $token = $this->lexer->fetch_token();
+    if ($token === 'n') {
+      return new PluralsExpression('var', 'n');
+    } elseif (is_int($token)) {
+      return new PluralsExpression('const', (int)$token);
+    } elseif ($token === '!') {
+      return new PluralsExpression('!', $this->_parse_primary());
+    } elseif ($token === '(') {
+      $result = $this->_parse($this->_parse_primary(), 0);
+      if ($this->lexer->fetch_token() != ')') {
+        throw new Exception('Mismatched parenthesis');
+      }
+      return $result;
+    }
+
+    throw new Exception('Primary expected');
+  }
+
+  /**
+   * Fetch an operator from the lexical analyzer and test for it. Optionally
+   * advance the position of the lexical analyzer to next token. Raise
+   * exception if the token retrieved is not an operator.
+   *
+   * @access private
+   * @param bool peek A flag to indicate whether the position of the lexical
+   * analyzer should *not* be advanced. If false, the lexical analyzer is
+   * advanced by one token.
+   * @throws Exception If the token read is not an operator.
+   * @return string The operator that has been fetched from the lexical
+   * analyzer.
+   */
+  private function _parse_operator($peek) {
+    if ($peek) {
+      $token = $this->lexer->peek();
+    } else {
+        $token = $this->lexer->fetch_token();
+    }
+
+    if ($token !== null && !array_key_exists($token, self::PREC)) {
+      throw new Exception('Operator expected');
+    }
+    return $token;
+  }
+
+  /**
+   * A parsing method suitable for recursion.
+   *
+   * @access private
+   * @param ParserExpression left_side A pre-parsed left-hand side expression
+   * of the file expression to be constructed. This helps with recursion.
+   * @param int min_precedence The minimum value of precedence for the
+   * operators to be considered for parsing. Parsing will stop and current
+   * expression is returned if an operator of a lower precedence is
+   * encountered.
+   * @throws Exception If the input string does not conform to the grammar of
+   * the gettext plural expression.
+   * @return ParserExpression A complete expression after parsing.
+   */
+  private function _parse($left_side, $min_precedence) {
+    $next_token = $this->_parse_operator(true);
+
+    while (self::PREC[$next_token] >= $min_precedence) {
+      $operator = $this->_parse_operator(false);
+      $right_side = $this->_parse_primary();
+
+      $next_token = $this->_parse_operator(true);
+
+      /*
+       * Consume (recursively) into right hand side all expressions of higher
+       * precedence.
+       */
+      while ((self::PREC[$operator] < self::PREC[$next_token]) ||
+             ((self::PREC[$operator] == self::PREC[$next_token]) &&
+              in_array($operator, self::RIGHT_ASSOC))) {
+        $right_side = $this->_parse(
+            $right_side, self::PREC[$next_token]);
+        $next_token = $this->_parse_operator(true);
+      }
+
+      if ($operator != '?') {
+        /*
+         * Handling for all binary operators. Consume into left hand side all
+         * expressions of equal precedence.
+         */
+        $left_side = new PluralsExpression($operator, $left_side, $right_side);
+      } else {
+        // Special handling for (a ? b : c) expression
+        $operator = $this->lexer->fetch_token();
+        if ($operator != ':') {
+          throw new Exception('Invalid ? expression');
+        }
+
+        $right_side2 = $this->_parse(
+          $this->_parse_primary(), self::PREC[$operator] + 1);
+        $next_token = $this->_parse_operator(true);
+        $left_side = new PluralsExpression(
+            '?', $left_side, $right_side, $right_side2);
+      }
+    }
+    return $left_side;
+  }
+
+ /**
+   * A simple implementation of an operator-precedence parser. See:
+   * https://en.wikipedia.org/wiki/Operator-precedence_parser for an analysis
+   * of the algorithm.
+   *
+   * @throws Exception If the input string does not conform to the grammar of
+   * the gettext plural expression.
+   * @return ParserExpression A complete expression after parsing.
+   */
+  public function parse() {
+    $expression = $this->_parse($this->_parse_primary(), 0);
+    // Special handling for an extra ')' at the end.
+    if ($this->lexer->peek() != '__END__') {
+      throw new Exception('Could not parse completely');
+    }
+    return $expression;
+  }
+}
+
+/**
+ * Provides a class to parse the value of the 'Plural-Forms:' header in the
+ * gettext translation files. Holds the expression tree and the number of
+ * plurals after parsing. Parsing happens during construction which takes as
+ * its only argument the string to parse. Error during parsing are silently
+ * suppressed and the fallback behavior is used with the value for Germanic
+ * languages as follows: "nplurals=2; plural=n == 1 ? 0 : 1;".
+ */
+class PluralHeader {
+  public $total;
+  public $expression;
+
+  /**
+   * Constructor
+   *
+   * @param string The value of the Plural-Forms: header as seen in .po files.
+   */
+  function __construct($string) {
+    try {
+      list($total, $expression) = $this->parse($string);
+    } catch (Exception $e) {
+      $string = "nplurals=2; plural=n == 1 ? 0 : 1;";
+      list($total, $expression) = $this->parse($string);
+    }
+    $this->total = $total;
+    $this->expression = $expression;
+  }
+
+  /**
+   * Return the number of plural forms and the parsed expression tree.
+   *
+   * @access private
+   * @param string string The value of the Plural-Forms: header.
+   * @throws Exception If the string could not be parsed.
+   * @return array The number of plural forms and parsed expression tree.
+   */
+  private function parse($string) {
+    $regex = "/^\s*nplurals\s*=\s*(\d+)\s*;\s*plural\s*=([^;]+);/i";
+    if (preg_match($regex, $string, $matches)) {
+      $total = (int)$matches[1];
+      $expression_string = $matches[2];
+    } else {
+      throw new Exception('Invalid header value');
+    }
+
+    $parser = new PluralsParser($expression_string);
+    $expression = $parser->parse();
+    return [$total, $expression];
+  }
+}
author	Andrew Dolgov <[email protected]>	2020-09-18 14:05:34 +0300
committer	Andrew Dolgov <[email protected]>	2020-09-18 14:05:34 +0300
commit	3588d5186ef7321fa573adbb62f42b05d7a138be (patch)
tree	14d49460bfebe2d370a1e874915f3670efba25f5 /lib
parent	4f5ae94b62b0e949adda5a5e2672cc79a193c02d (diff)