summaryrefslogtreecommitdiff
path: root/src/HTML5/Data.php
blob: 86b4b0f5e4c3c05442e41bc4082a892663757d1e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
<?php

namespace HTML5;

// warning: this file is encoded in UTF-8!


/**
 * Character data.
 */
class Data {

  // at some point this should be moved to a .ser file. Another
  // possible optimization is to give UTF-8 bytes, not Unicode
  // codepoints
  // XXX: Not quite sure why it's named this; this is
  // actually the numeric entity dereference table.
  protected static $realCodepointTable = array(
    0x00 => 0xFFFD, // REPLACEMENT CHARACTER
    0x0D => 0x000A, // LINE FEED (LF)
    0x80 => 0x20AC, // EURO SIGN ('€')
    0x81 => 0x0081, // <control>
    0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK ('‚')
    0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK ('ƒ')
    0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK ('„')
    0x85 => 0x2026, // HORIZONTAL ELLIPSIS ('…')
    0x86 => 0x2020, // DAGGER ('†')
    0x87 => 0x2021, // DOUBLE DAGGER ('‡')
    0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT ('ˆ')
    0x89 => 0x2030, // PER MILLE SIGN ('‰')
    0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON ('Š')
    0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ('‹')
    0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE ('Œ')
    0x8D => 0x008D, // <control>
    0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON ('Ž')
    0x8F => 0x008F, // <control>
    0x90 => 0x0090, // <control>
    0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK ('‘')
    0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK ('’')
    0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK ('“')
    0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK ('”')
    0x95 => 0x2022, // BULLET ('•')
    0x96 => 0x2013, // EN DASH ('–')
    0x97 => 0x2014, // EM DASH ('—')
    0x98 => 0x02DC, // SMALL TILDE ('˜')
    0x99 => 0x2122, // TRADE MARK SIGN ('™')
    0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON ('š')
    0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ('›')
    0x9C => 0x0153, // LATIN SMALL LIGATURE OE ('œ')
    0x9D => 0x009D, // <control>
    0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON ('ž')
    0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS ('Ÿ')
  );

  protected static $namedCharacterReferences;

  protected static $namedCharacterReferenceMaxLength;

  /**
   * Returns the "real" Unicode codepoint of a malformed character
   * reference.
   */
  public static function getRealCodepoint($ref) {
    if (!isset(self::$realCodepointTable[$ref])) return false;
    else return self::$realCodepointTable[$ref];
  }

  public static function getNamedCharacterReferences() {
    // Danger Will Robinson: This will prevent the opcode cache from 
    // caching the entity references.
    if (!self::$namedCharacterReferences) {
      self::$namedCharacterReferences = unserialize(
        file_get_contents(dirname(__FILE__) . '/named-character-references.ser'));
    }
    return self::$namedCharacterReferences;
  }

  /**
   * Converts a Unicode codepoint to sequence of UTF-8 bytes.
   * @note Shamelessly stolen from HTML Purifier, which is also
   *     shamelessly stolen from Feyd (which is in public domain).
   */
  public static function utf8chr($code) {
    /* We don't care: we live dangerously
     * if($code > 0x10FFFF or $code < 0x0 or
      ($code >= 0xD800 and $code <= 0xDFFF) ) {
      // bits are set outside the "valid" range as defined
      // by UNICODE 4.1.0
      return "\xEF\xBF\xBD";
      }*/

    $x = $y = $z = $w = 0;
    if ($code < 0x80) {
      // regular ASCII character
      $x = $code;
    } else {
      // set up bits for UTF-8
      $x = ($code & 0x3F) | 0x80;
      if ($code < 0x800) {
         $y = (($code & 0x7FF) >> 6) | 0xC0;
      } else {
        $y = (($code & 0xFC0) >> 6) | 0x80;
        if($code < 0x10000) {
          $z = (($code >> 12) & 0x0F) | 0xE0;
        } else {
          $z = (($code >> 12) & 0x3F) | 0x80;
          $w = (($code >> 18) & 0x07) | 0xF0;
        }
      }
    }
    // set up the actual character
    $ret = '';
    if($w) $ret .= chr($w);
    if($z) $ret .= chr($z);
    if($y) $ret .= chr($y);
    $ret .= chr($x);

    return $ret;
  }

}