summaryrefslogtreecommitdiff
path: root/lib/htmlpurifier/library/HTMLPurifier/Lexer/PEARSax3.php
blob: 1d358c7b6bea2c36896c64ad99f2872b69bb0e9c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
<?php

/**
 * Proof-of-concept lexer that uses the PEAR package XML_HTMLSax3 to parse HTML.
 *
 * PEAR, not suprisingly, also has a SAX parser for HTML.  I don't know
 * very much about implementation, but it's fairly well written.  However, that
 * abstraction comes at a price: performance. You need to have it installed,
 * and if the API changes, it might break our adapter. Not sure whether or not
 * it's UTF-8 aware, but it has some entity parsing trouble (in all areas,
 * text and attributes).
 *
 * Quite personally, I don't recommend using the PEAR class, and the defaults
 * don't use it. The unit tests do perform the tests on the SAX parser too, but
 * whatever it does for poorly formed HTML is up to it.
 *
 * @todo Generalize so that XML_HTMLSax is also supported.
 *
 * @warning Entity-resolution inside attributes is broken.
 */

class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
{

    /**
     * Internal accumulator array for SAX parsers.
     */
    protected $tokens = array();
    protected $last_token_was_empty;

    private $parent_handler;
    private $stack = array();

    public function tokenizeHTML($string, $config, $context) {

        $this->tokens = array();
        $this->last_token_was_empty = false;

        $string = $this->normalize($string, $config, $context);

        $this->parent_handler = set_error_handler(array($this, 'muteStrictErrorHandler'));

        $parser = new XML_HTMLSax3();
        $parser->set_object($this);
        $parser->set_element_handler('openHandler','closeHandler');
        $parser->set_data_handler('dataHandler');
        $parser->set_escape_handler('escapeHandler');

        // doesn't seem to work correctly for attributes
        $parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);

        $parser->parse($string);

        restore_error_handler();

        return $this->tokens;

    }

    /**
     * Open tag event handler, interface is defined by PEAR package.
     */
    public function openHandler(&$parser, $name, $attrs, $closed) {
        // entities are not resolved in attrs
        foreach ($attrs as $key => $attr) {
            $attrs[$key] = $this->parseData($attr);
        }
        if ($closed) {
            $this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs);
            $this->last_token_was_empty = true;
        } else {
            $this->tokens[] = new HTMLPurifier_Token_Start($name, $attrs);
        }
        $this->stack[] = $name;
        return true;
    }

    /**
     * Close tag event handler, interface is defined by PEAR package.
     */
    public function closeHandler(&$parser, $name) {
        // HTMLSax3 seems to always send empty tags an extra close tag
        // check and ignore if you see it:
        // [TESTME] to make sure it doesn't overreach
        if ($this->last_token_was_empty) {
            $this->last_token_was_empty = false;
            return true;
        }
        $this->tokens[] = new HTMLPurifier_Token_End($name);
        if (!empty($this->stack)) array_pop($this->stack);
        return true;
    }

    /**
     * Data event handler, interface is defined by PEAR package.
     */
    public function dataHandler(&$parser, $data) {
        $this->last_token_was_empty = false;
        $this->tokens[] = new HTMLPurifier_Token_Text($data);
        return true;
    }

    /**
     * Escaped text handler, interface is defined by PEAR package.
     */
    public function escapeHandler(&$parser, $data) {
        if (strpos($data, '--') === 0) {
            // remove trailing and leading double-dashes
            $data = substr($data, 2);
            if (strlen($data) >= 2 && substr($data, -2) == "--") {
                $data = substr($data, 0, -2);
            }
            if (isset($this->stack[sizeof($this->stack) - 1]) &&
                $this->stack[sizeof($this->stack) - 1] == "style") {
                $this->tokens[] = new HTMLPurifier_Token_Text($data);
            } else {
                $this->tokens[] = new HTMLPurifier_Token_Comment($data);
            }
            $this->last_token_was_empty = false;
        }
        // CDATA is handled elsewhere, but if it was handled here:
        //if (strpos($data, '[CDATA[') === 0) {
        //    $this->tokens[] = new HTMLPurifier_Token_Text(
        //        substr($data, 7, strlen($data) - 9) );
        //}
        return true;
    }

    /**
     * An error handler that mutes strict errors
     */
    public function muteStrictErrorHandler($errno, $errstr, $errfile=null, $errline=null, $errcontext=null) {
        if ($errno == E_STRICT) return;
        return call_user_func($this->parent_handler, $errno, $errstr, $errfile, $errline, $errcontext);
    }

}

// vim: et sw=4 sts=4