diff options
author | Titouan Galopin <[email protected]> | 2018-11-25 00:58:42 +0100 |
---|---|---|
committer | Titouan Galopin <[email protected]> | 2018-11-25 14:58:29 +0100 |
commit | d829a30e092ea5d868b93a258724b12f9aa313fd (patch) | |
tree | 1c8819b825738ce95d87d96e6e2cafeb2c7f14c8 /src/HTML5/Parser | |
parent | d39a98a478c95e0df86ed564650f7326244116e2 (diff) |
Fix coding style
Diffstat (limited to 'src/HTML5/Parser')
-rw-r--r-- | src/HTML5/Parser/CharacterReference.php | 32 | ||||
-rw-r--r-- | src/HTML5/Parser/DOMTreeBuilder.php | 178 | ||||
-rw-r--r-- | src/HTML5/Parser/EventHandler.php | 36 | ||||
-rw-r--r-- | src/HTML5/Parser/FileInputStream.php | 7 | ||||
-rw-r--r-- | src/HTML5/Parser/InputStream.php | 26 | ||||
-rw-r--r-- | src/HTML5/Parser/ParseError.php | 1 | ||||
-rw-r--r-- | src/HTML5/Parser/Scanner.php | 74 | ||||
-rw-r--r-- | src/HTML5/Parser/StringInputStream.php | 65 | ||||
-rw-r--r-- | src/HTML5/Parser/Tokenizer.php | 332 | ||||
-rw-r--r-- | src/HTML5/Parser/TreeBuildingRules.php | 33 | ||||
-rw-r--r-- | src/HTML5/Parser/UTF8Utils.php | 23 |
11 files changed, 400 insertions, 407 deletions
diff --git a/src/HTML5/Parser/CharacterReference.php b/src/HTML5/Parser/CharacterReference.php index c1617e7..3ecfba4 100644 --- a/src/HTML5/Parser/CharacterReference.php +++ b/src/HTML5/Parser/CharacterReference.php @@ -1,4 +1,5 @@ <?php + namespace Masterminds\HTML5\Parser; use Masterminds\HTML5\Entities; @@ -6,25 +7,22 @@ use Masterminds\HTML5\Entities; /** * Manage entity references. * - * This is a simple resolver for HTML5 character reference entitites. - * See \Masterminds\HTML5\Entities for the list of supported entities. + * This is a simple resolver for HTML5 character reference entitites. See Entities for the list of supported entities. */ class CharacterReference { - protected static $numeric_mask = array( 0x0, 0x2FFFF, 0, - 0xFFFF + 0xFFFF, ); /** - * Given a name (e.g. - * 'amp'), lookup the UTF-8 character ('&') + * Given a name (e.g. 'amp'), lookup the UTF-8 character ('&'). + * + * @param string $name The name to look up * - * @param string $name - * The name to look up. * @return string The character sequence. In UTF-8 this may be more than one byte. */ public static function lookupName($name) @@ -34,20 +32,16 @@ class CharacterReference } /** - * Given a Unicode codepoint, return the UTF-8 character. - * - * (NOT USED ANYWHERE) - */ - /* - * public static function lookupCode($codePoint) { return 'POINT'; } - */ - - /** * Given a decimal number, return the UTF-8 character. + * + * @param $int + * + * @return false|string|string[]|null */ public static function lookupDecimal($int) { $entity = '&#' . $int . ';'; + // UNTESTED: This may fail on some planes. Couldn't find full documentation // on the value of the mask array. return mb_decode_numericentity($entity, static::$numeric_mask, 'utf-8'); @@ -55,6 +49,10 @@ class CharacterReference /** * Given a hexidecimal number, return the UTF-8 character. + * + * @param $hexdec + * + * @return false|string|string[]|null */ public static function lookupHex($hexdec) { diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php index 7f0e16a..365bb75 100644 --- a/src/HTML5/Parser/DOMTreeBuilder.php +++ b/src/HTML5/Parser/DOMTreeBuilder.php @@ -1,7 +1,9 @@ <?php + namespace Masterminds\HTML5\Parser; use Masterminds\HTML5\Elements; +use Masterminds\HTML5\InstructionProcessor; /** * Create an HTML5 DOM tree from events. @@ -24,7 +26,7 @@ use Masterminds\HTML5\Elements; class DOMTreeBuilder implements EventHandler { /** - * Defined in http://www.w3.org/TR/html51/infrastructure.html#html-namespace-0 + * Defined in http://www.w3.org/TR/html51/infrastructure.html#html-namespace-0. */ const NAMESPACE_HTML = 'http://www.w3.org/1999/xhtml'; @@ -45,14 +47,14 @@ class DOMTreeBuilder implements EventHandler const OPT_IMPLICIT_NS = 'implicit_namespaces'; /** - * Holds the HTML5 element names that causes a namespace switch + * Holds the HTML5 element names that causes a namespace switch. * * @var array */ protected $nsRoots = array( 'html' => self::NAMESPACE_HTML, 'svg' => self::NAMESPACE_SVG, - 'math' => self::NAMESPACE_MATHML + 'math' => self::NAMESPACE_MATHML, ); /** @@ -63,7 +65,7 @@ class DOMTreeBuilder implements EventHandler protected $implicitNamespaces = array( 'xml' => self::NAMESPACE_XML, 'xmlns' => self::NAMESPACE_XMLNS, - 'xlink' => self::NAMESPACE_XLINK + 'xlink' => self::NAMESPACE_XLINK, ); /** @@ -146,15 +148,15 @@ class DOMTreeBuilder implements EventHandler protected $insertMode = 0; /** - * Track if we are in an element that allows only inline child nodes + * Track if we are in an element that allows only inline child nodes. + * * @var string|null */ protected $onlyInline; /** * Quirks mode is enabled by default. - * Any document that is missing the - * DT will be considered to be in quirks mode. + * Any document that is missing the DT will be considered to be in quirks mode. */ protected $quirks = true; @@ -175,24 +177,23 @@ class DOMTreeBuilder implements EventHandler // $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt); $this->doc = $impl->createDocument(null, null, $dt); } + $this->errors = array(); $this->current = $this->doc; // ->documentElement; // Create a rules engine for tags. - $this->rules = new TreeBuildingRules($this->doc); + $this->rules = new TreeBuildingRules(); $implicitNS = array(); if (isset($this->options[self::OPT_IMPLICIT_NS])) { $implicitNS = $this->options[self::OPT_IMPLICIT_NS]; - } elseif (isset($this->options["implicitNamespaces"])) { - $implicitNS = $this->options["implicitNamespaces"]; + } elseif (isset($this->options['implicitNamespaces'])) { + $implicitNS = $this->options['implicitNamespaces']; } // Fill $nsStack with the defalut HTML5 namespaces, plus the "implicitNamespaces" array taken form $options - array_unshift($this->nsStack, $implicitNS + array( - '' => self::NAMESPACE_HTML - ) + $this->implicitNamespaces); + array_unshift($this->nsStack, $implicitNS + array('' => self::NAMESPACE_HTML) + $this->implicitNamespaces); if ($isFragment) { $this->insertMode = static::IM_IN_BODY; @@ -229,8 +230,10 @@ class DOMTreeBuilder implements EventHandler * * This is used for handling Processor Instructions as they are * inserted. If omitted, PI's are inserted directly into the DOM tree. + * + * @param InstructionProcessor $proc */ - public function setInstructionProcessor(\Masterminds\HTML5\InstructionProcessor $proc) + public function setInstructionProcessor(InstructionProcessor $proc) { $this->processor = $proc; } @@ -242,7 +245,7 @@ class DOMTreeBuilder implements EventHandler $this->quirks = $quirks; if ($this->insertMode > static::IM_INITIAL) { - $this->parseError("Illegal placement of DOCTYPE tag. Ignoring: " . $name); + $this->parseError('Illegal placement of DOCTYPE tag. Ignoring: ' . $name); return; } @@ -256,27 +259,32 @@ class DOMTreeBuilder implements EventHandler * @todo - XMLNS namespace handling (we need to parse, even if it's not valid) * - XLink, MathML and SVG namespace handling * - Omission rules: 8.1.2.4 Optional tags + * + * @param string $name + * @param array $attributes + * @param bool $selfClosing + * + * @return int */ public function startTag($name, $attributes = array(), $selfClosing = false) { - // fprintf(STDOUT, $name); $lname = $this->normalizeTagName($name); // Make sure we have an html element. - if (! $this->doc->documentElement && $name !== 'html' && ! $this->frag) { + if (!$this->doc->documentElement && 'html' !== $name && !$this->frag) { $this->startTag('html'); } // Set quirks mode if we're at IM_INITIAL with no doctype. - if ($this->insertMode == static::IM_INITIAL) { + if ($this->insertMode === static::IM_INITIAL) { $this->quirks = true; - $this->parseError("No DOCTYPE specified."); + $this->parseError('No DOCTYPE specified.'); } // SPECIAL TAG HANDLING: // Spec says do this, and "don't ask." // find the spec where this is defined... looks problematic - if ($name == 'image' && !($this->insertMode === static::IM_IN_SVG || $this->insertMode === static::IM_IN_MATHML)) { + if ('image' === $name && !($this->insertMode === static::IM_IN_SVG || $this->insertMode === static::IM_IN_MATHML)) { $name = 'img'; } @@ -292,7 +300,7 @@ class DOMTreeBuilder implements EventHandler break; case 'head': if ($this->insertMode > static::IM_BEFORE_HEAD) { - $this->parseError("Unexpected head tag outside of head context."); + $this->parseError('Unexpected head tag outside of head context.'); } else { $this->insertMode = static::IM_IN_HEAD; } @@ -307,14 +315,14 @@ class DOMTreeBuilder implements EventHandler $this->insertMode = static::IM_IN_MATHML; break; case 'noscript': - if ($this->insertMode == static::IM_IN_HEAD) { + if ($this->insertMode === static::IM_IN_HEAD) { $this->insertMode = static::IM_IN_HEAD_NOSCRIPT; } break; } // Special case handling for SVG. - if ($this->insertMode == static::IM_IN_SVG) { + if ($this->insertMode === static::IM_IN_SVG) { $lname = Elements::normalizeSvgElement($lname); } @@ -322,62 +330,58 @@ class DOMTreeBuilder implements EventHandler // when we found a tag thats appears inside $nsRoots, we have to switch the defalut namespace if (isset($this->nsRoots[$lname]) && $this->nsStack[0][''] !== $this->nsRoots[$lname]) { array_unshift($this->nsStack, array( - '' => $this->nsRoots[$lname] + '' => $this->nsRoots[$lname], ) + $this->nsStack[0]); - $pushes ++; + ++$pushes; } $needsWorkaround = false; - if (isset($this->options["xmlNamespaces"]) && $this->options["xmlNamespaces"]) { + if (isset($this->options['xmlNamespaces']) && $this->options['xmlNamespaces']) { // when xmlNamespaces is true a and we found a 'xmlns' or 'xmlns:*' attribute, we should add a new item to the $nsStack foreach ($attributes as $aName => $aVal) { - if ($aName === 'xmlns') { + if ('xmlns' === $aName) { $needsWorkaround = $aVal; array_unshift($this->nsStack, array( - '' => $aVal + '' => $aVal, ) + $this->nsStack[0]); - $pushes ++; - } elseif ((($pos = strpos($aName, ':')) ? substr($aName, 0, $pos) : '') === 'xmlns') { + ++$pushes; + } elseif ('xmlns' === (($pos = strpos($aName, ':')) ? substr($aName, 0, $pos) : '')) { array_unshift($this->nsStack, array( - substr($aName, $pos + 1) => $aVal + substr($aName, $pos + 1) => $aVal, ) + $this->nsStack[0]); - $pushes ++; + ++$pushes; } } } if ($this->onlyInline && Elements::isA($lname, Elements::BLOCK_TAG)) { - $this->autoclose($this->onlyInline); - $this->onlyInline = null; + $this->autoclose($this->onlyInline); + $this->onlyInline = null; } try { $prefix = ($pos = strpos($lname, ':')) ? substr($lname, 0, $pos) : ''; - - if ($needsWorkaround!==false) { - - $xml = "<$lname xmlns=\"$needsWorkaround\" ".(strlen($prefix) && isset($this->nsStack[0][$prefix])?("xmlns:$prefix=\"".$this->nsStack[0][$prefix]."\""):"")."/>"; + if (false !== $needsWorkaround) { + $xml = "<$lname xmlns=\"$needsWorkaround\" " . (strlen($prefix) && isset($this->nsStack[0][$prefix]) ? ("xmlns:$prefix=\"" . $this->nsStack[0][$prefix] . '"') : '') . '/>'; $frag = new \DOMDocument('1.0', 'UTF-8'); $frag->loadXML($xml); $ele = $this->doc->importNode($frag->documentElement, true); - } else { - if (!isset($this->nsStack[0][$prefix]) || ($prefix === "" && isset($this->options[self::OPT_DISABLE_HTML_NS]) && $this->options[self::OPT_DISABLE_HTML_NS])) { + if (!isset($this->nsStack[0][$prefix]) || ('' === $prefix && isset($this->options[self::OPT_DISABLE_HTML_NS]) && $this->options[self::OPT_DISABLE_HTML_NS])) { $ele = $this->doc->createElement($lname); } else { $ele = $this->doc->createElementNS($this->nsStack[0][$prefix], $lname); } } - } catch (\DOMException $e) { $this->parseError("Illegal tag name: <$lname>. Replaced with <invalid>."); $ele = $this->doc->createElement('invalid'); } if (Elements::isA($lname, Elements::BLOCK_ONLY_INLINE)) { - $this->onlyInline = $lname; + $this->onlyInline = $lname; } // When we add some namespacess, we have to track them. Later, when "endElement" is invoked, we have to remove them. @@ -396,23 +400,23 @@ class DOMTreeBuilder implements EventHandler foreach ($attributes as $aName => $aVal) { // xmlns attributes can't be set - if ($aName === 'xmlns') { + if ('xmlns' === $aName) { continue; } - if ($this->insertMode == static::IM_IN_SVG) { + if ($this->insertMode === static::IM_IN_SVG) { $aName = Elements::normalizeSvgAttribute($aName); - } elseif ($this->insertMode == static::IM_IN_MATHML) { + } elseif ($this->insertMode === static::IM_IN_MATHML) { $aName = Elements::normalizeMathMlAttribute($aName); } try { $prefix = ($pos = strpos($aName, ':')) ? substr($aName, 0, $pos) : false; - if ($prefix==='xmlns') { - $ele->setAttributeNs(self::NAMESPACE_XMLNS, $aName, $aVal); - } elseif ($prefix!==false && isset($this->nsStack[0][$prefix])) { - $ele->setAttributeNs($this->nsStack[0][$prefix], $aName, $aVal); + if ('xmlns' === $prefix) { + $ele->setAttributeNS(self::NAMESPACE_XMLNS, $aName, $aVal); + } elseif (false !== $prefix && isset($this->nsStack[0][$prefix])) { + $ele->setAttributeNS($this->nsStack[0][$prefix], $aName, $aVal); } else { $ele->setAttribute($aName, $aVal); } @@ -422,19 +426,19 @@ class DOMTreeBuilder implements EventHandler } // This is necessary on a non-DTD schema, like HTML5. - if ($aName == 'id') { + if ('id' === $aName) { $ele->setIdAttribute('id', true); } } - // Some elements have special processing rules. Handle those separately. - if ($this->rules->hasRules($name) && $this->frag !== $this->current) { + if ($this->frag !== $this->current && $this->rules->hasRules($name)) { + // Some elements have special processing rules. Handle those separately. $this->current = $this->rules->evaluate($ele, $this->current); - } // Otherwise, it's a standard element. - else { + } else { + // Otherwise, it's a standard element. $this->current->appendChild($ele); - if (! Elements::isA($name, Elements::VOID_TAG)) { + if (!Elements::isA($name, Elements::VOID_TAG)) { $this->current = $ele; } @@ -448,7 +452,7 @@ class DOMTreeBuilder implements EventHandler // This is sort of a last-ditch attempt to correct for cases where no head/body // elements are provided. - if ($this->insertMode <= static::IM_BEFORE_HEAD && $name != 'head' && $name != 'html') { + if ($this->insertMode <= static::IM_BEFORE_HEAD && 'head' !== $name && 'html' !== $name) { $this->insertMode = static::IM_IN_BODY; } @@ -456,7 +460,7 @@ class DOMTreeBuilder implements EventHandler // but we have to remove the namespaces pushed to $nsStack. if ($pushes > 0 && Elements::isA($name, Elements::VOID_TAG)) { // remove the namespaced definded by current node - for ($i = 0; $i < $pushes; $i ++) { + for ($i = 0; $i < $pushes; ++$i) { array_shift($this->nsStack); } } @@ -485,7 +489,7 @@ class DOMTreeBuilder implements EventHandler 'html', 'br', 'head', - 'title' + 'title', ))) { $this->startTag('html'); $this->endTag($name); @@ -495,13 +499,13 @@ class DOMTreeBuilder implements EventHandler } // Ignore the tag. - $this->parseError("Illegal closing tag at global scope."); + $this->parseError('Illegal closing tag at global scope.'); return; } // Special case handling for SVG. - if ($this->insertMode == static::IM_IN_SVG) { + if ($this->insertMode === static::IM_IN_SVG) { $lname = Elements::normalizeSvgElement($lname); } @@ -512,39 +516,33 @@ class DOMTreeBuilder implements EventHandler $cid = spl_object_hash($this->current); } - // XXX: Not sure whether we need this anymore. - // if ($name != $lname) { - // return $this->quirksTreeResolver($lname); - // } - // XXX: HTML has no parent. What do we do, though, // if this element appears in the wrong place? - if ($lname == 'html') { + if ('html' === $lname) { return; } // remove the namespaced definded by current node if (isset($this->pushes[$cid])) { - for ($i = 0; $i < $this->pushes[$cid][0]; $i ++) { + for ($i = 0; $i < $this->pushes[$cid][0]; ++$i) { array_shift($this->nsStack); } unset($this->pushes[$cid]); } - if (! $this->autoclose($lname)) { + if (!$this->autoclose($lname)) { $this->parseError('Could not find closing tag for ' . $lname); } - // switch ($this->insertMode) { switch ($lname) { - case "head": + case 'head': $this->insertMode = static::IM_AFTER_HEAD; break; - case "body": + case 'body': $this->insertMode = static::IM_AFTER_BODY; break; - case "svg": - case "mathml": + case 'svg': + case 'mathml': $this->insertMode = static::IM_IN_BODY; break; } @@ -566,9 +564,9 @@ class DOMTreeBuilder implements EventHandler // practical as most documents contain these characters. Other text is not // expected here so recording a parse error is necessary. $dataTmp = trim($data, " \t\n\r\f"); - if (! empty($dataTmp)) { + if (!empty($dataTmp)) { // fprintf(STDOUT, "Unexpected insert mode: %d", $this->insertMode); - $this->parseError("Unexpected text. Ignoring: " . $dataTmp); + $this->parseError('Unexpected text. Ignoring: ' . $dataTmp); } return; @@ -585,7 +583,7 @@ class DOMTreeBuilder implements EventHandler public function parseError($msg, $line = 0, $col = 0) { - $this->errors[] = sprintf("Line %d, Col %d: %s", $line, $col, $msg); + $this->errors[] = sprintf('Line %d, Col %d: %s', $line, $col, $msg); } public function getErrors() @@ -602,15 +600,14 @@ class DOMTreeBuilder implements EventHandler public function processingInstruction($name, $data = null) { // XXX: Ignore initial XML declaration, per the spec. - if ($this->insertMode == static::IM_INITIAL && 'xml' == strtolower($name)) { + if ($this->insertMode === static::IM_INITIAL && 'xml' === strtolower($name)) { return; } - // Important: The processor may modify the current DOM tree however - // it sees fit. - if (isset($this->processor)) { + // Important: The processor may modify the current DOM tree however it sees fit. + if ($this->processor instanceof InstructionProcessor) { $res = $this->processor->process($this->current, $name, $data); - if (! empty($res)) { + if (!empty($res)) { $this->current = $res; } @@ -632,9 +629,9 @@ class DOMTreeBuilder implements EventHandler * * See sections 2.9 and 8.1.2. * - * @param string $name - * The tag name. - * @return string The normalized tag name. + * @param string $name The tag name + * + * @return string the normalized tag name */ protected function normalizeTagName($name) { @@ -646,7 +643,7 @@ class DOMTreeBuilder implements EventHandler protected function quirksTreeResolver($name) { - throw new \Exception("Not implemented."); + throw new \Exception('Not implemented.'); } /** @@ -660,15 +657,16 @@ class DOMTreeBuilder implements EventHandler { $working = $this->current; do { - if ($working->nodeType != XML_ELEMENT_NODE) { + if (XML_ELEMENT_NODE !== $working->nodeType) { return false; } - if ($working->tagName == $tagName) { + if ($working->tagName === $tagName) { $this->current = $working->parentNode; return true; } } while ($working = $working->parentNode); + return false; } @@ -685,8 +683,8 @@ class DOMTreeBuilder implements EventHandler protected function isAncestor($tagName) { $candidate = $this->current; - while ($candidate->nodeType === XML_ELEMENT_NODE) { - if ($candidate->tagName == $tagName) { + while (XML_ELEMENT_NODE === $candidate->nodeType) { + if ($candidate->tagName === $tagName) { return true; } $candidate = $candidate->parentNode; @@ -704,6 +702,6 @@ class DOMTreeBuilder implements EventHandler */ protected function isParent($tagName) { - return $this->current->tagName == $tagName; + return $this->current->tagName === $tagName; } } diff --git a/src/HTML5/Parser/EventHandler.php b/src/HTML5/Parser/EventHandler.php index 3da71a3..cb0109b 100644 --- a/src/HTML5/Parser/EventHandler.php +++ b/src/HTML5/Parser/EventHandler.php @@ -1,4 +1,5 @@ <?php + namespace Masterminds\HTML5\Parser; /** @@ -20,7 +21,6 @@ namespace Masterminds\HTML5\Parser; */ interface EventHandler { - const DOCTYPE_NONE = 0; const DOCTYPE_PUBLIC = 1; @@ -30,15 +30,11 @@ interface EventHandler /** * A doctype declaration. * - * @param string $name - * The name of the root element. - * @param int $idType - * One of DOCTYPE_NONE, DOCTYPE_PUBLIC, or DOCTYPE_SYSTEM. - * @param string $id - * The identifier. For DOCTYPE_PUBLIC, this is the public ID. If DOCTYPE_SYSTEM, - * then this is a system ID. - * @param boolean $quirks - * Indicates whether the builder should enter quirks mode. + * @param string $name The name of the root element + * @param int $idType One of DOCTYPE_NONE, DOCTYPE_PUBLIC, or DOCTYPE_SYSTEM + * @param string $id The identifier. For DOCTYPE_PUBLIC, this is the public ID. If DOCTYPE_SYSTEM, + * then this is a system ID. + * @param bool $quirks Indicates whether the builder should enter quirks mode */ public function doctype($name, $idType = 0, $id = null, $quirks = false); @@ -63,13 +59,11 @@ interface EventHandler * The textmode is automatically reset to Tokenizer::TEXTMODE_NORMAL when the * closing tag is encounter. **This behavior may change.** * - * @param string $name - * The tag name. - * @param array $attributes - * An array with all of the tag's attributes. - * @param boolean $selfClosing - * An indicator of whether or not this tag is self-closing (<foo/>) - * @return int One of the Tokenizer::TEXTMODE_* constants. + * @param string $name The tag name + * @param array $attributes An array with all of the tag's attributes + * @param bool $selfClosing An indicator of whether or not this tag is self-closing (<foo/>) + * + * @return int one of the Tokenizer::TEXTMODE_* constants */ public function startTag($name, $attributes = array(), $selfClosing = false); @@ -104,7 +98,7 @@ interface EventHandler * A CDATA section. * * @param string $data - * The unparsed character data. + * The unparsed character data */ public function cdata($data); @@ -113,10 +107,8 @@ interface EventHandler * * While user agents don't get PIs, server-side does. * - * @param string $name - * The name of the processor (e.g. 'php'). - * @param string $data - * The unparsed data. + * @param string $name The name of the processor (e.g. 'php'). + * @param string $data The unparsed data */ public function processingInstruction($name, $data = null); } diff --git a/src/HTML5/Parser/FileInputStream.php b/src/HTML5/Parser/FileInputStream.php index 76bd17b..f176422 100644 --- a/src/HTML5/Parser/FileInputStream.php +++ b/src/HTML5/Parser/FileInputStream.php @@ -1,4 +1,5 @@ <?php + namespace Masterminds\HTML5\Parser; /** @@ -18,9 +19,9 @@ class FileInputStream extends StringInputStream implements InputStream /** * Load a file input stream. * - * @param string $data The file or url path to load. - * @param string $encoding The encoding to use for the data. - * @param string $debug A fprintf format to use to echo the data on stdout. + * @param string $data the file or url path to load + * @param string $encoding the encoding to use for the data + * @param string $debug a fprintf format to use to echo the data on stdout */ public function __construct($data, $encoding = 'UTF-8', $debug = '') { diff --git a/src/HTML5/Parser/InputStream.php b/src/HTML5/Parser/InputStream.php index e4a106a..cf279d8 100644 --- a/src/HTML5/Parser/InputStream.php +++ b/src/HTML5/Parser/InputStream.php @@ -29,7 +29,7 @@ interface InputStream extends \Iterator * * @TODO Move this to the scanner. * - * @return int The column number. + * @return int the column number */ public function columnOffset(); @@ -49,12 +49,12 @@ interface InputStream extends \Iterator * and returns the matched substring. * * @see strcspn - * @param string $bytes - * Bytes to match. - * @param int $max - * Maximum number of bytes to scan. + * + * @param string $bytes Bytes to match + * @param int $max Maximum number of bytes to scan + * * @return mixed Index or false if no match is found. You should use strong - * equality when checking the result, since index could be 0. + * equality when checking the result, since index could be 0. */ public function charsUntil($bytes, $max = null); @@ -65,20 +65,18 @@ interface InputStream extends \Iterator * and returns the matched substring. * * @see strspn - * @param string $bytes - * A mask of bytes to match. If ANY byte in this mask matches the - * current char, the pointer advances and the char is part of the - * substring. - * @param int $max - * The max number of chars to read. + * + * @param string $bytes A mask of bytes to match. If ANY byte in this mask matches the + * current char, the pointer advances and the char is part of the + * substring. + * @param int $max The max number of chars to read */ public function charsWhile($bytes, $max = null); /** * Unconsume one character. * - * @param int $howMany - * The number of characters to move the pointer back. + * @param int $howMany The number of characters to move the pointer back */ public function unconsume($howMany = 1); diff --git a/src/HTML5/Parser/ParseError.php b/src/HTML5/Parser/ParseError.php index 86498a1..640e516 100644 --- a/src/HTML5/Parser/ParseError.php +++ b/src/HTML5/Parser/ParseError.php @@ -1,4 +1,5 @@ <?php + namespace Masterminds\HTML5\Parser; /** diff --git a/src/HTML5/Parser/Scanner.php b/src/HTML5/Parser/Scanner.php index cec9a13..7bea1ae 100644 --- a/src/HTML5/Parser/Scanner.php +++ b/src/HTML5/Parser/Scanner.php @@ -1,4 +1,5 @@ <?php + namespace Masterminds\HTML5\Parser; use Masterminds\HTML5\Exception; @@ -18,7 +19,7 @@ class Scanner private $data; /** - * The current integer byte position we are in $data + * The current integer byte position we are in $data. */ private $char; @@ -35,10 +36,10 @@ class Scanner /** * Create a new Scanner. * - * @param string $data Data to parse - * @param string $encoding The encoding to use for the data. + * @param string $data Data to parse + * @param string $encoding the encoding to use for the data * - * @throws Exception If the given data cannot be encoded to UTF-8. + * @throws Exception if the given data cannot be encoded to UTF-8 */ public function __construct($data, $encoding = 'UTF-8') { @@ -75,20 +76,21 @@ class Scanner * '</script>' string. * * @param string $sequence - * @param bool $caseSensitive + * @param bool $caseSensitive * * @return bool */ public function sequenceMatches($sequence, $caseSensitive = true) { $portion = substr($this->data, $this->char, strlen($sequence)); - return $caseSensitive ? $portion === $sequence : strcasecmp($portion, $sequence) === 0; + + return $caseSensitive ? $portion === $sequence : 0 === strcasecmp($portion, $sequence); } /** * Get the current position. * - * @return int The current intiger byte position. + * @return int the current intiger byte position */ public function position() { @@ -98,7 +100,7 @@ class Scanner /** * Take a peek at the next character in the data. * - * @return string The next character. + * @return string the next character */ public function peek() { @@ -114,11 +116,11 @@ class Scanner * * Note: This advances the pointer. * - * @return string The next character. + * @return string the next character */ public function next() { - $this->char++; + ++$this->char; if ($this->char < $this->EOF) { return $this->data[$this->char]; @@ -132,7 +134,7 @@ class Scanner * * Note, this does not advance the pointer. * - * @return string The current character. + * @return string the current character */ public function current() { @@ -157,13 +159,12 @@ class Scanner * Unconsume some of the data. * This moves the data pointer backwards. * - * @param int $howMany - * The number of characters to move the pointer back. + * @param int $howMany The number of characters to move the pointer back */ public function unconsume($howMany = 1) { if (($this->char - $howMany) >= 0) { - $this->char = $this->char - $howMany; + $this->char -= $howMany; } } @@ -173,7 +174,7 @@ class Scanner * Note, along with getting the characters the pointer in the data will be * moved as well. * - * @return string The next group that is hex characters. + * @return string the next group that is hex characters */ public function getHex() { @@ -186,7 +187,7 @@ class Scanner * Note, along with getting the characters the pointer in the data will be * moved as well. * - * @return string The next group of ASCII alpha characters. + * @return string the next group of ASCII alpha characters */ public function getAsciiAlpha() { @@ -199,7 +200,7 @@ class Scanner * Note, along with getting the characters the pointer in the data will be * moved as well. * - * @return string The next group of ASCII alpha characters and numbers. + * @return string the next group of ASCII alpha characters and numbers */ public function getAsciiAlphaNum() { @@ -212,7 +213,7 @@ class Scanner * Note, along with getting the characters the pointer in the data will be * moved as well. * - * @return string The next group of numbers. + * @return string the next group of numbers */ public function getNumeric() { @@ -242,11 +243,11 @@ class Scanner /** * Returns the current line that is being consumed. * - * @return int The current line number. + * @return int the current line number */ public function currentLine() { - if (empty($this->EOF) || $this->char == 0) { + if (empty($this->EOF) || 0 === $this->char) { return 1; } @@ -284,12 +285,12 @@ class Scanner * * Newlines are column 0. The first char after a newline is column 1. * - * @return int The column number. + * @return int the column number */ public function columnOffset() { // Short circuit for the first char. - if ($this->char == 0) { + if (0 === $this->char) { return 0; } @@ -303,7 +304,7 @@ class Scanner // However, for here we want the length up until the next byte to be // processed, so add one to the current byte ($this->char). - if ($lastLine !== false) { + if (false !== $lastLine) { $findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine); } else { // After a newline. @@ -318,7 +319,7 @@ class Scanner * * This consumes characters until the EOF. * - * @return int The number of characters remaining. + * @return int the number of characters remaining */ public function remainingChars() { @@ -351,7 +352,7 @@ class Scanner $crlfTable = array( "\0" => "\xEF\xBF\xBD", "\r\n" => "\n", - "\r" => "\n" + "\r" => "\n", ); return strtr($data, $crlfTable); @@ -365,12 +366,11 @@ class Scanner * Matches as far as possible until we reach a certain set of bytes * and returns the matched substring. * - * @param string $bytes - * Bytes to match. - * @param int $max - * Maximum number of bytes to scan. + * @param string $bytes Bytes to match + * @param int $max Maximum number of bytes to scan + * * @return mixed Index or false if no match is found. You should use strong - * equality when checking the result, since index could be 0. + * equality when checking the result, since index could be 0. */ private function doCharsUntil($bytes, $max = null) { @@ -378,7 +378,7 @@ class Scanner return false; } - if ($max === 0 || $max) { + if (0 === $max || $max) { $len = strcspn($this->data, $bytes, $this->char, $max); } else { $len = strcspn($this->data, $bytes, $this->char); @@ -396,12 +396,10 @@ class Scanner * Matches as far as possible with a certain set of bytes * and returns the matched substring. * - * @param string $bytes - * A mask of bytes to match. If ANY byte in this mask matches the - * current char, the pointer advances and the char is part of the - * substring. - * @param int $max - * The max number of chars to read. + * @param string $bytes A mask of bytes to match. If ANY byte in this mask matches the + * current char, the pointer advances and the char is part of the + * substring. + * @param int $max The max number of chars to read * * @return string */ @@ -411,7 +409,7 @@ class Scanner return false; } - if ($max === 0 || $max) { + if (0 === $max || $max) { $len = strspn($this->data, $bytes, $this->char, $max); } else { $len = strspn($this->data, $bytes, $this->char); diff --git a/src/HTML5/Parser/StringInputStream.php b/src/HTML5/Parser/StringInputStream.php index 0118468..2281990 100644 --- a/src/HTML5/Parser/StringInputStream.php +++ b/src/HTML5/Parser/StringInputStream.php @@ -2,6 +2,7 @@ /** * Loads a string to be parsed. */ + namespace Masterminds\HTML5\Parser; /* @@ -50,7 +51,7 @@ class StringInputStream implements InputStream private $data; /** - * The current integer byte position we are in $data + * The current integer byte position we are in $data. */ private $char; @@ -67,9 +68,9 @@ class StringInputStream implements InputStream /** * Create a new InputStream wrapper. * - * @param string $data Data to parse - * @param string $encoding The encoding to use for the data. - * @param string $debug A fprintf format to use to echo the data on stdout. + * @param string $data Data to parse + * @param string $encoding the encoding to use for the data + * @param string $debug a fprintf format to use to echo the data on stdout */ public function __construct($data, $encoding = 'UTF-8', $debug = '') { @@ -110,7 +111,7 @@ class StringInputStream implements InputStream $crlfTable = array( "\0" => "\xEF\xBF\xBD", "\r\n" => "\n", - "\r" => "\n" + "\r" => "\n", ); return strtr($data, $crlfTable); @@ -121,7 +122,7 @@ class StringInputStream implements InputStream */ public function currentLine() { - if (empty($this->EOF) || $this->char == 0) { + if (empty($this->EOF) || 0 === $this->char) { return 1; } // Add one to $this->char because we want the number for the next @@ -130,9 +131,7 @@ class StringInputStream implements InputStream } /** - * * @deprecated - * */ public function getCurrentLine() { @@ -144,12 +143,12 @@ class StringInputStream implements InputStream * * Newlines are column 0. The first char after a newline is column 1. * - * @return int The column number. + * @return int the column number */ public function columnOffset() { // Short circuit for the first char. - if ($this->char == 0) { + if (0 === $this->char) { return 0; } // strrpos is weird, and the offset needs to be negative for what we @@ -162,7 +161,7 @@ class StringInputStream implements InputStream // However, for here we want the length up until the next byte to be // processed, so add one to the current byte ($this->char). - if ($lastLine !== false) { + if (false !== $lastLine) { $findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine); } else { // After a newline. @@ -173,9 +172,7 @@ class StringInputStream implements InputStream } /** - * * @deprecated - * */ public function getColumnOffset() { @@ -185,7 +182,7 @@ class StringInputStream implements InputStream /** * Get the current character. * - * @return string The current character. + * @return string the current character */ public function current() { @@ -198,7 +195,7 @@ class StringInputStream implements InputStream */ public function next() { - $this->char ++; + ++$this->char; } /** @@ -212,15 +209,11 @@ class StringInputStream implements InputStream /** * Is the current pointer location valid. * - * @return bool Is the current pointer location valid. + * @return bool is the current pointer location valid */ public function valid() { - if ($this->char < $this->EOF) { - return true; - } - - return false; + return $this->char < $this->EOF; } /** @@ -232,7 +225,7 @@ class StringInputStream implements InputStream * @note This performs bounds checking * * @return string Returns the remaining text. If called when the InputStream is - * already exhausted, it returns an empty string. + * already exhausted, it returns an empty string. */ public function remainingChars() { @@ -254,12 +247,11 @@ class StringInputStream implements InputStream * Matches as far as possible until we reach a certain set of bytes * and returns the matched substring. * - * @param string $bytes - * Bytes to match. - * @param int $max - * Maximum number of bytes to scan. + * @param string $bytes Bytes to match + * @param int $max Maximum number of bytes to scan + * * @return mixed Index or false if no match is found. You should use strong - * equality when checking the result, since index could be 0. + * equality when checking the result, since index could be 0. */ public function charsUntil($bytes, $max = null) { @@ -267,7 +259,7 @@ class StringInputStream implements InputStream return false; } - if ($max === 0 || $max) { + if (0 === $max || $max) { $len = strcspn($this->data, $bytes, $this->char, $max); } else { $len = strcspn($this->data, $bytes, $this->char); @@ -285,12 +277,10 @@ class StringInputStream implements InputStream * Matches as far as possible with a certain set of bytes * and returns the matched substring. * - * @param string $bytes - * A mask of bytes to match. If ANY byte in this mask matches the - * current char, the pointer advances and the char is part of the - * substring. - * @param int $max - * The max number of chars to read. + * @param string $bytes A mask of bytes to match. If ANY byte in this mask matches the + * current char, the pointer advances and the char is part of the + * substring. + * @param int $max The max number of chars to read * * @return string */ @@ -300,7 +290,7 @@ class StringInputStream implements InputStream return false; } - if ($max === 0 || $max) { + if (0 === $max || $max) { $len = strspn($this->data, $bytes, $this->char, $max); } else { $len = strspn($this->data, $bytes, $this->char); @@ -314,13 +304,12 @@ class StringInputStream implements InputStream /** * Unconsume characters. * - * @param int $howMany - * The number of characters to unconsume. + * @param int $howMany The number of characters to unconsume */ public function unconsume($howMany = 1) { if (($this->char - $howMany) >= 0) { - $this->char = $this->char - $howMany; + $this->char -= $howMany; } } diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index ba9de52..cfd0e43 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -1,4 +1,5 @@ <?php + namespace Masterminds\HTML5\Parser; use Masterminds\HTML5\Elements; @@ -25,7 +26,6 @@ use Masterminds\HTML5\Elements; */ class Tokenizer { - protected $scanner; protected $events; @@ -56,12 +56,9 @@ class Tokenizer * it a scanner (input) and an event handler (output), and then calling * the Tokenizer::parse() method.` * - * @param \Masterminds\HTML5\Parser\Scanner $scanner - * A scanner initialized with an input stream. - * @param \Masterminds\HTML5\Parser\EventHandler $eventHandler - * An event handler, initialized and ready to receive - * events. - * @param string $mode + * @param Scanner $scanner A scanner initialized with an input stream + * @param EventHandler $eventHandler An event handler, initialized and ready to receive events + * @param string $mode */ public function __construct($scanner, $eventHandler, $mode = self::CONFORMANT_HTML) { @@ -103,11 +100,9 @@ class Tokenizer * Normally, setting is done by the event handler via a special return code on * startTag(), but it can also be set manually using this function. * - * @param integer $textmode - * One of Elements::TEXT_* - * @param string $untilTag - * The tag that should stop RAW or RCDATA mode. Normal mode does not - * use this indicator. + * @param int $textmode One of Elements::TEXT_* + * @param string $untilTag The tag that should stop RAW or RCDATA mode. Normal mode does not + * use this indicator. */ public function setTextMode($textmode, $untilTag = null) { @@ -117,13 +112,13 @@ class Tokenizer /** * Consume a character and make a move. - * HTML5 8.2.4.1 + * HTML5 8.2.4.1. */ protected function consumeData() { $tok = $this->scanner->current(); - if ($tok === '&') { + if ('&' === $tok) { // Character reference $ref = $this->decodeCharacterReference(); $this->buffer($ref); @@ -132,7 +127,7 @@ class Tokenizer } // Parse tag - if ($tok === '<') { + if ('<' === $tok) { // Any buffered text data can go out now. $this->flushBuffer(); @@ -143,7 +138,7 @@ class Tokenizer || $this->processingInstruction() || $this->tagName() // This always returns false. - || $this->parseError("Illegal tag opening") + || $this->parseError('Illegal tag opening') || $this->characterData(); $tok = $this->scanner->current(); @@ -153,7 +148,7 @@ class Tokenizer $this->eof($tok); // Parse character - if ($tok !== false) { + if (false !== $tok) { switch ($this->textMode) { case Elements::TEXT_RAW: $this->rawText($tok); @@ -164,10 +159,10 @@ class Tokenizer break; default: - if (!strspn($tok, "<&")) { + if (!strspn($tok, '<&')) { // NULL character - if ($tok === "\00") { - $this->parseError("Received null character."); + if ("\00" === $tok) { + $this->parseError('Received null character.'); } $this->text .= $tok; @@ -189,7 +184,7 @@ class Tokenizer protected function characterData() { $tok = $this->scanner->current(); - if ($tok === false) { + if (false === $tok) { return false; } switch ($this->textMode) { @@ -198,9 +193,10 @@ class Tokenizer case Elements::TEXT_RCDATA: return $this->rcdata($tok); default: - if (strspn($tok, "<&")) { + if (strspn($tok, '<&')) { return false; } + return $this->text($tok); } } @@ -208,20 +204,20 @@ class Tokenizer /** * This buffers the current token as character data. * - * @param string $tok The current token. + * @param string $tok the current token * * @return bool */ protected function text($tok) { // This should never happen... - if ($tok === false) { + if (false === $tok) { return false; } // NULL character - if ($tok === "\00") { - $this->parseError("Received null character."); + if ("\00" === $tok) { + $this->parseError('Received null character.'); } $this->buffer($tok); @@ -233,7 +229,7 @@ class Tokenizer /** * Read text in RAW mode. * - * @param string $tok The current token. + * @param string $tok the current token * * @return bool */ @@ -254,7 +250,7 @@ class Tokenizer /** * Read text in RCDATA mode. * - * @param string $tok The current token. + * @param string $tok the current token * * @return bool */ @@ -268,8 +264,8 @@ class Tokenizer $txt = ''; $caseSensitive = !Elements::isHtml5Element($this->untilTag); - while ($tok !== false && ! ($tok == '<' && ($this->scanner->sequenceMatches($sequence, $caseSensitive)))) { - if ($tok == '&') { + while (false !== $tok && !('<' == $tok && ($this->scanner->sequenceMatches($sequence, $caseSensitive)))) { + if ('&' == $tok) { $txt .= $this->decodeCharacterReference(); $tok = $this->scanner->current(); } else { @@ -280,8 +276,8 @@ class Tokenizer $len = strlen($sequence); $this->scanner->consume($len); $len += $this->scanner->whitespace(); - if ($this->scanner->current() !== '>') { - $this->parseError("Unclosed RCDATA end tag"); + if ('>' !== $this->scanner->current()) { + $this->parseError('Unclosed RCDATA end tag'); } $this->scanner->unconsume($len); @@ -296,7 +292,7 @@ class Tokenizer */ protected function eof($tok) { - if ($tok === false) { + if (false === $tok) { // fprintf(STDOUT, "EOF"); $this->flushBuffer(); $this->events->eof(); @@ -313,40 +309,37 @@ class Tokenizer */ protected function markupDeclaration($tok) { - if ($tok != '!') { + if ('!' != $tok) { return false; } $tok = $this->scanner->next(); // Comment: - if ($tok == '-' && $this->scanner->peek() == '-') { + if ('-' == $tok && '-' == $this->scanner->peek()) { $this->scanner->next(); // Consume the other '-' $this->scanner->next(); // Next char. return $this->comment(); - } - - elseif ($tok == 'D' || $tok == 'd') { // Doctype + } elseif ('D' == $tok || 'd' == $tok) { // Doctype return $this->doctype(); - } - - elseif ($tok == '[') { // CDATA section + } elseif ('[' == $tok) { // CDATA section return $this->cdataSection(); } // FINISH - $this->parseError("Expected <!--, <![CDATA[, or <!DOCTYPE. Got <!%s", $tok); + $this->parseError('Expected <!--, <![CDATA[, or <!DOCTYPE. Got <!%s', $tok); $this->bogusComment('<!'); + return true; } /** * Consume an end tag. - * 8.2.4.9 + * 8.2.4.9. */ protected function endTag() { - if ($this->scanner->current() != '/') { + if ('/' != $this->scanner->current()) { return false; } $tok = $this->scanner->next(); @@ -355,21 +348,22 @@ class Tokenizer // > -> parse error // EOF -> parse error // -> parse error - if (! ctype_alpha($tok)) { + if (!ctype_alpha($tok)) { $this->parseError("Expected tag name, got '%s'", $tok); - if ($tok == "\0" || $tok === false) { + if ("\0" == $tok || false === $tok) { return false; } + return $this->bogusComment('</'); } $name = $this->scanner->charsUntil("\n\f \t>"); - $name = $this->mode === self::CONFORMANT_XML ? $name: strtolower($name); + $name = self::CONFORMANT_XML === $this->mode ? $name : strtolower($name); // Trash whitespace. $this->scanner->whitespace(); $tok = $this->scanner->current(); - if ($tok != '>') { + if ('>' != $tok) { $this->parseError("Expected >, got '%s'", $tok); // We just trash stuff until we get to the next tag close. $this->scanner->charsUntil('>'); @@ -377,23 +371,24 @@ class Tokenizer $this->events->endTag($name); $this->scanner->next(); + return true; } /** * Consume a tag name and body. - * 8.2.4.10 + * 8.2.4.10. */ protected function tagName() { $tok = $this->scanner->current(); - if (! ctype_alpha($tok)) { + if (!ctype_alpha($tok)) { return false; } // We know this is at least one char. - $name = $this->scanner->charsWhile(":_-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); - $name = $this->mode === self::CONFORMANT_XML ? $name : strtolower($name); + $name = $this->scanner->charsWhile(':_-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'); + $name = self::CONFORMANT_XML === $this->mode ? $name : strtolower($name); $attributes = array(); $selfClose = false; @@ -403,7 +398,7 @@ class Tokenizer do { $this->scanner->whitespace(); $this->attribute($attributes); - } while (! $this->isTagEnd($selfClose)); + } while (!$this->isTagEnd($selfClose)); } catch (ParseError $e) { $selfClose = false; } @@ -425,30 +420,34 @@ class Tokenizer protected function isTagEnd(&$selfClose) { $tok = $this->scanner->current(); - if ($tok == '/') { + if ('/' == $tok) { $this->scanner->next(); $this->scanner->whitespace(); $tok = $this->scanner->current(); - if ($tok == '>') { + if ('>' == $tok) { $selfClose = true; + return true; } - if ($tok === false) { - $this->parseError("Unexpected EOF inside of tag."); + if (false === $tok) { + $this->parseError('Unexpected EOF inside of tag.'); + return true; } // Basically, we skip the / token and go on. // See 8.2.4.43. $this->parseError("Unexpected '%s' inside of a tag.", $tok); + return false; } - if ($tok == '>') { + if ('>' == $tok) { return true; } - if ($tok === false) { - $this->parseError("Unexpected EOF inside of tag."); + if (false === $tok) { + $this->parseError('Unexpected EOF inside of tag.'); + return true; } @@ -467,23 +466,23 @@ class Tokenizer protected function attribute(&$attributes) { $tok = $this->scanner->current(); - if ($tok == '/' || $tok == '>' || $tok === false) { + if ('/' == $tok || '>' == $tok || false === $tok) { return false; } - if ($tok == '<') { + if ('<' == $tok) { $this->parseError("Unexpected '<' inside of attributes list."); // Push the < back onto the stack. $this->scanner->unconsume(); // Let the caller figure out how to handle this. - throw new ParseError("Start tag inside of attribute."); + throw new ParseError('Start tag inside of attribute.'); } $name = strtolower($this->scanner->charsUntil("/>=\n\f\t ")); - if (strlen($name) == 0) { + if (0 == strlen($name)) { $tok = $this->scanner->current(); - $this->parseError("Expected an attribute name, got %s.", $tok); + $this->parseError('Expected an attribute name, got %s.', $tok); // Really, only '=' can be the char here. Everything else gets absorbed // under one rule or another. $name = $tok; @@ -497,18 +496,17 @@ class Tokenizer // see issue #23: https://github.com/Masterminds/html5-php/issues/23 // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name if (preg_match("/[\x1-\x2C\\/\x3B-\x40\x5B-\x5E\x60\x7B-\x7F]/u", $name)) { - $this->parseError("Unexpected characters in attribute name: %s", $name); + $this->parseError('Unexpected characters in attribute name: %s', $name); $isValidAttribute = false; } // There is no limitation for 1st character in HTML5. // But method "DOMElement::setAttribute" is throwing exception for the // characters below so they have to be filtered. // see issue #23: https://github.com/Masterminds/html5-php/issues/23 // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name - else - if (preg_match("/^[0-9.-]/u", $name)) { - $this->parseError("Unexpected character at the begining of attribute name: %s", $name); - $isValidAttribute = false; - } + elseif (preg_match('/^[0-9.-]/u', $name)) { + $this->parseError('Unexpected character at the begining of attribute name: %s', $name); + $isValidAttribute = false; + } // 8.1.2.3 $this->scanner->whitespace(); @@ -516,6 +514,7 @@ class Tokenizer if ($isValidAttribute) { $attributes[$name] = $val; } + return true; } @@ -527,7 +526,7 @@ class Tokenizer */ protected function attributeValue() { - if ($this->scanner->current() != '=') { + if ('=' != $this->scanner->current()) { return null; } $this->scanner->next(); @@ -538,21 +537,24 @@ class Tokenizer switch ($tok) { case "\n": case "\f": - case " ": + case ' ': case "\t": // Whitespace here indicates an empty value. return null; case '"': case "'": $this->scanner->next(); + return $this->quotedAttributeValue($tok); case '>': // case '/': // 8.2.4.37 seems to allow foo=/ as a valid attr. - $this->parseError("Expected attribute value, got tag end."); + $this->parseError('Expected attribute value, got tag end.'); + return null; case '=': case '`': - $this->parseError("Expecting quotes, got %s.", $tok); + $this->parseError('Expecting quotes, got %s.', $tok); + return $this->unquotedAttributeValue(); default: return $this->unquotedAttributeValue(); @@ -562,11 +564,11 @@ class Tokenizer /** * Get an attribute value string. * - * @param string $quote - * IMPORTANT: This is a series of chars! Any one of which will be considered - * termination of an attribute's value. E.g. "\"'" will stop at either - * ' or ". - * @return string The attribute value. + * @param string $quote IMPORTANT: This is a series of chars! Any one of which will be considered + * termination of an attribute's value. E.g. "\"'" will stop at either + * ' or ". + * + * @return string the attribute value */ protected function quotedAttributeValue($quote) { @@ -574,21 +576,22 @@ class Tokenizer $val = ''; while (true) { - $tokens = $this->scanner->charsUntil($stoplist.'&'); - if ($tokens !== false) { + $tokens = $this->scanner->charsUntil($stoplist . '&'); + if (false !== $tokens) { $val .= $tokens; } else { break; } $tok = $this->scanner->current(); - if ($tok == '&') { + if ('&' == $tok) { $val .= $this->decodeCharacterReference(true); continue; } break; } $this->scanner->next(); + return $val; } @@ -597,34 +600,34 @@ class Tokenizer $stoplist = "\t\n\f >"; $val = ''; $tok = $this->scanner->current(); - while (strspn($tok, $stoplist) == 0 && $tok !== false) { - if ($tok == '&') { + while (0 == strspn($tok, $stoplist) && false !== $tok) { + if ('&' == $tok) { $val .= $this->decodeCharacterReference(true); $tok = $this->scanner->current(); } else { if (strspn($tok, "\"'<=`") > 0) { - $this->parseError("Unexpected chars in unquoted attribute value %s", $tok); + $this->parseError('Unexpected chars in unquoted attribute value %s', $tok); } $val .= $tok; $tok = $this->scanner->next(); } } + return $val; } /** * Consume malformed markup as if it were a comment. - * 8.2.4.44 + * 8.2.4.44. * * The spec requires that the ENTIRE tag-like thing be enclosed inside of * the comment. So this will generate comments like: * * <!--</+foo>--> * - * @param string $leading - * Prepend any leading characters. This essentially - * negates the need to backtrack, but it's sort of - * a hack. + * @param string $leading Prepend any leading characters. This essentially + * negates the need to backtrack, but it's sort of + * a hack. * * @return bool */ @@ -632,11 +635,11 @@ class Tokenizer { $comment = $leading; $tokens = $this->scanner->charsUntil('>'); - if ($tokens !== false) { + if (false !== $tokens) { $comment .= $tokens; } $tok = $this->scanner->current(); - if ($tok !== false) { + if (false !== $tok) { $comment .= $tok; } @@ -660,25 +663,27 @@ class Tokenizer $comment = ''; // <!-->. Emit an empty comment because 8.2.4.46 says to. - if ($tok == '>') { + if ('>' == $tok) { // Parse error. Emit the comment token. $this->parseError("Expected comment data, got '>'"); $this->events->comment(''); $this->scanner->next(); + return true; } // Replace NULL with the replacement char. - if ($tok == "\0") { + if ("\0" == $tok) { $tok = UTF8Utils::FFFD; } - while (! $this->isCommentEnd()) { + while (!$this->isCommentEnd()) { $comment .= $tok; $tok = $this->scanner->next(); } $this->events->comment($comment); $this->scanner->next(); + return true; } @@ -692,24 +697,26 @@ class Tokenizer $tok = $this->scanner->current(); // EOF - if ($tok === false) { + if (false === $tok) { // Hit the end. - $this->parseError("Unexpected EOF in a comment."); + $this->parseError('Unexpected EOF in a comment.'); + return true; } // If it doesn't start with -, not the end. - if ($tok != '-') { + if ('-' != $tok) { return false; } // Advance one, and test for '->' - if ($this->scanner->next() == '-' && $this->scanner->peek() == '>') { + if ('-' == $this->scanner->next() && '>' == $this->scanner->peek()) { $this->scanner->next(); // Consume the last '>' return true; } // Unread '-'; $this->scanner->unconsume(1); + return false; } @@ -729,9 +736,10 @@ class Tokenizer return false; } // Check that string is DOCTYPE. - $chars = $this->scanner->charsWhile("DOCTYPEdoctype"); + $chars = $this->scanner->charsWhile('DOCTYPEdoctype'); if (strcasecmp($chars, 'DOCTYPE')) { $this->parseError('Expected DOCTYPE, got %s', $chars); + return $this->bogusComment('<!' . $chars); } @@ -739,14 +747,15 @@ class Tokenizer $tok = $this->scanner->current(); // EOF: die. - if ($tok === false) { + if (false === $tok) { $this->events->doctype('html5', EventHandler::DOCTYPE_NONE, '', true); + return $this->eof($tok); } // NULL char: convert. - if ($tok === "\0") { - $this->parseError("Unexpected null character in DOCTYPE."); + if ("\0" === $tok) { + $this->parseError('Unexpected null character in DOCTYPE.'); } $stop = " \n\f>"; @@ -757,23 +766,26 @@ class Tokenizer $tok = $this->scanner->current(); // If false, emit a parse error, DOCTYPE, and return. - if ($tok === false) { + if (false === $tok) { $this->parseError('Unexpected EOF in DOCTYPE declaration.'); $this->events->doctype($doctypeName, EventHandler::DOCTYPE_NONE, null, true); + return true; } // Short DOCTYPE, like <!DOCTYPE html> - if ($tok == '>') { + if ('>' == $tok) { // DOCTYPE without a name. - if (strlen($doctypeName) == 0) { - $this->parseError("Expected a DOCTYPE name. Got nothing."); + if (0 == strlen($doctypeName)) { + $this->parseError('Expected a DOCTYPE name. Got nothing.'); $this->events->doctype($doctypeName, 0, null, true); $this->scanner->next(); + return true; } $this->events->doctype($doctypeName); $this->scanner->next(); + return true; } $this->scanner->whitespace(); @@ -782,36 +794,40 @@ class Tokenizer $white = $this->scanner->whitespace(); // Get ID, and flag it as pub or system. - if (($pub == 'PUBLIC' || $pub == 'SYSTEM') && $white > 0) { + if (('PUBLIC' == $pub || 'SYSTEM' == $pub) && $white > 0) { // Get the sys ID. - $type = $pub == 'PUBLIC' ? EventHandler::DOCTYPE_PUBLIC : EventHandler::DOCTYPE_SYSTEM; + $type = 'PUBLIC' == $pub ? EventHandler::DOCTYPE_PUBLIC : EventHandler::DOCTYPE_SYSTEM; $id = $this->quotedString("\0>"); - if ($id === false) { + if (false === $id) { $this->events->doctype($doctypeName, $type, $pub, false); + return false; } // Premature EOF. - if ($this->scanner->current() === false) { - $this->parseError("Unexpected EOF in DOCTYPE"); + if (false === $this->scanner->current()) { + $this->parseError('Unexpected EOF in DOCTYPE'); $this->events->doctype($doctypeName, $type, $id, true); + return true; } // Well-formed complete DOCTYPE. $this->scanner->whitespace(); - if ($this->scanner->current() == '>') { + if ('>' == $this->scanner->current()) { $this->events->doctype($doctypeName, $type, $id, false); $this->scanner->next(); + return true; } // If we get here, we have <!DOCTYPE foo PUBLIC "bar" SOME_JUNK // Throw away the junk, parse error, quirks mode, return true. - $this->scanner->charsUntil(">"); - $this->parseError("Malformed DOCTYPE."); + $this->scanner->charsUntil('>'); + $this->parseError('Malformed DOCTYPE.'); $this->events->doctype($doctypeName, $type, $id, true); $this->scanner->next(); + return true; } @@ -819,35 +835,37 @@ class Tokenizer // Consume to > and trash. $this->scanner->charsUntil('>'); - $this->parseError("Expected PUBLIC or SYSTEM. Got %s.", $pub); + $this->parseError('Expected PUBLIC or SYSTEM. Got %s.', $pub); $this->events->doctype($doctypeName, 0, null, true); $this->scanner->next(); + return true; } /** * Utility for reading a quoted string. * - * @param string $stopchars - * Characters (in addition to a close-quote) that should stop the string. - * E.g. sometimes '>' is higher precedence than '"' or "'". + * @param string $stopchars Characters (in addition to a close-quote) that should stop the string. + * E.g. sometimes '>' is higher precedence than '"' or "'". * * @return mixed String if one is found (quotations omitted) */ protected function quotedString($stopchars) { $tok = $this->scanner->current(); - if ($tok == '"' || $tok == "'") { + if ('"' == $tok || "'" == $tok) { $this->scanner->next(); $ret = $this->scanner->charsUntil($tok . $stopchars); if ($this->scanner->current() == $tok) { $this->scanner->next(); } else { // Parse error because no close quote. - $this->parseError("Expected %s, got %s", $tok, $this->scanner->current()); + $this->parseError('Expected %s, got %s', $tok, $this->scanner->current()); } + return $ret; } + return false; } @@ -858,39 +876,43 @@ class Tokenizer */ protected function cdataSection() { - if ($this->scanner->current() != '[') { + if ('[' != $this->scanner->current()) { return false; } $cdata = ''; $this->scanner->next(); $chars = $this->scanner->charsWhile('CDAT'); - if ($chars != 'CDATA' || $this->scanner->current() != '[') { + if ('CDATA' != $chars || '[' != $this->scanner->current()) { $this->parseError('Expected [CDATA[, got %s', $chars); + return $this->bogusComment('<![' . $chars); } $tok = $this->scanner->next(); do { - if ($tok === false) { + if (false === $tok) { $this->parseError('Unexpected EOF inside CDATA.'); $this->bogusComment('<![CDATA[' . $cdata); + return true; } $cdata .= $tok; $tok = $this->scanner->next(); - } while (! $this->scanner->sequenceMatches(']]>')); + } while (!$this->scanner->sequenceMatches(']]>')); // Consume ]]> $this->scanner->consume(3); $this->events->cdata($cdata); + return true; } // ================================================================ // Non-HTML5 // ================================================================ + /** * Handle a processing instruction. * @@ -903,7 +925,7 @@ class Tokenizer */ protected function processingInstruction() { - if ($this->scanner->current() != '?') { + if ('?' != $this->scanner->current()) { return false; } @@ -912,21 +934,23 @@ class Tokenizer $white = $this->scanner->whitespace(); // If not a PI, send to bogusComment. - if (strlen($procName) == 0 || $white == 0 || $this->scanner->current() == false) { + if (0 == strlen($procName) || 0 == $white || false == $this->scanner->current()) { $this->parseError("Expected processing instruction name, got $tok"); $this->bogusComment('<?' . $tok . $procName); + return true; } $data = ''; // As long as it's not the case that the next two chars are ? and >. - while (! ($this->scanner->current() == '?' && $this->scanner->peek() == '>')) { + while (!('?' == $this->scanner->current() && '>' == $this->scanner->peek())) { $data .= $this->scanner->current(); $tok = $this->scanner->next(); - if ($tok === false) { - $this->parseError("Unexpected EOF in processing instruction."); + if (false === $tok) { + $this->parseError('Unexpected EOF in processing instruction.'); $this->events->processingInstruction($procName, $data); + return true; } } @@ -934,6 +958,7 @@ class Tokenizer $this->scanner->next(); // > $this->scanner->next(); // Next token. $this->events->processingInstruction($procName, $data); + return true; } @@ -955,7 +980,7 @@ class Tokenizer // Optimization for reading larger blocks faster. $first = substr($sequence, 0, 1); - while ($this->scanner->current() !== false) { + while (false !== $this->scanner->current()) { $buffer .= $this->scanner->charsUntil($first); // Stop as soon as we hit the stopping condition. @@ -967,7 +992,8 @@ class Tokenizer } // If we get here, we hit the EOF. - $this->parseError("Unexpected EOF during text read."); + $this->parseError('Unexpected EOF during text read.'); + return $buffer; } @@ -985,7 +1011,7 @@ class Tokenizer * '</script>' string. * * @param string $sequence - * @param bool $caseSensitive + * @param bool $caseSensitive * * @return bool */ @@ -1005,7 +1031,7 @@ class Tokenizer */ protected function flushBuffer() { - if ($this->text === '') { + if ('' === $this->text) { return; } $this->events->text($this->text); @@ -1055,9 +1081,8 @@ class Tokenizer * * If $inAttribute is set to true, a bare & will be returned as-is. * - * @param bool $inAttribute - * Set to true if the text is inside of an attribute value. - * false otherwise. + * @param bool $inAttribute Set to true if the text is inside of an attribute value. + * false otherwise. * * @return string */ @@ -1067,36 +1092,37 @@ class Tokenizer $tok = $this->scanner->next(); $start = $this->scanner->position(); - if ($tok == false) { + if (false === $tok) { return '&'; } // These indicate not an entity. We return just // the &. - if (strspn($tok, static::WHITE . "&<") == 1) { + if (1 === strspn($tok, static::WHITE . '&<')) { // $this->scanner->next(); return '&'; } // Numeric entity - if ($tok == '#') { + if ('#' === $tok) { $tok = $this->scanner->next(); // Hexidecimal encoding. // X[0-9a-fA-F]+; // x[0-9a-fA-F]+; - if ($tok == 'x' || $tok == 'X') { + if ('x' === $tok || 'X' === $tok) { $tok = $this->scanner->next(); // Consume x // Convert from hex code to char. $hex = $this->scanner->getHex(); if (empty($hex)) { - $this->parseError("Expected &#xHEX;, got &#x%s", $tok); + $this->parseError('Expected &#xHEX;, got &#x%s', $tok); // We unconsume because we don't know what parser rules might // be in effect for the remaining chars. For example. '&#>' // might result in a specific parsing rule inside of tag // contexts, while not inside of pcdata context. $this->scanner->unconsume(2); + return '&'; } $entity = CharacterReference::lookupHex($hex); @@ -1105,17 +1131,17 @@ class Tokenizer else { // Convert from decimal to char. $numeric = $this->scanner->getNumeric(); - if ($numeric === false) { - $this->parseError("Expected &#DIGITS;, got &#%s", $tok); + if (false === $numeric) { + $this->parseError('Expected &#DIGITS;, got &#%s', $tok); $this->scanner->unconsume(2); + return '&'; } $entity = CharacterReference::lookupDecimal($numeric); } - } elseif ($tok === '=' && $inAttribute) { + } elseif ('=' === $tok && $inAttribute) { return '&'; } else { // String entity. - // Attempt to consume a string up to a ';'. // [a-zA-Z0-9]+; $cname = $this->scanner->getAsciiAlphaNum(); @@ -1124,11 +1150,12 @@ class Tokenizer // When no entity is found provide the name of the unmatched string // and continue on as the & is not part of an entity. The & will // be converted to & elsewhere. - if ($entity == null) { - if (!$inAttribute || strlen($cname) === 0) { + if (null === $entity) { + if (!$inAttribute || '' === $cname) { $this->parseError("No match in entity table for '%s'", $cname); } $this->scanner->unconsume($this->scanner->position() - $start); + return '&'; } } @@ -1137,8 +1164,9 @@ class Tokenizer $tok = $this->scanner->current(); // We have an entity. We're done here. - if ($tok == ';') { + if (';' === $tok) { $this->scanner->next(); + return $entity; } @@ -1146,10 +1174,12 @@ class Tokenizer // entire string. Otherwise, failure to match is an error. if ($inAttribute) { $this->scanner->unconsume($this->scanner->position() - $start); + return '&'; } - $this->parseError("Expected &ENTITY;, got &ENTITY%s (no trailing ;) ", $tok); + $this->parseError('Expected &ENTITY;, got &ENTITY%s (no trailing ;) ', $tok); + return '&' . $entity; } } diff --git a/src/HTML5/Parser/TreeBuildingRules.php b/src/HTML5/Parser/TreeBuildingRules.php index d092872..9b94185 100644 --- a/src/HTML5/Parser/TreeBuildingRules.php +++ b/src/HTML5/Parser/TreeBuildingRules.php @@ -1,4 +1,5 @@ <?php + namespace Masterminds\HTML5\Parser; /** @@ -14,7 +15,6 @@ namespace Masterminds\HTML5\Parser; */ class TreeBuildingRules { - protected static $tags = array( 'li' => 1, 'dd' => 1, @@ -29,21 +29,10 @@ class TreeBuildingRules 'tbody' => 1, 'table' => 1, 'optgroup' => 1, - 'option' => 1 + 'option' => 1, ); /** - * Build a new rules engine. - * - * @param \DOMDocument $doc - * The DOM document to use for evaluation and modification. - */ - public function __construct($doc) - { - $this->doc = $doc; - } - - /** * Returns true if the given tagname has special processing rules. */ public function hasRules($tagname) @@ -56,7 +45,7 @@ class TreeBuildingRules * * This may modify the existing DOM. * - * @return \DOMElement The new Current DOM element. + * @return \DOMElement the new Current DOM element */ public function evaluate($new, $current) { @@ -71,7 +60,7 @@ class TreeBuildingRules return $this->handleRT($new, $current); case 'optgroup': return $this->closeIfCurrentMatches($new, $current, array( - 'optgroup' + 'optgroup', )); case 'option': return $this->closeIfCurrentMatches($new, $current, array( @@ -79,13 +68,13 @@ class TreeBuildingRules )); case 'tr': return $this->closeIfCurrentMatches($new, $current, array( - 'tr' + 'tr', )); case 'td': case 'th': return $this->closeIfCurrentMatches($new, $current, array( 'th', - 'td' + 'td', )); case 'tbody': case 'thead': @@ -95,7 +84,7 @@ class TreeBuildingRules return $this->closeIfCurrentMatches($new, $current, array( 'thead', 'tfoot', - 'tbody' + 'tbody', )); } @@ -105,7 +94,7 @@ class TreeBuildingRules protected function handleLI($ele, $current) { return $this->closeIfCurrentMatches($ele, $current, array( - 'li' + 'li', )); } @@ -113,7 +102,7 @@ class TreeBuildingRules { return $this->closeIfCurrentMatches($ele, $current, array( 'dt', - 'dd' + 'dd', )); } @@ -121,13 +110,13 @@ class TreeBuildingRules { return $this->closeIfCurrentMatches($ele, $current, array( 'rt', - 'rp' + 'rp', )); } protected function closeIfCurrentMatches($ele, $current, $match) { - if (in_array($current->tagName, $match)) { + if (in_array($current->tagName, $match, true)) { $current->parentNode->appendChild($ele); } else { $current->appendChild($ele); diff --git a/src/HTML5/Parser/UTF8Utils.php b/src/HTML5/Parser/UTF8Utils.php index 451c155..77c2dfb 100644 --- a/src/HTML5/Parser/UTF8Utils.php +++ b/src/HTML5/Parser/UTF8Utils.php @@ -1,5 +1,7 @@ <?php + namespace Masterminds\HTML5\Parser; + /* * * Portions based on code from html5lib files with the following copyright: @@ -30,11 +32,10 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. use Masterminds\HTML5\Exception; /** - * UTF-8 Utilities + * UTF-8 Utilities. */ class UTF8Utils { - /** * The Unicode replacement character.. */ @@ -76,10 +77,8 @@ class UTF8Utils * This has not yet been tested with charactersets other than UTF-8. * It should work with ISO-8859-1/-13 and standard Latin Win charsets. * - * @param string $data - * The data to convert. - * @param string $encoding - * A valid encoding. Examples: http://www.php.net/manual/en/mbstring.supported-encodings.php + * @param string $data The data to convert + * @param string $encoding A valid encoding. Examples: http://www.php.net/manual/en/mbstring.supported-encodings.php * * @return string */ @@ -108,7 +107,7 @@ class UTF8Utils $data = mb_convert_encoding($data, 'UTF-8', $encoding); mb_substitute_character($save); } // @todo Get iconv running in at least some environments if that is possible. - elseif (function_exists('iconv') && $encoding != 'auto') { + elseif (function_exists('iconv') && 'auto' !== $encoding) { // fprintf(STDOUT, "iconv found\n"); // iconv has the following behaviors: // - Overlong representations are ignored. @@ -122,7 +121,7 @@ class UTF8Utils /* * One leading U+FEFF BYTE ORDER MARK character must be ignored if any are present. */ - if (substr($data, 0, 3) === "\xEF\xBB\xBF") { + if ("\xEF\xBB\xBF" === substr($data, 0, 3)) { $data = substr($data, 3); } @@ -132,9 +131,9 @@ class UTF8Utils /** * Checks for Unicode code points that are not valid in a document. * - * @param string $data A string to analyze. + * @param string $data a string to analyze * - * @return array An array of (string) error messages produced by the scanning. + * @return array an array of (string) error messages produced by the scanning */ public static function checkForIllegalCodepoints($data) { @@ -144,7 +143,7 @@ class UTF8Utils /* * All U+0000 null characters in the input must be replaced by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such characters is a parse error. */ - for ($i = 0, $count = substr_count($data, "\0"); $i < $count; $i ++) { + for ($i = 0, $count = substr_count($data, "\0"); $i < $count; ++$i) { $errors[] = 'null-character'; } @@ -166,7 +165,7 @@ class UTF8Utils | [\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF] # U+nFFFE and U+nFFFF (1 <= n <= 10_{16}) )/x', $data, $matches); - for ($i = 0; $i < $count; $i ++) { + for ($i = 0; $i < $count; ++$i) { $errors[] = 'invalid-codepoint'; } |