352 lines
11 KiB
PHP
352 lines
11 KiB
PHP
<?php
|
|
|
|
/**
|
|
* @file
|
|
* Contains \Drupal\Component\Utility\Html.
|
|
*/
|
|
|
|
namespace Drupal\Component\Utility;
|
|
|
|
/**
|
|
* Provides DOMDocument helpers for parsing and serializing HTML strings.
|
|
*
|
|
* @ingroup utility
|
|
*/
|
|
class Html {
|
|
|
|
/**
|
|
* An array of previously cleaned HTML classes.
|
|
*
|
|
* @var array
|
|
*/
|
|
protected static $classes = array();
|
|
|
|
/**
|
|
* An array of the initial IDs used in one request.
|
|
*
|
|
* @var array
|
|
*/
|
|
protected static $seenIdsInit;
|
|
|
|
/**
|
|
* An array of IDs, including incremented versions when an ID is duplicated.
|
|
* @var array
|
|
*/
|
|
protected static $seenIds;
|
|
|
|
/**
|
|
* Stores whether the current request was sent via AJAX.
|
|
*
|
|
* @var bool
|
|
*/
|
|
protected static $isAjax = FALSE;
|
|
|
|
/**
|
|
* Prepares a string for use as a valid class name.
|
|
*
|
|
* Do not pass one string containing multiple classes as they will be
|
|
* incorrectly concatenated with dashes, i.e. "one two" will become "one-two".
|
|
*
|
|
* @param string $class
|
|
* The class name to clean.
|
|
*
|
|
* @return string
|
|
* The cleaned class name.
|
|
*/
|
|
public static function getClass($class) {
|
|
if (!isset(static::$classes[$class])) {
|
|
static::$classes[$class] = static::cleanCssIdentifier(Unicode::strtolower($class));
|
|
}
|
|
return static::$classes[$class];
|
|
}
|
|
|
|
/**
|
|
* Prepares a string for use as a CSS identifier (element, class, or ID name).
|
|
*
|
|
* http://www.w3.org/TR/CSS21/syndata.html#characters shows the syntax for
|
|
* valid CSS identifiers (including element names, classes, and IDs in
|
|
* selectors.)
|
|
*
|
|
* @param string $identifier
|
|
* The identifier to clean.
|
|
* @param array $filter
|
|
* An array of string replacements to use on the identifier.
|
|
*
|
|
* @return string
|
|
* The cleaned identifier.
|
|
*/
|
|
public static function cleanCssIdentifier($identifier, array $filter = array(
|
|
' ' => '-',
|
|
'_' => '-',
|
|
'__' => '__',
|
|
'/' => '-',
|
|
'[' => '-',
|
|
']' => ''
|
|
)) {
|
|
$identifier = strtr($identifier, $filter);
|
|
// Valid characters in a CSS identifier are:
|
|
// - the hyphen (U+002D)
|
|
// - a-z (U+0030 - U+0039)
|
|
// - A-Z (U+0041 - U+005A)
|
|
// - the underscore (U+005F)
|
|
// - 0-9 (U+0061 - U+007A)
|
|
// - ISO 10646 characters U+00A1 and higher
|
|
// We strip out any character not in the above list.
|
|
$identifier = preg_replace('/[^\x{002D}\x{0030}-\x{0039}\x{0041}-\x{005A}\x{005F}\x{0061}-\x{007A}\x{00A1}-\x{FFFF}]/u', '', $identifier);
|
|
// Identifiers cannot start with a digit, two hyphens, or a hyphen followed by a digit.
|
|
$identifier = preg_replace(array(
|
|
'/^[0-9]/',
|
|
'/^(-[0-9])|^(--)/'
|
|
), array('_', '__'), $identifier);
|
|
return $identifier;
|
|
}
|
|
|
|
/**
|
|
* Sets if this request is an Ajax request.
|
|
*
|
|
* @param bool $is_ajax
|
|
* TRUE if this request is an Ajax request, FALSE otherwise.
|
|
*/
|
|
public static function setIsAjax($is_ajax) {
|
|
static::$isAjax = $is_ajax;
|
|
}
|
|
|
|
/**
|
|
* Prepares a string for use as a valid HTML ID and guarantees uniqueness.
|
|
*
|
|
* This function ensures that each passed HTML ID value only exists once on
|
|
* the page. By tracking the already returned ids, this function enables
|
|
* forms, blocks, and other content to be output multiple times on the same
|
|
* page, without breaking (X)HTML validation.
|
|
*
|
|
* For already existing IDs, a counter is appended to the ID string.
|
|
* Therefore, JavaScript and CSS code should not rely on any value that was
|
|
* generated by this function and instead should rely on manually added CSS
|
|
* classes or similarly reliable constructs.
|
|
*
|
|
* Two consecutive hyphens separate the counter from the original ID. To
|
|
* manage uniqueness across multiple Ajax requests on the same page, Ajax
|
|
* requests POST an array of all IDs currently present on the page, which are
|
|
* used to prime this function's cache upon first invocation.
|
|
*
|
|
* To allow reverse-parsing of IDs submitted via Ajax, any multiple
|
|
* consecutive hyphens in the originally passed $id are replaced with a
|
|
* single hyphen.
|
|
*
|
|
* @param string $id
|
|
* The ID to clean.
|
|
*
|
|
* @return string
|
|
* The cleaned ID.
|
|
*/
|
|
public static function getUniqueId($id) {
|
|
// If this is an Ajax request, then content returned by this page request
|
|
// will be merged with content already on the base page. The HTML IDs must
|
|
// be unique for the fully merged content. Therefore use unique IDs.
|
|
if (static::$isAjax) {
|
|
return static::getId($id) . '--' . Crypt::randomBytesBase64(8);
|
|
}
|
|
|
|
// @todo Remove all that code once we switch over to random IDs only,
|
|
// see https://www.drupal.org/node/1090592.
|
|
if (!isset(static::$seenIdsInit)) {
|
|
static::$seenIdsInit = array();
|
|
}
|
|
if (!isset(static::$seenIds)) {
|
|
static::$seenIds = static::$seenIdsInit;
|
|
}
|
|
|
|
$id = static::getId($id);
|
|
|
|
// Ensure IDs are unique by appending a counter after the first occurrence.
|
|
// The counter needs to be appended with a delimiter that does not exist in
|
|
// the base ID. Requiring a unique delimiter helps ensure that we really do
|
|
// return unique IDs and also helps us re-create the $seen_ids array during
|
|
// Ajax requests.
|
|
if (isset(static::$seenIds[$id])) {
|
|
$id = $id . '--' . ++static::$seenIds[$id];
|
|
}
|
|
else {
|
|
static::$seenIds[$id] = 1;
|
|
}
|
|
return $id;
|
|
}
|
|
|
|
/**
|
|
* Prepares a string for use as a valid HTML ID.
|
|
*
|
|
* Only use this function when you want to intentionally skip the uniqueness
|
|
* guarantee of self::getUniqueId().
|
|
*
|
|
* @param string $id
|
|
* The ID to clean.
|
|
*
|
|
* @return string
|
|
* The cleaned ID.
|
|
*
|
|
* @see self::getUniqueId()
|
|
*/
|
|
public static function getId($id) {
|
|
$id = strtr(Unicode::strtolower($id), array(' ' => '-', '_' => '-', '[' => '-', ']' => ''));
|
|
|
|
// As defined in http://www.w3.org/TR/html4/types.html#type-name, HTML IDs can
|
|
// only contain letters, digits ([0-9]), hyphens ("-"), underscores ("_"),
|
|
// colons (":"), and periods ("."). We strip out any character not in that
|
|
// list. Note that the CSS spec doesn't allow colons or periods in identifiers
|
|
// (http://www.w3.org/TR/CSS21/syndata.html#characters), so we strip those two
|
|
// characters as well.
|
|
$id = preg_replace('/[^A-Za-z0-9\-_]/', '', $id);
|
|
|
|
// Removing multiple consecutive hyphens.
|
|
$id = preg_replace('/\-+/', '-', $id);
|
|
return $id;
|
|
}
|
|
|
|
/**
|
|
* Resets the list of seen IDs.
|
|
*/
|
|
public static function resetSeenIds() {
|
|
static::$seenIds = NULL;
|
|
}
|
|
|
|
/**
|
|
* Normalizes an HTML snippet.
|
|
*
|
|
* This function is essentially \DOMDocument::normalizeDocument(), but
|
|
* operates on an HTML string instead of a \DOMDocument.
|
|
*
|
|
* @param string $html
|
|
* The HTML string to normalize.
|
|
*
|
|
* @return string
|
|
* The normalized HTML string.
|
|
*/
|
|
public static function normalize($html) {
|
|
$document = static::load($html);
|
|
return static::serialize($document);
|
|
}
|
|
|
|
/**
|
|
* Parses an HTML snippet and returns it as a DOM object.
|
|
*
|
|
* This function loads the body part of a partial (X)HTML document and returns
|
|
* a full \DOMDocument object that represents this document.
|
|
*
|
|
* Use \Drupal\Component\Utility\Html::serialize() to serialize this
|
|
* \DOMDocument back to a string.
|
|
*
|
|
* @param string $html
|
|
* The partial (X)HTML snippet to load. Invalid markup will be corrected on
|
|
* import.
|
|
*
|
|
* @return \DOMDocument
|
|
* A \DOMDocument that represents the loaded (X)HTML snippet.
|
|
*/
|
|
public static function load($html) {
|
|
$document = <<<EOD
|
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head>
|
|
<body>!html</body>
|
|
</html>
|
|
EOD;
|
|
// PHP's \DOMDocument serialization adds straw whitespace in case the markup
|
|
// of the wrapping document contains newlines, so ensure to remove all
|
|
// newlines before injecting the actual HTML body to process.
|
|
$document = strtr($document, array("\n" => '', '!html' => $html));
|
|
|
|
$dom = new \DOMDocument();
|
|
// Ignore warnings during HTML soup loading.
|
|
@$dom->loadHTML($document);
|
|
|
|
return $dom;
|
|
}
|
|
|
|
/**
|
|
* Converts the body of a \DOMDocument back to an HTML snippet.
|
|
*
|
|
* The function serializes the body part of a \DOMDocument back to an (X)HTML
|
|
* snippet. The resulting (X)HTML snippet will be properly formatted to be
|
|
* compatible with HTML user agents.
|
|
*
|
|
* @param \DOMDocument $document
|
|
* A \DOMDocument object to serialize, only the tags below the first <body>
|
|
* node will be converted.
|
|
*
|
|
* @return string
|
|
* A valid (X)HTML snippet, as a string.
|
|
*/
|
|
public static function serialize(\DOMDocument $document) {
|
|
$body_node = $document->getElementsByTagName('body')->item(0);
|
|
$html = '';
|
|
|
|
foreach ($body_node->getElementsByTagName('script') as $node) {
|
|
static::escapeCdataElement($node);
|
|
}
|
|
foreach ($body_node->getElementsByTagName('style') as $node) {
|
|
static::escapeCdataElement($node, '/*', '*/');
|
|
}
|
|
foreach ($body_node->childNodes as $node) {
|
|
$html .= $document->saveXML($node);
|
|
}
|
|
return $html;
|
|
}
|
|
|
|
/**
|
|
* Adds comments around a <!CDATA section in a \DOMNode.
|
|
*
|
|
* \DOMDocument::loadHTML() in \Drupal\Component\Utility\Html::load() makes
|
|
* CDATA sections from the contents of inline script and style tags. This can
|
|
* cause HTML4 browsers to throw exceptions.
|
|
*
|
|
* This function attempts to solve the problem by creating a
|
|
* \DOMDocumentFragment to comment the CDATA tag.
|
|
*
|
|
* @param \DOMNode $node
|
|
* The element potentially containing a CDATA node.
|
|
* @param string $comment_start
|
|
* (optional) A string to use as a comment start marker to escape the CDATA
|
|
* declaration. Defaults to '//'.
|
|
* @param string $comment_end
|
|
* (optional) A string to use as a comment end marker to escape the CDATA
|
|
* declaration. Defaults to an empty string.
|
|
*/
|
|
public static function escapeCdataElement(\DOMNode $node, $comment_start = '//', $comment_end = '') {
|
|
foreach ($node->childNodes as $child_node) {
|
|
if ($child_node instanceof \DOMCdataSection) {
|
|
$embed_prefix = "\n<!--{$comment_start}--><![CDATA[{$comment_start} ><!--{$comment_end}\n";
|
|
$embed_suffix = "\n{$comment_start}--><!]]>{$comment_end}\n";
|
|
|
|
// Prevent invalid cdata escaping as this would throw a DOM error.
|
|
// This is the same behavior as found in libxml2.
|
|
// Related W3C standard: http://www.w3.org/TR/REC-xml/#dt-cdsection
|
|
// Fix explanation: http://en.wikipedia.org/wiki/CDATA#Nesting
|
|
$data = str_replace(']]>', ']]]]><![CDATA[>', $child_node->data);
|
|
|
|
$fragment = $node->ownerDocument->createDocumentFragment();
|
|
$fragment->appendXML($embed_prefix . $data . $embed_suffix);
|
|
$node->appendChild($fragment);
|
|
$node->removeChild($child_node);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Decodes all HTML entities including numerical ones to regular UTF-8 bytes.
|
|
*
|
|
* Double-escaped entities will only be decoded once ("&lt;" becomes
|
|
* "<", not "<"). Be careful when using this function, as it will revert
|
|
* previous sanitization efforts (<script> will become <script>).
|
|
*
|
|
* @param string $text
|
|
* The text to decode entities in.
|
|
*
|
|
* @return string
|
|
* The input $text, with all HTML entities decoded once.
|
|
*/
|
|
public static function decodeEntities($text) {
|
|
return html_entity_decode($text, ENT_QUOTES, 'UTF-8');
|
|
}
|
|
|
|
}
|