2015-08-17 17:00:26 -07:00
< ? php
namespace Drupal\Component\Utility ;
/**
* Provides helper to filter for cross - site scripting .
*
* @ ingroup utility
*/
class Xss {
/**
2015-09-04 13:20:09 -07:00
* The list of HTML tags allowed by filterAdmin () .
2015-08-17 17:00:26 -07:00
*
* @ var array
*
* @ see \Drupal\Component\Utility\Xss :: filterAdmin ()
*/
protected static $adminTags = array ( 'a' , 'abbr' , 'acronym' , 'address' , 'article' , 'aside' , 'b' , 'bdi' , 'bdo' , 'big' , 'blockquote' , 'br' , 'caption' , 'cite' , 'code' , 'col' , 'colgroup' , 'command' , 'dd' , 'del' , 'details' , 'dfn' , 'div' , 'dl' , 'dt' , 'em' , 'figcaption' , 'figure' , 'footer' , 'h1' , 'h2' , 'h3' , 'h4' , 'h5' , 'h6' , 'header' , 'hgroup' , 'hr' , 'i' , 'img' , 'ins' , 'kbd' , 'li' , 'mark' , 'menu' , 'meter' , 'nav' , 'ol' , 'output' , 'p' , 'pre' , 'progress' , 'q' , 'rp' , 'rt' , 'ruby' , 's' , 'samp' , 'section' , 'small' , 'span' , 'strong' , 'sub' , 'summary' , 'sup' , 'table' , 'tbody' , 'td' , 'tfoot' , 'th' , 'thead' , 'time' , 'tr' , 'tt' , 'u' , 'ul' , 'var' , 'wbr' );
2015-09-04 13:20:09 -07:00
/**
* The default list of HTML tags allowed by filter () .
*
* @ var array
*
* @ see \Drupal\Component\Utility\Xss :: filter ()
*/
protected static $htmlTags = array ( 'a' , 'em' , 'strong' , 'cite' , 'blockquote' , 'code' , 'ul' , 'ol' , 'li' , 'dl' , 'dt' , 'dd' );
2015-08-17 17:00:26 -07:00
/**
* Filters HTML to prevent cross - site - scripting ( XSS ) vulnerabilities .
*
* Based on kses by Ulf Harnhammar , see http :// sourceforge . net / projects / kses .
* For examples of various XSS attacks , see : http :// ha . ckers . org / xss . html .
*
2015-08-27 12:03:05 -07:00
* This code does four things :
2015-08-17 17:00:26 -07:00
* - Removes characters and constructs that can trick browsers .
* - Makes sure all HTML entities are well - formed .
* - Makes sure all HTML tags and attributes are well - formed .
* - Makes sure no HTML tags contain URLs with a disallowed protocol ( e . g .
* javascript : ) .
*
* @ param $string
* The string with raw HTML in it . It will be stripped of everything that
* can cause an XSS attack .
* @ param array $html_tags
* An array of HTML tags .
*
* @ return string
* An XSS safe version of $string , or an empty string if $string is not
* valid UTF - 8.
*
* @ see \Drupal\Component\Utility\Unicode :: validateUtf8 ()
*
* @ ingroup sanitization
*/
2015-09-04 13:20:09 -07:00
public static function filter ( $string , array $html_tags = NULL ) {
if ( is_null ( $html_tags )) {
$html_tags = static :: $htmlTags ;
}
2015-08-17 17:00:26 -07:00
// Only operate on valid UTF-8 strings. This is necessary to prevent cross
// site scripting issues on Internet Explorer 6.
if ( ! Unicode :: validateUtf8 ( $string )) {
return '' ;
}
// Remove NULL characters (ignored by some browsers).
$string = str_replace ( chr ( 0 ), '' , $string );
// Remove Netscape 4 JS entities.
$string = preg_replace ( '%&\s*\{[^}]*(\}\s*;?|$)%' , '' , $string );
// Defuse all HTML entities.
$string = str_replace ( '&' , '&' , $string );
// Change back only well-formed entities in our whitelist:
// Decimal numeric entities.
$string = preg_replace ( '/&#([0-9]+;)/' , '&#\1' , $string );
// Hexadecimal numeric entities.
$string = preg_replace ( '/&#[Xx]0*((?:[0-9A-Fa-f]{2})+;)/' , '&#x\1' , $string );
// Named entities.
$string = preg_replace ( '/&([A-Za-z][A-Za-z0-9]*;)/' , '&\1' , $string );
$html_tags = array_flip ( $html_tags );
// Late static binding does not work inside anonymous functions.
$class = get_called_class ();
$splitter = function ( $matches ) use ( $html_tags , $class ) {
return $class :: split ( $matches [ 1 ], $html_tags , $class );
};
2015-09-04 13:20:09 -07:00
// Strip any tags that are not in the whitelist.
2015-08-27 12:03:05 -07:00
return preg_replace_callback ( ' %
2015-08-17 17:00:26 -07:00
(
< ( ? = [ ^ a - zA - Z !/ ]) # a lone <
| # or
<!--.* ? --> # a comment
| # or
< [ ^> ] * ( >| $ ) # a string that starts with a <, up until the > or the end of the string
| # or
> # just a >
2015-08-27 12:03:05 -07:00
) % x ' , $splitter , $string );
2015-08-17 17:00:26 -07:00
}
/**
* Applies a very permissive XSS / HTML filter for admin - only use .
*
* Use only for fields where it is impractical to use the
* whole filter system , but where some ( mainly inline ) mark - up
2015-10-08 11:40:12 -07:00
* is desired ( so \Drupal\Component\Utility\Html :: escape () is
2015-08-17 17:00:26 -07:00
* not acceptable ) .
*
* Allows all tags that can be used inside an HTML body , save
* for scripts and styles .
*
* @ param string $string
* The string to apply the filter to .
*
* @ return string
* The filtered string .
2015-08-27 12:03:05 -07:00
*
* @ ingroup sanitization
*
* @ see \Drupal\Component\Utility\Xss :: getAdminTagList ()
2015-08-17 17:00:26 -07:00
*/
public static function filterAdmin ( $string ) {
return static :: filter ( $string , static :: $adminTags );
}
/**
* Processes an HTML tag .
*
* @ param string $string
* The HTML tag to process .
* @ param array $html_tags
* An array where the keys are the allowed tags and the values are not
* used .
* @ param string $class
* The called class . This method is called from an anonymous function which
* breaks late static binding . See https :// bugs . php . net / bug . php ? id = 66622 for
* more information .
*
* @ return string
* If the element isn ' t allowed , an empty string . Otherwise , the cleaned up
* version of the HTML element .
*/
protected static function split ( $string , $html_tags , $class ) {
if ( substr ( $string , 0 , 1 ) != '<' ) {
// We matched a lone ">" character.
return '>' ;
}
elseif ( strlen ( $string ) == 1 ) {
// We matched a lone "<" character.
return '<' ;
}
if ( ! preg_match ( '%^<\s*(/\s*)?([a-zA-Z0-9\-]+)\s*([^>]*)>?|(<!--.*?-->)$%' , $string , $matches )) {
// Seriously malformed.
return '' ;
}
$slash = trim ( $matches [ 1 ]);
$elem = & $matches [ 2 ];
$attrlist = & $matches [ 3 ];
$comment = & $matches [ 4 ];
if ( $comment ) {
$elem = '!--' ;
}
// When in whitelist mode, an element is disallowed when not listed.
if ( $class :: needsRemoval ( $html_tags , $elem )) {
return '' ;
}
if ( $comment ) {
return $comment ;
}
if ( $slash != '' ) {
return " </ $elem > " ;
}
// Is there a closing XHTML slash at the end of the attributes?
$attrlist = preg_replace ( '%(\s?)/\s*$%' , '\1' , $attrlist , - 1 , $count );
$xhtml_slash = $count ? ' /' : '' ;
// Clean up attributes.
$attr2 = implode ( ' ' , $class :: attributes ( $attrlist ));
$attr2 = preg_replace ( '/[<>]/' , '' , $attr2 );
$attr2 = strlen ( $attr2 ) ? ' ' . $attr2 : '' ;
return " < $elem $attr2 $xhtml_slash > " ;
}
/**
* Processes a string of HTML attributes .
*
* @ param string $attributes
* The html attribute to process .
*
* @ return string
* Cleaned up version of the HTML attributes .
*/
protected static function attributes ( $attributes ) {
$attributes_array = array ();
$mode = 0 ;
$attribute_name = '' ;
$skip = FALSE ;
$skip_protocol_filtering = FALSE ;
while ( strlen ( $attributes ) != 0 ) {
// Was the last operation successful?
$working = 0 ;
switch ( $mode ) {
case 0 :
// Attribute name, href for instance.
2015-12-02 11:38:43 -08:00
if ( preg_match ( '/^([-a-zA-Z][-a-zA-Z0-9]*)/' , $attributes , $match )) {
2015-08-17 17:00:26 -07:00
$attribute_name = strtolower ( $match [ 1 ]);
$skip = ( $attribute_name == 'style' || substr ( $attribute_name , 0 , 2 ) == 'on' );
// Values for attributes of type URI should be filtered for
// potentially malicious protocols (for example, an href-attribute
// starting with "javascript:"). However, for some non-URI
// attributes performing this filtering causes valid and safe data
// to be mangled. We prevent this by skipping protocol filtering on
// such attributes.
// @see \Drupal\Component\Utility\UrlHelper::filterBadProtocol()
// @see http://www.w3.org/TR/html4/index/attributes.html
$skip_protocol_filtering = substr ( $attribute_name , 0 , 5 ) === 'data-' || in_array ( $attribute_name , array (
'title' ,
'alt' ,
2016-10-06 15:16:20 -07:00
'rel' ,
'property' ,
2015-08-17 17:00:26 -07:00
));
$working = $mode = 1 ;
2015-12-02 11:38:43 -08:00
$attributes = preg_replace ( '/^[-a-zA-Z][-a-zA-Z0-9]*/' , '' , $attributes );
2015-08-17 17:00:26 -07:00
}
break ;
case 1 :
// Equals sign or valueless ("selected").
if ( preg_match ( '/^\s*=\s*/' , $attributes )) {
$working = 1 ; $mode = 2 ;
$attributes = preg_replace ( '/^\s*=\s*/' , '' , $attributes );
break ;
}
if ( preg_match ( '/^\s+/' , $attributes )) {
$working = 1 ; $mode = 0 ;
if ( ! $skip ) {
$attributes_array [] = $attribute_name ;
}
$attributes = preg_replace ( '/^\s+/' , '' , $attributes );
}
break ;
case 2 :
// Attribute value, a URL after href= for instance.
if ( preg_match ( '/^"([^"]*)"(\s+|$)/' , $attributes , $match )) {
$thisval = $skip_protocol_filtering ? $match [ 1 ] : UrlHelper :: filterBadProtocol ( $match [ 1 ]);
if ( ! $skip ) {
$attributes_array [] = " $attribute_name = \" $thisval\ " " ;
}
$working = 1 ;
$mode = 0 ;
$attributes = preg_replace ( '/^"[^"]*"(\s+|$)/' , '' , $attributes );
break ;
}
if ( preg_match ( " /^'([^']*)'( \ s+| $ )/ " , $attributes , $match )) {
$thisval = $skip_protocol_filtering ? $match [ 1 ] : UrlHelper :: filterBadProtocol ( $match [ 1 ]);
if ( ! $skip ) {
$attributes_array [] = " $attribute_name =' $thisval ' " ;
}
$working = 1 ; $mode = 0 ;
$attributes = preg_replace ( " /^'[^']*'( \ s+| $ )/ " , '' , $attributes );
break ;
}
if ( preg_match ( " %^([^ \ s \" ']+)( \ s+| $ )% " , $attributes , $match )) {
$thisval = $skip_protocol_filtering ? $match [ 1 ] : UrlHelper :: filterBadProtocol ( $match [ 1 ]);
if ( ! $skip ) {
$attributes_array [] = " $attribute_name = \" $thisval\ " " ;
}
$working = 1 ; $mode = 0 ;
$attributes = preg_replace ( " %^[^ \ s \" ']+( \ s+| $ )% " , '' , $attributes );
}
break ;
}
if ( $working == 0 ) {
// Not well formed; remove and try again.
$attributes = preg_replace ( ' /
^
(
" [^ " ] * ( " | $ ) # - a string that starts with a double quote, up until the next double quote or the end of the string
| # or
\ ' [ ^ \ ' ] * ( \ ' | $ ) | # - a string that starts with a quote, up until the next quote or the end of the string
| # or
\S # - a non-whitespace character
) * # any number of the above three
\s * # any number of whitespaces
/ x ', ' ' , $attributes );
$mode = 0 ;
}
}
// The attribute list ends with a valueless attribute like "selected".
if ( $mode == 1 && ! $skip ) {
$attributes_array [] = $attribute_name ;
}
return $attributes_array ;
}
/**
* Whether this element needs to be removed altogether .
*
* @ param $html_tags
* The list of HTML tags .
* @ param $elem
* The name of the HTML element .
*
* @ return bool
* TRUE if this element needs to be removed .
*/
protected static function needsRemoval ( $html_tags , $elem ) {
return ! isset ( $html_tags [ strtolower ( $elem )]);
}
2015-08-27 12:03:05 -07:00
/**
2015-09-04 13:20:09 -07:00
* Gets the list of HTML tags allowed by Xss :: filterAdmin () .
2015-08-27 12:03:05 -07:00
*
* @ return array
2015-09-04 13:20:09 -07:00
* The list of HTML tags allowed by filterAdmin () .
2015-08-27 12:03:05 -07:00
*/
public static function getAdminTagList () {
return static :: $adminTags ;
}
2015-09-04 13:20:09 -07:00
/**
* Gets the standard list of HTML tags allowed by Xss :: filter () .
*
* @ return array
* The list of HTML tags allowed by Xss :: filter () .
*/
public static function getHtmlTagList () {
return static :: $htmlTags ;
}
2016-06-02 15:56:09 -07:00
2015-08-17 17:00:26 -07:00
}