2015-08-17 17:00:26 -07:00
< ? php
/**
* @ file
* Contains \Drupal\Component\Utility\UrlHelper .
*/
namespace Drupal\Component\Utility ;
/**
* Helper class URL based methods .
*
* @ ingroup utility
*/
class UrlHelper {
/**
* The list of allowed protocols .
*
* @ var array
*/
protected static $allowedProtocols = array ( 'http' , 'https' );
/**
* Parses an array into a valid , rawurlencoded query string .
*
*
* rawurlencode () is RFC3986 compliant , and as a consequence RFC3987
* compliant . The latter defines the required format of " URLs " in HTML5 .
* urlencode () is almost the same as rawurlencode (), except that it encodes
* spaces as " + " instead of " %20 " . This makes its result non compliant to
* RFC3986 and as a consequence non compliant to RFC3987 and as a consequence
* not valid as a " URL " in HTML5 .
*
* @ todo Remove this function once PHP 5.4 is required as we can use just
* http_build_query () directly .
*
* @ param array $query
2016-01-06 16:31:26 -08:00
* The query parameter array to be processed ; for instance ,
* \Drupal :: request () -> query -> all () .
2015-08-17 17:00:26 -07:00
* @ param string $parent
2016-01-06 16:31:26 -08:00
* ( optional ) Internal use only . Used to build the $query array key for
* nested items . Defaults to an empty string .
2015-08-17 17:00:26 -07:00
*
* @ return string
* A rawurlencoded string which can be used as or appended to the URL query
* string .
*
* @ ingroup php_wrappers
*/
public static function buildQuery ( array $query , $parent = '' ) {
$params = array ();
foreach ( $query as $key => $value ) {
$key = ( $parent ? $parent . '[' . rawurlencode ( $key ) . ']' : rawurlencode ( $key ));
// Recurse into children.
if ( is_array ( $value )) {
$params [] = static :: buildQuery ( $value , $key );
}
// If a query parameter value is NULL, only append its key.
elseif ( ! isset ( $value )) {
$params [] = $key ;
}
else {
// For better readability of paths in query strings, we decode slashes.
$params [] = $key . '=' . str_replace ( '%2F' , '/' , rawurlencode ( $value ));
}
}
return implode ( '&' , $params );
}
/**
* Filters a URL query parameter array to remove unwanted elements .
*
* @ param array $query
* An array to be processed .
* @ param array $exclude
* ( optional ) A list of $query array keys to remove . Use " parent[child] " to
* exclude nested items .
* @ param string $parent
* Internal use only . Used to build the $query array key for nested items .
*
* @ return
* An array containing query parameters .
*/
public static function filterQueryParameters ( array $query , array $exclude = array (), $parent = '' ) {
// If $exclude is empty, there is nothing to filter.
if ( empty ( $exclude )) {
return $query ;
}
elseif ( ! $parent ) {
$exclude = array_flip ( $exclude );
}
$params = array ();
foreach ( $query as $key => $value ) {
$string_key = ( $parent ? $parent . '[' . $key . ']' : $key );
if ( isset ( $exclude [ $string_key ])) {
continue ;
}
if ( is_array ( $value )) {
$params [ $key ] = static :: filterQueryParameters ( $value , $exclude , $string_key );
}
else {
$params [ $key ] = $value ;
}
}
return $params ;
}
/**
* Parses a URL string into its path , query , and fragment components .
*
* This function splits both internal paths like @ code node ? b = c #d @endcode and
* external URLs like @ code https :// example . com / a ? b = c #d @endcode into their
* component parts . See
* @ link http :// tools . ietf . org / html / rfc3986 #section-3 RFC 3986 @endlink for an
* explanation of what the component parts are .
*
* Note that , unlike the RFC , when passed an external URL , this function
* groups the scheme , authority , and path together into the path component .
*
* @ param string $url
* The internal path or external URL string to parse .
*
* @ return array
* An associative array containing :
* - path : The path component of $url . If $url is an external URL , this
* includes the scheme , authority , and path .
* - query : An array of query parameters from $url , if they exist .
* - fragment : The fragment component from $url , if it exists .
*
* @ see \Drupal\Core\Utility\LinkGenerator
* @ see http :// tools . ietf . org / html / rfc3986
*
* @ ingroup php_wrappers
*/
public static function parse ( $url ) {
$options = array (
'path' => NULL ,
'query' => array (),
'fragment' => '' ,
);
// External URLs: not using parse_url() here, so we do not have to rebuild
// the scheme, host, and path without having any use for it.
if ( strpos ( $url , '://' ) !== FALSE ) {
// Split off everything before the query string into 'path'.
$parts = explode ( '?' , $url );
// Don't support URLs without a path, like 'http://'.
list (, $path ) = explode ( '://' , $parts [ 0 ], 2 );
if ( $path != '' ) {
$options [ 'path' ] = $parts [ 0 ];
}
// If there is a query string, transform it into keyed query parameters.
if ( isset ( $parts [ 1 ])) {
$query_parts = explode ( '#' , $parts [ 1 ]);
parse_str ( $query_parts [ 0 ], $options [ 'query' ]);
// Take over the fragment, if there is any.
if ( isset ( $query_parts [ 1 ])) {
$options [ 'fragment' ] = $query_parts [ 1 ];
}
}
}
// Internal URLs.
else {
2016-01-06 16:31:26 -08:00
// parse_url() does not support relative URLs, so make it absolute. For
// instance, the relative URL "foo/bar:1" isn't properly parsed.
2015-08-17 17:00:26 -07:00
$parts = parse_url ( 'http://example.com/' . $url );
// Strip the leading slash that was just added.
$options [ 'path' ] = substr ( $parts [ 'path' ], 1 );
if ( isset ( $parts [ 'query' ])) {
parse_str ( $parts [ 'query' ], $options [ 'query' ]);
}
if ( isset ( $parts [ 'fragment' ])) {
$options [ 'fragment' ] = $parts [ 'fragment' ];
}
}
return $options ;
}
/**
* Encodes a Drupal path for use in a URL .
*
* For aesthetic reasons slashes are not escaped .
*
* @ param string $path
* The Drupal path to encode .
*
* @ return string
* The encoded path .
*/
public static function encodePath ( $path ) {
return str_replace ( '%2F' , '/' , rawurlencode ( $path ));
}
/**
2016-01-06 16:31:26 -08:00
* Determines whether a path is external to Drupal .
2015-08-17 17:00:26 -07:00
*
2016-01-06 16:31:26 -08:00
* An example of an external path is http :// example . com . If a path cannot be
* assessed by Drupal ' s menu handler , then we must treat it as potentially
* insecure .
2015-08-17 17:00:26 -07:00
*
* @ param string $path
* The internal path or external URL being linked to , such as " node/34 " or
* " http://example.com/foo " .
*
* @ return bool
* TRUE or FALSE , where TRUE indicates an external path .
*/
public static function isExternal ( $path ) {
$colonpos = strpos ( $path , ':' );
2016-02-24 13:20:32 -08:00
// Some browsers treat \ as / so normalize to forward slashes.
$path = str_replace ( '\\' , '/' , $path );
// If the path starts with 2 slashes then it is always considered an
// external URL without an explicit protocol part.
2015-08-17 17:00:26 -07:00
return ( strpos ( $path , '//' ) === 0 )
2016-02-24 13:20:32 -08:00
// Leading control characters may be ignored or mishandled by browsers,
// so assume such a path may lead to an external location. The \p{C}
// character class matches all UTF-8 control, unassigned, and private
// characters.
|| ( preg_match ( '/^\p{C}/u' , $path ) !== 0 )
// Avoid calling static::stripDangerousProtocols() if there is any slash
// (/), hash (#) or question_mark (?) before the colon (:) occurrence -
// if any - as this would clearly mean it is not a URL.
2015-08-17 17:00:26 -07:00
|| ( $colonpos !== FALSE
&& ! preg_match ( '![/?#]!' , substr ( $path , 0 , $colonpos ))
&& static :: stripDangerousProtocols ( $path ) == $path );
}
/**
* Determines if an external URL points to this installation .
*
* @ param string $url
* A string containing an external URL , such as " http://example.com/foo " .
* @ param string $base_url
* The base URL string to check against , such as " http://example.com/ "
*
* @ return bool
* TRUE if the URL has the same domain and base path .
*
* @ throws \InvalidArgumentException
* Exception thrown when a either $url or $bath_url are not fully qualified .
*/
public static function externalIsLocal ( $url , $base_url ) {
$url_parts = parse_url ( $url );
$base_parts = parse_url ( $base_url );
if ( empty ( $base_parts [ 'host' ]) || empty ( $url_parts [ 'host' ])) {
2015-08-27 12:03:05 -07:00
throw new \InvalidArgumentException ( 'A path was passed when a fully qualified domain was expected.' );
2015-08-17 17:00:26 -07:00
}
if ( ! isset ( $url_parts [ 'path' ]) || ! isset ( $base_parts [ 'path' ])) {
return ( ! isset ( $base_parts [ 'path' ]) || $base_parts [ 'path' ] == '/' )
&& ( $url_parts [ 'host' ] == $base_parts [ 'host' ]);
}
else {
// When comparing base paths, we need a trailing slash to make sure a
// partial URL match isn't occurring. Since base_path() always returns
// with a trailing slash, we don't need to add the trailing slash here.
return ( $url_parts [ 'host' ] == $base_parts [ 'host' ] && stripos ( $url_parts [ 'path' ], $base_parts [ 'path' ]) === 0 );
}
}
/**
* Processes an HTML attribute value and strips dangerous protocols from URLs .
*
* @ param string $string
* The string with the attribute value .
*
* @ return string
* Cleaned up and HTML - escaped version of $string .
*/
public static function filterBadProtocol ( $string ) {
// Get the plain text representation of the attribute value (i.e. its
// meaning).
$string = Html :: decodeEntities ( $string );
2015-09-04 13:20:09 -07:00
return Html :: escape ( static :: stripDangerousProtocols ( $string ));
2015-08-17 17:00:26 -07:00
}
/**
* Gets the allowed protocols .
*
* @ return array
* An array of protocols , for example http , https and irc .
*/
public static function getAllowedProtocols () {
return static :: $allowedProtocols ;
}
/**
* Sets the allowed protocols .
*
* @ param array $protocols
* An array of protocols , for example http , https and irc .
*/
public static function setAllowedProtocols ( array $protocols = array ()) {
static :: $allowedProtocols = $protocols ;
}
/**
2016-01-06 16:31:26 -08:00
* Strips dangerous protocols ( for example , 'javascript:' ) from a URI .
2015-08-17 17:00:26 -07:00
*
* This function must be called for all URIs within user - entered input prior
* to being output to an HTML attribute value . It is often called as part of
2015-09-04 13:20:09 -07:00
* \Drupal\Component\Utility\UrlHelper :: filterBadProtocol () or
* \Drupal\Component\Utility\Xss :: filter (), but those functions return an
* HTML - encoded string , so this function can be called independently when the
* output needs to be a plain - text string for passing to functions that will
2015-10-08 11:40:12 -07:00
* call Html :: escape () separately . The exact behavior depends on the value :
* - If the value is a well - formed ( per RFC 3986 ) relative URL or
* absolute URL that does not use a dangerous protocol ( like
* " javascript: " ), then the URL remains unchanged . This includes all
* URLs generated via Url :: toString () and UrlGeneratorTrait :: url () .
* - If the value is a well - formed absolute URL with a dangerous protocol ,
* the protocol is stripped . This process is repeated on the remaining URL
* until it is stripped down to a safe protocol .
* - If the value is not a well - formed URL , the same sanitization behavior as
* for well - formed URLs will be invoked , which strips most substrings that
* precede a " : " . The result can be used in URL attributes such as " href "
* or " src " ( only after calling Html :: escape () separately ), but this may not
2016-01-06 16:31:26 -08:00
* produce valid HTML ( for example , malformed URLs within " href " attributes
* fail HTML validation ) . This can be avoided by using
2015-10-08 11:40:12 -07:00
* Url :: fromUri ( $possibly_not_a_url ) -> toString (), which either throws an
* exception or returns a well - formed URL .
2015-08-17 17:00:26 -07:00
*
* @ param string $uri
* A plain - text URI that might contain dangerous protocols .
*
* @ return string
* A plain - text URI stripped of dangerous protocols . As with all plain - text
* strings , this return value must not be output to an HTML page without
* being sanitized first . However , it can be passed to functions
* expecting plain - text strings .
2015-10-08 11:40:12 -07:00
*
* @ see \Drupal\Component\Utility\Html :: escape ()
* @ see \Drupal\Core\Url :: toString ()
* @ see \Drupal\Core\Routing\UrlGeneratorTrait :: url ()
* @ see \Drupal\Core\Url :: fromUri ()
2015-08-17 17:00:26 -07:00
*/
public static function stripDangerousProtocols ( $uri ) {
$allowed_protocols = array_flip ( static :: $allowedProtocols );
// Iteratively remove any invalid protocol found.
do {
$before = $uri ;
$colonpos = strpos ( $uri , ':' );
if ( $colonpos > 0 ) {
// We found a colon, possibly a protocol. Verify.
$protocol = substr ( $uri , 0 , $colonpos );
// If a colon is preceded by a slash, question mark or hash, it cannot
// possibly be part of the URL scheme. This must be a relative URL, which
// inherits the (safe) protocol of the base document.
if ( preg_match ( '![/?#]!' , $protocol )) {
break ;
}
// Check if this is a disallowed protocol. Per RFC2616, section 3.2.3
// (URI Comparison) scheme comparison must be case-insensitive.
if ( ! isset ( $allowed_protocols [ strtolower ( $protocol )])) {
$uri = substr ( $uri , $colonpos + 1 );
}
}
} while ( $before != $uri );
return $uri ;
}
/**
* Verifies the syntax of the given URL .
*
* This function should only be used on actual URLs . It should not be used for
* Drupal menu paths , which can contain arbitrary characters .
* Valid values per RFC 3986.
*
* @ param string $url
* The URL to verify .
* @ param bool $absolute
* Whether the URL is absolute ( beginning with a scheme such as " http: " ) .
*
* @ return bool
* TRUE if the URL is in a valid format , FALSE otherwise .
*/
public static function isValid ( $url , $absolute = FALSE ) {
if ( $absolute ) {
return ( bool ) preg_match ( "
/^ # Start at the beginning of the text
( ? : ftp | https ? | feed ) : \ / \ / # Look for ftp, http, https or feed schemes
( ? : # Userinfo (optional) which is typically
( ? : ( ? : [ \w\ . \ - \ +! $ & ' \ ( \ ) * \ + ,; = ] |% [ 0 - 9 a - f ]{ 2 }) +: ) * # a username or a username and password
( ? : [ \w\ . \ - \ +%! $ & ' \ ( \ ) * \ + ,; = ] |% [ 0 - 9 a - f ]{ 2 }) +@ # combination
) ?
( ? :
( ? : [ a - z0 - 9 \ - \ . ] |% [ 0 - 9 a - f ]{ 2 }) + # A domain name or a IPv4 address
| ( ? : \ [( ? : [ 0 - 9 a - f ]{ 0 , 4 } : ) * ( ? : [ 0 - 9 a - f ]{ 0 , 4 }) \ ]) # or a well formed IPv6 address
)
( ? :: [ 0 - 9 ] + ) ? # Server port number (optional)
( ? : [ \ /| \ ? ]
( ? : [ \w #!:\.\?\+=&@$'~*,;\/\(\)\[\]\-]|%[0-9a-f]{2}) # The path and query (optional)
* ) ?
$ / xi " , $url );
}
else {
return ( bool ) preg_match ( " /^(?:[ \ w#!: \ . \ ? \ +=&@ $ '~*,; \ / \ ( \ ) \ [ \ ] \ -]|%[0-9a-f] { 2})+ $ /i " , $url );
}
}
}