[ Index ]

PHP Cross Reference of phpBB-3.3.14-deutsch

title

Body

[close]

/vendor/s9e/text-formatter/src/Parser/AttributeFilters/ -> UrlFilter.php (source)

   1  <?php
   2  
   3  /**
   4  * @package   s9e\TextFormatter
   5  * @copyright Copyright (c) 2010-2022 The s9e authors
   6  * @license   http://www.opensource.org/licenses/mit-license.php The MIT License
   7  */
   8  namespace s9e\TextFormatter\Parser\AttributeFilters;
   9  
  10  use s9e\TextFormatter\Parser\Logger;
  11  
  12  class UrlFilter
  13  {
  14      /**
  15      * Filter a URL
  16      *
  17      * @param  mixed  $attrValue Original URL
  18      * @param  array  $urlConfig URL config
  19      * @param  Logger $logger    Parser's logger
  20      * @return mixed             Cleaned up URL if valid, FALSE otherwise
  21      */
  22  	public static function filter($attrValue, array $urlConfig, Logger $logger = null)
  23      {
  24          /**
  25          * Trim the URL to conform with HTML5 then parse it
  26          * @link http://dev.w3.org/html5/spec/links.html#attr-hyperlink-href
  27          */
  28          $p = self::parseUrl(trim($attrValue));
  29  
  30          $error = self::validateUrl($urlConfig, $p);
  31          if (!empty($error))
  32          {
  33              if (isset($logger))
  34              {
  35                  $p['attrValue'] = $attrValue;
  36                  $logger->err($error, $p);
  37              }
  38  
  39              return false;
  40          }
  41  
  42          return self::rebuildUrl($p);
  43      }
  44  
  45      /**
  46      * Parse a URL and return its components
  47      *
  48      * Similar to PHP's own parse_url() except that all parts are always returned
  49      *
  50      * @param  string $url Original URL
  51      * @return array
  52      */
  53  	protected static function parseUrl($url)
  54      {
  55          $regexp = '(^(?:([a-z][-+.\\w]*):)?(?://(?:([^:/?#]*)(?::([^/?#]*)?)?@)?(?:(\\[[a-f\\d:]+\\]|[^:/?#]+)(?::(\\d*))?)?(?![^/?#]))?([^?#]*)(\\?[^#]*)?(#.*)?$)Di';
  56  
  57          // NOTE: this regexp always matches because of the last three captures
  58          preg_match($regexp, $url, $m);
  59  
  60          $parts  = [];
  61          $tokens = ['scheme', 'user', 'pass', 'host', 'port', 'path', 'query', 'fragment'];
  62          foreach ($tokens as $i => $name)
  63          {
  64              $parts[$name] = $m[$i + 1] ?? '';
  65          }
  66  
  67          /**
  68          * @link http://tools.ietf.org/html/rfc3986#section-3.1
  69          *
  70          * 'An implementation should accept uppercase letters as equivalent to lowercase in
  71          * scheme names (e.g., allow "HTTP" as well as "http") for the sake of robustness but
  72          * should only produce lowercase scheme names for consistency.'
  73          */
  74          $parts['scheme'] = strtolower($parts['scheme']);
  75  
  76          /**
  77          * Normalize the domain label separators and remove trailing dots
  78          * @link http://url.spec.whatwg.org/#domain-label-separators
  79          */
  80          $parts['host'] = rtrim(preg_replace("/\xE3\x80\x82|\xEF(?:\xBC\x8E|\xBD\xA1)/s", '.', $parts['host']), '.');
  81  
  82          // Test whether host has non-ASCII characters and punycode it if possible
  83          if (preg_match('#[^[:ascii:]]#', $parts['host']) && function_exists('idn_to_ascii'))
  84          {
  85              $variant = (defined('INTL_IDNA_VARIANT_UTS46')) ? INTL_IDNA_VARIANT_UTS46 : 0;
  86              $parts['host'] = idn_to_ascii($parts['host'], 0, $variant);
  87          }
  88  
  89          return $parts;
  90      }
  91  
  92      /**
  93      * Rebuild a parsed URL
  94      *
  95      * @param  array  $p Parsed URL
  96      * @return string
  97      */
  98  	protected static function rebuildUrl(array $p)
  99      {
 100          $url = '';
 101          if ($p['scheme'] !== '')
 102          {
 103              $url .= $p['scheme'] . ':';
 104          }
 105          if ($p['host'] !== '')
 106          {
 107              $url .= '//';
 108  
 109              // Add the credentials if applicable
 110              if ($p['user'] !== '')
 111              {
 112                  // Reencode the credentials in case there are invalid chars in them, or suspicious
 113                  // characters such as : or @ that could confuse a browser into connecting to the
 114                  // wrong host (or at least, to a host that is different than the one we thought)
 115                  $url .= rawurlencode(urldecode($p['user']));
 116  
 117                  if ($p['pass'] !== '')
 118                  {
 119                      $url .= ':' . rawurlencode(urldecode($p['pass']));
 120                  }
 121  
 122                  $url .= '@';
 123              }
 124  
 125              $url .= $p['host'];
 126  
 127              // Append the port number (note that as per the regexp it can only contain digits)
 128              if ($p['port'] !== '')
 129              {
 130                  $url .= ':' . $p['port'];
 131              }
 132          }
 133          elseif ($p['scheme'] === 'file')
 134          {
 135              // Allow the file: scheme to not have a host and ensure it starts with slashes
 136              $url .= '//';
 137          }
 138  
 139          // Build the path, including the query and fragment parts
 140          $path = $p['path'] . $p['query'] . $p['fragment'];
 141  
 142          /**
 143          * "For consistency, URI producers and normalizers should use uppercase hexadecimal digits
 144          * for all percent- encodings."
 145          *
 146          * @link http://tools.ietf.org/html/rfc3986#section-2.1
 147          */
 148          $path = preg_replace_callback(
 149              '/%.?[a-f]/',
 150              function ($m)
 151              {
 152                  return strtoupper($m[0]);
 153              },
 154              $path
 155          );
 156  
 157          // Append the sanitized path to the URL
 158          $url .= self::sanitizeUrl($path);
 159  
 160          // Replace the first colon if there's no scheme and it could potentially be interpreted as
 161          // the scheme separator
 162          if (!$p['scheme'])
 163          {
 164              $url = preg_replace('#^([^/]*):#', '$1%3A', $url);
 165          }
 166  
 167          return $url;
 168      }
 169  
 170      /**
 171      * Sanitize a URL for safe use regardless of context
 172      *
 173      * This method URL-encodes some sensitive characters in case someone would want to use the URL in
 174      * some JavaScript thingy, or in CSS. We also encode characters that are not allowed in the path
 175      * of a URL as defined in RFC 3986 appendix A, including percent signs that are not immediately
 176      * followed by two hex digits.
 177      *
 178      * " and ' to prevent breaking out of quotes (JavaScript or otherwise)
 179      * ( and ) to prevent the use of functions in JavaScript (eval()) or CSS (expression())
 180      * < and > to prevent breaking out of <script>
 181      * \r and \n because they're illegal in JavaScript
 182      * [ and ] because the W3 validator rejects them and they "should" be escaped as per RFC 3986
 183      * Non-ASCII characters as per RFC 3986
 184      * Control codes and spaces, as per RFC 3986
 185      *
 186      * @link http://sla.ckers.org/forum/read.php?2,51478
 187      * @link http://timelessrepo.com/json-isnt-a-javascript-subset
 188      * @link http://www.ietf.org/rfc/rfc3986.txt
 189      * @link http://stackoverflow.com/a/1547922
 190      * @link http://tools.ietf.org/html/rfc3986#appendix-A
 191      *
 192      * @param  string $url Original URL
 193      * @return string      Sanitized URL
 194      */
 195  	public static function sanitizeUrl($url)
 196      {
 197          return preg_replace_callback(
 198              '/%(?![0-9A-Fa-f]{2})|[^!#-&*-;=?-Z_a-z~]/',
 199              function ($m)
 200              {
 201                  return rawurlencode($m[0]);
 202              },
 203              $url
 204          );
 205      }
 206  
 207      /**
 208      * Validate a parsed URL
 209      *
 210      * @param  array      $urlConfig URL config
 211      * @param  array      $p         Parsed URL
 212      * @return string|null           Error message if invalid, or NULL
 213      */
 214  	protected static function validateUrl(array $urlConfig, array $p)
 215      {
 216          if ($p['scheme'] !== '' && !preg_match($urlConfig['allowedSchemes'], $p['scheme']))
 217          {
 218              return 'URL scheme is not allowed';
 219          }
 220  
 221          if ($p['host'] !== '')
 222          {
 223              /**
 224              * Test whether the host is valid
 225              * @link http://tools.ietf.org/html/rfc1035#section-2.3.1
 226              * @link http://tools.ietf.org/html/rfc1123#section-2
 227              */
 228              $regexp = '/^(?!-)[-a-z0-9]{0,62}[a-z0-9](?:\\.(?!-)[-a-z0-9]{0,62}[a-z0-9])*$/i';
 229              if (!preg_match($regexp, $p['host']))
 230              {
 231                  // If the host invalid, retest as an IPv4 and IPv6 address (IPv6 in brackets)
 232                  if (!NetworkFilter::filterIpv4($p['host'])
 233                   && !NetworkFilter::filterIpv6(preg_replace('/^\\[(.*)\\]$/', '$1', $p['host'])))
 234                  {
 235                      return 'URL host is invalid';
 236                  }
 237              }
 238  
 239              if ((isset($urlConfig['disallowedHosts']) && preg_match($urlConfig['disallowedHosts'], $p['host']))
 240               || (isset($urlConfig['restrictedHosts']) && !preg_match($urlConfig['restrictedHosts'], $p['host'])))
 241              {
 242                  return 'URL host is not allowed';
 243              }
 244          }
 245          elseif (preg_match('(^(?:(?:f|ht)tps?)$)', $p['scheme']))
 246          {
 247              return 'Missing host';
 248          }
 249      }
 250  }


Generated: Mon Nov 25 19:05:08 2024 Cross-referenced by PHPXref 0.7.1