[ Index ] |
PHP Cross Reference of phpBB-3.3.14-deutsch |
[Summary view] [Print] [Text view]
1 <?php 2 3 /** 4 * @package s9e\TextFormatter 5 * @copyright Copyright (c) 2010-2022 The s9e authors 6 * @license http://www.opensource.org/licenses/mit-license.php The MIT License 7 */ 8 namespace s9e\TextFormatter\Parser\AttributeFilters; 9 10 use s9e\TextFormatter\Parser\Logger; 11 12 class UrlFilter 13 { 14 /** 15 * Filter a URL 16 * 17 * @param mixed $attrValue Original URL 18 * @param array $urlConfig URL config 19 * @param Logger $logger Parser's logger 20 * @return mixed Cleaned up URL if valid, FALSE otherwise 21 */ 22 public static function filter($attrValue, array $urlConfig, Logger $logger = null) 23 { 24 /** 25 * Trim the URL to conform with HTML5 then parse it 26 * @link http://dev.w3.org/html5/spec/links.html#attr-hyperlink-href 27 */ 28 $p = self::parseUrl(trim($attrValue)); 29 30 $error = self::validateUrl($urlConfig, $p); 31 if (!empty($error)) 32 { 33 if (isset($logger)) 34 { 35 $p['attrValue'] = $attrValue; 36 $logger->err($error, $p); 37 } 38 39 return false; 40 } 41 42 return self::rebuildUrl($p); 43 } 44 45 /** 46 * Parse a URL and return its components 47 * 48 * Similar to PHP's own parse_url() except that all parts are always returned 49 * 50 * @param string $url Original URL 51 * @return array 52 */ 53 protected static function parseUrl($url) 54 { 55 $regexp = '(^(?:([a-z][-+.\\w]*):)?(?://(?:([^:/?#]*)(?::([^/?#]*)?)?@)?(?:(\\[[a-f\\d:]+\\]|[^:/?#]+)(?::(\\d*))?)?(?![^/?#]))?([^?#]*)(\\?[^#]*)?(#.*)?$)Di'; 56 57 // NOTE: this regexp always matches because of the last three captures 58 preg_match($regexp, $url, $m); 59 60 $parts = []; 61 $tokens = ['scheme', 'user', 'pass', 'host', 'port', 'path', 'query', 'fragment']; 62 foreach ($tokens as $i => $name) 63 { 64 $parts[$name] = $m[$i + 1] ?? ''; 65 } 66 67 /** 68 * @link http://tools.ietf.org/html/rfc3986#section-3.1 69 * 70 * 'An implementation should accept uppercase letters as equivalent to lowercase in 71 * scheme names (e.g., allow "HTTP" as well as "http") for the sake of robustness but 72 * should only produce lowercase scheme names for consistency.' 73 */ 74 $parts['scheme'] = strtolower($parts['scheme']); 75 76 /** 77 * Normalize the domain label separators and remove trailing dots 78 * @link http://url.spec.whatwg.org/#domain-label-separators 79 */ 80 $parts['host'] = rtrim(preg_replace("/\xE3\x80\x82|\xEF(?:\xBC\x8E|\xBD\xA1)/s", '.', $parts['host']), '.'); 81 82 // Test whether host has non-ASCII characters and punycode it if possible 83 if (preg_match('#[^[:ascii:]]#', $parts['host']) && function_exists('idn_to_ascii')) 84 { 85 $variant = (defined('INTL_IDNA_VARIANT_UTS46')) ? INTL_IDNA_VARIANT_UTS46 : 0; 86 $parts['host'] = idn_to_ascii($parts['host'], 0, $variant); 87 } 88 89 return $parts; 90 } 91 92 /** 93 * Rebuild a parsed URL 94 * 95 * @param array $p Parsed URL 96 * @return string 97 */ 98 protected static function rebuildUrl(array $p) 99 { 100 $url = ''; 101 if ($p['scheme'] !== '') 102 { 103 $url .= $p['scheme'] . ':'; 104 } 105 if ($p['host'] !== '') 106 { 107 $url .= '//'; 108 109 // Add the credentials if applicable 110 if ($p['user'] !== '') 111 { 112 // Reencode the credentials in case there are invalid chars in them, or suspicious 113 // characters such as : or @ that could confuse a browser into connecting to the 114 // wrong host (or at least, to a host that is different than the one we thought) 115 $url .= rawurlencode(urldecode($p['user'])); 116 117 if ($p['pass'] !== '') 118 { 119 $url .= ':' . rawurlencode(urldecode($p['pass'])); 120 } 121 122 $url .= '@'; 123 } 124 125 $url .= $p['host']; 126 127 // Append the port number (note that as per the regexp it can only contain digits) 128 if ($p['port'] !== '') 129 { 130 $url .= ':' . $p['port']; 131 } 132 } 133 elseif ($p['scheme'] === 'file') 134 { 135 // Allow the file: scheme to not have a host and ensure it starts with slashes 136 $url .= '//'; 137 } 138 139 // Build the path, including the query and fragment parts 140 $path = $p['path'] . $p['query'] . $p['fragment']; 141 142 /** 143 * "For consistency, URI producers and normalizers should use uppercase hexadecimal digits 144 * for all percent- encodings." 145 * 146 * @link http://tools.ietf.org/html/rfc3986#section-2.1 147 */ 148 $path = preg_replace_callback( 149 '/%.?[a-f]/', 150 function ($m) 151 { 152 return strtoupper($m[0]); 153 }, 154 $path 155 ); 156 157 // Append the sanitized path to the URL 158 $url .= self::sanitizeUrl($path); 159 160 // Replace the first colon if there's no scheme and it could potentially be interpreted as 161 // the scheme separator 162 if (!$p['scheme']) 163 { 164 $url = preg_replace('#^([^/]*):#', '$1%3A', $url); 165 } 166 167 return $url; 168 } 169 170 /** 171 * Sanitize a URL for safe use regardless of context 172 * 173 * This method URL-encodes some sensitive characters in case someone would want to use the URL in 174 * some JavaScript thingy, or in CSS. We also encode characters that are not allowed in the path 175 * of a URL as defined in RFC 3986 appendix A, including percent signs that are not immediately 176 * followed by two hex digits. 177 * 178 * " and ' to prevent breaking out of quotes (JavaScript or otherwise) 179 * ( and ) to prevent the use of functions in JavaScript (eval()) or CSS (expression()) 180 * < and > to prevent breaking out of <script> 181 * \r and \n because they're illegal in JavaScript 182 * [ and ] because the W3 validator rejects them and they "should" be escaped as per RFC 3986 183 * Non-ASCII characters as per RFC 3986 184 * Control codes and spaces, as per RFC 3986 185 * 186 * @link http://sla.ckers.org/forum/read.php?2,51478 187 * @link http://timelessrepo.com/json-isnt-a-javascript-subset 188 * @link http://www.ietf.org/rfc/rfc3986.txt 189 * @link http://stackoverflow.com/a/1547922 190 * @link http://tools.ietf.org/html/rfc3986#appendix-A 191 * 192 * @param string $url Original URL 193 * @return string Sanitized URL 194 */ 195 public static function sanitizeUrl($url) 196 { 197 return preg_replace_callback( 198 '/%(?![0-9A-Fa-f]{2})|[^!#-&*-;=?-Z_a-z~]/', 199 function ($m) 200 { 201 return rawurlencode($m[0]); 202 }, 203 $url 204 ); 205 } 206 207 /** 208 * Validate a parsed URL 209 * 210 * @param array $urlConfig URL config 211 * @param array $p Parsed URL 212 * @return string|null Error message if invalid, or NULL 213 */ 214 protected static function validateUrl(array $urlConfig, array $p) 215 { 216 if ($p['scheme'] !== '' && !preg_match($urlConfig['allowedSchemes'], $p['scheme'])) 217 { 218 return 'URL scheme is not allowed'; 219 } 220 221 if ($p['host'] !== '') 222 { 223 /** 224 * Test whether the host is valid 225 * @link http://tools.ietf.org/html/rfc1035#section-2.3.1 226 * @link http://tools.ietf.org/html/rfc1123#section-2 227 */ 228 $regexp = '/^(?!-)[-a-z0-9]{0,62}[a-z0-9](?:\\.(?!-)[-a-z0-9]{0,62}[a-z0-9])*$/i'; 229 if (!preg_match($regexp, $p['host'])) 230 { 231 // If the host invalid, retest as an IPv4 and IPv6 address (IPv6 in brackets) 232 if (!NetworkFilter::filterIpv4($p['host']) 233 && !NetworkFilter::filterIpv6(preg_replace('/^\\[(.*)\\]$/', '$1', $p['host']))) 234 { 235 return 'URL host is invalid'; 236 } 237 } 238 239 if ((isset($urlConfig['disallowedHosts']) && preg_match($urlConfig['disallowedHosts'], $p['host'])) 240 || (isset($urlConfig['restrictedHosts']) && !preg_match($urlConfig['restrictedHosts'], $p['host']))) 241 { 242 return 'URL host is not allowed'; 243 } 244 } 245 elseif (preg_match('(^(?:(?:f|ht)tps?)$)', $p['scheme'])) 246 { 247 return 'Missing host'; 248 } 249 } 250 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Mon Nov 25 19:05:08 2024 | Cross-referenced by PHPXref 0.7.1 |