diff options
Diffstat (limited to 'vendor/symfony/polyfill-intl-idn/Idn.php')
-rw-r--r-- | vendor/symfony/polyfill-intl-idn/Idn.php | 925 |
1 files changed, 925 insertions, 0 deletions
diff --git a/vendor/symfony/polyfill-intl-idn/Idn.php b/vendor/symfony/polyfill-intl-idn/Idn.php new file mode 100644 index 0000000..fee3026 --- /dev/null +++ b/vendor/symfony/polyfill-intl-idn/Idn.php @@ -0,0 +1,925 @@ +<?php + +/* + * This file is part of the Symfony package. + * + * (c) Fabien Potencier <fabien@symfony.com> and Trevor Rowbotham <trevor.rowbotham@pm.me> + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\Polyfill\Intl\Idn; + +use Exception; +use Normalizer; +use Symfony\Polyfill\Intl\Idn\Resources\unidata\DisallowedRanges; +use Symfony\Polyfill\Intl\Idn\Resources\unidata\Regex; + +/** + * @see https://www.unicode.org/reports/tr46/ + * + * @internal + */ +final class Idn +{ + public const ERROR_EMPTY_LABEL = 1; + public const ERROR_LABEL_TOO_LONG = 2; + public const ERROR_DOMAIN_NAME_TOO_LONG = 4; + public const ERROR_LEADING_HYPHEN = 8; + public const ERROR_TRAILING_HYPHEN = 0x10; + public const ERROR_HYPHEN_3_4 = 0x20; + public const ERROR_LEADING_COMBINING_MARK = 0x40; + public const ERROR_DISALLOWED = 0x80; + public const ERROR_PUNYCODE = 0x100; + public const ERROR_LABEL_HAS_DOT = 0x200; + public const ERROR_INVALID_ACE_LABEL = 0x400; + public const ERROR_BIDI = 0x800; + public const ERROR_CONTEXTJ = 0x1000; + public const ERROR_CONTEXTO_PUNCTUATION = 0x2000; + public const ERROR_CONTEXTO_DIGITS = 0x4000; + + public const INTL_IDNA_VARIANT_2003 = 0; + public const INTL_IDNA_VARIANT_UTS46 = 1; + + public const IDNA_DEFAULT = 0; + public const IDNA_ALLOW_UNASSIGNED = 1; + public const IDNA_USE_STD3_RULES = 2; + public const IDNA_CHECK_BIDI = 4; + public const IDNA_CHECK_CONTEXTJ = 8; + public const IDNA_NONTRANSITIONAL_TO_ASCII = 16; + public const IDNA_NONTRANSITIONAL_TO_UNICODE = 32; + + public const MAX_DOMAIN_SIZE = 253; + public const MAX_LABEL_SIZE = 63; + + public const BASE = 36; + public const TMIN = 1; + public const TMAX = 26; + public const SKEW = 38; + public const DAMP = 700; + public const INITIAL_BIAS = 72; + public const INITIAL_N = 128; + public const DELIMITER = '-'; + public const MAX_INT = 2147483647; + + /** + * Contains the numeric value of a basic code point (for use in representing integers) in the + * range 0 to BASE-1, or -1 if b is does not represent a value. + * + * @var array<int, int> + */ + private static $basicToDigit = [ + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, + + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, + + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + ]; + + /** + * @var array<int, int> + */ + private static $virama; + + /** + * @var array<int, string> + */ + private static $mapped; + + /** + * @var array<int, bool> + */ + private static $ignored; + + /** + * @var array<int, string> + */ + private static $deviation; + + /** + * @var array<int, bool> + */ + private static $disallowed; + + /** + * @var array<int, string> + */ + private static $disallowed_STD3_mapped; + + /** + * @var array<int, bool> + */ + private static $disallowed_STD3_valid; + + /** + * @var bool + */ + private static $mappingTableLoaded = false; + + /** + * @see https://www.unicode.org/reports/tr46/#ToASCII + * + * @param string $domainName + * @param int $options + * @param int $variant + * @param array $idna_info + * + * @return string|false + */ + public static function idn_to_ascii($domainName, $options = self::IDNA_DEFAULT, $variant = self::INTL_IDNA_VARIANT_UTS46, &$idna_info = []) + { + if (\PHP_VERSION_ID >= 70200 && self::INTL_IDNA_VARIANT_2003 === $variant) { + @trigger_error('idn_to_ascii(): INTL_IDNA_VARIANT_2003 is deprecated', \E_USER_DEPRECATED); + } + + $options = [ + 'CheckHyphens' => true, + 'CheckBidi' => self::INTL_IDNA_VARIANT_2003 === $variant || 0 !== ($options & self::IDNA_CHECK_BIDI), + 'CheckJoiners' => self::INTL_IDNA_VARIANT_UTS46 === $variant && 0 !== ($options & self::IDNA_CHECK_CONTEXTJ), + 'UseSTD3ASCIIRules' => 0 !== ($options & self::IDNA_USE_STD3_RULES), + 'Transitional_Processing' => self::INTL_IDNA_VARIANT_2003 === $variant || 0 === ($options & self::IDNA_NONTRANSITIONAL_TO_ASCII), + 'VerifyDnsLength' => true, + ]; + $info = new Info(); + $labels = self::process((string) $domainName, $options, $info); + + foreach ($labels as $i => $label) { + // Only convert labels to punycode that contain non-ASCII code points + if (1 === preg_match('/[^\x00-\x7F]/', $label)) { + try { + $label = 'xn--'.self::punycodeEncode($label); + } catch (Exception $e) { + $info->errors |= self::ERROR_PUNYCODE; + } + + $labels[$i] = $label; + } + } + + if ($options['VerifyDnsLength']) { + self::validateDomainAndLabelLength($labels, $info); + } + + $idna_info = [ + 'result' => implode('.', $labels), + 'isTransitionalDifferent' => $info->transitionalDifferent, + 'errors' => $info->errors, + ]; + + return 0 === $info->errors ? $idna_info['result'] : false; + } + + /** + * @see https://www.unicode.org/reports/tr46/#ToUnicode + * + * @param string $domainName + * @param int $options + * @param int $variant + * @param array $idna_info + * + * @return string|false + */ + public static function idn_to_utf8($domainName, $options = self::IDNA_DEFAULT, $variant = self::INTL_IDNA_VARIANT_UTS46, &$idna_info = []) + { + if (\PHP_VERSION_ID >= 70200 && self::INTL_IDNA_VARIANT_2003 === $variant) { + @trigger_error('idn_to_utf8(): INTL_IDNA_VARIANT_2003 is deprecated', \E_USER_DEPRECATED); + } + + $info = new Info(); + $labels = self::process((string) $domainName, [ + 'CheckHyphens' => true, + 'CheckBidi' => self::INTL_IDNA_VARIANT_2003 === $variant || 0 !== ($options & self::IDNA_CHECK_BIDI), + 'CheckJoiners' => self::INTL_IDNA_VARIANT_UTS46 === $variant && 0 !== ($options & self::IDNA_CHECK_CONTEXTJ), + 'UseSTD3ASCIIRules' => 0 !== ($options & self::IDNA_USE_STD3_RULES), + 'Transitional_Processing' => self::INTL_IDNA_VARIANT_2003 === $variant || 0 === ($options & self::IDNA_NONTRANSITIONAL_TO_UNICODE), + ], $info); + $idna_info = [ + 'result' => implode('.', $labels), + 'isTransitionalDifferent' => $info->transitionalDifferent, + 'errors' => $info->errors, + ]; + + return 0 === $info->errors ? $idna_info['result'] : false; + } + + /** + * @param string $label + * + * @return bool + */ + private static function isValidContextJ(array $codePoints, $label) + { + if (!isset(self::$virama)) { + self::$virama = require __DIR__.\DIRECTORY_SEPARATOR.'Resources'.\DIRECTORY_SEPARATOR.'unidata'.\DIRECTORY_SEPARATOR.'virama.php'; + } + + $offset = 0; + + foreach ($codePoints as $i => $codePoint) { + if (0x200C !== $codePoint && 0x200D !== $codePoint) { + continue; + } + + if (!isset($codePoints[$i - 1])) { + return false; + } + + // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; + if (isset(self::$virama[$codePoints[$i - 1]])) { + continue; + } + + // If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C(Joining_Type:T)*(Joining_Type:{R,D})) Then + // True; + // Generated RegExp = ([Joining_Type:{L,D}][Joining_Type:T]*\u200C[Joining_Type:T]*)[Joining_Type:{R,D}] + if (0x200C === $codePoint && 1 === preg_match(Regex::ZWNJ, $label, $matches, \PREG_OFFSET_CAPTURE, $offset)) { + $offset += \strlen($matches[1][0]); + + continue; + } + + return false; + } + + return true; + } + + /** + * @see https://www.unicode.org/reports/tr46/#ProcessingStepMap + * + * @param string $input + * @param array<string, bool> $options + * + * @return string + */ + private static function mapCodePoints($input, array $options, Info $info) + { + $str = ''; + $useSTD3ASCIIRules = $options['UseSTD3ASCIIRules']; + $transitional = $options['Transitional_Processing']; + + foreach (self::utf8Decode($input) as $codePoint) { + $data = self::lookupCodePointStatus($codePoint, $useSTD3ASCIIRules); + + switch ($data['status']) { + case 'disallowed': + $info->errors |= self::ERROR_DISALLOWED; + + // no break. + + case 'valid': + $str .= mb_chr($codePoint, 'utf-8'); + + break; + + case 'ignored': + // Do nothing. + break; + + case 'mapped': + $str .= $data['mapping']; + + break; + + case 'deviation': + $info->transitionalDifferent = true; + $str .= ($transitional ? $data['mapping'] : mb_chr($codePoint, 'utf-8')); + + break; + } + } + + return $str; + } + + /** + * @see https://www.unicode.org/reports/tr46/#Processing + * + * @param string $domain + * @param array<string, bool> $options + * + * @return array<int, string> + */ + private static function process($domain, array $options, Info $info) + { + // If VerifyDnsLength is not set, we are doing ToUnicode otherwise we are doing ToASCII and + // we need to respect the VerifyDnsLength option. + $checkForEmptyLabels = !isset($options['VerifyDnsLength']) || $options['VerifyDnsLength']; + + if ($checkForEmptyLabels && '' === $domain) { + $info->errors |= self::ERROR_EMPTY_LABEL; + + return [$domain]; + } + + // Step 1. Map each code point in the domain name string + $domain = self::mapCodePoints($domain, $options, $info); + + // Step 2. Normalize the domain name string to Unicode Normalization Form C. + if (!Normalizer::isNormalized($domain, Normalizer::FORM_C)) { + $domain = Normalizer::normalize($domain, Normalizer::FORM_C); + } + + // Step 3. Break the string into labels at U+002E (.) FULL STOP. + $labels = explode('.', $domain); + $lastLabelIndex = \count($labels) - 1; + + // Step 4. Convert and validate each label in the domain name string. + foreach ($labels as $i => $label) { + $validationOptions = $options; + + if ('xn--' === substr($label, 0, 4)) { + try { + $label = self::punycodeDecode(substr($label, 4)); + } catch (Exception $e) { + $info->errors |= self::ERROR_PUNYCODE; + + continue; + } + + $validationOptions['Transitional_Processing'] = false; + $labels[$i] = $label; + } + + self::validateLabel($label, $info, $validationOptions, $i > 0 && $i === $lastLabelIndex); + } + + if ($info->bidiDomain && !$info->validBidiDomain) { + $info->errors |= self::ERROR_BIDI; + } + + // Any input domain name string that does not record an error has been successfully + // processed according to this specification. Conversely, if an input domain_name string + // causes an error, then the processing of the input domain_name string fails. Determining + // what to do with error input is up to the caller, and not in the scope of this document. + return $labels; + } + + /** + * @see https://tools.ietf.org/html/rfc5893#section-2 + * + * @param string $label + */ + private static function validateBidiLabel($label, Info $info) + { + if (1 === preg_match(Regex::RTL_LABEL, $label)) { + $info->bidiDomain = true; + + // Step 1. The first character must be a character with Bidi property L, R, or AL. + // If it has the R or AL property, it is an RTL label + if (1 !== preg_match(Regex::BIDI_STEP_1_RTL, $label)) { + $info->validBidiDomain = false; + + return; + } + + // Step 2. In an RTL label, only characters with the Bidi properties R, AL, AN, EN, ES, + // CS, ET, ON, BN, or NSM are allowed. + if (1 === preg_match(Regex::BIDI_STEP_2, $label)) { + $info->validBidiDomain = false; + + return; + } + + // Step 3. In an RTL label, the end of the label must be a character with Bidi property + // R, AL, EN, or AN, followed by zero or more characters with Bidi property NSM. + if (1 !== preg_match(Regex::BIDI_STEP_3, $label)) { + $info->validBidiDomain = false; + + return; + } + + // Step 4. In an RTL label, if an EN is present, no AN may be present, and vice versa. + if (1 === preg_match(Regex::BIDI_STEP_4_AN, $label) && 1 === preg_match(Regex::BIDI_STEP_4_EN, $label)) { + $info->validBidiDomain = false; + + return; + } + + return; + } + + // We are a LTR label + // Step 1. The first character must be a character with Bidi property L, R, or AL. + // If it has the L property, it is an LTR label. + if (1 !== preg_match(Regex::BIDI_STEP_1_LTR, $label)) { + $info->validBidiDomain = false; + + return; + } + + // Step 5. In an LTR label, only characters with the Bidi properties L, EN, + // ES, CS, ET, ON, BN, or NSM are allowed. + if (1 === preg_match(Regex::BIDI_STEP_5, $label)) { + $info->validBidiDomain = false; + + return; + } + + // Step 6.In an LTR label, the end of the label must be a character with Bidi property L or + // EN, followed by zero or more characters with Bidi property NSM. + if (1 !== preg_match(Regex::BIDI_STEP_6, $label)) { + $info->validBidiDomain = false; + + return; + } + } + + /** + * @param array<int, string> $labels + */ + private static function validateDomainAndLabelLength(array $labels, Info $info) + { + $maxDomainSize = self::MAX_DOMAIN_SIZE; + $length = \count($labels); + + // Number of "." delimiters. + $domainLength = $length - 1; + + // If the last label is empty and it is not the first label, then it is the root label. + // Increase the max size by 1, making it 254, to account for the root label's "." + // delimiter. This also means we don't need to check the last label's length for being too + // long. + if ($length > 1 && '' === $labels[$length - 1]) { + ++$maxDomainSize; + --$length; + } + + for ($i = 0; $i < $length; ++$i) { + $bytes = \strlen($labels[$i]); + $domainLength += $bytes; + + if ($bytes > self::MAX_LABEL_SIZE) { + $info->errors |= self::ERROR_LABEL_TOO_LONG; + } + } + + if ($domainLength > $maxDomainSize) { + $info->errors |= self::ERROR_DOMAIN_NAME_TOO_LONG; + } + } + + /** + * @see https://www.unicode.org/reports/tr46/#Validity_Criteria + * + * @param string $label + * @param array<string, bool> $options + * @param bool $canBeEmpty + */ + private static function validateLabel($label, Info $info, array $options, $canBeEmpty) + { + if ('' === $label) { + if (!$canBeEmpty && (!isset($options['VerifyDnsLength']) || $options['VerifyDnsLength'])) { + $info->errors |= self::ERROR_EMPTY_LABEL; + } + + return; + } + + // Step 1. The label must be in Unicode Normalization Form C. + if (!Normalizer::isNormalized($label, Normalizer::FORM_C)) { + $info->errors |= self::ERROR_INVALID_ACE_LABEL; + } + + $codePoints = self::utf8Decode($label); + + if ($options['CheckHyphens']) { + // Step 2. If CheckHyphens, the label must not contain a U+002D HYPHEN-MINUS character + // in both the thrid and fourth positions. + if (isset($codePoints[2], $codePoints[3]) && 0x002D === $codePoints[2] && 0x002D === $codePoints[3]) { + $info->errors |= self::ERROR_HYPHEN_3_4; + } + + // Step 3. If CheckHyphens, the label must neither begin nor end with a U+002D + // HYPHEN-MINUS character. + if ('-' === substr($label, 0, 1)) { + $info->errors |= self::ERROR_LEADING_HYPHEN; + } + + if ('-' === substr($label, -1, 1)) { + $info->errors |= self::ERROR_TRAILING_HYPHEN; + } + } + + // Step 4. The label must not contain a U+002E (.) FULL STOP. + if (false !== strpos($label, '.')) { + $info->errors |= self::ERROR_LABEL_HAS_DOT; + } + + // Step 5. The label must not begin with a combining mark, that is: General_Category=Mark. + if (1 === preg_match(Regex::COMBINING_MARK, $label)) { + $info->errors |= self::ERROR_LEADING_COMBINING_MARK; + } + + // Step 6. Each code point in the label must only have certain status values according to + // Section 5, IDNA Mapping Table: + $transitional = $options['Transitional_Processing']; + $useSTD3ASCIIRules = $options['UseSTD3ASCIIRules']; + + foreach ($codePoints as $codePoint) { + $data = self::lookupCodePointStatus($codePoint, $useSTD3ASCIIRules); + $status = $data['status']; + + if ('valid' === $status || (!$transitional && 'deviation' === $status)) { + continue; + } + + $info->errors |= self::ERROR_DISALLOWED; + + break; + } + + // Step 7. If CheckJoiners, the label must satisify the ContextJ rules from Appendix A, in + // The Unicode Code Points and Internationalized Domain Names for Applications (IDNA) + // [IDNA2008]. + if ($options['CheckJoiners'] && !self::isValidContextJ($codePoints, $label)) { + $info->errors |= self::ERROR_CONTEXTJ; + } + + // Step 8. If CheckBidi, and if the domain name is a Bidi domain name, then the label must + // satisfy all six of the numbered conditions in [IDNA2008] RFC 5893, Section 2. + if ($options['CheckBidi'] && (!$info->bidiDomain || $info->validBidiDomain)) { + self::validateBidiLabel($label, $info); + } + } + + /** + * @see https://tools.ietf.org/html/rfc3492#section-6.2 + * + * @param string $input + * + * @return string + */ + private static function punycodeDecode($input) + { + $n = self::INITIAL_N; + $out = 0; + $i = 0; + $bias = self::INITIAL_BIAS; + $lastDelimIndex = strrpos($input, self::DELIMITER); + $b = false === $lastDelimIndex ? 0 : $lastDelimIndex; + $inputLength = \strlen($input); + $output = []; + $bytes = array_map('ord', str_split($input)); + + for ($j = 0; $j < $b; ++$j) { + if ($bytes[$j] > 0x7F) { + throw new Exception('Invalid input'); + } + + $output[$out++] = $input[$j]; + } + + if ($b > 0) { + ++$b; + } + + for ($in = $b; $in < $inputLength; ++$out) { + $oldi = $i; + $w = 1; + + for ($k = self::BASE; /* no condition */; $k += self::BASE) { + if ($in >= $inputLength) { + throw new Exception('Invalid input'); + } + + $digit = self::$basicToDigit[$bytes[$in++] & 0xFF]; + + if ($digit < 0) { + throw new Exception('Invalid input'); + } + + if ($digit > intdiv(self::MAX_INT - $i, $w)) { + throw new Exception('Integer overflow'); + } + + $i += $digit * $w; + + if ($k <= $bias) { + $t = self::TMIN; + } elseif ($k >= $bias + self::TMAX) { + $t = self::TMAX; + } else { + $t = $k - $bias; + } + + if ($digit < $t) { + break; + } + + $baseMinusT = self::BASE - $t; + + if ($w > intdiv(self::MAX_INT, $baseMinusT)) { + throw new Exception('Integer overflow'); + } + + $w *= $baseMinusT; + } + + $outPlusOne = $out + 1; + $bias = self::adaptBias($i - $oldi, $outPlusOne, 0 === $oldi); + + if (intdiv($i, $outPlusOne) > self::MAX_INT - $n) { + throw new Exception('Integer overflow'); + } + + $n += intdiv($i, $outPlusOne); + $i %= $outPlusOne; + array_splice($output, $i++, 0, [mb_chr($n, 'utf-8')]); + } + + return implode('', $output); + } + + /** + * @see https://tools.ietf.org/html/rfc3492#section-6.3 + * + * @param string $input + * + * @return string + */ + private static function punycodeEncode($input) + { + $n = self::INITIAL_N; + $delta = 0; + $out = 0; + $bias = self::INITIAL_BIAS; + $inputLength = 0; + $output = ''; + $iter = self::utf8Decode($input); + + foreach ($iter as $codePoint) { + ++$inputLength; + + if ($codePoint < 0x80) { + $output .= \chr($codePoint); + ++$out; + } + } + + $h = $out; + $b = $out; + + if ($b > 0) { + $output .= self::DELIMITER; + ++$out; + } + + while ($h < $inputLength) { + $m = self::MAX_INT; + + foreach ($iter as $codePoint) { + if ($codePoint >= $n && $codePoint < $m) { + $m = $codePoint; + } + } + + if ($m - $n > intdiv(self::MAX_INT - $delta, $h + 1)) { + throw new Exception('Integer overflow'); + } + + $delta += ($m - $n) * ($h + 1); + $n = $m; + + foreach ($iter as $codePoint) { + if ($codePoint < $n && 0 === ++$delta) { + throw new Exception('Integer overflow'); + } + + if ($codePoint === $n) { + $q = $delta; + + for ($k = self::BASE; /* no condition */; $k += self::BASE) { + if ($k <= $bias) { + $t = self::TMIN; + } elseif ($k >= $bias + self::TMAX) { + $t = self::TMAX; + } else { + $t = $k - $bias; + } + + if ($q < $t) { + break; + } + + $qMinusT = $q - $t; + $baseMinusT = self::BASE - $t; + $output .= self::encodeDigit($t + ($qMinusT) % ($baseMinusT), false); + ++$out; + $q = intdiv($qMinusT, $baseMinusT); + } + + $output .= self::encodeDigit($q, false); + ++$out; + $bias = self::adaptBias($delta, $h + 1, $h === $b); + $delta = 0; + ++$h; + } + } + + ++$delta; + ++$n; + } + + return $output; + } + + /** + * @see https://tools.ietf.org/html/rfc3492#section-6.1 + * + * @param int $delta + * @param int $numPoints + * @param bool $firstTime + * + * @return int + */ + private static function adaptBias($delta, $numPoints, $firstTime) + { + // xxx >> 1 is a faster way of doing intdiv(xxx, 2) + $delta = $firstTime ? intdiv($delta, self::DAMP) : $delta >> 1; + $delta += intdiv($delta, $numPoints); + $k = 0; + + while ($delta > ((self::BASE - self::TMIN) * self::TMAX) >> 1) { + $delta = intdiv($delta, self::BASE - self::TMIN); + $k += self::BASE; + } + + return $k + intdiv((self::BASE - self::TMIN + 1) * $delta, $delta + self::SKEW); + } + + /** + * @param int $d + * @param bool $flag + * + * @return string + */ + private static function encodeDigit($d, $flag) + { + return \chr($d + 22 + 75 * ($d < 26 ? 1 : 0) - (($flag ? 1 : 0) << 5)); + } + + /** + * Takes a UTF-8 encoded string and converts it into a series of integer code points. Any + * invalid byte sequences will be replaced by a U+FFFD replacement code point. + * + * @see https://encoding.spec.whatwg.org/#utf-8-decoder + * + * @param string $input + * + * @return array<int, int> + */ + private static function utf8Decode($input) + { + $bytesSeen = 0; + $bytesNeeded = 0; + $lowerBoundary = 0x80; + $upperBoundary = 0xBF; + $codePoint = 0; + $codePoints = []; + $length = \strlen($input); + + for ($i = 0; $i < $length; ++$i) { + $byte = \ord($input[$i]); + + if (0 === $bytesNeeded) { + if ($byte >= 0x00 && $byte <= 0x7F) { + $codePoints[] = $byte; + + continue; + } + + if ($byte >= 0xC2 && $byte <= 0xDF) { + $bytesNeeded = 1; + $codePoint = $byte & 0x1F; + } elseif ($byte >= 0xE0 && $byte <= 0xEF) { + if (0xE0 === $byte) { + $lowerBoundary = 0xA0; + } elseif (0xED === $byte) { + $upperBoundary = 0x9F; + } + + $bytesNeeded = 2; + $codePoint = $byte & 0xF; + } elseif ($byte >= 0xF0 && $byte <= 0xF4) { + if (0xF0 === $byte) { + $lowerBoundary = 0x90; + } elseif (0xF4 === $byte) { + $upperBoundary = 0x8F; + } + + $bytesNeeded = 3; + $codePoint = $byte & 0x7; + } else { + $codePoints[] = 0xFFFD; + } + + continue; + } + + if ($byte < $lowerBoundary || $byte > $upperBoundary) { + $codePoint = 0; + $bytesNeeded = 0; + $bytesSeen = 0; + $lowerBoundary = 0x80; + $upperBoundary = 0xBF; + --$i; + $codePoints[] = 0xFFFD; + + continue; + } + + $lowerBoundary = 0x80; + $upperBoundary = 0xBF; + $codePoint = ($codePoint << 6) | ($byte & 0x3F); + + if (++$bytesSeen !== $bytesNeeded) { + continue; + } + + $codePoints[] = $codePoint; + $codePoint = 0; + $bytesNeeded = 0; + $bytesSeen = 0; + } + + // String unexpectedly ended, so append a U+FFFD code point. + if (0 !== $bytesNeeded) { + $codePoints[] = 0xFFFD; + } + + return $codePoints; + } + + /** + * @param int $codePoint + * @param bool $useSTD3ASCIIRules + * + * @return array{status: string, mapping?: string} + */ + private static function lookupCodePointStatus($codePoint, $useSTD3ASCIIRules) + { + if (!self::$mappingTableLoaded) { + self::$mappingTableLoaded = true; + self::$mapped = require __DIR__.'/Resources/unidata/mapped.php'; + self::$ignored = require __DIR__.'/Resources/unidata/ignored.php'; + self::$deviation = require __DIR__.'/Resources/unidata/deviation.php'; + self::$disallowed = require __DIR__.'/Resources/unidata/disallowed.php'; + self::$disallowed_STD3_mapped = require __DIR__.'/Resources/unidata/disallowed_STD3_mapped.php'; + self::$disallowed_STD3_valid = require __DIR__.'/Resources/unidata/disallowed_STD3_valid.php'; + } + + if (isset(self::$mapped[$codePoint])) { + return ['status' => 'mapped', 'mapping' => self::$mapped[$codePoint]]; + } + + if (isset(self::$ignored[$codePoint])) { + return ['status' => 'ignored']; + } + + if (isset(self::$deviation[$codePoint])) { + return ['status' => 'deviation', 'mapping' => self::$deviation[$codePoint]]; + } + + if (isset(self::$disallowed[$codePoint]) || DisallowedRanges::inRange($codePoint)) { + return ['status' => 'disallowed']; + } + + $isDisallowedMapped = isset(self::$disallowed_STD3_mapped[$codePoint]); + + if ($isDisallowedMapped || isset(self::$disallowed_STD3_valid[$codePoint])) { + $status = 'disallowed'; + + if (!$useSTD3ASCIIRules) { + $status = $isDisallowedMapped ? 'mapped' : 'valid'; + } + + if ($isDisallowedMapped) { + return ['status' => $status, 'mapping' => self::$disallowed_STD3_mapped[$codePoint]]; + } + + return ['status' => $status]; + } + + return ['status' => 'valid']; + } +} |