diff options
Diffstat (limited to 'xbmc/utils/RegExp.cpp')
-rw-r--r-- | xbmc/utils/RegExp.cpp | 651 |
1 files changed, 651 insertions, 0 deletions
diff --git a/xbmc/utils/RegExp.cpp b/xbmc/utils/RegExp.cpp new file mode 100644 index 0000000..9667b64 --- /dev/null +++ b/xbmc/utils/RegExp.cpp @@ -0,0 +1,651 @@ +/* + * Copyright (C) 2005-2018 Team Kodi + * This file is part of Kodi - https://kodi.tv + * + * SPDX-License-Identifier: GPL-2.0-or-later + * See LICENSES/README.md for more information. + */ + +#include "RegExp.h" + +#include "log.h" +#include "utils/StringUtils.h" +#include "utils/Utf8Utils.h" + +#include <algorithm> +#include <stdlib.h> +#include <string.h> + +using namespace PCRE; + +#ifndef PCRE_UCP +#define PCRE_UCP 0 +#endif // PCRE_UCP + +#ifdef PCRE_CONFIG_JIT +#define PCRE_HAS_JIT_CODE 1 +#endif + +#ifndef PCRE_STUDY_JIT_COMPILE +#define PCRE_STUDY_JIT_COMPILE 0 +#endif +#ifndef PCRE_INFO_JIT +// some unused number +#define PCRE_INFO_JIT 2048 +#endif +#ifndef PCRE_HAS_JIT_CODE +#define pcre_free_study(x) pcre_free((x)) +#endif + +int CRegExp::m_Utf8Supported = -1; +int CRegExp::m_UcpSupported = -1; +int CRegExp::m_JitSupported = -1; + + +CRegExp::CRegExp(bool caseless /*= false*/, CRegExp::utf8Mode utf8 /*= asciiOnly*/) +{ + InitValues(caseless, utf8); +} + +void CRegExp::InitValues(bool caseless /*= false*/, CRegExp::utf8Mode utf8 /*= asciiOnly*/) +{ + m_utf8Mode = utf8; + m_re = NULL; + m_sd = NULL; + m_iOptions = PCRE_DOTALL | PCRE_NEWLINE_ANY; + if(caseless) + m_iOptions |= PCRE_CASELESS; + if (m_utf8Mode == forceUtf8) + { + if (IsUtf8Supported()) + m_iOptions |= PCRE_UTF8; + if (AreUnicodePropertiesSupported()) + m_iOptions |= PCRE_UCP; + } + + m_offset = 0; + m_jitCompiled = false; + m_bMatched = false; + m_iMatchCount = 0; + m_jitStack = NULL; + + memset(m_iOvector, 0, sizeof(m_iOvector)); +} + +CRegExp::CRegExp(bool caseless, CRegExp::utf8Mode utf8, const char *re, studyMode study /*= NoStudy*/) +{ + if (utf8 == autoUtf8) + utf8 = requireUtf8(re) ? forceUtf8 : asciiOnly; + + InitValues(caseless, utf8); + RegComp(re, study); +} + +bool CRegExp::requireUtf8(const std::string& regexp) +{ + // enable UTF-8 mode if regexp string has UTF-8 multibyte sequences + if (CUtf8Utils::checkStrForUtf8(regexp) == CUtf8Utils::utf8string) + return true; + + // check for explicit Unicode Properties (\p, \P, \X) and for Unicode character codes (greater than 0xFF) in form \x{hhh..} + // note: PCRE change meaning of \w, \s, \d (and \W, \S, \D) when Unicode Properties are enabled, + // but in auto mode we enable UNP for US-ASCII regexp only if regexp contains explicit \p, \P, \X or Unicode character code + const char* const regexpC = regexp.c_str(); + const size_t len = regexp.length(); + size_t pos = 0; + + while (pos < len) + { + const char chr = regexpC[pos]; + if (chr == '\\') + { + const char nextChr = regexpC[pos + 1]; + + if (nextChr == 'p' || nextChr == 'P' || nextChr == 'X') + return true; // found Unicode Properties + else if (nextChr == 'Q') + pos = regexp.find("\\E", pos + 2); // skip all literals in "\Q...\E" + else if (nextChr == 'x' && regexpC[pos + 2] == '{') + { // Unicode character with hex code + if (readCharXCode(regexp, pos) >= 0x100) + return true; // found Unicode character code + } + else if (nextChr == '\\' || nextChr == '(' || nextChr == ')' + || nextChr == '[' || nextChr == ']') + pos++; // exclude next character from analyze + + } // chr != '\\' + else if (chr == '(' && regexpC[pos + 1] == '?' && regexpC[pos + 2] == '#') // comment in regexp + pos = regexp.find(')', pos); // skip comment + else if (chr == '[') + { + if (isCharClassWithUnicode(regexp, pos)) + return true; + } + + if (pos == std::string::npos) // check results of regexp.find() and isCharClassWithUnicode + return false; + + pos++; + } + + // no Unicode Properties was found + return false; +} + +inline int CRegExp::readCharXCode(const std::string& regexp, size_t& pos) +{ + // read hex character code in form "\x{hh..}" + // 'pos' must point to '\' + if (pos >= regexp.length()) + return -1; + const char* const regexpC = regexp.c_str(); + if (regexpC[pos] != '\\' || regexpC[pos + 1] != 'x' || regexpC[pos + 2] != '{') + return -1; + + pos++; + const size_t startPos = pos; // 'startPos' points to 'x' + const size_t closingBracketPos = regexp.find('}', startPos + 2); + if (closingBracketPos == std::string::npos) + return 0; // return character zero code, leave 'pos' at 'x' + + pos++; // 'pos' points to '{' + int chCode = 0; + while (++pos < closingBracketPos) + { + const int xdigitVal = StringUtils::asciixdigitvalue(regexpC[pos]); + if (xdigitVal >= 0) + chCode = chCode * 16 + xdigitVal; + else + { // found non-hexdigit + pos = startPos; // reset 'pos' to 'startPos', process "{hh..}" as non-code + return 0; // return character zero code + } + } + + return chCode; +} + +bool CRegExp::isCharClassWithUnicode(const std::string& regexp, size_t& pos) +{ + const char* const regexpC = regexp.c_str(); + const size_t len = regexp.length(); + if (pos > len || regexpC[pos] != '[') + return false; + + // look for Unicode character code "\x{hhh..}" and Unicode properties "\P", "\p" and "\X" + // find end (terminating ']') of character class (like "[a-h45]") + // detect nested POSIX classes like "[[:lower:]]" and escaped brackets like "[\]]" + bool needUnicode = false; + while (++pos < len) + { + if (regexpC[pos] == '[' && regexpC[pos + 1] == ':') + { // possible POSIX character class, like "[:alpha:]" + const size_t nextClosingBracketPos = regexp.find(']', pos + 2); // don't care about "\]", as it produce error if used inside POSIX char class + + if (nextClosingBracketPos == std::string::npos) + { // error in regexp: no closing ']' for character class + pos = std::string::npos; + return needUnicode; + } + else if (regexpC[nextClosingBracketPos - 1] == ':') + pos = nextClosingBracketPos; // skip POSIX character class + // if ":]" is not found, process "[:..." as part of normal character class + } + else if (regexpC[pos] == ']') + return needUnicode; // end of character class + else if (regexpC[pos] == '\\') + { + const char nextChar = regexpC[pos + 1]; + if (nextChar == ']' || nextChar == '[') + pos++; // skip next character + else if (nextChar == 'Q') + { + pos = regexp.find("\\E", pos + 2); + if (pos == std::string::npos) + return needUnicode; // error in regexp: no closing "\E" after "\Q" in character class + else + pos++; // skip "\E" + } + else if (nextChar == 'p' || nextChar == 'P' || nextChar == 'X') + needUnicode = true; // don't care about property name as it can contain only ASCII chars + else if (nextChar == 'x') + { + if (readCharXCode(regexp, pos) >= 0x100) + needUnicode = true; + } + } + } + pos = std::string::npos; // closing square bracket was not found + + return needUnicode; +} + + +CRegExp::CRegExp(const CRegExp& re) +{ + m_re = NULL; + m_sd = NULL; + m_jitStack = NULL; + m_utf8Mode = re.m_utf8Mode; + m_iOptions = re.m_iOptions; + *this = re; +} + +CRegExp& CRegExp::operator=(const CRegExp& re) +{ + size_t size; + Cleanup(); + m_jitCompiled = false; + m_pattern = re.m_pattern; + if (re.m_re) + { + if (pcre_fullinfo(re.m_re, NULL, PCRE_INFO_SIZE, &size) >= 0) + { + if ((m_re = (pcre*)malloc(size))) + { + memcpy(m_re, re.m_re, size); + memcpy(m_iOvector, re.m_iOvector, OVECCOUNT*sizeof(int)); + m_offset = re.m_offset; + m_iMatchCount = re.m_iMatchCount; + m_bMatched = re.m_bMatched; + m_subject = re.m_subject; + m_iOptions = re.m_iOptions; + } + else + CLog::Log(LOGFATAL, "{}: Failed to allocate memory", __FUNCTION__); + } + } + return *this; +} + +CRegExp::~CRegExp() +{ + Cleanup(); +} + +bool CRegExp::RegComp(const char *re, studyMode study /*= NoStudy*/) +{ + if (!re) + return false; + + m_offset = 0; + m_jitCompiled = false; + m_bMatched = false; + m_iMatchCount = 0; + const char *errMsg = NULL; + int errOffset = 0; + int options = m_iOptions; + if (m_utf8Mode == autoUtf8 && requireUtf8(re)) + options |= (IsUtf8Supported() ? PCRE_UTF8 : 0) | (AreUnicodePropertiesSupported() ? PCRE_UCP : 0); + + Cleanup(); + + m_re = pcre_compile(re, options, &errMsg, &errOffset, NULL); + if (!m_re) + { + m_pattern.clear(); + CLog::Log(LOGERROR, "PCRE: {}. Compilation failed at offset {} in expression '{}'", errMsg, + errOffset, re); + return false; + } + + m_pattern = re; + + if (study) + { + const bool jitCompile = (study == StudyWithJitComp) && IsJitSupported(); + const int studyOptions = jitCompile ? PCRE_STUDY_JIT_COMPILE : 0; + + m_sd = pcre_study(m_re, studyOptions, &errMsg); + if (errMsg != NULL) + { + CLog::Log(LOGWARNING, "{}: PCRE error \"{}\" while studying expression", __FUNCTION__, + errMsg); + if (m_sd != NULL) + { + pcre_free_study(m_sd); + m_sd = NULL; + } + } + else if (jitCompile) + { + int jitPresent = 0; + m_jitCompiled = (pcre_fullinfo(m_re, m_sd, PCRE_INFO_JIT, &jitPresent) == 0 && jitPresent == 1); + } + } + + return true; +} + +int CRegExp::RegFind(const char *str, unsigned int startoffset /*= 0*/, int maxNumberOfCharsToTest /*= -1*/) +{ + return PrivateRegFind(strlen(str), str, startoffset, maxNumberOfCharsToTest); +} + +int CRegExp::PrivateRegFind(size_t bufferLen, const char *str, unsigned int startoffset /* = 0*/, int maxNumberOfCharsToTest /*= -1*/) +{ + m_offset = 0; + m_bMatched = false; + m_iMatchCount = 0; + + if (!m_re) + { + CLog::Log(LOGERROR, "PCRE: Called before compilation"); + return -1; + } + + if (!str) + { + CLog::Log(LOGERROR, "PCRE: Called without a string to match"); + return -1; + } + + if (startoffset > bufferLen) + { + CLog::Log(LOGERROR, "{}: startoffset is beyond end of string to match", __FUNCTION__); + return -1; + } + +#ifdef PCRE_HAS_JIT_CODE + if (m_jitCompiled && !m_jitStack) + { + m_jitStack = pcre_jit_stack_alloc(32*1024, 512*1024); + if (m_jitStack == NULL) + CLog::Log(LOGWARNING, "{}: can't allocate address space for JIT stack", __FUNCTION__); + + pcre_assign_jit_stack(m_sd, NULL, m_jitStack); + } +#endif + + if (maxNumberOfCharsToTest >= 0) + bufferLen = std::min<size_t>(bufferLen, startoffset + maxNumberOfCharsToTest); + + m_subject.assign(str + startoffset, bufferLen - startoffset); + int rc = pcre_exec(m_re, NULL, m_subject.c_str(), m_subject.length(), 0, 0, m_iOvector, OVECCOUNT); + + if (rc<1) + { + static const int fragmentLen = 80; // length of excerpt before erroneous char for log + switch(rc) + { + case PCRE_ERROR_NOMATCH: + return -1; + + case PCRE_ERROR_MATCHLIMIT: + CLog::Log(LOGERROR, "PCRE: Match limit reached"); + return -1; + +#ifdef PCRE_ERROR_SHORTUTF8 + case PCRE_ERROR_SHORTUTF8: + { + const size_t startPos = (m_subject.length() > fragmentLen) ? CUtf8Utils::RFindValidUtf8Char(m_subject, m_subject.length() - fragmentLen) : 0; + if (startPos != std::string::npos) + CLog::Log( + LOGERROR, + "PCRE: Bad UTF-8 character at the end of string. Text before bad character: \"{}\"", + m_subject.substr(startPos)); + else + CLog::Log(LOGERROR, "PCRE: Bad UTF-8 character at the end of string"); + return -1; + } +#endif + case PCRE_ERROR_BADUTF8: + { + const size_t startPos = (m_iOvector[0] > fragmentLen) ? CUtf8Utils::RFindValidUtf8Char(m_subject, m_iOvector[0] - fragmentLen) : 0; + if (m_iOvector[0] >= 0 && startPos != std::string::npos) + CLog::Log(LOGERROR, + "PCRE: Bad UTF-8 character, error code: {}, position: {}. Text before bad " + "char: \"{}\"", + m_iOvector[1], m_iOvector[0], + m_subject.substr(startPos, m_iOvector[0] - startPos + 1)); + else + CLog::Log(LOGERROR, "PCRE: Bad UTF-8 character, error code: {}, position: {}", + m_iOvector[1], m_iOvector[0]); + return -1; + } + case PCRE_ERROR_BADUTF8_OFFSET: + CLog::Log(LOGERROR, "PCRE: Offset is pointing to the middle of UTF-8 character"); + return -1; + + default: + CLog::Log(LOGERROR, "PCRE: Unknown error: {}", rc); + return -1; + } + } + m_offset = startoffset; + m_bMatched = true; + m_iMatchCount = rc; + return m_iOvector[0] + m_offset; +} + +int CRegExp::GetCaptureTotal() const +{ + int c = -1; + if (m_re) + pcre_fullinfo(m_re, NULL, PCRE_INFO_CAPTURECOUNT, &c); + return c; +} + +std::string CRegExp::GetReplaceString(const std::string& sReplaceExp) const +{ + if (!m_bMatched || sReplaceExp.empty()) + return ""; + + const char* const expr = sReplaceExp.c_str(); + + size_t pos = sReplaceExp.find_first_of("\\&"); + std::string result(sReplaceExp, 0, pos); + result.reserve(sReplaceExp.size()); // very rough estimate + + while(pos != std::string::npos) + { + if (expr[pos] == '\\') + { + // string is null-terminated and current char isn't null, so it's safe to advance to next char + pos++; // advance to next char + const char nextChar = expr[pos]; + if (nextChar == '&' || nextChar == '\\') + { // this is "\&" or "\\" combination + result.push_back(nextChar); // add '&' or '\' to result + pos++; + } + else if (isdigit(nextChar)) + { // this is "\0" - "\9" combination + int subNum = nextChar - '0'; + pos++; // advance to second next char + const char secondNextChar = expr[pos]; + if (isdigit(secondNextChar)) + { // this is "\00" - "\99" combination + subNum = subNum * 10 + (secondNextChar - '0'); + pos++; + } + result.append(GetMatch(subNum)); + } + } + else + { // '&' char + result.append(GetMatch(0)); + pos++; + } + + const size_t nextPos = sReplaceExp.find_first_of("\\&", pos); + result.append(sReplaceExp, pos, nextPos - pos); + pos = nextPos; + } + + return result; +} + +int CRegExp::GetSubStart(int iSub) const +{ + if (!IsValidSubNumber(iSub)) + return -1; + + return m_iOvector[iSub*2] + m_offset; +} + +int CRegExp::GetSubStart(const std::string& subName) const +{ + return GetSubStart(GetNamedSubPatternNumber(subName.c_str())); +} + +int CRegExp::GetSubLength(int iSub) const +{ + if (!IsValidSubNumber(iSub)) + return -1; + + return m_iOvector[(iSub*2)+1] - m_iOvector[(iSub*2)]; +} + +int CRegExp::GetSubLength(const std::string& subName) const +{ + return GetSubLength(GetNamedSubPatternNumber(subName.c_str())); +} + +std::string CRegExp::GetMatch(int iSub /* = 0 */) const +{ + if (!IsValidSubNumber(iSub)) + return ""; + + int pos = m_iOvector[(iSub*2)]; + int len = m_iOvector[(iSub*2)+1] - pos; + if (pos < 0 || len <= 0) + return ""; + + return m_subject.substr(pos, len); +} + +std::string CRegExp::GetMatch(const std::string& subName) const +{ + return GetMatch(GetNamedSubPatternNumber(subName.c_str())); +} + +bool CRegExp::GetNamedSubPattern(const char* strName, std::string& strMatch) const +{ + strMatch.clear(); + int iSub = pcre_get_stringnumber(m_re, strName); + if (!IsValidSubNumber(iSub)) + return false; + strMatch = GetMatch(iSub); + return true; +} + +int CRegExp::GetNamedSubPatternNumber(const char* strName) const +{ + return pcre_get_stringnumber(m_re, strName); +} + +void CRegExp::DumpOvector(int iLog /* = LOGDEBUG */) +{ + if (iLog < LOGDEBUG || iLog > LOGNONE) + return; + + std::string str = "{"; + int size = GetSubCount(); // past the subpatterns is junk + for (int i = 0; i <= size; i++) + { + std::string t = StringUtils::Format("[{},{}]", m_iOvector[(i * 2)], m_iOvector[(i * 2) + 1]); + if (i != size) + t += ","; + str += t; + } + str += "}"; + CLog::Log(iLog, "regexp ovector={}", str); +} + +void CRegExp::Cleanup() +{ + if (m_re) + { + pcre_free(m_re); + m_re = NULL; + } + + if (m_sd) + { + pcre_free_study(m_sd); + m_sd = NULL; + } + +#ifdef PCRE_HAS_JIT_CODE + if (m_jitStack) + { + pcre_jit_stack_free(m_jitStack); + m_jitStack = NULL; + } +#endif +} + +inline bool CRegExp::IsValidSubNumber(int iSub) const +{ + return iSub >= 0 && iSub <= m_iMatchCount && iSub <= m_MaxNumOfBackrefrences; +} + + +bool CRegExp::IsUtf8Supported(void) +{ + if (m_Utf8Supported == -1) + { + if (pcre_config(PCRE_CONFIG_UTF8, &m_Utf8Supported) != 0) + m_Utf8Supported = 0; + } + + return m_Utf8Supported == 1; +} + +bool CRegExp::AreUnicodePropertiesSupported(void) +{ +#if defined(PCRE_CONFIG_UNICODE_PROPERTIES) && PCRE_UCP != 0 + if (m_UcpSupported == -1) + { + if (pcre_config(PCRE_CONFIG_UNICODE_PROPERTIES, &m_UcpSupported) != 0) + m_UcpSupported = 0; + } +#endif + + return m_UcpSupported == 1; +} + +bool CRegExp::LogCheckUtf8Support(void) +{ + bool utf8FullSupport = true; + + if (!CRegExp::IsUtf8Supported()) + { + utf8FullSupport = false; + CLog::Log(LOGWARNING, "UTF-8 is not supported in PCRE lib, support for national symbols is limited!"); + } + + if (!CRegExp::AreUnicodePropertiesSupported()) + { + utf8FullSupport = false; + CLog::Log(LOGWARNING, "Unicode properties are not enabled in PCRE lib, support for national symbols may be limited!"); + } + + if (!utf8FullSupport) + { + CLog::Log(LOGINFO, + "Consider installing PCRE lib version 8.10 or later with enabled Unicode properties " + "and UTF-8 support. Your PCRE lib version: {}", + PCRE::pcre_version()); +#if PCRE_UCP == 0 + CLog::Log(LOGINFO, "You will need to rebuild XBMC after PCRE lib update."); +#endif + } + + return utf8FullSupport; +} + +bool CRegExp::IsJitSupported(void) +{ + if (m_JitSupported == -1) + { +#ifdef PCRE_HAS_JIT_CODE + if (pcre_config(PCRE_CONFIG_JIT, &m_JitSupported) != 0) +#endif + m_JitSupported = 0; + } + + return m_JitSupported == 1; +} |