summaryrefslogtreecommitdiffstats
path: root/xbmc/utils/RegExp.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'xbmc/utils/RegExp.cpp')
-rw-r--r--xbmc/utils/RegExp.cpp651
1 files changed, 651 insertions, 0 deletions
diff --git a/xbmc/utils/RegExp.cpp b/xbmc/utils/RegExp.cpp
new file mode 100644
index 0000000..9667b64
--- /dev/null
+++ b/xbmc/utils/RegExp.cpp
@@ -0,0 +1,651 @@
+/*
+ * Copyright (C) 2005-2018 Team Kodi
+ * This file is part of Kodi - https://kodi.tv
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * See LICENSES/README.md for more information.
+ */
+
+#include "RegExp.h"
+
+#include "log.h"
+#include "utils/StringUtils.h"
+#include "utils/Utf8Utils.h"
+
+#include <algorithm>
+#include <stdlib.h>
+#include <string.h>
+
+using namespace PCRE;
+
+#ifndef PCRE_UCP
+#define PCRE_UCP 0
+#endif // PCRE_UCP
+
+#ifdef PCRE_CONFIG_JIT
+#define PCRE_HAS_JIT_CODE 1
+#endif
+
+#ifndef PCRE_STUDY_JIT_COMPILE
+#define PCRE_STUDY_JIT_COMPILE 0
+#endif
+#ifndef PCRE_INFO_JIT
+// some unused number
+#define PCRE_INFO_JIT 2048
+#endif
+#ifndef PCRE_HAS_JIT_CODE
+#define pcre_free_study(x) pcre_free((x))
+#endif
+
+int CRegExp::m_Utf8Supported = -1;
+int CRegExp::m_UcpSupported = -1;
+int CRegExp::m_JitSupported = -1;
+
+
+CRegExp::CRegExp(bool caseless /*= false*/, CRegExp::utf8Mode utf8 /*= asciiOnly*/)
+{
+ InitValues(caseless, utf8);
+}
+
+void CRegExp::InitValues(bool caseless /*= false*/, CRegExp::utf8Mode utf8 /*= asciiOnly*/)
+{
+ m_utf8Mode = utf8;
+ m_re = NULL;
+ m_sd = NULL;
+ m_iOptions = PCRE_DOTALL | PCRE_NEWLINE_ANY;
+ if(caseless)
+ m_iOptions |= PCRE_CASELESS;
+ if (m_utf8Mode == forceUtf8)
+ {
+ if (IsUtf8Supported())
+ m_iOptions |= PCRE_UTF8;
+ if (AreUnicodePropertiesSupported())
+ m_iOptions |= PCRE_UCP;
+ }
+
+ m_offset = 0;
+ m_jitCompiled = false;
+ m_bMatched = false;
+ m_iMatchCount = 0;
+ m_jitStack = NULL;
+
+ memset(m_iOvector, 0, sizeof(m_iOvector));
+}
+
+CRegExp::CRegExp(bool caseless, CRegExp::utf8Mode utf8, const char *re, studyMode study /*= NoStudy*/)
+{
+ if (utf8 == autoUtf8)
+ utf8 = requireUtf8(re) ? forceUtf8 : asciiOnly;
+
+ InitValues(caseless, utf8);
+ RegComp(re, study);
+}
+
+bool CRegExp::requireUtf8(const std::string& regexp)
+{
+ // enable UTF-8 mode if regexp string has UTF-8 multibyte sequences
+ if (CUtf8Utils::checkStrForUtf8(regexp) == CUtf8Utils::utf8string)
+ return true;
+
+ // check for explicit Unicode Properties (\p, \P, \X) and for Unicode character codes (greater than 0xFF) in form \x{hhh..}
+ // note: PCRE change meaning of \w, \s, \d (and \W, \S, \D) when Unicode Properties are enabled,
+ // but in auto mode we enable UNP for US-ASCII regexp only if regexp contains explicit \p, \P, \X or Unicode character code
+ const char* const regexpC = regexp.c_str();
+ const size_t len = regexp.length();
+ size_t pos = 0;
+
+ while (pos < len)
+ {
+ const char chr = regexpC[pos];
+ if (chr == '\\')
+ {
+ const char nextChr = regexpC[pos + 1];
+
+ if (nextChr == 'p' || nextChr == 'P' || nextChr == 'X')
+ return true; // found Unicode Properties
+ else if (nextChr == 'Q')
+ pos = regexp.find("\\E", pos + 2); // skip all literals in "\Q...\E"
+ else if (nextChr == 'x' && regexpC[pos + 2] == '{')
+ { // Unicode character with hex code
+ if (readCharXCode(regexp, pos) >= 0x100)
+ return true; // found Unicode character code
+ }
+ else if (nextChr == '\\' || nextChr == '(' || nextChr == ')'
+ || nextChr == '[' || nextChr == ']')
+ pos++; // exclude next character from analyze
+
+ } // chr != '\\'
+ else if (chr == '(' && regexpC[pos + 1] == '?' && regexpC[pos + 2] == '#') // comment in regexp
+ pos = regexp.find(')', pos); // skip comment
+ else if (chr == '[')
+ {
+ if (isCharClassWithUnicode(regexp, pos))
+ return true;
+ }
+
+ if (pos == std::string::npos) // check results of regexp.find() and isCharClassWithUnicode
+ return false;
+
+ pos++;
+ }
+
+ // no Unicode Properties was found
+ return false;
+}
+
+inline int CRegExp::readCharXCode(const std::string& regexp, size_t& pos)
+{
+ // read hex character code in form "\x{hh..}"
+ // 'pos' must point to '\'
+ if (pos >= regexp.length())
+ return -1;
+ const char* const regexpC = regexp.c_str();
+ if (regexpC[pos] != '\\' || regexpC[pos + 1] != 'x' || regexpC[pos + 2] != '{')
+ return -1;
+
+ pos++;
+ const size_t startPos = pos; // 'startPos' points to 'x'
+ const size_t closingBracketPos = regexp.find('}', startPos + 2);
+ if (closingBracketPos == std::string::npos)
+ return 0; // return character zero code, leave 'pos' at 'x'
+
+ pos++; // 'pos' points to '{'
+ int chCode = 0;
+ while (++pos < closingBracketPos)
+ {
+ const int xdigitVal = StringUtils::asciixdigitvalue(regexpC[pos]);
+ if (xdigitVal >= 0)
+ chCode = chCode * 16 + xdigitVal;
+ else
+ { // found non-hexdigit
+ pos = startPos; // reset 'pos' to 'startPos', process "{hh..}" as non-code
+ return 0; // return character zero code
+ }
+ }
+
+ return chCode;
+}
+
+bool CRegExp::isCharClassWithUnicode(const std::string& regexp, size_t& pos)
+{
+ const char* const regexpC = regexp.c_str();
+ const size_t len = regexp.length();
+ if (pos > len || regexpC[pos] != '[')
+ return false;
+
+ // look for Unicode character code "\x{hhh..}" and Unicode properties "\P", "\p" and "\X"
+ // find end (terminating ']') of character class (like "[a-h45]")
+ // detect nested POSIX classes like "[[:lower:]]" and escaped brackets like "[\]]"
+ bool needUnicode = false;
+ while (++pos < len)
+ {
+ if (regexpC[pos] == '[' && regexpC[pos + 1] == ':')
+ { // possible POSIX character class, like "[:alpha:]"
+ const size_t nextClosingBracketPos = regexp.find(']', pos + 2); // don't care about "\]", as it produce error if used inside POSIX char class
+
+ if (nextClosingBracketPos == std::string::npos)
+ { // error in regexp: no closing ']' for character class
+ pos = std::string::npos;
+ return needUnicode;
+ }
+ else if (regexpC[nextClosingBracketPos - 1] == ':')
+ pos = nextClosingBracketPos; // skip POSIX character class
+ // if ":]" is not found, process "[:..." as part of normal character class
+ }
+ else if (regexpC[pos] == ']')
+ return needUnicode; // end of character class
+ else if (regexpC[pos] == '\\')
+ {
+ const char nextChar = regexpC[pos + 1];
+ if (nextChar == ']' || nextChar == '[')
+ pos++; // skip next character
+ else if (nextChar == 'Q')
+ {
+ pos = regexp.find("\\E", pos + 2);
+ if (pos == std::string::npos)
+ return needUnicode; // error in regexp: no closing "\E" after "\Q" in character class
+ else
+ pos++; // skip "\E"
+ }
+ else if (nextChar == 'p' || nextChar == 'P' || nextChar == 'X')
+ needUnicode = true; // don't care about property name as it can contain only ASCII chars
+ else if (nextChar == 'x')
+ {
+ if (readCharXCode(regexp, pos) >= 0x100)
+ needUnicode = true;
+ }
+ }
+ }
+ pos = std::string::npos; // closing square bracket was not found
+
+ return needUnicode;
+}
+
+
+CRegExp::CRegExp(const CRegExp& re)
+{
+ m_re = NULL;
+ m_sd = NULL;
+ m_jitStack = NULL;
+ m_utf8Mode = re.m_utf8Mode;
+ m_iOptions = re.m_iOptions;
+ *this = re;
+}
+
+CRegExp& CRegExp::operator=(const CRegExp& re)
+{
+ size_t size;
+ Cleanup();
+ m_jitCompiled = false;
+ m_pattern = re.m_pattern;
+ if (re.m_re)
+ {
+ if (pcre_fullinfo(re.m_re, NULL, PCRE_INFO_SIZE, &size) >= 0)
+ {
+ if ((m_re = (pcre*)malloc(size)))
+ {
+ memcpy(m_re, re.m_re, size);
+ memcpy(m_iOvector, re.m_iOvector, OVECCOUNT*sizeof(int));
+ m_offset = re.m_offset;
+ m_iMatchCount = re.m_iMatchCount;
+ m_bMatched = re.m_bMatched;
+ m_subject = re.m_subject;
+ m_iOptions = re.m_iOptions;
+ }
+ else
+ CLog::Log(LOGFATAL, "{}: Failed to allocate memory", __FUNCTION__);
+ }
+ }
+ return *this;
+}
+
+CRegExp::~CRegExp()
+{
+ Cleanup();
+}
+
+bool CRegExp::RegComp(const char *re, studyMode study /*= NoStudy*/)
+{
+ if (!re)
+ return false;
+
+ m_offset = 0;
+ m_jitCompiled = false;
+ m_bMatched = false;
+ m_iMatchCount = 0;
+ const char *errMsg = NULL;
+ int errOffset = 0;
+ int options = m_iOptions;
+ if (m_utf8Mode == autoUtf8 && requireUtf8(re))
+ options |= (IsUtf8Supported() ? PCRE_UTF8 : 0) | (AreUnicodePropertiesSupported() ? PCRE_UCP : 0);
+
+ Cleanup();
+
+ m_re = pcre_compile(re, options, &errMsg, &errOffset, NULL);
+ if (!m_re)
+ {
+ m_pattern.clear();
+ CLog::Log(LOGERROR, "PCRE: {}. Compilation failed at offset {} in expression '{}'", errMsg,
+ errOffset, re);
+ return false;
+ }
+
+ m_pattern = re;
+
+ if (study)
+ {
+ const bool jitCompile = (study == StudyWithJitComp) && IsJitSupported();
+ const int studyOptions = jitCompile ? PCRE_STUDY_JIT_COMPILE : 0;
+
+ m_sd = pcre_study(m_re, studyOptions, &errMsg);
+ if (errMsg != NULL)
+ {
+ CLog::Log(LOGWARNING, "{}: PCRE error \"{}\" while studying expression", __FUNCTION__,
+ errMsg);
+ if (m_sd != NULL)
+ {
+ pcre_free_study(m_sd);
+ m_sd = NULL;
+ }
+ }
+ else if (jitCompile)
+ {
+ int jitPresent = 0;
+ m_jitCompiled = (pcre_fullinfo(m_re, m_sd, PCRE_INFO_JIT, &jitPresent) == 0 && jitPresent == 1);
+ }
+ }
+
+ return true;
+}
+
+int CRegExp::RegFind(const char *str, unsigned int startoffset /*= 0*/, int maxNumberOfCharsToTest /*= -1*/)
+{
+ return PrivateRegFind(strlen(str), str, startoffset, maxNumberOfCharsToTest);
+}
+
+int CRegExp::PrivateRegFind(size_t bufferLen, const char *str, unsigned int startoffset /* = 0*/, int maxNumberOfCharsToTest /*= -1*/)
+{
+ m_offset = 0;
+ m_bMatched = false;
+ m_iMatchCount = 0;
+
+ if (!m_re)
+ {
+ CLog::Log(LOGERROR, "PCRE: Called before compilation");
+ return -1;
+ }
+
+ if (!str)
+ {
+ CLog::Log(LOGERROR, "PCRE: Called without a string to match");
+ return -1;
+ }
+
+ if (startoffset > bufferLen)
+ {
+ CLog::Log(LOGERROR, "{}: startoffset is beyond end of string to match", __FUNCTION__);
+ return -1;
+ }
+
+#ifdef PCRE_HAS_JIT_CODE
+ if (m_jitCompiled && !m_jitStack)
+ {
+ m_jitStack = pcre_jit_stack_alloc(32*1024, 512*1024);
+ if (m_jitStack == NULL)
+ CLog::Log(LOGWARNING, "{}: can't allocate address space for JIT stack", __FUNCTION__);
+
+ pcre_assign_jit_stack(m_sd, NULL, m_jitStack);
+ }
+#endif
+
+ if (maxNumberOfCharsToTest >= 0)
+ bufferLen = std::min<size_t>(bufferLen, startoffset + maxNumberOfCharsToTest);
+
+ m_subject.assign(str + startoffset, bufferLen - startoffset);
+ int rc = pcre_exec(m_re, NULL, m_subject.c_str(), m_subject.length(), 0, 0, m_iOvector, OVECCOUNT);
+
+ if (rc<1)
+ {
+ static const int fragmentLen = 80; // length of excerpt before erroneous char for log
+ switch(rc)
+ {
+ case PCRE_ERROR_NOMATCH:
+ return -1;
+
+ case PCRE_ERROR_MATCHLIMIT:
+ CLog::Log(LOGERROR, "PCRE: Match limit reached");
+ return -1;
+
+#ifdef PCRE_ERROR_SHORTUTF8
+ case PCRE_ERROR_SHORTUTF8:
+ {
+ const size_t startPos = (m_subject.length() > fragmentLen) ? CUtf8Utils::RFindValidUtf8Char(m_subject, m_subject.length() - fragmentLen) : 0;
+ if (startPos != std::string::npos)
+ CLog::Log(
+ LOGERROR,
+ "PCRE: Bad UTF-8 character at the end of string. Text before bad character: \"{}\"",
+ m_subject.substr(startPos));
+ else
+ CLog::Log(LOGERROR, "PCRE: Bad UTF-8 character at the end of string");
+ return -1;
+ }
+#endif
+ case PCRE_ERROR_BADUTF8:
+ {
+ const size_t startPos = (m_iOvector[0] > fragmentLen) ? CUtf8Utils::RFindValidUtf8Char(m_subject, m_iOvector[0] - fragmentLen) : 0;
+ if (m_iOvector[0] >= 0 && startPos != std::string::npos)
+ CLog::Log(LOGERROR,
+ "PCRE: Bad UTF-8 character, error code: {}, position: {}. Text before bad "
+ "char: \"{}\"",
+ m_iOvector[1], m_iOvector[0],
+ m_subject.substr(startPos, m_iOvector[0] - startPos + 1));
+ else
+ CLog::Log(LOGERROR, "PCRE: Bad UTF-8 character, error code: {}, position: {}",
+ m_iOvector[1], m_iOvector[0]);
+ return -1;
+ }
+ case PCRE_ERROR_BADUTF8_OFFSET:
+ CLog::Log(LOGERROR, "PCRE: Offset is pointing to the middle of UTF-8 character");
+ return -1;
+
+ default:
+ CLog::Log(LOGERROR, "PCRE: Unknown error: {}", rc);
+ return -1;
+ }
+ }
+ m_offset = startoffset;
+ m_bMatched = true;
+ m_iMatchCount = rc;
+ return m_iOvector[0] + m_offset;
+}
+
+int CRegExp::GetCaptureTotal() const
+{
+ int c = -1;
+ if (m_re)
+ pcre_fullinfo(m_re, NULL, PCRE_INFO_CAPTURECOUNT, &c);
+ return c;
+}
+
+std::string CRegExp::GetReplaceString(const std::string& sReplaceExp) const
+{
+ if (!m_bMatched || sReplaceExp.empty())
+ return "";
+
+ const char* const expr = sReplaceExp.c_str();
+
+ size_t pos = sReplaceExp.find_first_of("\\&");
+ std::string result(sReplaceExp, 0, pos);
+ result.reserve(sReplaceExp.size()); // very rough estimate
+
+ while(pos != std::string::npos)
+ {
+ if (expr[pos] == '\\')
+ {
+ // string is null-terminated and current char isn't null, so it's safe to advance to next char
+ pos++; // advance to next char
+ const char nextChar = expr[pos];
+ if (nextChar == '&' || nextChar == '\\')
+ { // this is "\&" or "\\" combination
+ result.push_back(nextChar); // add '&' or '\' to result
+ pos++;
+ }
+ else if (isdigit(nextChar))
+ { // this is "\0" - "\9" combination
+ int subNum = nextChar - '0';
+ pos++; // advance to second next char
+ const char secondNextChar = expr[pos];
+ if (isdigit(secondNextChar))
+ { // this is "\00" - "\99" combination
+ subNum = subNum * 10 + (secondNextChar - '0');
+ pos++;
+ }
+ result.append(GetMatch(subNum));
+ }
+ }
+ else
+ { // '&' char
+ result.append(GetMatch(0));
+ pos++;
+ }
+
+ const size_t nextPos = sReplaceExp.find_first_of("\\&", pos);
+ result.append(sReplaceExp, pos, nextPos - pos);
+ pos = nextPos;
+ }
+
+ return result;
+}
+
+int CRegExp::GetSubStart(int iSub) const
+{
+ if (!IsValidSubNumber(iSub))
+ return -1;
+
+ return m_iOvector[iSub*2] + m_offset;
+}
+
+int CRegExp::GetSubStart(const std::string& subName) const
+{
+ return GetSubStart(GetNamedSubPatternNumber(subName.c_str()));
+}
+
+int CRegExp::GetSubLength(int iSub) const
+{
+ if (!IsValidSubNumber(iSub))
+ return -1;
+
+ return m_iOvector[(iSub*2)+1] - m_iOvector[(iSub*2)];
+}
+
+int CRegExp::GetSubLength(const std::string& subName) const
+{
+ return GetSubLength(GetNamedSubPatternNumber(subName.c_str()));
+}
+
+std::string CRegExp::GetMatch(int iSub /* = 0 */) const
+{
+ if (!IsValidSubNumber(iSub))
+ return "";
+
+ int pos = m_iOvector[(iSub*2)];
+ int len = m_iOvector[(iSub*2)+1] - pos;
+ if (pos < 0 || len <= 0)
+ return "";
+
+ return m_subject.substr(pos, len);
+}
+
+std::string CRegExp::GetMatch(const std::string& subName) const
+{
+ return GetMatch(GetNamedSubPatternNumber(subName.c_str()));
+}
+
+bool CRegExp::GetNamedSubPattern(const char* strName, std::string& strMatch) const
+{
+ strMatch.clear();
+ int iSub = pcre_get_stringnumber(m_re, strName);
+ if (!IsValidSubNumber(iSub))
+ return false;
+ strMatch = GetMatch(iSub);
+ return true;
+}
+
+int CRegExp::GetNamedSubPatternNumber(const char* strName) const
+{
+ return pcre_get_stringnumber(m_re, strName);
+}
+
+void CRegExp::DumpOvector(int iLog /* = LOGDEBUG */)
+{
+ if (iLog < LOGDEBUG || iLog > LOGNONE)
+ return;
+
+ std::string str = "{";
+ int size = GetSubCount(); // past the subpatterns is junk
+ for (int i = 0; i <= size; i++)
+ {
+ std::string t = StringUtils::Format("[{},{}]", m_iOvector[(i * 2)], m_iOvector[(i * 2) + 1]);
+ if (i != size)
+ t += ",";
+ str += t;
+ }
+ str += "}";
+ CLog::Log(iLog, "regexp ovector={}", str);
+}
+
+void CRegExp::Cleanup()
+{
+ if (m_re)
+ {
+ pcre_free(m_re);
+ m_re = NULL;
+ }
+
+ if (m_sd)
+ {
+ pcre_free_study(m_sd);
+ m_sd = NULL;
+ }
+
+#ifdef PCRE_HAS_JIT_CODE
+ if (m_jitStack)
+ {
+ pcre_jit_stack_free(m_jitStack);
+ m_jitStack = NULL;
+ }
+#endif
+}
+
+inline bool CRegExp::IsValidSubNumber(int iSub) const
+{
+ return iSub >= 0 && iSub <= m_iMatchCount && iSub <= m_MaxNumOfBackrefrences;
+}
+
+
+bool CRegExp::IsUtf8Supported(void)
+{
+ if (m_Utf8Supported == -1)
+ {
+ if (pcre_config(PCRE_CONFIG_UTF8, &m_Utf8Supported) != 0)
+ m_Utf8Supported = 0;
+ }
+
+ return m_Utf8Supported == 1;
+}
+
+bool CRegExp::AreUnicodePropertiesSupported(void)
+{
+#if defined(PCRE_CONFIG_UNICODE_PROPERTIES) && PCRE_UCP != 0
+ if (m_UcpSupported == -1)
+ {
+ if (pcre_config(PCRE_CONFIG_UNICODE_PROPERTIES, &m_UcpSupported) != 0)
+ m_UcpSupported = 0;
+ }
+#endif
+
+ return m_UcpSupported == 1;
+}
+
+bool CRegExp::LogCheckUtf8Support(void)
+{
+ bool utf8FullSupport = true;
+
+ if (!CRegExp::IsUtf8Supported())
+ {
+ utf8FullSupport = false;
+ CLog::Log(LOGWARNING, "UTF-8 is not supported in PCRE lib, support for national symbols is limited!");
+ }
+
+ if (!CRegExp::AreUnicodePropertiesSupported())
+ {
+ utf8FullSupport = false;
+ CLog::Log(LOGWARNING, "Unicode properties are not enabled in PCRE lib, support for national symbols may be limited!");
+ }
+
+ if (!utf8FullSupport)
+ {
+ CLog::Log(LOGINFO,
+ "Consider installing PCRE lib version 8.10 or later with enabled Unicode properties "
+ "and UTF-8 support. Your PCRE lib version: {}",
+ PCRE::pcre_version());
+#if PCRE_UCP == 0
+ CLog::Log(LOGINFO, "You will need to rebuild XBMC after PCRE lib update.");
+#endif
+ }
+
+ return utf8FullSupport;
+}
+
+bool CRegExp::IsJitSupported(void)
+{
+ if (m_JitSupported == -1)
+ {
+#ifdef PCRE_HAS_JIT_CODE
+ if (pcre_config(PCRE_CONFIG_JIT, &m_JitSupported) != 0)
+#endif
+ m_JitSupported = 0;
+ }
+
+ return m_JitSupported == 1;
+}