summaryrefslogtreecommitdiffstats
path: root/src/lib/util/strutil.cc
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/lib/util/strutil.cc467
1 files changed, 467 insertions, 0 deletions
diff --git a/src/lib/util/strutil.cc b/src/lib/util/strutil.cc
new file mode 100644
index 0000000..55f5f97
--- /dev/null
+++ b/src/lib/util/strutil.cc
@@ -0,0 +1,467 @@
+// Copyright (C) 2011-2022 Internet Systems Consortium, Inc. ("ISC")
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <config.h>
+
+#include <util/encode/hex.h>
+#include <util/strutil.h>
+
+#include <boost/algorithm/string/classification.hpp>
+#include <boost/algorithm/string/constants.hpp>
+#include <boost/algorithm/string/split.hpp>
+
+#include <numeric>
+#include <iostream>
+#include <sstream>
+
+// Early versions of C++11 regex were buggy, use it if we
+// can otherwise, we fall back to regcomp/regexec. For more info see:
+// https://stackoverflow.com/questions/12530406/is-gcc-4-8-or-earlier-buggy-about-regular-expressions
+#ifdef USE_REGEX
+#include <regex>
+#else
+#include <sys/types.h>
+#include <regex.h>
+#endif
+
+#include <string.h>
+
+using namespace std;
+
+namespace isc {
+namespace util {
+namespace str {
+
+// Normalize slashes
+
+void
+normalizeSlash(std::string& name) {
+ if (!name.empty()) {
+ size_t pos = 0;
+ while ((pos = name.find('\\', pos)) != std::string::npos) {
+ name[pos] = '/';
+ }
+ }
+}
+
+// Trim String
+
+string
+trim(const string& instring) {
+ string retstring = "";
+ if (!instring.empty()) {
+ static const char* blanks = " \t\n";
+
+ // Search for first non-blank character in the string
+ size_t first = instring.find_first_not_of(blanks);
+ if (first != string::npos) {
+
+ // String not all blanks, so look for last character
+ size_t last = instring.find_last_not_of(blanks);
+
+ // Extract the trimmed substring
+ retstring = instring.substr(first, (last - first + 1));
+ }
+ }
+
+ return (retstring);
+}
+
+// Tokenize string. As noted in the header, this is locally written to avoid
+// another dependency on a Boost library.
+
+vector<string>
+tokens(const std::string& text, const std::string& delim, bool escape) {
+ vector<string> result;
+ string token;
+ bool in_token = false;
+ bool escaped = false;
+ for (auto c = text.cbegin(); c != text.cend(); ++c) {
+ if (delim.find(*c) != string::npos) {
+ // Current character is a delimiter
+ if (!in_token) {
+ // Two or more delimiters, eat them
+ } else if (escaped) {
+ // Escaped delimiter in a token: reset escaped and keep it
+ escaped = false;
+ token.push_back(*c);
+ } else {
+ // End of the current token: save it if not empty
+ if (!token.empty()) {
+ result.push_back(token);
+ }
+ // Reset state
+ in_token = false;
+ token.clear();
+ }
+ } else if (escape && (*c == '\\')) {
+ // Current character is the escape character
+ if (!in_token) {
+ // The escape character is the first character of a new token
+ in_token = true;
+ }
+ if (escaped) {
+ // Escaped escape: reset escaped and keep one character
+ escaped = false;
+ token.push_back(*c);
+ } else {
+ // Remember to keep the next character
+ escaped = true;
+ }
+ } else {
+ // Not a delimiter nor an escape
+ if (!in_token) {
+ // First character of a new token
+ in_token = true;
+ }
+ if (escaped) {
+ // Escaped common character: as escape was false
+ escaped = false;
+ token.push_back('\\');
+ token.push_back(*c);
+ } else {
+ // The common case: keep it
+ token.push_back(*c);
+ }
+ }
+ }
+ // End of input: close and save the current token if not empty
+ if (escaped) {
+ // Pending escape
+ token.push_back('\\');
+ }
+ if (!token.empty()) {
+ result.push_back(token);
+ }
+
+ return (result);
+}
+
+// Local function to pass to accumulate() for summing up string lengths.
+
+namespace {
+
+size_t
+lengthSum(string::size_type curlen, const string& cur_string) {
+ return (curlen + cur_string.size());
+}
+
+}
+
+// Provide printf-style formatting.
+
+std::string
+format(const std::string& format, const std::vector<std::string>& args) {
+
+ static const string flag = "%s";
+
+ // Initialize return string. To speed things up, we'll reserve an
+ // appropriate amount of space - current string size, plus length of all
+ // the argument strings, less two characters for each argument (the %s in
+ // the format string is being replaced).
+ string result;
+ size_t length = accumulate(args.begin(), args.end(), format.size(),
+ lengthSum) - (args.size() * flag.size());
+ result.reserve(length);
+
+ // Iterate through replacing all tokens
+ result = format;
+ size_t tokenpos = 0; // Position of last token replaced
+ std::vector<std::string>::size_type i = 0; // Index into argument array
+
+ while ((i < args.size()) && (tokenpos != string::npos)) {
+ tokenpos = result.find(flag, tokenpos);
+ if (tokenpos != string::npos) {
+ result.replace(tokenpos, flag.size(), args[i++]);
+ }
+ }
+
+ return (result);
+}
+
+std::string
+getToken(std::istringstream& iss) {
+ string token;
+ iss >> token;
+ if (iss.bad() || iss.fail()) {
+ isc_throw(StringTokenError, "could not read token from string");
+ }
+ return (token);
+}
+
+std::vector<uint8_t>
+quotedStringToBinary(const std::string& quoted_string) {
+ std::vector<uint8_t> binary;
+ // Remove whitespace before and after the quotes.
+ std::string trimmed_string = trim(quoted_string);
+
+ // We require two quote characters, so the length of the string must be
+ // equal to 2 at minimum, and it must start and end with quotes.
+ if ((trimmed_string.length() > 1) && ((trimmed_string[0] == '\'') &&
+ (trimmed_string[trimmed_string.length()-1] == '\''))) {
+ // Remove quotes and trim the text inside the quotes.
+ trimmed_string = trim(trimmed_string.substr(1, trimmed_string.length() - 2));
+ // Copy string contents into the vector.
+ binary.assign(trimmed_string.begin(), trimmed_string.end());
+ }
+ // Return resulting vector or empty vector.
+ return (binary);
+}
+
+void
+decodeColonSeparatedHexString(const std::string& hex_string,
+ std::vector<uint8_t>& binary) {
+ decodeSeparatedHexString(hex_string, ":", binary);
+}
+
+void
+decodeSeparatedHexString(const std::string& hex_string, const std::string& sep,
+ std::vector<uint8_t>& binary) {
+ std::vector<std::string> split_text;
+ boost::split(split_text, hex_string, boost::is_any_of(sep),
+ boost::algorithm::token_compress_off);
+
+ std::vector<uint8_t> binary_vec;
+ for (size_t i = 0; i < split_text.size(); ++i) {
+
+ // If there are multiple tokens and the current one is empty, it
+ // means that two consecutive colons were specified. This is not
+ // allowed.
+ if ((split_text.size() > 1) && split_text[i].empty()) {
+ isc_throw(isc::BadValue, "two consecutive separators ('" << sep << "') specified in"
+ " a decoded string '" << hex_string << "'");
+
+ // Between a colon we expect at most two characters.
+ } else if (split_text[i].size() > 2) {
+ isc_throw(isc::BadValue, "invalid format of the decoded string"
+ << " '" << hex_string << "'");
+
+ } else if (!split_text[i].empty()) {
+ std::stringstream s;
+ s << "0x";
+
+ for (unsigned int j = 0; j < split_text[i].length(); ++j) {
+ // Check if we're dealing with hexadecimal digit.
+ if (!isxdigit(split_text[i][j])) {
+ isc_throw(isc::BadValue, "'" << split_text[i][j]
+ << "' is not a valid hexadecimal digit in"
+ << " decoded string '" << hex_string << "'");
+ }
+ s << split_text[i][j];
+ }
+
+ // The stream should now have one or two hexadecimal digits.
+ // Let's convert it to a number and store in a temporary
+ // vector.
+ unsigned int binary_value;
+ s >> std::hex >> binary_value;
+
+ binary_vec.push_back(static_cast<uint8_t>(binary_value));
+ }
+
+ }
+
+ // All ok, replace the data in the output vector with a result.
+ binary.swap(binary_vec);
+}
+
+
+void
+decodeFormattedHexString(const std::string& hex_string,
+ std::vector<uint8_t>& binary) {
+ // If there is at least one colon we assume that the string
+ // comprises octets separated by colons (e.g. MAC address notation).
+ if (hex_string.find(':') != std::string::npos) {
+ decodeSeparatedHexString(hex_string, ":", binary);
+ } else if (hex_string.find(' ') != std::string::npos) {
+ decodeSeparatedHexString(hex_string, " ", binary);
+ } else {
+ std::ostringstream s;
+
+ // If we have odd number of digits we'll have to prepend '0'.
+ if (hex_string.length() % 2 != 0) {
+ s << "0";
+ }
+
+ // It is ok to use '0x' prefix in a string.
+ if ((hex_string.length() > 2) && (hex_string.substr(0, 2) == "0x")) {
+ // Exclude '0x' from the decoded string.
+ s << hex_string.substr(2);
+
+ } else {
+ // No '0x', so decode the whole string.
+ s << hex_string;
+ }
+
+ try {
+ // Decode the hex string.
+ encode::decodeHex(s.str(), binary);
+
+ } catch (...) {
+ isc_throw(isc::BadValue, "'" << hex_string << "' is not a valid"
+ " string of hexadecimal digits");
+ }
+ }
+}
+
+class StringSanitizerImpl {
+public:
+ /// @brief Constructor.
+ StringSanitizerImpl(const std::string& char_set, const std::string& char_replacement)
+ : char_set_(char_set), char_replacement_(char_replacement) {
+ if (char_set.size() > StringSanitizer::MAX_DATA_SIZE) {
+ isc_throw(isc::BadValue, "char set size: '" << char_set.size()
+ << "' exceeds max size: '"
+ << StringSanitizer::MAX_DATA_SIZE << "'");
+ }
+
+ if (char_replacement.size() > StringSanitizer::MAX_DATA_SIZE) {
+ isc_throw(isc::BadValue, "char replacement size: '"
+ << char_replacement.size() << "' exceeds max size: '"
+ << StringSanitizer::MAX_DATA_SIZE << "'");
+ }
+#ifdef USE_REGEX
+ try {
+ scrub_exp_ = std::regex(char_set, std::regex::extended);
+ } catch (const std::exception& ex) {
+ isc_throw(isc::BadValue, "invalid regex: '"
+ << char_set_ << "', " << ex.what());
+ }
+#else
+ int ec = regcomp(&scrub_exp_, char_set_.c_str(), REG_EXTENDED);
+ if (ec) {
+ char errbuf[512] = "";
+ static_cast<void>(regerror(ec, &scrub_exp_, errbuf, sizeof(errbuf)));
+ regfree(&scrub_exp_);
+ isc_throw(isc::BadValue, "invalid regex: '" << char_set_ << "', " << errbuf);
+ }
+#endif
+ }
+
+ /// @brief Destructor.
+ ~StringSanitizerImpl() {
+#ifndef USE_REGEX
+ regfree(&scrub_exp_);
+#endif
+ }
+
+ std::string scrub(const std::string& original) {
+#ifdef USE_REGEX
+ std::stringstream result;
+ try {
+ std::regex_replace(std::ostream_iterator<char>(result),
+ original.begin(), original.end(),
+ scrub_exp_, char_replacement_);
+ } catch (const std::exception& ex) {
+ isc_throw(isc::BadValue, "replacing '" << char_set_ << "' with '"
+ << char_replacement_ << "' in '" << original << "' failed: ,"
+ << ex.what());
+ }
+
+ return (result.str());
+#else
+ // In order to handle embedded nuls, we have to process in nul-terminated
+ // chunks. We iterate over the original data, doing pattern replacement
+ // on each chunk.
+ const char* orig_data = original.data();
+ const char* dead_end = orig_data + original.size();
+ const char* start_from = orig_data;
+ stringstream result;
+
+ while (start_from < dead_end) {
+ // Iterate over original string, match by match.
+ regmatch_t matches[2]; // n matches + 1
+ const char* end_at = start_from + strlen(start_from);
+
+ while (start_from < end_at) {
+ // Look for the next match
+ if (regexec(&scrub_exp_, start_from, 1, matches, 0) == REG_NOMATCH) {
+ // No matches, so add in the remainder
+ result << start_from;
+ start_from = end_at + 1;
+ break;
+ }
+
+ // Shouldn't happen, but one never knows eh?
+ if (matches[0].rm_so == -1) {
+ isc_throw(isc::Unexpected, "matched but so is -1?");
+ }
+
+ // Add everything from starting point up to the current match
+ const char* match_at = start_from + matches[0].rm_so;
+ while (start_from < match_at) {
+ result << *start_from;
+ ++start_from;
+ }
+
+ // Add in the replacement
+ result << char_replacement_;
+
+ // Move past the match.
+ ++start_from;
+ }
+
+ // if we have an embedded nul, replace it and continue
+ if (start_from < dead_end) {
+ // Add in the replacement
+ result << char_replacement_;
+ start_from = end_at + 1;
+ }
+ }
+
+ return (result.str());
+#endif
+ }
+
+private:
+ /// @brief The char set data for regex.
+ std::string char_set_;
+
+ /// @brief The char replacement data for regex.
+ std::string char_replacement_;
+
+#ifdef USE_REGEX
+ regex scrub_exp_;
+#else
+ regex_t scrub_exp_;
+#endif
+};
+
+// @note The regex engine is implemented using recursion and can cause
+// stack overflow if the input data is too large. An arbitrary size of
+// 4096 should be enough for all cases.
+const uint32_t StringSanitizer::MAX_DATA_SIZE = 4096;
+
+StringSanitizer::StringSanitizer(const std::string& char_set,
+ const std::string& char_replacement)
+ : impl_(new StringSanitizerImpl(char_set, char_replacement)) {
+}
+
+StringSanitizer::~StringSanitizer() {
+}
+
+std::string
+StringSanitizer::scrub(const std::string& original) {
+ return (impl_->scrub(original));
+}
+
+std::string dumpAsHex(const uint8_t* data, size_t length) {
+ std::stringstream output;
+ for (unsigned int i = 0; i < length; i++) {
+ if (i) {
+ output << ":";
+ }
+
+ output << std::setfill('0') << std::setw(2) << std::hex
+ << static_cast<unsigned short>(data[i]);
+ }
+
+ return (output.str());
+}
+
+} // namespace str
+} // namespace util
+} // namespace isc