summaryrefslogtreecommitdiffstats
path: root/src/text_anonymizer.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/text_anonymizer.cc')
-rw-r--r--src/text_anonymizer.cc519
1 files changed, 519 insertions, 0 deletions
diff --git a/src/text_anonymizer.cc b/src/text_anonymizer.cc
new file mode 100644
index 0000000..32a7a13
--- /dev/null
+++ b/src/text_anonymizer.cc
@@ -0,0 +1,519 @@
+/**
+ * Copyright (c) 2022, Timothy Stack
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * * Neither the name of Timothy Stack nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "text_anonymizer.hh"
+
+#include <arpa/inet.h>
+#include <curl/curl.h>
+
+#include "animals-json.h"
+#include "config.h"
+#include "data_scanner.hh"
+#include "diseases-json.h"
+#include "ghc/filesystem.hpp"
+#include "lnav_util.hh"
+#include "pcrepp/pcre2pp.hh"
+#include "words-json.h"
+#include "yajlpp/yajlpp_def.hh"
+
+namespace lnav {
+
+struct random_list {
+ std::vector<std::string> rl_data;
+
+ std::string at_index(size_t index) const
+ {
+ auto counter = index / this->rl_data.size();
+ auto mod = index % this->rl_data.size();
+
+ auto retval = this->rl_data[mod];
+ if (counter > 0) {
+ retval = fmt::format(FMT_STRING("{}{}"), retval, counter);
+ }
+ return retval;
+ }
+};
+
+static const typed_json_path_container<random_list> random_list_handlers = {
+ yajlpp::property_handler("data#").for_field(&random_list::rl_data),
+};
+
+static random_list
+load_word_list()
+{
+ static const intern_string_t name
+ = intern_string::lookup(words_json.get_name());
+ auto parse_res
+ = random_list_handlers.parser_for(name).with_ignore_unused(false).of(
+ words_json.to_string_fragment());
+
+ return parse_res.unwrap();
+}
+
+static const random_list&
+get_word_list()
+{
+ static const auto retval = load_word_list();
+
+ return retval;
+}
+
+static random_list
+load_animal_list()
+{
+ static const intern_string_t name
+ = intern_string::lookup(animals_json.get_name());
+ auto parse_res
+ = random_list_handlers.parser_for(name).with_ignore_unused(false).of(
+ animals_json.to_string_fragment());
+
+ return parse_res.unwrap();
+}
+
+static const random_list&
+get_animal_list()
+{
+ static const auto retval = load_animal_list();
+
+ return retval;
+}
+
+static random_list
+load_disease_list()
+{
+ static const intern_string_t name
+ = intern_string::lookup(diseases_json.get_name());
+ auto parse_res
+ = random_list_handlers.parser_for(name).with_ignore_unused(false).of(
+ diseases_json.to_string_fragment());
+
+ return parse_res.unwrap();
+}
+
+static const random_list&
+get_disease_list()
+{
+ static const auto retval = load_disease_list();
+
+ return retval;
+}
+
+std::string
+text_anonymizer::next(string_fragment line)
+{
+ data_scanner ds(line);
+ std::string retval;
+
+ while (true) {
+ auto tok_res = ds.tokenize2();
+ if (!tok_res) {
+ break;
+ }
+
+ switch (tok_res->tr_token) {
+ case DT_URL: {
+ auto url_str = tok_res->to_string();
+ auto_mem<CURLU> cu(curl_url_cleanup);
+ cu = curl_url();
+
+ if (curl_url_set(cu, CURLUPART_URL, url_str.c_str(), 0)
+ != CURLUE_OK)
+ {
+ retval += "<unparseable-url>";
+ } else {
+ auto_mem<char> url_part(curl_free);
+
+ if (curl_url_get(
+ cu, CURLUPART_USER, url_part.out(), CURLU_URLDECODE)
+ == CURLUE_OK)
+ {
+ auto anon_user = this->get_default(
+ this->ta_user_names,
+ url_part.in(),
+ [](size_t size, auto& user) {
+ return get_animal_list().at_index(size);
+ });
+ curl_url_set(cu,
+ CURLUPART_USER,
+ anon_user.c_str(),
+ CURLU_URLENCODE);
+ }
+
+ if (curl_url_get(cu,
+ CURLUPART_PASSWORD,
+ url_part.out(),
+ CURLU_URLDECODE)
+ == CURLUE_OK)
+ {
+ auto anon_pass
+ = hasher()
+ .update(url_part.in(), strlen(url_part.in()))
+ .to_string();
+ curl_url_set(cu,
+ CURLUPART_PASSWORD,
+ anon_pass.c_str(),
+ CURLU_URLENCODE);
+ }
+
+ if (curl_url_get(
+ cu, CURLUPART_HOST, url_part.out(), CURLU_URLDECODE)
+ == CURLUE_OK)
+ {
+ auto anon_host = this->get_default(
+ this->ta_host_names,
+ url_part.in(),
+ [](size_t size, auto& hn) {
+ const auto& diseases = get_disease_list();
+
+ return fmt::format(FMT_STRING("{}.example.com"),
+ diseases.at_index(size));
+ });
+ curl_url_set(cu,
+ CURLUPART_HOST,
+ anon_host.c_str(),
+ CURLU_URLENCODE);
+ }
+
+ if (curl_url_get(
+ cu, CURLUPART_PATH, url_part.out(), CURLU_URLDECODE)
+ == CURLUE_OK)
+ {
+ ghc::filesystem::path url_path(url_part.in());
+ ghc::filesystem::path anon_path;
+
+ for (const auto& comp : url_path) {
+ if (comp == comp.root_path()) {
+ anon_path = anon_path / comp;
+ continue;
+ }
+ anon_path = anon_path / this->next(comp.string());
+ }
+ curl_url_set(cu,
+ CURLUPART_PATH,
+ anon_path.c_str(),
+ CURLU_URLENCODE);
+ }
+
+ if (curl_url_get(cu,
+ CURLUPART_QUERY,
+ url_part.out(),
+ CURLU_URLDECODE)
+ == CURLUE_OK)
+ {
+ static const auto SPLIT_RE
+ = lnav::pcre2pp::code::from_const(R"((&))");
+
+ curl_url_set(cu, CURLUPART_QUERY, nullptr, 0);
+
+ auto url_query
+ = string_fragment::from_c_str(url_part.in());
+ auto replacer = [this, &cu](const std::string& comp) {
+ std::string anon_query;
+
+ auto eq_index = comp.find('=');
+ if (eq_index != std::string::npos) {
+ auto new_key
+ = this->next(comp.substr(0, eq_index));
+ auto new_value
+ = this->next(comp.substr(eq_index + 1));
+ anon_query = fmt::format(
+ FMT_STRING("{}={}"), new_key, new_value);
+ } else {
+ anon_query = this->next(comp);
+ }
+
+ curl_url_set(cu,
+ CURLUPART_QUERY,
+ anon_query.c_str(),
+ CURLU_URLENCODE | CURLU_APPENDQUERY);
+ };
+
+ auto loop_res
+ = SPLIT_RE.capture_from(url_query).for_each(
+ [&replacer](lnav::pcre2pp::match_data& md) {
+ replacer(md.leading().to_string());
+ });
+ if (loop_res.isOk()) {
+ replacer(loop_res.unwrap().to_string());
+ }
+ }
+
+ if (curl_url_get(cu,
+ CURLUPART_FRAGMENT,
+ url_part.out(),
+ CURLU_URLDECODE)
+ == CURLUE_OK)
+ {
+ auto anon_frag = this->next(
+ string_fragment::from_c_str(url_part.in()));
+
+ curl_url_set(cu,
+ CURLUPART_FRAGMENT,
+ anon_frag.c_str(),
+ CURLU_URLENCODE);
+ }
+
+ auto_mem<char> anon_url(curl_free);
+ if (curl_url_get(cu, CURLUPART_URL, anon_url.out(), 0)
+ == CURLUE_OK)
+ {
+ retval.append(anon_url.in());
+ }
+ }
+ break;
+ }
+ case DT_PATH: {
+ ghc::filesystem::path inp_path(tok_res->to_string());
+ ghc::filesystem::path anon_path;
+
+ for (const auto& comp : inp_path) {
+ auto comp_str = comp.string();
+ if (comp == comp.root_path() || comp == inp_path) {
+ anon_path = anon_path / comp;
+ continue;
+ }
+ anon_path = anon_path / this->next(comp_str);
+ }
+
+ retval += anon_path.string();
+ break;
+ }
+ case DT_CREDIT_CARD_NUMBER: {
+ auto cc = tok_res->to_string();
+ auto has_spaces = cc.size() > 16;
+ auto new_end = std::remove_if(
+ cc.begin(), cc.end(), [](auto ch) { return ch == ' '; });
+ cc.erase(new_end, cc.end());
+ auto anon_cc = hasher().update(cc).to_string().substr(0, 16);
+
+ if (has_spaces) {
+ anon_cc.insert(12, " ");
+ anon_cc.insert(8, " ");
+ anon_cc.insert(4, " ");
+ }
+
+ retval += anon_cc;
+ break;
+ }
+ case DT_MAC_ADDRESS: {
+ // 00-00-5E-00-53-00
+ auto mac_addr = tok_res->to_string();
+
+ retval += this->get_default(
+ this->ta_mac_addresses,
+ mac_addr,
+ [](size_t size, auto& inp) {
+ uint32_t base_mac = 0x5e005300;
+
+ base_mac += size;
+ auto anon_mac = byte_array<6>::from({
+ 0x00,
+ 0x00,
+ (unsigned char) ((base_mac >> 24) & 0xff),
+ (unsigned char) ((base_mac >> 16) & 0xff),
+ (unsigned char) ((base_mac >> 8) & 0xff),
+ (unsigned char) ((base_mac >> 0) & 0xff),
+ });
+
+ return anon_mac.to_string(
+ nonstd::make_optional(inp[2]));
+ });
+ break;
+ }
+ case DT_HEX_DUMP: {
+ auto hex_str = tok_res->to_string();
+ auto hash_str = hasher().update(hex_str).to_array().to_string(
+ nonstd::make_optional(hex_str[2]));
+ std::string anon_hex;
+
+ while (anon_hex.size() < hex_str.size()) {
+ anon_hex += hash_str;
+ }
+ anon_hex.resize(hex_str.size());
+
+ retval += anon_hex;
+ break;
+ }
+ case DT_IPV4_ADDRESS: {
+ auto ipv4 = tok_res->to_string();
+ retval += this->get_default(
+ this->ta_ipv4_addresses, ipv4, [](size_t size, auto& _) {
+ char anon_ipv4[INET_ADDRSTRLEN];
+ struct in_addr ia;
+
+ inet_aton("10.0.0.0", &ia);
+ ia.s_addr = htonl(ntohl(ia.s_addr) + 1 + size);
+ inet_ntop(AF_INET, &ia, anon_ipv4, sizeof(anon_ipv4));
+ return std::string{anon_ipv4};
+ });
+ break;
+ }
+ case DT_IPV6_ADDRESS: {
+ auto ipv6 = tok_res->to_string();
+ retval += this->get_default(
+ this->ta_ipv6_addresses, ipv6, [](size_t size, auto& _) {
+ char anon_ipv6[INET6_ADDRSTRLEN];
+ struct in6_addr ia;
+ uint32_t* ia6_addr32 = (uint32_t*) &ia.s6_addr[12];
+
+ inet_pton(AF_INET6, "2001:db8::", &ia);
+ *ia6_addr32 = htonl(ntohl(*ia6_addr32) + 1 + size);
+ inet_ntop(AF_INET6, &ia, anon_ipv6, sizeof(anon_ipv6));
+ return std::string{anon_ipv6};
+ });
+ break;
+ }
+ case DT_EMAIL: {
+ auto email_addr = tok_res->to_string();
+ auto at_index = email_addr.find('@');
+
+ retval += fmt::format(
+ FMT_STRING("{}@{}.example.com"),
+ this->get_default(this->ta_user_names,
+ email_addr.substr(0, at_index),
+ [](auto size, const auto& inp) {
+ return get_animal_list().at_index(
+ size);
+ }),
+ this->get_default(this->ta_host_names,
+ email_addr.substr(at_index + 1),
+ [](auto size, const auto& inp) {
+ return get_disease_list().at_index(
+ size);
+ }));
+ break;
+ }
+ case DT_WORD:
+ case DT_SYMBOL: {
+ static const auto SPLIT_RE = lnav::pcre2pp::code::from_const(
+ R"((\.|::|_|-|/|\\|\d+))");
+ auto symbol_frag = ds.to_string_fragment(tok_res->tr_capture);
+ auto sym_provider = [](auto size, const auto& inp) {
+ if (inp.size() <= 4) {
+ return inp;
+ }
+
+ auto comp_frag = string_fragment::from_str(inp);
+ return string_fragment::from_str(
+ get_word_list().at_index(size))
+ .to_string_with_case_style(
+ comp_frag.detect_text_case_style());
+ };
+
+ auto cap_res
+ = SPLIT_RE.capture_from(symbol_frag)
+ .for_each([this, &retval, &sym_provider](
+ lnav::pcre2pp::match_data& md) {
+ auto comp = md.leading().to_string();
+ retval
+ += this->get_default(
+ this->ta_symbols, comp, sym_provider)
+ + md[0]->to_string();
+ });
+ if (cap_res.isErr()) {
+ retval += "<symbol>";
+ } else {
+ auto remaining = cap_res.unwrap().to_string();
+
+ retval += this->get_default(
+ this->ta_symbols, remaining, sym_provider);
+ }
+ break;
+ }
+ case DT_QUOTED_STRING: {
+ auto anon_inner = this->next(
+ ds.to_string_fragment(tok_res->tr_inner_capture)
+ .to_string());
+
+ retval += line.sub_range(tok_res->tr_capture.c_begin,
+ tok_res->tr_inner_capture.c_begin)
+ .to_string()
+ + anon_inner
+ + ds.to_string_fragment(tok_res->tr_capture).back();
+ break;
+ }
+ case DT_XML_OPEN_TAG: {
+ auto open_tag = tok_res->to_string();
+ auto space_index = open_tag.find(' ');
+
+ if (space_index == std::string::npos) {
+ retval += open_tag;
+ } else {
+ static const auto ATTR_RE
+ = lnav::pcre2pp::code::from_const(R"([\w\-]+=)");
+ static thread_local auto md
+ = lnav::pcre2pp::match_data::unitialized();
+
+ auto remaining = string_fragment::from_str_range(
+ open_tag, space_index, open_tag.size());
+
+ retval += open_tag.substr(0, space_index + 1);
+ while (!remaining.empty()) {
+ auto cap_res = ATTR_RE.capture_from(remaining)
+ .into(md)
+ .matches()
+ .ignore_error();
+
+ if (!cap_res) {
+ break;
+ }
+
+ retval += md.leading();
+ retval += md[0]->to_string();
+ remaining = md.remaining();
+ data_scanner ds(remaining);
+ auto attr_tok_res = ds.tokenize2();
+ if (!attr_tok_res) {
+ continue;
+ }
+ retval += this->next(attr_tok_res->to_string());
+ remaining = remaining.substr(
+ attr_tok_res->tr_capture.length());
+ }
+
+ retval += remaining.to_string();
+ }
+ break;
+ }
+ case DT_UUID: {
+ retval
+ += hasher().update(tok_res->to_string()).to_uuid_string();
+ break;
+ }
+ default: {
+ retval += tok_res->to_string();
+ break;
+ }
+ }
+ }
+
+ return retval;
+}
+
+} // namespace lnav