diff options
Diffstat (limited to 'src/log_format_impls.cc')
-rw-r--r-- | src/log_format_impls.cc | 1775 |
1 files changed, 1775 insertions, 0 deletions
diff --git a/src/log_format_impls.cc b/src/log_format_impls.cc new file mode 100644 index 0000000..96d37f5 --- /dev/null +++ b/src/log_format_impls.cc @@ -0,0 +1,1775 @@ +/** + * Copyright (c) 2007-2017, Timothy Stack + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * * Neither the name of Timothy Stack nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @file log_format_impls.cc + */ + +#include <algorithm> +#include <utility> + +#include "log_format.hh" + +#include <stdio.h> + +#include "base/injector.bind.hh" +#include "base/opt_util.hh" +#include "config.h" +#include "formats/logfmt/logfmt.parser.hh" +#include "log_vtab_impl.hh" +#include "sql_util.hh" +#include "yajlpp/yajlpp.hh" + +class generic_log_format : public log_format { + static pcre_format* get_pcre_log_formats() + { + static pcre_format log_fmt[] = { + pcre_format( + "^(?:\\*\\*\\*\\s+)?(?<timestamp>@[0-9a-zA-Z]{16,24})(.*)"), + pcre_format( + "^(?:\\*\\*\\*\\s+)?(?<timestamp>[\\dTZ: +/\\-,\\.-]+)([^:]+)"), + pcre_format( + "^(?:\\*\\*\\*\\s+)?(?<timestamp>[\\w:+/\\.-]+) \\[\\w (.*)"), + pcre_format("^(?:\\*\\*\\*\\s+)?(?<timestamp>[\\w:,/\\.-]+) (.*)"), + pcre_format( + "^(?:\\*\\*\\*\\s+)?(?<timestamp>[\\w:,/\\.-]+) - (.*)"), + pcre_format( + "^(?:\\*\\*\\*\\s+)?(?<timestamp>[\\w: \\.,/-]+) - (.*)"), + pcre_format("^(?:\\*\\*\\*\\s+)?(?<timestamp>[\\w: " + "\\.,/-]+)\\[[^\\]]+\\](.*)"), + pcre_format("^(?:\\*\\*\\*\\s+)?(?<timestamp>[\\w: \\.,/-]+) (.*)"), + + pcre_format( + R"(^(?:\*\*\*\s+)?\[(?<timestamp>[\w: \.,+/-]+)\]\s*(\w+):?)"), + pcre_format( + "^(?:\\*\\*\\*\\s+)?\\[(?<timestamp>[\\w: \\.,+/-]+)\\] (.*)"), + pcre_format("^(?:\\*\\*\\*\\s+)?\\[(?<timestamp>[\\w: " + "\\.,+/-]+)\\] \\[(\\w+)\\]"), + pcre_format("^(?:\\*\\*\\*\\s+)?\\[(?<timestamp>[\\w: " + "\\.,+/-]+)\\] \\w+ (.*)"), + pcre_format("^(?:\\*\\*\\*\\s+)?\\[(?<timestamp>[\\w: ,+/-]+)\\] " + "\\(\\d+\\) (.*)"), + + pcre_format(), + }; + + return log_fmt; + } + + std::string get_pattern_regex(uint64_t line_number) const override + { + int pat_index = this->pattern_index_for_line(line_number); + return get_pcre_log_formats()[pat_index].name; + } + + const intern_string_t get_name() const override + { + return intern_string::lookup("generic_log"); + } + + scan_result_t scan(logfile& lf, + std::vector<logline>& dst, + const line_info& li, + shared_buffer_ref& sbr, + scan_batch_context& sbc) override + { + struct exttm log_time; + struct timeval log_tv; + string_fragment ts; + nonstd::optional<string_fragment> level; + const char* last_pos; + + if ((last_pos = this->log_scanf(dst.size(), + sbr.to_string_fragment(), + get_pcre_log_formats(), + nullptr, + &log_time, + &log_tv, + + &ts, + &level)) + != nullptr) + { + log_level_t level_val = log_level_t::LEVEL_UNKNOWN; + if (level) { + level_val = string2level(level->data(), level->length()); + } + + if (!((log_time.et_flags & ETF_DAY_SET) + && (log_time.et_flags & ETF_MONTH_SET) + && (log_time.et_flags & ETF_YEAR_SET))) + { + this->check_for_new_year(dst, log_time, log_tv); + } + + dst.emplace_back(li.li_file_range.fr_offset, log_tv, level_val); + return scan_match{0}; + } + + return scan_no_match{"no patterns matched"}; + } + + void annotate(uint64_t line_number, + string_attrs_t& sa, + logline_value_vector& values, + bool annotate_module) const override + { + auto& line = values.lvv_sbr; + int pat_index = this->pattern_index_for_line(line_number); + auto& fmt = get_pcre_log_formats()[pat_index]; + int prefix_len = 0; + auto md = fmt.pcre->create_match_data(); + auto match_res = fmt.pcre->capture_from(line.to_string_fragment()) + .into(md) + .matches(PCRE2_NO_UTF_CHECK) + .ignore_error(); + if (!match_res) { + return; + } + + auto lr = to_line_range(md[fmt.pf_timestamp_index].value()); + sa.emplace_back(lr, logline::L_TIMESTAMP.value()); + + prefix_len = lr.lr_end; + auto level_cap = md[2]; + if (level_cap) { + if (string2level(level_cap->data(), level_cap->length(), true) + != LEVEL_UNKNOWN) + { + prefix_len = level_cap->sf_end; + } + } + + lr.lr_start = 0; + lr.lr_end = prefix_len; + sa.emplace_back(lr, logline::L_PREFIX.value()); + + lr.lr_start = prefix_len; + lr.lr_end = line.length(); + sa.emplace_back(lr, SA_BODY.value()); + } + + std::shared_ptr<log_format> specialized(int fmt_lock) override + { + auto retval = std::make_shared<generic_log_format>(*this); + + retval->lf_specialized = true; + return retval; + } +}; + +std::string +from_escaped_string(const char* str, size_t len) +{ + std::string retval; + + for (size_t lpc = 0; lpc < len; lpc++) { + switch (str[lpc]) { + case '\\': + if ((lpc + 3) < len && str[lpc + 1] == 'x') { + int ch; + + if (sscanf(&str[lpc + 2], "%2x", &ch) == 1) { + retval.append(1, (char) ch & 0xff); + lpc += 3; + } + } + break; + default: + retval.append(1, str[lpc]); + break; + } + } + + return retval; +} + +nonstd::optional<const char*> +lnav_strnstr(const char* s, const char* find, size_t slen) +{ + char c, sc; + size_t len; + + if ((c = *find++) != '\0') { + len = strlen(find); + do { + do { + if (slen < 1 || (sc = *s) == '\0') { + return nonstd::nullopt; + } + --slen; + ++s; + } while (sc != c); + if (len > slen) { + return nonstd::nullopt; + } + } while (strncmp(s, find, len) != 0); + s--; + } + return s; +} + +struct separated_string { + const char* ss_str; + size_t ss_len; + const char* ss_separator; + size_t ss_separator_len; + + separated_string(const char* str, size_t len) + : ss_str(str), ss_len(len), ss_separator(","), + ss_separator_len(strlen(this->ss_separator)) + { + } + + separated_string& with_separator(const char* sep) + { + this->ss_separator = sep; + this->ss_separator_len = strlen(sep); + return *this; + } + + struct iterator { + const separated_string& i_parent; + const char* i_pos; + const char* i_next_pos; + size_t i_index; + + iterator(const separated_string& ss, const char* pos) + : i_parent(ss), i_pos(pos), i_next_pos(pos), i_index(0) + { + this->update(); + } + + void update() + { + const separated_string& ss = this->i_parent; + auto next_field + = lnav_strnstr(this->i_pos, + ss.ss_separator, + ss.ss_len - (this->i_pos - ss.ss_str)); + if (next_field) { + this->i_next_pos = next_field.value() + ss.ss_separator_len; + } else { + this->i_next_pos = ss.ss_str + ss.ss_len; + } + } + + iterator& operator++() + { + this->i_pos = this->i_next_pos; + this->update(); + this->i_index += 1; + + return *this; + } + + string_fragment operator*() + { + const auto& ss = this->i_parent; + int end; + + if (this->i_next_pos < (ss.ss_str + ss.ss_len)) { + end = this->i_next_pos - ss.ss_str - ss.ss_separator_len; + } else { + end = this->i_next_pos - ss.ss_str; + } + return string_fragment::from_byte_range( + ss.ss_str, this->i_pos - ss.ss_str, end); + } + + bool operator==(const iterator& other) const + { + return (&this->i_parent == &other.i_parent) + && (this->i_pos == other.i_pos); + } + + bool operator!=(const iterator& other) const + { + return !(*this == other); + } + + size_t index() const { return this->i_index; } + }; + + iterator begin() { return {*this, this->ss_str}; } + + iterator end() { return {*this, this->ss_str + this->ss_len}; } +}; + +class bro_log_format : public log_format { +public: + struct field_def { + logline_value_meta fd_meta; + std::string fd_collator; + nonstd::optional<size_t> fd_numeric_index; + + explicit field_def(const intern_string_t name, + int col, + log_format* format) + : fd_meta(name, value_kind_t::VALUE_TEXT, col, format) + { + } + + field_def& with_kind(value_kind_t kind, + bool identifier = false, + const std::string& collator = "") + { + this->fd_meta.lvm_kind = kind; + this->fd_meta.lvm_identifier = identifier; + this->fd_collator = collator; + return *this; + } + + field_def& with_numeric_index(size_t index) + { + this->fd_numeric_index = index; + return *this; + } + }; + + bro_log_format() + { + this->lf_is_self_describing = true; + this->lf_time_ordered = false; + } + + const intern_string_t get_name() const override + { + static const intern_string_t name(intern_string::lookup("bro")); + + return this->blf_format_name.empty() ? name : this->blf_format_name; + } + + void clear() override + { + this->log_format::clear(); + this->blf_format_name.clear(); + this->blf_field_defs.clear(); + } + + scan_result_t scan_int(std::vector<logline>& dst, + const line_info& li, + shared_buffer_ref& sbr) + { + static const intern_string_t STATUS_CODE + = intern_string::lookup("bro_status_code"); + static const intern_string_t TS = intern_string::lookup("bro_ts"); + static const intern_string_t UID = intern_string::lookup("bro_uid"); + + separated_string ss(sbr.get_data(), sbr.length()); + struct timeval tv; + struct exttm tm; + bool found_ts = false; + log_level_t level = LEVEL_INFO; + uint8_t opid = 0; + + ss.with_separator(this->blf_separator.get()); + + for (auto iter = ss.begin(); iter != ss.end(); ++iter) { + if (iter.index() == 0 && *iter == "#close") { + return scan_match{0}; + } + + if (iter.index() >= this->blf_field_defs.size()) { + break; + } + + const auto& fd = this->blf_field_defs[iter.index()]; + + if (TS == fd.fd_meta.lvm_name) { + string_fragment sf = *iter; + + if (this->lf_date_time.scan( + sf.data(), sf.length(), nullptr, &tm, tv)) + { + this->lf_timestamp_flags = tm.et_flags; + found_ts = true; + } + } else if (STATUS_CODE == fd.fd_meta.lvm_name) { + string_fragment sf = *iter; + + if (!sf.empty() && sf[0] >= '4') { + level = LEVEL_ERROR; + } + } else if (UID == fd.fd_meta.lvm_name) { + string_fragment sf = *iter; + + opid = hash_str(sf.data(), sf.length()); + } + + if (fd.fd_numeric_index) { + switch (fd.fd_meta.lvm_kind) { + case value_kind_t::VALUE_INTEGER: + case value_kind_t::VALUE_FLOAT: { + string_fragment sf = *iter; + char field_copy[sf.length() + 1]; + double val; + + if (sscanf(sf.to_string(field_copy), "%lf", &val) == 1) + { + this->lf_value_stats[fd.fd_numeric_index.value()] + .add_value(val); + } + break; + } + default: + break; + } + } + } + + if (found_ts) { + if (!this->lf_specialized) { + for (auto& ll : dst) { + ll.set_ignore(true); + } + } + dst.emplace_back(li.li_file_range.fr_offset, tv, level, 0, opid); + return scan_match{0}; + } + return scan_no_match{}; + } + + scan_result_t scan(logfile& lf, + std::vector<logline>& dst, + const line_info& li, + shared_buffer_ref& sbr, + scan_batch_context& sbc) override + { + static const auto SEP_RE + = lnav::pcre2pp::code::from_const(R"(^#separator\s+(.+))"); + + if (!this->blf_format_name.empty()) { + return this->scan_int(dst, li, sbr); + } + + if (dst.empty() || dst.size() > 20 || sbr.empty() + || sbr.get_data()[0] == '#') + { + return scan_no_match{}; + } + + auto line_iter = dst.begin(); + auto read_result = lf.read_line(line_iter); + + if (read_result.isErr()) { + return scan_no_match{"unable to read first line"}; + } + + auto line = read_result.unwrap(); + auto md = SEP_RE.create_match_data(); + + auto match_res = SEP_RE.capture_from(line.to_string_fragment()) + .into(md) + .matches(PCRE2_NO_UTF_CHECK) + .ignore_error(); + if (!match_res) { + return scan_no_match{"cannot read separator header"}; + } + + this->clear(); + + auto sep = from_escaped_string(md[1]->data(), md[1]->length()); + this->blf_separator = intern_string::lookup(sep); + + for (++line_iter; line_iter != dst.end(); ++line_iter) { + auto next_read_result = lf.read_line(line_iter); + + if (next_read_result.isErr()) { + return scan_no_match{"unable to read header line"}; + } + + line = next_read_result.unwrap(); + separated_string ss(line.get_data(), line.length()); + + ss.with_separator(this->blf_separator.get()); + auto iter = ss.begin(); + + string_fragment directive = *iter; + + if (directive.empty() || directive[0] != '#') { + continue; + } + + ++iter; + if (iter == ss.end()) { + continue; + } + + if (directive == "#set_separator") { + this->blf_set_separator = intern_string::lookup(*iter); + } else if (directive == "#empty_field") { + this->blf_empty_field = intern_string::lookup(*iter); + } else if (directive == "#unset_field") { + this->blf_unset_field = intern_string::lookup(*iter); + } else if (directive == "#path") { + auto full_name = fmt::format(FMT_STRING("bro_{}_log"), *iter); + this->blf_format_name = intern_string::lookup(full_name); + } else if (directive == "#fields" && this->blf_field_defs.empty()) { + do { + this->blf_field_defs.emplace_back( + intern_string::lookup("bro_" + sql_safe_ident(*iter)), + this->blf_field_defs.size(), + this); + ++iter; + } while (iter != ss.end()); + } else if (directive == "#types") { + static const char* KNOWN_IDS[] = { + "bro_conn_uids", + "bro_fuid", + "bro_host", + "bro_info_code", + "bro_method", + "bro_mime_type", + "bro_orig_fuids", + "bro_parent_fuid", + "bro_proto", + "bro_referrer", + "bro_resp_fuids", + "bro_service", + "bro_status_code", + "bro_uid", + "bro_uri", + "bro_user_agent", + "bro_username", + }; + + int numeric_count = 0; + + do { + string_fragment field_type = *iter; + auto& fd = this->blf_field_defs[iter.index() - 1]; + + if (field_type == "time") { + fd.with_kind(value_kind_t::VALUE_TIMESTAMP); + } else if (field_type == "string") { + bool ident = std::binary_search(std::begin(KNOWN_IDS), + std::end(KNOWN_IDS), + fd.fd_meta.lvm_name); + fd.with_kind(value_kind_t::VALUE_TEXT, ident); + } else if (field_type == "count") { + bool ident = std::binary_search(std::begin(KNOWN_IDS), + std::end(KNOWN_IDS), + fd.fd_meta.lvm_name); + fd.with_kind(value_kind_t::VALUE_INTEGER, ident) + .with_numeric_index(numeric_count); + numeric_count += 1; + } else if (field_type == "bool") { + fd.with_kind(value_kind_t::VALUE_BOOLEAN); + } else if (field_type == "addr") { + fd.with_kind( + value_kind_t::VALUE_TEXT, true, "ipaddress"); + } else if (field_type == "port") { + fd.with_kind(value_kind_t::VALUE_INTEGER, true); + } else if (field_type == "interval") { + fd.with_kind(value_kind_t::VALUE_FLOAT) + .with_numeric_index(numeric_count); + numeric_count += 1; + } + + ++iter; + } while (iter != ss.end()); + + this->lf_value_stats.resize(numeric_count); + } + } + + if (!this->blf_format_name.empty() && !this->blf_separator.empty() + && !this->blf_field_defs.empty()) + { + dst.clear(); + return this->scan_int(dst, li, sbr); + } + + this->blf_format_name.clear(); + this->lf_value_stats.clear(); + + return scan_no_match{}; + } + + void annotate(uint64_t line_number, + string_attrs_t& sa, + logline_value_vector& values, + bool annotate_module) const override + { + static const intern_string_t TS = intern_string::lookup("bro_ts"); + static const intern_string_t UID = intern_string::lookup("bro_uid"); + + auto& sbr = values.lvv_sbr; + separated_string ss(sbr.get_data(), sbr.length()); + + ss.with_separator(this->blf_separator.get()); + + for (auto iter = ss.begin(); iter != ss.end(); ++iter) { + if (iter.index() >= this->blf_field_defs.size()) { + return; + } + + const field_def& fd = this->blf_field_defs[iter.index()]; + string_fragment sf = *iter; + + if (sf == this->blf_empty_field) { + sf.clear(); + } else if (sf == this->blf_unset_field) { + sf.invalidate(); + } + + auto lr = line_range(sf.sf_begin, sf.sf_end); + + if (fd.fd_meta.lvm_name == TS) { + sa.emplace_back(lr, logline::L_TIMESTAMP.value()); + } else if (fd.fd_meta.lvm_name == UID) { + sa.emplace_back(lr, logline::L_OPID.value()); + } + + if (lr.is_valid()) { + values.lvv_values.emplace_back(fd.fd_meta, sbr, lr); + } else { + values.lvv_values.emplace_back(fd.fd_meta); + } + } + } + + const logline_value_stats* stats_for_value( + const intern_string_t& name) const override + { + const logline_value_stats* retval = nullptr; + + for (const auto& blf_field_def : this->blf_field_defs) { + if (blf_field_def.fd_meta.lvm_name == name) { + if (!blf_field_def.fd_numeric_index) { + break; + } + retval = &this->lf_value_stats[blf_field_def.fd_numeric_index + .value()]; + break; + } + } + + return retval; + } + + bool hide_field(const intern_string_t field_name, bool val) override + { + auto fd_iter + = std::find_if(this->blf_field_defs.begin(), + this->blf_field_defs.end(), + [field_name](const field_def& elem) { + return elem.fd_meta.lvm_name == field_name; + }); + if (fd_iter == this->blf_field_defs.end()) { + return false; + } + + fd_iter->fd_meta.lvm_user_hidden = val; + return true; + } + + std::shared_ptr<log_format> specialized(int fmt_lock = -1) override + { + auto retval = std::make_shared<bro_log_format>(*this); + + retval->lf_specialized = true; + return retval; + } + + class bro_log_table : public log_format_vtab_impl { + public: + explicit bro_log_table(const bro_log_format& format) + : log_format_vtab_impl(format), blt_format(format) + { + } + + void get_columns(std::vector<vtab_column>& cols) const override + { + for (const auto& fd : this->blt_format.blf_field_defs) { + std::pair<int, unsigned int> type_pair + = log_vtab_impl::logline_value_to_sqlite_type( + fd.fd_meta.lvm_kind); + + cols.emplace_back(fd.fd_meta.lvm_name.to_string(), + type_pair.first, + fd.fd_collator, + false, + "", + type_pair.second); + } + } + + void get_foreign_keys( + std::vector<std::string>& keys_inout) const override + { + this->log_vtab_impl::get_foreign_keys(keys_inout); + + for (const auto& fd : this->blt_format.blf_field_defs) { + if (fd.fd_meta.lvm_identifier) { + keys_inout.push_back(fd.fd_meta.lvm_name.to_string()); + } + } + } + + const bro_log_format& blt_format; + }; + + static std::map<intern_string_t, std::shared_ptr<bro_log_table>>& + get_tables() + { + static std::map<intern_string_t, std::shared_ptr<bro_log_table>> retval; + + return retval; + } + + std::shared_ptr<log_vtab_impl> get_vtab_impl() const override + { + if (this->blf_format_name.empty()) { + return nullptr; + } + + std::shared_ptr<bro_log_table> retval = nullptr; + + auto& tables = get_tables(); + auto iter = tables.find(this->blf_format_name); + if (iter == tables.end()) { + retval = std::make_shared<bro_log_table>(*this); + tables[this->blf_format_name] = retval; + } + + return retval; + } + + void get_subline(const logline& ll, + shared_buffer_ref& sbr, + bool full_message) override + { + } + + intern_string_t blf_format_name; + intern_string_t blf_separator; + intern_string_t blf_set_separator; + intern_string_t blf_empty_field; + intern_string_t blf_unset_field; + std::vector<field_def> blf_field_defs; +}; + +struct ws_separated_string { + const char* ss_str; + size_t ss_len; + + explicit ws_separated_string(const char* str = nullptr, size_t len = -1) + : ss_str(str), ss_len(len) + { + } + + struct iterator { + enum class state_t { + NORMAL, + QUOTED, + }; + + const ws_separated_string& i_parent; + const char* i_pos; + const char* i_next_pos; + size_t i_index{0}; + state_t i_state{state_t::NORMAL}; + + iterator(const ws_separated_string& ss, const char* pos) + : i_parent(ss), i_pos(pos), i_next_pos(pos) + { + this->update(); + } + + void update() + { + const auto& ss = this->i_parent; + bool done = false; + + while (!done && this->i_next_pos < (ss.ss_str + ss.ss_len)) { + switch (this->i_state) { + case state_t::NORMAL: + if (*this->i_next_pos == '"') { + this->i_state = state_t::QUOTED; + } else if (isspace(*this->i_next_pos)) { + done = true; + } + break; + case state_t::QUOTED: + if (*this->i_next_pos == '"') { + this->i_state = state_t::NORMAL; + } + break; + } + if (!done) { + this->i_next_pos += 1; + } + } + } + + iterator& operator++() + { + const auto& ss = this->i_parent; + + this->i_pos = this->i_next_pos; + while (this->i_pos < (ss.ss_str + ss.ss_len) + && isspace(*this->i_pos)) + { + this->i_pos += 1; + this->i_next_pos += 1; + } + this->update(); + this->i_index += 1; + + return *this; + } + + string_fragment operator*() + { + const auto& ss = this->i_parent; + int end = this->i_next_pos - ss.ss_str; + + return string_fragment(ss.ss_str, this->i_pos - ss.ss_str, end); + } + + bool operator==(const iterator& other) const + { + return (&this->i_parent == &other.i_parent) + && (this->i_pos == other.i_pos); + } + + bool operator!=(const iterator& other) const + { + return !(*this == other); + } + + size_t index() const { return this->i_index; } + }; + + iterator begin() { return {*this, this->ss_str}; } + + iterator end() { return {*this, this->ss_str + this->ss_len}; } +}; + +class w3c_log_format : public log_format { +public: + struct field_def { + const intern_string_t fd_name; + logline_value_meta fd_meta; + std::string fd_collator; + nonstd::optional<size_t> fd_numeric_index; + + explicit field_def(const intern_string_t name) + : fd_name(name), fd_meta(intern_string::lookup(sql_safe_ident( + name.to_string_fragment())), + value_kind_t::VALUE_TEXT) + { + } + + field_def(const intern_string_t name, logline_value_meta meta) + : fd_name(name), fd_meta(meta) + { + } + + field_def(int col, + const char* name, + value_kind_t kind, + bool ident = false, + std::string coll = "") + : fd_name(intern_string::lookup(name)), + fd_meta( + intern_string::lookup(sql_safe_ident(string_fragment(name))), + kind, + col), + fd_collator(std::move(coll)) + { + this->fd_meta.lvm_identifier = ident; + } + + field_def& with_kind(value_kind_t kind, + bool identifier = false, + const std::string& collator = "") + { + this->fd_meta.lvm_kind = kind; + this->fd_meta.lvm_identifier = identifier; + this->fd_collator = collator; + return *this; + } + + field_def& with_numeric_index(int index) + { + this->fd_numeric_index = index; + return *this; + } + }; + + struct field_to_struct_t { + field_to_struct_t(const char* prefix, const char* struct_name) + : fs_prefix(prefix), + fs_struct_name(intern_string::lookup(struct_name)) + { + } + + const char* fs_prefix; + intern_string_t fs_struct_name; + }; + + static const std::vector<field_def> KNOWN_FIELDS; + const static std::vector<field_to_struct_t> KNOWN_STRUCT_FIELDS; + + w3c_log_format() + { + this->lf_is_self_describing = true; + this->lf_time_ordered = false; + } + + const intern_string_t get_name() const override + { + static const intern_string_t name(intern_string::lookup("w3c")); + + return this->wlf_format_name.empty() ? name : this->wlf_format_name; + } + + void clear() override + { + this->log_format::clear(); + this->wlf_time_scanner.clear(); + this->wlf_format_name.clear(); + this->wlf_field_defs.clear(); + } + + scan_result_t scan_int(std::vector<logline>& dst, + const line_info& li, + shared_buffer_ref& sbr) + { + static const intern_string_t F_DATE = intern_string::lookup("date"); + static const intern_string_t F_DATE_LOCAL + = intern_string::lookup("date-local"); + static const intern_string_t F_DATE_UTC + = intern_string::lookup("date-UTC"); + static const intern_string_t F_TIME = intern_string::lookup("time"); + static const intern_string_t F_TIME_LOCAL + = intern_string::lookup("time-local"); + static const intern_string_t F_TIME_UTC + = intern_string::lookup("time-UTC"); + static const intern_string_t F_STATUS_CODE + = intern_string::lookup("sc-status"); + + ws_separated_string ss(sbr.get_data(), sbr.length()); + struct timeval date_tv { + 0, 0 + }, time_tv{0, 0}; + struct exttm date_tm, time_tm; + bool found_date = false, found_time = false; + log_level_t level = LEVEL_INFO; + + for (auto iter = ss.begin(); iter != ss.end(); ++iter) { + if (iter.index() >= this->wlf_field_defs.size()) { + level = LEVEL_INVALID; + break; + } + + const field_def& fd = this->wlf_field_defs[iter.index()]; + string_fragment sf = *iter; + + if (sf.startswith("#")) { + if (sf == "#Date:") { + auto sbr_sf_opt + = sbr.to_string_fragment().consume_n(sf.length()); + + if (sbr_sf_opt) { + auto sbr_sf = sbr_sf_opt.value().trim(); + date_time_scanner dts; + struct exttm tm; + struct timeval tv; + + if (dts.scan(sbr_sf.data(), + sbr_sf.length(), + nullptr, + &tm, + tv)) + { + this->lf_date_time.set_base_time(tv.tv_sec, + tm.et_tm); + this->wlf_time_scanner.set_base_time(tv.tv_sec, + tm.et_tm); + } + } + } + dst.emplace_back( + li.li_file_range.fr_offset, 0, 0, LEVEL_IGNORE, 0); + return scan_match{0}; + } + + sf = sf.trim("\" \t"); + if (F_DATE == fd.fd_name || F_DATE_LOCAL == fd.fd_name + || F_DATE_UTC == fd.fd_name) + { + if (this->lf_date_time.scan( + sf.data(), sf.length(), nullptr, &date_tm, date_tv)) + { + this->lf_timestamp_flags |= date_tm.et_flags; + found_date = true; + } + } else if (F_TIME == fd.fd_name || F_TIME_LOCAL == fd.fd_name + || F_TIME_UTC == fd.fd_name) + { + if (this->wlf_time_scanner.scan( + sf.data(), sf.length(), nullptr, &time_tm, time_tv)) + { + this->lf_timestamp_flags |= time_tm.et_flags; + found_time = true; + } + } else if (F_STATUS_CODE == fd.fd_name) { + if (!sf.empty() && sf[0] >= '4') { + level = LEVEL_ERROR; + } + } + + if (fd.fd_numeric_index) { + switch (fd.fd_meta.lvm_kind) { + case value_kind_t::VALUE_INTEGER: + case value_kind_t::VALUE_FLOAT: { + char field_copy[sf.length() + 1]; + double val; + + if (sscanf(sf.to_string(field_copy), "%lf", &val) == 1) + { + this->lf_value_stats[fd.fd_numeric_index.value()] + .add_value(val); + } + break; + } + default: + break; + } + } + } + + if (found_time) { + struct exttm tm = time_tm; + struct timeval tv; + + if (found_date) { + tm.et_tm.tm_year = date_tm.et_tm.tm_year; + tm.et_tm.tm_mday = date_tm.et_tm.tm_mday; + tm.et_tm.tm_mon = date_tm.et_tm.tm_mon; + tm.et_tm.tm_wday = date_tm.et_tm.tm_wday; + tm.et_tm.tm_yday = date_tm.et_tm.tm_yday; + } + + tv.tv_sec = tm2sec(&tm.et_tm); + tv.tv_usec = tm.et_nsec / 1000; + + if (!this->lf_specialized) { + for (auto& ll : dst) { + ll.set_ignore(true); + } + } + dst.emplace_back(li.li_file_range.fr_offset, tv, level, 0); + return scan_match{0}; + } + + return scan_no_match{}; + } + + scan_result_t scan(logfile& lf, + std::vector<logline>& dst, + const line_info& li, + shared_buffer_ref& sbr, + scan_batch_context& sbc) override + { + static const auto* W3C_LOG_NAME = intern_string::lookup("w3c_log"); + static const auto* X_FIELDS_NAME = intern_string::lookup("x_fields"); + static auto X_FIELDS_IDX = 0; + + if (li.li_partial) { + return scan_incomplete{}; + } + + if (!this->wlf_format_name.empty()) { + return this->scan_int(dst, li, sbr); + } + + if (dst.empty() || dst.size() > 20 || sbr.empty() + || sbr.get_data()[0] == '#') + { + return scan_no_match{}; + } + + this->clear(); + + for (auto line_iter = dst.begin(); line_iter != dst.end(); ++line_iter) + { + auto next_read_result = lf.read_line(line_iter); + + if (next_read_result.isErr()) { + return scan_no_match{"unable to read first line"}; + } + + auto line = next_read_result.unwrap(); + ws_separated_string ss(line.get_data(), line.length()); + auto iter = ss.begin(); + + string_fragment directive = *iter; + + if (directive.empty() || directive[0] != '#') { + continue; + } + + ++iter; + if (iter == ss.end()) { + continue; + } + + if (directive == "#Date:") { + date_time_scanner dts; + struct exttm tm; + struct timeval tv; + + if (dts.scan(line.get_data_at(directive.length() + 1), + line.length() - directive.length() - 1, + nullptr, + &tm, + tv)) + { + this->lf_date_time.set_base_time(tv.tv_sec, tm.et_tm); + this->wlf_time_scanner.set_base_time(tv.tv_sec, tm.et_tm); + } + } else if (directive == "#Fields:" && this->wlf_field_defs.empty()) + { + int numeric_count = 0; + + do { + auto sf = (*iter).trim(")"); + + auto field_iter = std::find_if( + begin(KNOWN_FIELDS), + end(KNOWN_FIELDS), + [&sf](auto elem) { return sf == elem.fd_name; }); + if (field_iter != end(KNOWN_FIELDS)) { + this->wlf_field_defs.emplace_back(*field_iter); + } else if (sf == "date" || sf == "time") { + this->wlf_field_defs.emplace_back( + intern_string::lookup(sf)); + } else { + const auto fs_iter = std::find_if( + begin(KNOWN_STRUCT_FIELDS), + end(KNOWN_STRUCT_FIELDS), + [&sf](auto elem) { + return sf.startswith(elem.fs_prefix); + }); + if (fs_iter != end(KNOWN_STRUCT_FIELDS)) { + auto field_name + = intern_string::lookup(sf.substr(3)); + this->wlf_field_defs.emplace_back( + field_name, + logline_value_meta( + field_name, + value_kind_t::VALUE_TEXT, + KNOWN_FIELDS.size() + 1 + + std::distance( + begin(KNOWN_STRUCT_FIELDS), + fs_iter), + this) + .with_struct_name(fs_iter->fs_struct_name)); + } else { + auto field_name = intern_string::lookup(sf); + this->wlf_field_defs.emplace_back( + field_name, + logline_value_meta( + field_name, + value_kind_t::VALUE_TEXT, + KNOWN_FIELDS.size() + X_FIELDS_IDX, + this) + .with_struct_name(X_FIELDS_NAME)); + } + } + auto& fd = this->wlf_field_defs.back(); + fd.fd_meta.lvm_format = nonstd::make_optional(this); + switch (fd.fd_meta.lvm_kind) { + case value_kind_t::VALUE_FLOAT: + case value_kind_t::VALUE_INTEGER: + fd.with_numeric_index(numeric_count); + numeric_count += 1; + break; + default: + break; + } + + ++iter; + } while (iter != ss.end()); + + this->wlf_format_name = W3C_LOG_NAME; + this->lf_value_stats.resize(numeric_count); + } + } + + if (!this->wlf_format_name.empty() && !this->wlf_field_defs.empty()) { + return this->scan_int(dst, li, sbr); + } + + this->wlf_format_name.clear(); + this->lf_value_stats.clear(); + + return scan_no_match{}; + } + + void annotate(uint64_t line_number, + string_attrs_t& sa, + logline_value_vector& values, + bool annotate_module) const override + { + auto& sbr = values.lvv_sbr; + ws_separated_string ss(sbr.get_data(), sbr.length()); + + for (auto iter = ss.begin(); iter != ss.end(); ++iter) { + string_fragment sf = *iter; + + if (iter.index() >= this->wlf_field_defs.size()) { + sa.emplace_back(line_range{sf.sf_begin, -1}, + SA_INVALID.value("extra fields detected")); + return; + } + + const field_def& fd = this->wlf_field_defs[iter.index()]; + + if (sf == "-") { + sf.invalidate(); + } + + auto lr = line_range(sf.sf_begin, sf.sf_end); + + if (lr.is_valid()) { + values.lvv_values.emplace_back(fd.fd_meta, sbr, lr); + if (sf.startswith("\"")) { + auto& meta = values.lvv_values.back().lv_meta; + + if (meta.lvm_kind == value_kind_t::VALUE_TEXT) { + meta.lvm_kind = value_kind_t::VALUE_W3C_QUOTED; + } else { + meta.lvm_kind = value_kind_t::VALUE_NULL; + } + } + } else { + values.lvv_values.emplace_back(fd.fd_meta); + } + } + } + + const logline_value_stats* stats_for_value( + const intern_string_t& name) const override + { + const logline_value_stats* retval = nullptr; + + for (const auto& wlf_field_def : this->wlf_field_defs) { + if (wlf_field_def.fd_meta.lvm_name == name) { + if (!wlf_field_def.fd_numeric_index) { + break; + } + retval = &this->lf_value_stats[wlf_field_def.fd_numeric_index + .value()]; + break; + } + } + + return retval; + } + + bool hide_field(const intern_string_t field_name, bool val) override + { + auto fd_iter + = std::find_if(this->wlf_field_defs.begin(), + this->wlf_field_defs.end(), + [field_name](const field_def& elem) { + return elem.fd_meta.lvm_name == field_name; + }); + if (fd_iter == this->wlf_field_defs.end()) { + return false; + } + + fd_iter->fd_meta.lvm_user_hidden = val; + return true; + } + + std::shared_ptr<log_format> specialized(int fmt_lock = -1) override + { + auto retval = std::make_shared<w3c_log_format>(*this); + + retval->lf_specialized = true; + return retval; + } + + class w3c_log_table : public log_format_vtab_impl { + public: + explicit w3c_log_table(const w3c_log_format& format) + : log_format_vtab_impl(format), wlt_format(format) + { + } + + void get_columns(std::vector<vtab_column>& cols) const override + { + for (const auto& fd : KNOWN_FIELDS) { + auto type_pair = log_vtab_impl::logline_value_to_sqlite_type( + fd.fd_meta.lvm_kind); + + cols.emplace_back(fd.fd_meta.lvm_name.to_string(), + type_pair.first, + fd.fd_collator, + false, + "", + type_pair.second); + } + cols.emplace_back("x_fields"); + cols.back().with_comment( + "A JSON-object that contains fields that are not first-class " + "columns"); + for (const auto& fs : KNOWN_STRUCT_FIELDS) { + cols.emplace_back(fs.fs_struct_name.to_string()); + } + }; + + void get_foreign_keys( + std::vector<std::string>& keys_inout) const override + { + this->log_vtab_impl::get_foreign_keys(keys_inout); + + for (const auto& fd : KNOWN_FIELDS) { + if (fd.fd_meta.lvm_identifier) { + keys_inout.push_back(fd.fd_meta.lvm_name.to_string()); + } + } + } + + const w3c_log_format& wlt_format; + }; + + static std::map<intern_string_t, std::shared_ptr<w3c_log_table>>& + get_tables() + { + static std::map<intern_string_t, std::shared_ptr<w3c_log_table>> retval; + + return retval; + } + + std::shared_ptr<log_vtab_impl> get_vtab_impl() const override + { + if (this->wlf_format_name.empty()) { + return nullptr; + } + + std::shared_ptr<w3c_log_table> retval = nullptr; + + auto& tables = get_tables(); + auto iter = tables.find(this->wlf_format_name); + if (iter == tables.end()) { + retval = std::make_shared<w3c_log_table>(*this); + tables[this->wlf_format_name] = retval; + } + + return retval; + } + + void get_subline(const logline& ll, + shared_buffer_ref& sbr, + bool full_message) override + { + } + + date_time_scanner wlf_time_scanner; + intern_string_t wlf_format_name; + std::vector<field_def> wlf_field_defs; +}; + +static int KNOWN_FIELD_INDEX = 0; +const std::vector<w3c_log_format::field_def> w3c_log_format::KNOWN_FIELDS = { + { + KNOWN_FIELD_INDEX++, + "cs-method", + value_kind_t::VALUE_TEXT, + true, + }, + { + KNOWN_FIELD_INDEX++, + "c-ip", + value_kind_t::VALUE_TEXT, + true, + "ipaddress", + }, + { + KNOWN_FIELD_INDEX++, + "cs-bytes", + value_kind_t::VALUE_INTEGER, + false, + }, + { + KNOWN_FIELD_INDEX++, + "cs-host", + value_kind_t::VALUE_TEXT, + true, + }, + { + KNOWN_FIELD_INDEX++, + "cs-uri-stem", + value_kind_t::VALUE_TEXT, + true, + "naturalnocase", + }, + { + KNOWN_FIELD_INDEX++, + "cs-uri-query", + value_kind_t::VALUE_TEXT, + false, + }, + { + KNOWN_FIELD_INDEX++, + "cs-username", + value_kind_t::VALUE_TEXT, + false, + }, + { + KNOWN_FIELD_INDEX++, + "cs-version", + value_kind_t::VALUE_TEXT, + true, + }, + { + KNOWN_FIELD_INDEX++, + "s-ip", + value_kind_t::VALUE_TEXT, + true, + "ipaddress", + }, + { + KNOWN_FIELD_INDEX++, + "s-port", + value_kind_t::VALUE_INTEGER, + true, + }, + { + KNOWN_FIELD_INDEX++, + "s-computername", + value_kind_t::VALUE_TEXT, + true, + }, + { + KNOWN_FIELD_INDEX++, + "s-sitename", + value_kind_t::VALUE_TEXT, + true, + }, + { + KNOWN_FIELD_INDEX++, + "sc-bytes", + value_kind_t::VALUE_INTEGER, + false, + }, + { + KNOWN_FIELD_INDEX++, + "sc-status", + value_kind_t::VALUE_INTEGER, + false, + }, + { + KNOWN_FIELD_INDEX++, + "sc-substatus", + value_kind_t::VALUE_INTEGER, + false, + }, + { + KNOWN_FIELD_INDEX++, + "time-taken", + value_kind_t::VALUE_FLOAT, + false, + }, +}; + +const std::vector<w3c_log_format::field_to_struct_t> + w3c_log_format::KNOWN_STRUCT_FIELDS = { + {"cs(", "cs_headers"}, + {"sc(", "sc_headers"}, + {"rs(", "rs_headers"}, + {"sr(", "sr_headers"}, +}; + +struct logfmt_pair_handler { + explicit logfmt_pair_handler(date_time_scanner& dts) : lph_dt_scanner(dts) + { + } + + bool process_value(const string_fragment& value_frag) + { + if (this->lph_key_frag == "time" || this->lph_key_frag == "ts") { + if (!this->lph_dt_scanner.scan(value_frag.data(), + value_frag.length(), + nullptr, + &this->lph_time_tm, + this->lph_tv)) + { + return false; + } + this->lph_found_time = true; + } else if (this->lph_key_frag == "level") { + this->lph_level + = string2level(value_frag.data(), value_frag.length()); + } + return true; + } + + date_time_scanner& lph_dt_scanner; + bool lph_found_time{false}; + struct exttm lph_time_tm {}; + struct timeval lph_tv { + 0, 0 + }; + log_level_t lph_level{log_level_t::LEVEL_INFO}; + string_fragment lph_key_frag{""}; +}; + +class logfmt_format : public log_format { +public: + const intern_string_t get_name() const override + { + const static auto NAME = intern_string::lookup("logfmt_log"); + + return NAME; + } + + class logfmt_log_table : public log_format_vtab_impl { + public: + logfmt_log_table(const log_format& format) + : log_format_vtab_impl(format) + { + } + + void get_columns(std::vector<vtab_column>& cols) const override + { + static const auto FIELDS = std::string("fields"); + + cols.emplace_back(FIELDS); + } + }; + + std::shared_ptr<log_vtab_impl> get_vtab_impl() const override + { + static auto retval = std::make_shared<logfmt_log_table>(*this); + + return retval; + } + + scan_result_t scan(logfile& lf, + std::vector<logline>& dst, + const line_info& li, + shared_buffer_ref& sbr, + scan_batch_context& sbc) override + { + auto p = logfmt::parser(sbr.to_string_fragment()); + scan_result_t retval = scan_no_match{}; + bool done = false; + logfmt_pair_handler lph(this->lf_date_time); + + while (!done) { + auto parse_result = p.step(); + + done = parse_result.match( + [](const logfmt::parser::end_of_input&) { return true; }, + [&lph](const logfmt::parser::kvpair& kvp) { + lph.lph_key_frag = kvp.first; + + return kvp.second.match( + [](const logfmt::parser::bool_value& bv) { + return false; + }, + [&lph](const logfmt::parser::float_value& fv) { + return lph.process_value(fv.fv_str_value); + }, + [&lph](const logfmt::parser::int_value& iv) { + return lph.process_value(iv.iv_str_value); + }, + [&lph](const logfmt::parser::quoted_value& qv) { + auto_mem<yajl_handle_t> handle(yajl_free); + yajl_callbacks cb; + + memset(&cb, 0, sizeof(cb)); + handle = yajl_alloc(&cb, nullptr, &lph); + cb.yajl_string = +[](void* ctx, + const unsigned char* str, + size_t len) -> int { + auto& lph = *((logfmt_pair_handler*) ctx); + string_fragment value_frag{str, 0, (int) len}; + + return lph.process_value(value_frag); + }; + + if (yajl_parse( + handle, + (const unsigned char*) qv.qv_value.data(), + qv.qv_value.length()) + != yajl_status_ok + || yajl_complete_parse(handle) + != yajl_status_ok) + { + log_debug("json parsing failed"); + string_fragment unq_frag{ + qv.qv_value.sf_string, + qv.qv_value.sf_begin + 1, + qv.qv_value.sf_end - 1, + }; + + return lph.process_value(unq_frag); + } + + return false; + }, + [&lph](const logfmt::parser::unquoted_value& uv) { + return lph.process_value(uv.uv_value); + }); + }, + [](const logfmt::parser::error& err) { + // log_error("logfmt parse error: %s", err.e_msg.c_str()); + return true; + }); + } + + if (lph.lph_found_time) { + dst.emplace_back( + li.li_file_range.fr_offset, lph.lph_tv, lph.lph_level); + retval = scan_match{0}; + } + + return retval; + } + + void annotate(uint64_t line_number, + string_attrs_t& sa, + logline_value_vector& values, + bool annotate_module) const override + { + static const auto FIELDS_NAME = intern_string::lookup("fields"); + + auto& sbr = values.lvv_sbr; + auto p = logfmt::parser(sbr.to_string_fragment()); + bool done = false; + + while (!done) { + auto parse_result = p.step(); + + done = parse_result.match( + [](const logfmt::parser::end_of_input&) { return true; }, + [this, &sa, &values](const logfmt::parser::kvpair& kvp) { + auto value_frag = kvp.second.match( + [this, &kvp, &values]( + const logfmt::parser::bool_value& bv) { + auto lvm = logline_value_meta{intern_string::lookup( + kvp.first), + value_kind_t:: + VALUE_INTEGER, + 0, + (log_format*) this} + .with_struct_name(FIELDS_NAME); + values.lvv_values.emplace_back(lvm, bv.bv_value); + + return bv.bv_str_value; + }, + [this, &kvp, &values]( + const logfmt::parser::int_value& iv) { + auto lvm = logline_value_meta{intern_string::lookup( + kvp.first), + value_kind_t:: + VALUE_INTEGER, + 0, + (log_format*) this} + .with_struct_name(FIELDS_NAME); + values.lvv_values.emplace_back(lvm, iv.iv_value); + + return iv.iv_str_value; + }, + [this, &kvp, &values]( + const logfmt::parser::float_value& fv) { + auto lvm = logline_value_meta{intern_string::lookup( + kvp.first), + value_kind_t:: + VALUE_INTEGER, + 0, + (log_format*) this} + .with_struct_name(FIELDS_NAME); + values.lvv_values.emplace_back(lvm, fv.fv_value); + + return fv.fv_str_value; + }, + [](const logfmt::parser::quoted_value& qv) { + return qv.qv_value; + }, + [](const logfmt::parser::unquoted_value& uv) { + return uv.uv_value; + }); + auto value_lr + = line_range{value_frag.sf_begin, value_frag.sf_end}; + + if (kvp.first == "time" || kvp.first == "ts") { + sa.emplace_back(value_lr, logline::L_TIMESTAMP.value()); + } else if (kvp.first == "level") { + } else if (kvp.first == "msg") { + sa.emplace_back(value_lr, SA_BODY.value()); + } else if (!kvp.second.is<logfmt::parser::int_value>() + && !kvp.second.is<logfmt::parser::bool_value>()) + { + auto lvm + = logline_value_meta{intern_string::lookup( + kvp.first), + value_frag.startswith("\"") + ? value_kind_t::VALUE_JSON + : value_kind_t::VALUE_TEXT, + 0, + (log_format*) this} + .with_struct_name(FIELDS_NAME); + values.lvv_values.emplace_back(lvm, value_frag); + } + + return false; + }, + [line_number, &sbr](const logfmt::parser::error& err) { + log_error("bad line %.*s", sbr.length(), sbr.get_data()); + log_error("%lld:logfmt parse error: %s", + line_number, + err.e_msg.c_str()); + return true; + }); + } + } + + std::shared_ptr<log_format> specialized(int fmt_lock) override + { + auto retval = std::make_shared<logfmt_format>(*this); + + retval->lf_specialized = true; + return retval; + } +}; + +static auto format_binder = injector::bind_multiple<log_format>() + .add<logfmt_format>() + .add<bro_log_format>() + .add<w3c_log_format>() + .add<generic_log_format>(); |