summaryrefslogtreecommitdiffstats
path: root/src/document.sections.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/document.sections.cc')
-rw-r--r--src/document.sections.cc540
1 files changed, 540 insertions, 0 deletions
diff --git a/src/document.sections.cc b/src/document.sections.cc
new file mode 100644
index 0000000..da9c33f
--- /dev/null
+++ b/src/document.sections.cc
@@ -0,0 +1,540 @@
+/**
+ * Copyright (c) 2022, Timothy Stack
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * * Neither the name of Timothy Stack nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <utility>
+#include <vector>
+
+#include "document.sections.hh"
+
+#include "base/enum_util.hh"
+#include "base/itertools.hh"
+#include "base/lnav_log.hh"
+#include "base/opt_util.hh"
+#include "data_scanner.hh"
+
+namespace lnav {
+namespace document {
+
+nonstd::optional<hier_node*>
+hier_node::lookup_child(section_key_t key) const
+{
+ return make_optional_from_nullable(key.match(
+ [this](const std::string& str) -> hier_node* {
+ auto iter = this->hn_named_children.find(str);
+ if (iter != this->hn_named_children.end()) {
+ return iter->second;
+ }
+ return nullptr;
+ },
+ [this](size_t index) -> hier_node* {
+ if (index < this->hn_children.size()) {
+ return this->hn_children[index].get();
+ }
+ return nullptr;
+ }));
+}
+
+nonstd::optional<const hier_node*>
+hier_node::lookup_path(const hier_node* root,
+ const std::vector<section_key_t>& path)
+{
+ auto retval = make_optional_from_nullable(root);
+
+ for (const auto& comp : path) {
+ if (!retval) {
+ break;
+ }
+
+ retval = retval.value()->lookup_child(comp);
+ }
+
+ if (!retval) {
+ return nonstd::nullopt;
+ }
+
+ return retval;
+}
+
+struct metadata_builder {
+ std::vector<section_interval_t> mb_intervals;
+ std::unique_ptr<hier_node> mb_root_node;
+
+ metadata to_metadata() &&
+ {
+ return {
+ std::move(this->mb_intervals),
+ std::move(this->mb_root_node),
+ };
+ }
+};
+
+static void
+discover_metadata_int(const attr_line_t& al, metadata_builder& mb)
+{
+ const auto& orig_attrs = al.get_attrs();
+ auto headers = orig_attrs
+ | lnav::itertools::filter_in([](const string_attr& attr) {
+ if (attr.sa_type != &VC_ROLE) {
+ return false;
+ }
+
+ auto role = attr.sa_value.get<role_t>();
+ switch (role) {
+ case role_t::VCR_H1:
+ case role_t::VCR_H2:
+ case role_t::VCR_H3:
+ case role_t::VCR_H4:
+ case role_t::VCR_H5:
+ case role_t::VCR_H6:
+ return true;
+ default:
+ return false;
+ }
+ })
+ | lnav::itertools::sort_by(&string_attr::sa_range);
+
+ // Remove headers from quoted text
+ for (const auto& orig_attr : orig_attrs) {
+ if (orig_attr.sa_type == &VC_ROLE
+ && orig_attr.sa_value.get<role_t>() == role_t::VCR_QUOTED_TEXT)
+ {
+ remove_string_attr(headers, orig_attr.sa_range);
+ }
+ }
+
+ auto& intervals = mb.mb_intervals;
+
+ struct open_interval_t {
+ open_interval_t(uint32_t level, file_off_t start, section_key_t id)
+ : oi_level(level), oi_start(start), oi_id(std::move(id))
+ {
+ }
+
+ int32_t oi_level;
+ file_off_t oi_start;
+ section_key_t oi_id;
+ std::unique_ptr<hier_node> oi_node{std::make_unique<hier_node>()};
+ };
+ std::vector<open_interval_t> open_intervals;
+ auto root_node = std::make_unique<hier_node>();
+
+ for (const auto& hdr_attr : headers) {
+ auto role = hdr_attr.sa_value.get<role_t>();
+ auto role_num = lnav::enums::to_underlying(role)
+ - lnav::enums::to_underlying(role_t::VCR_H1);
+ std::vector<open_interval_t> new_open_intervals;
+
+ for (auto& oi : open_intervals) {
+ if (oi.oi_level >= role_num) {
+ // close out this section
+ intervals.emplace_back(
+ oi.oi_start, hdr_attr.sa_range.lr_start - 1, oi.oi_id);
+ auto* node_ptr = oi.oi_node.get();
+ auto* parent_node = oi.oi_node->hn_parent;
+ if (parent_node != nullptr) {
+ parent_node->hn_children.emplace_back(
+ std::move(oi.oi_node));
+ parent_node->hn_named_children.insert({
+ oi.oi_id.get<std::string>(),
+ node_ptr,
+ });
+ }
+ } else {
+ new_open_intervals.emplace_back(std::move(oi));
+ }
+ }
+ auto* parent_node = new_open_intervals.empty()
+ ? root_node.get()
+ : new_open_intervals.back().oi_node.get();
+ new_open_intervals.emplace_back(role_num,
+ hdr_attr.sa_range.lr_start,
+ al.get_substring(hdr_attr.sa_range));
+ new_open_intervals.back().oi_node->hn_parent = parent_node;
+ new_open_intervals.back().oi_node->hn_start
+ = hdr_attr.sa_range.lr_start;
+
+ open_intervals = std::move(new_open_intervals);
+ }
+
+ for (auto& oi : open_intervals) {
+ // close out this section
+ intervals.emplace_back(oi.oi_start, al.length(), oi.oi_id);
+ auto* node_ptr = oi.oi_node.get();
+ auto* parent_node = oi.oi_node->hn_parent;
+ if (parent_node == nullptr) {
+ root_node = std::move(oi.oi_node);
+ } else {
+ parent_node->hn_children.emplace_back(std::move(oi.oi_node));
+ parent_node->hn_named_children.insert({
+ oi.oi_id.get<std::string>(),
+ node_ptr,
+ });
+ }
+ }
+
+ for (auto& interval : intervals) {
+ auto start_off_iter = find_string_attr_containing(
+ orig_attrs, &SA_ORIGIN_OFFSET, interval.start);
+ if (start_off_iter != orig_attrs.end()) {
+ interval.start += start_off_iter->sa_value.get<int64_t>();
+ }
+ auto stop_off_iter = find_string_attr_containing(
+ orig_attrs, &SA_ORIGIN_OFFSET, interval.stop - 1);
+ if (stop_off_iter != orig_attrs.end()) {
+ interval.stop += stop_off_iter->sa_value.get<int64_t>();
+ }
+ }
+
+ hier_node::depth_first(root_node.get(), [&orig_attrs](hier_node* node) {
+ auto off_opt
+ = get_string_attr(orig_attrs, &SA_ORIGIN_OFFSET, node->hn_start);
+
+ if (off_opt) {
+ node->hn_start += off_opt.value()->sa_value.get<int64_t>();
+ }
+ });
+
+ if (!root_node->hn_children.empty()
+ || !root_node->hn_named_children.empty())
+ {
+ mb.mb_root_node = std::move(root_node);
+ }
+}
+
+metadata
+discover_metadata(const attr_line_t& al)
+{
+ metadata_builder mb;
+
+ discover_metadata_int(al, mb);
+
+ return std::move(mb).to_metadata();
+}
+
+class structure_walker {
+public:
+ explicit structure_walker(attr_line_t& al, line_range lr)
+ : sw_line(al), sw_range(lr),
+ sw_scanner(string_fragment::from_str_range(
+ al.get_string(), lr.lr_start, lr.lr_end))
+ {
+ this->sw_interval_state.resize(1);
+ this->sw_hier_nodes.push_back(std::make_unique<hier_node>());
+ }
+
+ metadata walk()
+ {
+ metadata_builder mb;
+ size_t garbage_count = 0;
+
+ while (garbage_count < 1000) {
+ auto tokenize_res = this->sw_scanner.tokenize2();
+ if (!tokenize_res) {
+ break;
+ }
+
+ auto dt = tokenize_res->tr_token;
+ element el(tokenize_res->tr_token, tokenize_res->tr_capture);
+
+ switch (dt) {
+ case DT_XML_DECL_TAG:
+ case DT_XML_EMPTY_TAG:
+ this->sw_values.emplace_back(el);
+ break;
+ case DT_XML_OPEN_TAG:
+ this->flush_values();
+ this->sw_interval_state.back().is_start
+ = el.e_capture.c_begin;
+ this->sw_interval_state.back().is_line_number
+ = this->sw_line_number;
+ this->sw_interval_state.back().is_name
+ = tokenize_res->to_string();
+ this->sw_depth += 1;
+ this->sw_interval_state.resize(this->sw_depth + 1);
+ this->sw_hier_nodes.push_back(
+ std::make_unique<hier_node>());
+ break;
+ case DT_XML_CLOSE_TAG: {
+ auto term = this->flush_values();
+ if (this->sw_depth > 0) {
+ this->sw_depth -= 1;
+ this->append_child_node(term);
+ this->sw_interval_state.pop_back();
+ this->sw_hier_stage
+ = std::move(this->sw_hier_nodes.back());
+ this->sw_hier_nodes.pop_back();
+ }
+ this->append_child_node(el.e_capture);
+ this->flush_values();
+ break;
+ }
+ case DT_H1: {
+ this->sw_line.get_attrs().emplace_back(
+ line_range{
+ this->sw_range.lr_start + el.e_capture.c_begin + 1,
+ this->sw_range.lr_start + el.e_capture.c_end - 1,
+ },
+ VC_ROLE.value(role_t::VCR_H1));
+ this->sw_line_number += 2;
+ break;
+ }
+ case DT_LCURLY:
+ case DT_LSQUARE:
+ case DT_LPAREN: {
+ this->flush_values();
+ // this->append_child_node(term);
+ this->sw_depth += 1;
+ this->sw_interval_state.back().is_start
+ = el.e_capture.c_begin;
+ this->sw_interval_state.back().is_line_number
+ = this->sw_line_number;
+ this->sw_interval_state.resize(this->sw_depth + 1);
+ this->sw_hier_nodes.push_back(
+ std::make_unique<hier_node>());
+ break;
+ }
+ case DT_RCURLY:
+ case DT_RSQUARE:
+ case DT_RPAREN: {
+ auto term = this->flush_values();
+ if (this->sw_depth > 0) {
+ this->append_child_node(term);
+ this->sw_depth -= 1;
+ this->sw_interval_state.pop_back();
+ this->sw_hier_stage
+ = std::move(this->sw_hier_nodes.back());
+ this->sw_hier_nodes.pop_back();
+ if (this->sw_interval_state.back().is_start) {
+ data_scanner::capture_t obj_cap = {
+ static_cast<int>(this->sw_interval_state.back()
+ .is_start.value()),
+ el.e_capture.c_end,
+ };
+
+ auto sf
+ = this->sw_scanner.to_string_fragment(obj_cap);
+ if (!sf.find('\n')) {
+ this->sw_hier_stage->hn_named_children.clear();
+ this->sw_hier_stage->hn_children.clear();
+ while (!this->sw_intervals.empty()
+ && this->sw_intervals.back().start
+ > obj_cap.c_begin)
+ {
+ this->sw_intervals.pop_back();
+ }
+ }
+ }
+ }
+ this->sw_values.emplace_back(el);
+ break;
+ }
+ case DT_COMMA:
+ if (this->sw_depth > 0) {
+ auto term = this->flush_values();
+ this->append_child_node(term);
+ }
+ break;
+ case DT_LINE:
+ this->sw_line_number += 1;
+ break;
+ case DT_WHITE:
+ break;
+ default:
+ if (dt == DT_GARBAGE) {
+ garbage_count += 1;
+ }
+ this->sw_values.emplace_back(el);
+ break;
+ }
+ }
+ this->flush_values();
+
+ if (this->sw_hier_stage != nullptr) {
+ this->sw_hier_stage->hn_parent = this->sw_hier_nodes.back().get();
+ this->sw_hier_nodes.back()->hn_children.push_back(
+ std::move(this->sw_hier_stage));
+ }
+ this->sw_hier_stage = std::move(this->sw_hier_nodes.back());
+ this->sw_hier_nodes.pop_back();
+ if (this->sw_hier_stage->hn_children.size() == 1
+ && this->sw_hier_stage->hn_named_children.empty())
+ {
+ this->sw_hier_stage
+ = std::move(this->sw_hier_stage->hn_children.front());
+ this->sw_hier_stage->hn_parent = nullptr;
+ }
+
+ mb.mb_root_node = std::move(this->sw_hier_stage);
+ mb.mb_intervals = std::move(this->sw_intervals);
+
+ discover_metadata_int(this->sw_line, mb);
+
+ return std::move(mb).to_metadata();
+ }
+
+private:
+ struct element {
+ element(data_token_t token, data_scanner::capture_t& cap)
+ : e_token(token), e_capture(cap)
+ {
+ }
+
+ data_token_t e_token;
+ data_scanner::capture_t e_capture;
+ };
+
+ struct interval_state {
+ nonstd::optional<file_off_t> is_start;
+ size_t is_line_number{0};
+ std::string is_name;
+ };
+
+ nonstd::optional<data_scanner::capture_t> flush_values()
+ {
+ nonstd::optional<data_scanner::capture_t> last_key;
+ nonstd::optional<data_scanner::capture_t> retval;
+
+ if (!this->sw_values.empty()) {
+ if (!this->sw_interval_state.back().is_start) {
+ this->sw_interval_state.back().is_start
+ = this->sw_values.front().e_capture.c_begin;
+ this->sw_interval_state.back().is_line_number
+ = this->sw_line_number;
+ }
+ retval = this->sw_values.back().e_capture;
+ }
+ for (const auto& el : this->sw_values) {
+ switch (el.e_token) {
+ case DT_SYMBOL:
+ case DT_CONSTANT:
+ case DT_WORD:
+ case DT_QUOTED_STRING:
+ last_key = el.e_capture;
+ break;
+ case DT_COLON:
+ case DT_EQUALS:
+ if (last_key) {
+ this->sw_interval_state.back().is_name
+ = this->sw_scanner
+ .to_string_fragment(last_key.value())
+ .to_string();
+ if (!this->sw_interval_state.back().is_name.empty()) {
+ this->sw_interval_state.back().is_start
+ = static_cast<ssize_t>(
+ last_key.value().c_begin);
+ this->sw_interval_state.back().is_line_number
+ = this->sw_line_number;
+ }
+ last_key = nonstd::nullopt;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ this->sw_values.clear();
+
+ return retval;
+ }
+
+ void append_child_node(nonstd::optional<data_scanner::capture_t> terminator)
+ {
+ auto& ivstate = this->sw_interval_state.back();
+ if (!ivstate.is_start || !terminator || this->sw_depth == 0) {
+ ivstate.is_start = nonstd::nullopt;
+ ivstate.is_line_number = 0;
+ ivstate.is_name.clear();
+ return;
+ }
+
+ auto new_node = this->sw_hier_stage != nullptr
+ ? std::move(this->sw_hier_stage)
+ : std::make_unique<lnav::document::hier_node>();
+ auto iv_start = ivstate.is_start.value();
+ auto iv_stop = static_cast<ssize_t>(terminator.value().c_end);
+ auto* top_node = this->sw_hier_nodes.back().get();
+ auto new_key = ivstate.is_name.empty()
+ ? lnav::document::section_key_t{top_node->hn_children.size()}
+ : lnav::document::section_key_t{ivstate.is_name};
+ this->sw_intervals.emplace_back(iv_start, iv_stop, new_key);
+ auto* retval = new_node.get();
+ new_node->hn_parent = top_node;
+ new_node->hn_start = this->sw_intervals.back().start;
+ new_node->hn_line_number = ivstate.is_line_number;
+ if (!ivstate.is_name.empty()) {
+ top_node->hn_named_children.insert({
+ ivstate.is_name,
+ retval,
+ });
+ }
+ top_node->hn_children.emplace_back(std::move(new_node));
+ ivstate.is_start = nonstd::nullopt;
+ ivstate.is_line_number = 0;
+ ivstate.is_name.clear();
+ }
+
+ attr_line_t& sw_line;
+ line_range sw_range;
+ data_scanner sw_scanner;
+ int sw_depth{0};
+ size_t sw_line_number{0};
+ std::vector<element> sw_values{};
+ std::vector<interval_state> sw_interval_state;
+ std::vector<lnav::document::section_interval_t> sw_intervals;
+ std::vector<std::unique_ptr<lnav::document::hier_node>> sw_hier_nodes;
+ std::unique_ptr<lnav::document::hier_node> sw_hier_stage;
+};
+
+metadata
+discover_structure(attr_line_t& al, struct line_range lr)
+{
+ return structure_walker(al, lr).walk();
+}
+
+std::vector<breadcrumb::possibility>
+metadata::possibility_provider(const std::vector<section_key_t>& path)
+{
+ std::vector<breadcrumb::possibility> retval;
+ auto curr_node = lnav::document::hier_node::lookup_path(
+ this->m_sections_root.get(), path);
+ if (curr_node) {
+ auto* parent_node = curr_node.value()->hn_parent;
+
+ if (parent_node != nullptr) {
+ for (const auto& sibling : parent_node->hn_named_children) {
+ retval.template emplace_back(sibling.first);
+ }
+ }
+ }
+ return retval;
+}
+
+} // namespace document
+} // namespace lnav