summaryrefslogtreecommitdiffstats
path: root/src/parser/yaml_parser_base.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/parser/yaml_parser_base.cpp')
-rw-r--r--src/parser/yaml_parser_base.cpp512
1 files changed, 512 insertions, 0 deletions
diff --git a/src/parser/yaml_parser_base.cpp b/src/parser/yaml_parser_base.cpp
new file mode 100644
index 0000000..df4db23
--- /dev/null
+++ b/src/parser/yaml_parser_base.cpp
@@ -0,0 +1,512 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <orcus/yaml_parser_base.hpp>
+#include <orcus/cell_buffer.hpp>
+#include <orcus/parser_global.hpp>
+
+#include <mdds/sorted_string_map.hpp>
+
+#include <limits>
+#include <vector>
+#include <deque>
+#include <sstream>
+#include <algorithm>
+
+namespace orcus { namespace yaml {
+
+struct scope
+{
+ size_t width;
+ detail::scope_t type;
+
+ scope(size_t _width) : width(_width), type(detail::scope_t::unset) {}
+};
+
+struct parser_base::impl
+{
+ cell_buffer m_buffer;
+ std::vector<scope> m_scopes;
+ std::deque<std::string_view> m_line_buffer;
+ const char* m_document;
+
+ size_t m_comment_length;
+
+ bool m_in_literal_block;
+ bool m_parsed_to_end_of_line;
+
+ detail::parse_token_t m_last_token;
+
+ impl() :
+ m_document(nullptr),
+ m_comment_length(0),
+ m_in_literal_block(false),
+ m_parsed_to_end_of_line(false),
+ m_last_token(detail::parse_token_t::unknown) {}
+};
+
+const size_t parser_base::parse_indent_blank_line = std::numeric_limits<size_t>::max();
+const size_t parser_base::parse_indent_end_of_stream = std::numeric_limits<size_t>::max() - 1;
+const size_t parser_base::scope_empty = std::numeric_limits<size_t>::max() - 2;
+
+parser_base::parser_base(std::string_view content) :
+ orcus::parser_base(content.data(), content.size()), mp_impl(std::make_unique<impl>()) {}
+
+parser_base::~parser_base() = default;
+
+void parser_base::push_parse_token(detail::parse_token_t t)
+{
+ mp_impl->m_last_token = t;
+}
+
+detail::parse_token_t parser_base::get_last_parse_token() const
+{
+ return mp_impl->m_last_token;
+}
+
+size_t parser_base::offset_last_char_of_line() const
+{
+ // The current parser position should be on the linefeed char after
+ // calling parse_to_end_of_line().
+ assert(mp_impl->m_parsed_to_end_of_line);
+
+ size_t pos = offset(); // character past the '\n'.
+ pos -= 1; // move back to the '\n'.
+
+ if (mp_impl->m_comment_length)
+ {
+ assert(mp_impl->m_comment_length < pos);
+ pos -= mp_impl->m_comment_length; // should be on the '#' character.
+ }
+
+ pos -= 1;
+
+ // Ignore any trailing whitespaces.
+ const char* p = mp_begin + pos;
+ for (; mp_begin < p && *p == ' '; --p, --pos)
+ ;
+
+ return pos;
+}
+
+size_t parser_base::parse_indent()
+{
+ for (size_t indent = 0; has_char(); next(), ++indent)
+ {
+ char c = cur_char();
+ switch (c)
+ {
+ case '#':
+ skip_comment();
+ return parse_indent_blank_line;
+ case '\n':
+ next();
+ return parse_indent_blank_line;
+ case ' ':
+ continue;
+ default:
+ return indent;
+ }
+ }
+
+ return parse_indent_end_of_stream;
+}
+
+std::string_view parser_base::parse_to_end_of_line()
+{
+ const char* p = mp_char;
+ size_t len = 0;
+ for (; has_char(); next(), ++len)
+ {
+ switch (cur_char())
+ {
+ case '#':
+ skip_comment();
+ break;
+ case '\'':
+ {
+ const char* p_open_quote = mp_char;
+
+ // character immediately after the closing quote.
+ const char* p_end =
+ parse_to_closing_single_quote(mp_char, remaining_size());
+
+ if (!p_end)
+ throw parse_error("parse_to_end_of_line: closing single quote was expected but not found.", offset());
+
+ size_t diff = p_end - p_open_quote - 1;
+
+ // Move the cursor to the closing quote.
+ next(diff);
+ len += diff;
+ assert(cur_char() == '\'');
+ continue;
+ }
+ break;
+ case '"':
+ {
+ const char* p_open_quote = mp_char;
+
+ // character immediately after the closing quote.
+ const char* p_end =
+ parse_to_closing_double_quote(mp_char, remaining_size());
+
+ if (!p_end)
+ throw parse_error("parse_to_end_of_line: closing double quote was expected but not found.", offset());
+
+ size_t diff = p_end - p_open_quote - 1;
+
+ // Move the cursor to the closing quote.
+ next(diff);
+ len += diff;
+ assert(cur_char() == '"');
+ continue;
+ }
+ break;
+ case '\n':
+ next();
+ break;
+ default:
+ continue;
+ }
+ break;
+ }
+
+ std::string_view ret(p, len);
+ mp_impl->m_parsed_to_end_of_line = true;
+ return ret;
+}
+
+void parser_base::skip_comment()
+{
+ assert(cur_char() == '#');
+
+ size_t n = 1;
+
+ for (; has_char(); next(), ++n)
+ {
+ if (cur_char() == '\n')
+ {
+ next();
+ break;
+ }
+ }
+
+ mp_impl->m_comment_length = n;
+}
+
+void parser_base::reset_on_new_line()
+{
+ mp_impl->m_comment_length = 0;
+ mp_impl->m_parsed_to_end_of_line = false;
+}
+
+size_t parser_base::get_scope() const
+{
+ return (mp_impl->m_scopes.empty()) ? scope_empty : mp_impl->m_scopes.back().width;
+}
+
+void parser_base::push_scope(size_t scope_width)
+{
+ mp_impl->m_scopes.emplace_back(scope_width);
+}
+
+void parser_base::clear_scopes()
+{
+ mp_impl->m_scopes.clear();
+}
+
+detail::scope_t parser_base::get_scope_type() const
+{
+ assert(!mp_impl->m_scopes.empty());
+ return mp_impl->m_scopes.back().type;
+}
+
+void parser_base::set_scope_type(detail::scope_t type)
+{
+ assert(!mp_impl->m_scopes.empty());
+ mp_impl->m_scopes.back().type = type;
+}
+
+size_t parser_base::pop_scope()
+{
+ assert(!mp_impl->m_scopes.empty());
+ mp_impl->m_scopes.pop_back();
+ return get_scope();
+}
+
+void parser_base::push_line_back(const char* p, size_t n)
+{
+ mp_impl->m_line_buffer.emplace_back(p, n);
+}
+
+std::string_view parser_base::pop_line_front()
+{
+ assert(!mp_impl->m_line_buffer.empty());
+
+ std::string_view ret = mp_impl->m_line_buffer.front();
+ mp_impl->m_line_buffer.pop_front();
+ return ret;
+}
+
+bool parser_base::has_line_buffer() const
+{
+ return !mp_impl->m_line_buffer.empty();
+}
+
+size_t parser_base::get_line_buffer_count() const
+{
+ return mp_impl->m_line_buffer.size();
+}
+
+std::string_view parser_base::merge_line_buffer()
+{
+ assert(!mp_impl->m_line_buffer.empty());
+
+ char sep = mp_impl->m_in_literal_block ? '\n' : ' ';
+
+ cell_buffer& buf = mp_impl->m_buffer;
+ buf.reset();
+
+ auto it = mp_impl->m_line_buffer.begin();
+ buf.append(it->data(), it->size());
+ ++it;
+
+ std::for_each(it, mp_impl->m_line_buffer.end(),
+ [&](std::string_view line)
+ {
+ buf.append(&sep, 1);
+ buf.append(line.data(), line.size());
+ }
+ );
+
+ mp_impl->m_line_buffer.clear();
+ mp_impl->m_in_literal_block = false;
+
+ return buf.str();
+}
+
+const char* parser_base::get_doc_hash() const
+{
+ return mp_impl->m_document;
+}
+
+void parser_base::set_doc_hash(const char* hash)
+{
+ mp_impl->m_document = hash;
+}
+
+namespace {
+
+namespace keyword {
+
+using map_type = mdds::sorted_string_map<detail::keyword_t, mdds::string_view_map_entry>;
+
+constexpr map_type::entry entries[] = {
+ { "FALSE", detail::keyword_t::boolean_false },
+ { "False", detail::keyword_t::boolean_false },
+ { "N", detail::keyword_t::boolean_false },
+ { "NO", detail::keyword_t::boolean_false },
+ { "NULL", detail::keyword_t::null },
+ { "No", detail::keyword_t::boolean_false },
+ { "Null", detail::keyword_t::null },
+ { "OFF", detail::keyword_t::boolean_false },
+ { "ON", detail::keyword_t::boolean_true },
+ { "Off", detail::keyword_t::boolean_false },
+ { "On", detail::keyword_t::boolean_true },
+ { "TRUE", detail::keyword_t::boolean_true },
+ { "True", detail::keyword_t::boolean_true },
+ { "Y", detail::keyword_t::boolean_true },
+ { "YES", detail::keyword_t::boolean_true },
+ { "Yes", detail::keyword_t::boolean_true },
+ { "false", detail::keyword_t::boolean_false },
+ { "n", detail::keyword_t::boolean_false },
+ { "no", detail::keyword_t::boolean_false },
+ { "null", detail::keyword_t::null },
+ { "off", detail::keyword_t::boolean_false },
+ { "on", detail::keyword_t::boolean_true },
+ { "true", detail::keyword_t::boolean_true },
+ { "y", detail::keyword_t::boolean_true },
+ { "yes", detail::keyword_t::boolean_true },
+ { "~", detail::keyword_t::null },
+};
+
+const map_type& get()
+{
+ static const map_type map(entries, std::size(entries), detail::keyword_t::unknown);
+ return map;
+}
+
+} // namespace keyword
+
+void throw_quoted_string_parse_error(
+ const char* func_name, const parse_quoted_string_state& ret, std::ptrdiff_t offset)
+{
+ std::ostringstream os;
+ os << func_name << ": failed to parse ";
+ if (ret.length == parse_quoted_string_state::error_illegal_escape_char)
+ os << "due to the presence of illegal escape character.";
+ else if (ret.length == parse_quoted_string_state::error_no_closing_quote)
+ os << "because the closing quote was not found.";
+ else
+ os << "due to unknown reason.";
+
+ throw parse_error(os.str(), offset);
+}
+
+}
+
+detail::keyword_t parser_base::parse_keyword(const char* p, size_t len)
+{
+ return keyword::get().find({p, len});
+}
+
+parser_base::key_value parser_base::parse_key_value(const char* p, size_t len)
+{
+ size_t scope = get_scope();
+ assert(scope != scope_empty);
+
+ assert(*p != ' ');
+ assert(len);
+
+ const char* p_end = p + len;
+
+ key_value kv;
+
+ char last = 0;
+ bool key_found = false;
+
+ const char* p_head = p;
+
+ for (; p != p_end; ++p)
+ {
+ if (*p == ' ')
+ {
+ if (!key_found)
+ {
+ if (last == ':')
+ {
+ // Key found.
+ std::size_t n = p - p_head - 1;
+ kv.key = trim({p_head, n});
+ key_found = true;
+ p_head = nullptr;
+ }
+ }
+ }
+ else
+ {
+ if (!p_head)
+ p_head = p;
+ }
+
+ last = *p;
+ }
+
+ assert(p_head);
+
+ if (key_found)
+ {
+ // Key has already been found and the value comes after the ':'.
+ kv.value = std::string_view(p_head, p-p_head);
+ }
+ else if (last == ':')
+ {
+ // Line only contains a key and ends with ':'.
+ std::size_t n = p - p_head - 1;
+ kv.key = trim({p_head, n});
+ }
+ else
+ {
+ // Key has not been found.
+ detail::scope_t st = get_scope_type();
+ if (st == detail::scope_t::map)
+ throw parse_error("key was expected, but not found.", offset_last_char_of_line());
+ }
+
+ return kv;
+}
+
+std::string_view parser_base::parse_single_quoted_string_value(const char*& p, size_t max_length)
+{
+ parse_quoted_string_state ret =
+ parse_single_quoted_string(p, max_length, mp_impl->m_buffer);
+
+ if (!ret.str)
+ throw_quoted_string_parse_error("parse_single_quoted_string_value", ret, offset());
+
+ return std::string_view(ret.str, ret.length);
+}
+
+std::string_view parser_base::parse_double_quoted_string_value(const char*& p, size_t max_length)
+{
+ parse_quoted_string_state ret =
+ parse_double_quoted_string(p, max_length, mp_impl->m_buffer);
+
+ if (!ret.str)
+ throw_quoted_string_parse_error("parse_double_quoted_string_value", ret, offset());
+
+ return std::string_view(ret.str, ret.length);
+}
+
+void parser_base::skip_blanks(const char*& p, size_t len)
+{
+ const char* p_end = p + len;
+ for (; p != p_end && *p == ' '; ++p)
+ ;
+}
+
+void parser_base::start_literal_block()
+{
+ mp_impl->m_in_literal_block = true;
+}
+
+bool parser_base::in_literal_block() const
+{
+ return mp_impl->m_in_literal_block;
+}
+
+void parser_base::handle_line_in_literal(size_t indent)
+{
+ size_t cur_scope = get_scope();
+
+ if (!has_line_buffer())
+ {
+ // Start a new multi-line string scope.
+
+ if (indent == cur_scope)
+ throw parse_error("parse: first line of a literal block must be indented.", offset());
+
+ push_scope(indent);
+ set_scope_type(yaml::detail::scope_t::multi_line_string);
+ }
+ else
+ {
+ // The current scope is already a multi-line scope.
+ assert(get_scope_type() == yaml::detail::scope_t::multi_line_string);
+ size_t leading_indent = indent - cur_scope;
+ prev(leading_indent);
+ }
+
+ std::string_view line = parse_to_end_of_line();
+ push_line_back(line.data(), line.size());
+}
+
+void parser_base::handle_line_in_multi_line_string()
+{
+ if (get_scope_type() != yaml::detail::scope_t::multi_line_string)
+ set_scope_type(yaml::detail::scope_t::multi_line_string);
+
+ std::string_view line = parse_to_end_of_line();
+ line = trim(line);
+ assert(!line.empty());
+ push_line_back(line.data(), line.size());
+}
+
+}}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */