diff options
Diffstat (limited to 'include/orcus/yaml_parser.hpp')
-rw-r--r-- | include/orcus/yaml_parser.hpp | 691 |
1 files changed, 691 insertions, 0 deletions
diff --git a/include/orcus/yaml_parser.hpp b/include/orcus/yaml_parser.hpp new file mode 100644 index 0000000..836a902 --- /dev/null +++ b/include/orcus/yaml_parser.hpp @@ -0,0 +1,691 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_YAML_PARSER_HPP +#define INCLUDED_ORCUS_YAML_PARSER_HPP + +#include "orcus/yaml_parser_base.hpp" +#include "orcus/parser_global.hpp" + +namespace orcus { + +/** + * Blank handler class for yaml_parser. One can sub-class this and overwrite + * callback functions one needs to handle. + */ +class yaml_handler +{ +public: + /** + * Called when the parser starts parsing a content. + */ + void begin_parse() {} + + /** + * Called when the parser finishes parsing an entire content. + */ + void end_parse() {} + + /** + * Called when a new document is encountered. + */ + void begin_document() {} + + /** + * Called when the parser has finished parsing a document. + */ + void end_document() {} + + /** + * Called when a sequence begins. + */ + void begin_sequence() {} + + /** + * Called when a sequence ends. + */ + void end_sequence() {} + + /** + * Called when a map begins. + */ + void begin_map() {} + + /** + * Called when the parser starts parsing a map key. + */ + void begin_map_key() {} + + /** + * Called when the parser finishes parsing a map key. + */ + void end_map_key() {} + + /** + * Called when the parser finishes parsing an entire map. + */ + void end_map() {} + + /** + * Called when a string value is encountered. + * + * @param value string value. + */ + void string(std::string_view value) + { + (void)value; + } + + /** + * Called when a numeric value is encountered. + * + * @param val numeric value. + */ + void number(double val) + { + (void)val; + } + + /** + * Called when a boolean 'true' keyword is encountered. + */ + void boolean_true() {} + + /** + * Called when a boolean 'false' keyword is encountered. + */ + void boolean_false() {} + + /** + * Called when a 'null' keyword is encountered. + */ + void null() {} +}; + +/** + * Parser for YAML documents. + * + * @tparam HandlerT Hanlder type with member functions for event callbacks. + * Refer to yaml_handler. + * + * @warning This parser is still highly experimental. Use with caution. + */ +template<typename HandlerT> +class yaml_parser : public yaml::parser_base +{ +public: + typedef HandlerT handler_type; + + yaml_parser(std::string_view content, handler_type& hdl); + + void parse(); + +private: + size_t end_scope(); + void check_or_begin_document(); + void check_or_begin_map(); + void check_or_begin_sequence(); + void parse_value(const char* p, size_t len); + void push_value(const char* p, size_t len); + void parse_line(const char* p, size_t len); + void parse_map_key(const char* p, size_t len); + + void handler_begin_parse(); + void handler_end_parse(); + void handler_begin_document(); + void handler_end_document(); + void handler_begin_sequence(); + void handler_end_sequence(); + void handler_begin_map(); + void handler_end_map(); + void handler_begin_map_key(); + void handler_end_map_key(); + void handler_string(const char* p, size_t n); + void handler_number(double val); + void handler_boolean_true(); + void handler_boolean_false(); + void handler_null(); + +private: + handler_type& m_handler; +}; + +template<typename _Handler> +void yaml_parser<_Handler>::handler_begin_parse() +{ + push_parse_token(yaml::detail::parse_token_t::begin_parse); + m_handler.begin_parse(); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_end_parse() +{ + push_parse_token(yaml::detail::parse_token_t::end_parse); + m_handler.end_parse(); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_begin_document() +{ + push_parse_token(yaml::detail::parse_token_t::begin_document); + m_handler.begin_document(); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_end_document() +{ + push_parse_token(yaml::detail::parse_token_t::end_document); + m_handler.end_document(); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_begin_sequence() +{ + push_parse_token(yaml::detail::parse_token_t::begin_sequence); + m_handler.begin_sequence(); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_end_sequence() +{ + push_parse_token(yaml::detail::parse_token_t::end_sequence); + m_handler.end_sequence(); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_begin_map() +{ + push_parse_token(yaml::detail::parse_token_t::begin_map); + m_handler.begin_map(); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_end_map() +{ + push_parse_token(yaml::detail::parse_token_t::end_map); + m_handler.end_map(); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_begin_map_key() +{ + push_parse_token(yaml::detail::parse_token_t::begin_map_key); + m_handler.begin_map_key(); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_end_map_key() +{ + push_parse_token(yaml::detail::parse_token_t::end_map_key); + m_handler.end_map_key(); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_string(const char* p, size_t n) +{ + push_parse_token(yaml::detail::parse_token_t::string); + m_handler.string({p, n}); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_number(double val) +{ + push_parse_token(yaml::detail::parse_token_t::number); + m_handler.number(val); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_boolean_true() +{ + push_parse_token(yaml::detail::parse_token_t::boolean_true); + m_handler.boolean_true(); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_boolean_false() +{ + push_parse_token(yaml::detail::parse_token_t::boolean_false); + m_handler.boolean_false(); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_null() +{ + push_parse_token(yaml::detail::parse_token_t::null); + m_handler.null(); +} + +template<typename _Handler> +yaml_parser<_Handler>::yaml_parser(std::string_view content, handler_type& hdl) : + yaml::parser_base(content), m_handler(hdl) {} + +template<typename _Handler> +void yaml_parser<_Handler>::parse() +{ + handler_begin_parse(); + + while (has_char()) + { + reset_on_new_line(); + + size_t indent = parse_indent(); + if (indent == parse_indent_end_of_stream) + break; + + if (indent == parse_indent_blank_line) + continue; + + size_t cur_scope = get_scope(); + + if (cur_scope <= indent) + { + if (in_literal_block()) + { + handle_line_in_literal(indent); + continue; + } + + if (has_line_buffer()) + { + // This line is part of multi-line string. Push the line to the + // buffer as-is. + handle_line_in_multi_line_string(); + continue; + } + } + + if (cur_scope == scope_empty) + { + if (indent > 0) + throw parse_error( + "first node of the document should not be indented.", offset()); + + push_scope(indent); + } + else if (indent > cur_scope) + { + push_scope(indent); + } + else if (indent < cur_scope) + { + // Current indent is less than the current scope level. + do + { + cur_scope = end_scope(); + if (cur_scope < indent) + throw parse_error("parse: invalid indent level.", offset()); + } + while (indent < cur_scope); + } + + // Parse the rest of the line. + std::string_view line = parse_to_end_of_line(); + line = trim(line); + + assert(!line.empty()); + parse_line(line.data(), line.size()); + } + + // End all remaining scopes. + size_t cur_scope = get_scope(); + while (cur_scope != scope_empty) + cur_scope = end_scope(); + + if (get_doc_hash()) + handler_end_document(); + + handler_end_parse(); +} + +template<typename _Handler> +size_t yaml_parser<_Handler>::end_scope() +{ + switch (get_scope_type()) + { + case yaml::detail::scope_t::map: + { + if (get_last_parse_token() == yaml::detail::parse_token_t::end_map_key) + handler_null(); + + handler_end_map(); + break; + } + case yaml::detail::scope_t::sequence: + { + if (get_last_parse_token() == yaml::detail::parse_token_t::begin_sequence_element) + handler_null(); + + handler_end_sequence(); + break; + } + case yaml::detail::scope_t::multi_line_string: + { + std::string_view merged = merge_line_buffer(); + handler_string(merged.data(), merged.size()); + break; + } + default: + { + if (has_line_buffer()) + { + assert(get_line_buffer_count() == 1); + std::string_view line = pop_line_front(); + parse_value(line.data(), line.size()); + } + } + } + return pop_scope(); +} + +template<typename _Handler> +void yaml_parser<_Handler>::check_or_begin_document() +{ + if (!get_doc_hash()) + { + set_doc_hash(mp_char); + handler_begin_document(); + } +} + +template<typename _Handler> +void yaml_parser<_Handler>::check_or_begin_map() +{ + switch (get_scope_type()) + { + case yaml::detail::scope_t::unset: + { + check_or_begin_document(); + set_scope_type(yaml::detail::scope_t::map); + handler_begin_map(); + break; + } + case yaml::detail::scope_t::map: + { + if (get_last_parse_token() == yaml::detail::parse_token_t::end_map_key) + handler_null(); + break; + } + default: + ; + } +} + +template<typename _Handler> +void yaml_parser<_Handler>::check_or_begin_sequence() +{ + switch (get_scope_type()) + { + case yaml::detail::scope_t::unset: + { + check_or_begin_document(); + set_scope_type(yaml::detail::scope_t::sequence); + handler_begin_sequence(); + break; + } + case yaml::detail::scope_t::sequence: + { + if (get_last_parse_token() == yaml::detail::parse_token_t::begin_sequence_element) + handler_null(); + break; + } + default: + ; + } + + push_parse_token(yaml::detail::parse_token_t::begin_sequence_element); +} + +template<typename _Handler> +void yaml_parser<_Handler>::parse_value(const char* p, size_t len) +{ + check_or_begin_document(); + + const char* p0 = p; + const char* p_end = p + len; + double val; + p = parse_numeric(p, p_end, val); + if (p == p_end) + { + handler_number(val); + return; + } + + yaml::detail::keyword_t kw = parse_keyword(p0, len); + + if (kw != yaml::detail::keyword_t::unknown) + { + switch (kw) + { + case yaml::detail::keyword_t::null: + handler_null(); + break; + case yaml::detail::keyword_t::boolean_true: + handler_boolean_true(); + break; + case yaml::detail::keyword_t::boolean_false: + handler_boolean_false(); + break; + default: + ; + } + + return; + } + + // Failed to parse it as a number or a keyword. It must be a string. + handler_string(p0, len); +} + +template<typename _Handler> +void yaml_parser<_Handler>::push_value(const char* p, size_t len) +{ + check_or_begin_document(); + + if (has_line_buffer() && get_scope_type() == yaml::detail::scope_t::unset) + set_scope_type(yaml::detail::scope_t::multi_line_string); + + push_line_back(p, len); +} + +template<typename _Handler> +void yaml_parser<_Handler>::parse_line(const char* p, size_t len) +{ + const char* p_end = p + len; + const char* p0 = p; // Save the original head position. + + if (*p == '-') + { + ++p; + if (p == p_end) + { + // List item start. + check_or_begin_sequence(); + return; + } + + switch (*p) + { + case '-': + { + // start of a document + ++p; + if (p == p_end) + throw parse_error("parse_line: line ended with '--'.", offset_last_char_of_line()); + + if (*p != '-') + parse_error::throw_with( + "parse_line: '-' expected but '", *p, "' found.", + offset_last_char_of_line() - std::ptrdiff_t(p_end-p)); + + ++p; // Skip the '-'. + set_doc_hash(p); + handler_begin_document(); + clear_scopes(); + + if (p != p_end) + { + skip_blanks(p, p_end-p); + + // Whatever comes after '---' is equivalent of first node. + assert(p != p_end); + push_scope(0); + parse_line(p, p_end-p); + } + return; + } + case ' ': + { + check_or_begin_sequence(); + + // list item start with inline first item content. + ++p; + if (p == p_end) + throw parse_error( + "parse_line: list item expected, but the line ended prematurely.", + offset_last_char_of_line() - std::ptrdiff_t(p_end-p)); + + skip_blanks(p, p_end-p); + + size_t scope_width = get_scope() + (p-p0); + push_scope(scope_width); + parse_line(p, p_end-p); + return; + } + default: + // It is none of the above. + p = p0; + } + + } + + if (get_scope_type() == yaml::detail::scope_t::sequence) + parse_error::throw_with( + "'-' was expected for a sequence element, but '", *p, "' was found.", + offset_last_char_of_line()-len+1); + + // If the line doesn't start with a "- ", it must be a dictionary key. + parse_map_key(p, len); +} + +template<typename _Handler> +void yaml_parser<_Handler>::parse_map_key(const char* p, size_t len) +{ + const char* p_end = p + len; + const char* p0 = p; // Save the original head position. + + switch (*p) + { + case '"': + { + std::string_view quoted_str = parse_double_quoted_string_value(p, len); + + if (p == p_end) + { + handler_string(quoted_str.data(), quoted_str.size()); + return; + } + + skip_blanks(p, p_end-p); + + if (*p != ':') + throw parse_error( + "parse_map_key: ':' is expected after the quoted string key.", + offset() - std::ptrdiff_t(p_end-p+1)); + + check_or_begin_map(); + handler_begin_map_key(); + handler_string(quoted_str.data(), quoted_str.size()); + handler_end_map_key(); + + ++p; // skip the ':'. + if (p == p_end) + return; + + // Skip all white spaces. + skip_blanks(p, p_end-p); + } + break; + case '\'': + { + std::string_view quoted_str = parse_single_quoted_string_value(p, len); + + if (p == p_end) + { + handler_string(quoted_str.data(), quoted_str.size()); + return; + } + + skip_blanks(p, p_end-p); + + if (*p != ':') + throw parse_error( + "parse_map_key: ':' is expected after the quoted string key.", + offset() - std::ptrdiff_t(p_end-p+1)); + + check_or_begin_map(); + handler_begin_map_key(); + handler_string(quoted_str.data(), quoted_str.size()); + handler_end_map_key(); + + ++p; // skip the ':'. + if (p == p_end) + return; + + skip_blanks(p, p_end-p); + } + break; + default: + { + key_value kv = parse_key_value(p, p_end-p); + + if (kv.key.empty()) + { + // No map key found. + if (*p == '|') + { + start_literal_block(); + return; + } + + push_value(p, len); + return; + } + + check_or_begin_map(); + handler_begin_map_key(); + parse_value(kv.key.data(), kv.key.size()); + handler_end_map_key(); + + if (kv.value.empty()) + return; + + p = kv.value.data(); + } + } + + if (*p == '|') + { + start_literal_block(); + return; + } + + // inline map item. + if (*p == '-') + throw parse_error( + "parse_map_key: sequence entry is not allowed as an inline map item.", + offset() - std::ptrdiff_t(p_end-p+1)); + + size_t scope_width = get_scope() + (p-p0); + push_scope(scope_width); + parse_line(p, p_end-p); +} + +} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |