summaryrefslogtreecommitdiffstats
path: root/src/data_parser.cc
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/data_parser.cc1071
1 files changed, 1071 insertions, 0 deletions
diff --git a/src/data_parser.cc b/src/data_parser.cc
new file mode 100644
index 0000000..a751b30
--- /dev/null
+++ b/src/data_parser.cc
@@ -0,0 +1,1071 @@
+/**
+ * Copyright (c) 2007-2012, Timothy Stack
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * * Neither the name of Timothy Stack nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <algorithm>
+
+#include "data_parser.hh"
+
+#include "config.h"
+#include "spookyhash/SpookyV2.h"
+
+data_format data_parser::FORMAT_SEMI("semi", DT_COMMA, DT_SEMI);
+data_format data_parser::FORMAT_COMMA("comma", DT_INVALID, DT_COMMA);
+data_format data_parser::FORMAT_PLAIN("plain", DT_INVALID, DT_INVALID);
+
+data_parser::data_parser(data_scanner* ds)
+ : dp_errors("dp_errors", __FILE__, __LINE__),
+ dp_pairs("dp_pairs", __FILE__, __LINE__), dp_msg_format(nullptr),
+ dp_msg_format_begin(ds->get_init_offset()), dp_scanner(ds)
+{
+ if (TRACE_FILE != nullptr) {
+ fprintf(TRACE_FILE, "input %s\n", ds->get_input().to_string().c_str());
+ }
+}
+
+void
+data_parser::pairup(data_parser::schema_id_t* schema,
+ data_parser::element_list_t& pairs_out,
+ data_parser::element_list_t& in_list,
+ int group_depth)
+{
+ element_list_t ELEMENT_LIST_T(el_stack), ELEMENT_LIST_T(free_row),
+ ELEMENT_LIST_T(key_comps), ELEMENT_LIST_T(value),
+ ELEMENT_LIST_T(prefix);
+ SpookyHash context;
+
+ require(in_list.el_format.df_name != nullptr);
+
+ POINT_TRACE("pairup_start");
+
+ FORMAT_TRACE(in_list);
+
+ for (auto iter = in_list.begin(); iter != in_list.end(); ++iter) {
+ if (iter->e_token == DNT_GROUP) {
+ element_list_t ELEMENT_LIST_T(group_pairs);
+
+ this->pairup(
+ nullptr, group_pairs, *iter->e_sub_elements, group_depth + 1);
+ if (!group_pairs.empty()) {
+ iter->assign_elements(group_pairs);
+ }
+ }
+
+ if (in_list.el_format.df_prefix_terminator != DT_INVALID) {
+ if (iter->e_token == in_list.el_format.df_prefix_terminator) {
+ in_list.el_format.df_prefix_terminator = DT_INVALID;
+ } else {
+ el_stack.PUSH_BACK(*iter);
+ }
+ } else if (iter->e_token == in_list.el_format.df_terminator) {
+ this->end_of_value(
+ el_stack, key_comps, value, in_list, group_depth);
+
+ key_comps.PUSH_BACK(*iter);
+ } else if (iter->e_token == in_list.el_format.df_qualifier) {
+ value.SPLICE(
+ value.end(), key_comps, key_comps.begin(), key_comps.end());
+ strip(value, element_is_space{});
+ if (!value.empty()) {
+ el_stack.PUSH_BACK(element(value, DNT_VALUE));
+ }
+ } else if (iter->e_token == in_list.el_format.df_separator) {
+ auto key_iter = key_comps.end();
+ bool found = false, key_is_values = true;
+
+ if (!key_comps.empty()) {
+ do {
+ --key_iter;
+ if (key_iter->e_token == in_list.el_format.df_appender) {
+ ++key_iter;
+ value.SPLICE(value.end(),
+ key_comps,
+ key_comps.begin(),
+ key_iter);
+ key_comps.POP_FRONT();
+ found = true;
+ } else if (key_iter->e_token
+ == in_list.el_format.df_terminator)
+ {
+ std::vector<element> key_copy;
+
+ value.SPLICE(value.end(),
+ key_comps,
+ key_comps.begin(),
+ key_iter);
+ key_comps.POP_FRONT();
+ strip(key_comps, element_is_space{});
+ if (key_comps.empty()) {
+ key_iter = key_comps.end();
+ } else {
+ key_iter = key_comps.begin();
+ }
+ found = true;
+ }
+ if (key_iter != key_comps.end()) {
+ switch (key_iter->e_token) {
+ case DT_WORD:
+ case DT_SYMBOL:
+ key_is_values = false;
+ break;
+ default:
+ break;
+ }
+ }
+ } while (key_iter != key_comps.begin() && !found);
+ }
+ if (!found && !el_stack.empty() && !key_comps.empty()) {
+ element_list_t::iterator value_iter;
+
+ if (el_stack.size() > 1
+ && in_list.el_format.df_appender != DT_INVALID
+ && in_list.el_format.df_terminator != DT_INVALID)
+ {
+ /* If we're expecting a terminator and haven't found it */
+ /* then this is part of the value. */
+ continue;
+ }
+
+ value.SPLICE(
+ value.end(), key_comps, key_comps.begin(), key_comps.end());
+ value_iter = value.end();
+ std::advance(value_iter, -1);
+ key_comps.SPLICE(
+ key_comps.begin(), value, value_iter, value.end());
+ key_comps.resize(1);
+ }
+
+ strip(value, element_is_space{});
+ value.remove_if(element_if(DT_COMMA));
+ if (!value.empty()) {
+ el_stack.PUSH_BACK(element(value, DNT_VALUE));
+ }
+ strip(key_comps, element_is_space{});
+ if (!key_comps.empty()) {
+ if (key_is_values) {
+ el_stack.PUSH_BACK(element(key_comps, DNT_VALUE));
+ } else {
+ el_stack.PUSH_BACK(element(key_comps, DNT_KEY, false));
+ }
+ }
+ key_comps.CLEAR();
+ value.CLEAR();
+ } else {
+ key_comps.PUSH_BACK(*iter);
+ }
+
+ POINT_TRACE("pairup_loop");
+ }
+
+ POINT_TRACE("pairup_eol");
+
+ CONSUMED_TRACE(in_list);
+
+ // Only perform the free-row logic at the top level, if we're in a group
+ // assume it is a list.
+ if (group_depth < 1 && el_stack.empty()) {
+ free_row.SPLICE(
+ free_row.begin(), key_comps, key_comps.begin(), key_comps.end());
+ } else {
+ this->end_of_value(el_stack, key_comps, value, in_list, group_depth);
+ }
+
+ POINT_TRACE("pairup_stack");
+
+ context.Init(0, 0);
+ while (!el_stack.empty()) {
+ auto kv_iter = el_stack.begin();
+ if (kv_iter->e_token == DNT_VALUE) {
+ if (pairs_out.empty()) {
+ free_row.PUSH_BACK(el_stack.front());
+ } else {
+ element_list_t ELEMENT_LIST_T(free_pair_subs);
+ struct element blank;
+
+ blank.e_capture.c_begin = blank.e_capture.c_end
+ = el_stack.front().e_capture.c_begin;
+ blank.e_token = DNT_KEY;
+ free_pair_subs.PUSH_BACK(blank);
+ free_pair_subs.PUSH_BACK(el_stack.front());
+ pairs_out.PUSH_BACK(element(free_pair_subs, DNT_PAIR));
+ }
+ }
+ if (kv_iter->e_token != DNT_KEY) {
+ el_stack.POP_FRONT();
+ continue;
+ }
+
+ ++kv_iter;
+ if (kv_iter == el_stack.end()) {
+ el_stack.POP_FRONT();
+ continue;
+ }
+
+ element_list_t ELEMENT_LIST_T(pair_subs);
+
+ if (schema != nullptr) {
+ size_t key_len;
+ const char* key_val
+ = this->get_element_string(el_stack.front(), key_len);
+ context.Update(key_val, key_len);
+ }
+
+ while (!free_row.empty()) {
+ element_list_t ELEMENT_LIST_T(free_pair_subs);
+ struct element blank;
+
+ blank.e_capture.c_begin = blank.e_capture.c_end
+ = free_row.front().e_capture.c_begin;
+ blank.e_token = DNT_KEY;
+ free_pair_subs.PUSH_BACK(blank);
+ free_pair_subs.PUSH_BACK(free_row.front());
+ pairs_out.PUSH_BACK(element(free_pair_subs, DNT_PAIR));
+ free_row.POP_FRONT();
+ }
+
+ bool has_value = false;
+
+ if (kv_iter->e_token == DNT_VALUE) {
+ ++kv_iter;
+ has_value = true;
+ }
+
+ pair_subs.SPLICE(
+ pair_subs.begin(), el_stack, el_stack.begin(), kv_iter);
+
+ if (!has_value) {
+ element_list_t ELEMENT_LIST_T(blank_value);
+ struct element blank;
+
+ blank.e_token = DT_QUOTED_STRING;
+ blank.e_capture.c_begin = blank.e_capture.c_end
+ = pair_subs.front().e_capture.c_end;
+ if (blank.e_capture.c_begin >= 0
+ && blank.e_capture.c_begin
+ < this->dp_scanner->get_input().sf_end)
+ {
+ switch (this->dp_scanner->to_string_fragment(blank.e_capture)
+ .front())
+ {
+ case '=':
+ case ':':
+ blank.e_capture.c_begin += 1;
+ blank.e_capture.c_end += 1;
+ break;
+ }
+ }
+ blank_value.PUSH_BACK(blank);
+ pair_subs.PUSH_BACK(element(blank_value, DNT_VALUE));
+ }
+
+ pairs_out.PUSH_BACK(element(pair_subs, DNT_PAIR));
+ }
+
+ if (pairs_out.size() == 1) {
+ element& pair = pairs_out.front();
+ element& evalue = pair.e_sub_elements->back();
+
+ if (evalue.e_token == DNT_VALUE && evalue.e_sub_elements != nullptr
+ && evalue.e_sub_elements->size() > 1)
+ {
+ element_list_t::iterator next_sub;
+
+ next_sub = pair.e_sub_elements->begin();
+ ++next_sub;
+ prefix.SPLICE(prefix.begin(),
+ *pair.e_sub_elements,
+ pair.e_sub_elements->begin(),
+ next_sub);
+ free_row.CLEAR();
+ free_row.SPLICE(free_row.begin(),
+ *evalue.e_sub_elements,
+ evalue.e_sub_elements->begin(),
+ evalue.e_sub_elements->end());
+ pairs_out.CLEAR();
+ context.Init(0, 0);
+ }
+ }
+
+ if (group_depth >= 1 && pairs_out.empty() && !free_row.empty()) {
+ pairs_out.SWAP(free_row);
+ }
+
+ if (pairs_out.empty() && !free_row.empty()) {
+ while (!free_row.empty()) {
+ switch (free_row.front().e_token) {
+ case DNT_GROUP:
+ case DNT_VALUE:
+ case DT_EMAIL:
+ case DT_CONSTANT:
+ case DT_NUMBER:
+ case DT_SYMBOL:
+ case DT_HEX_NUMBER:
+ case DT_OCTAL_NUMBER:
+ case DT_VERSION_NUMBER:
+ case DT_QUOTED_STRING:
+ case DT_IPV4_ADDRESS:
+ case DT_IPV6_ADDRESS:
+ case DT_MAC_ADDRESS:
+ case DT_HEX_DUMP:
+ case DT_XML_DECL_TAG:
+ case DT_XML_OPEN_TAG:
+ case DT_XML_CLOSE_TAG:
+ case DT_XML_EMPTY_TAG:
+ case DT_UUID:
+ case DT_URL:
+ case DT_PATH:
+ case DT_DATE:
+ case DT_TIME:
+ case DT_PERCENTAGE: {
+ element_list_t ELEMENT_LIST_T(pair_subs);
+ struct element blank;
+
+ blank.e_capture.c_begin = blank.e_capture.c_end
+ = free_row.front().e_capture.c_begin;
+ blank.e_token = DNT_KEY;
+ pair_subs.PUSH_BACK(blank);
+ pair_subs.PUSH_BACK(free_row.front());
+ pairs_out.PUSH_BACK(element(pair_subs, DNT_PAIR));
+
+ // Throw something into the hash so that the number of
+ // columns is significant. I don't think we want to
+ // use the token ID since some columns values might vary
+ // between rows.
+ context.Update(" ", 1);
+ } break;
+
+ default: {
+ size_t key_len;
+ const char* key_val
+ = this->get_element_string(free_row.front(), key_len);
+
+ context.Update(key_val, key_len);
+ } break;
+ }
+
+ free_row.POP_FRONT();
+ }
+ }
+
+ if (!prefix.empty()) {
+ element_list_t ELEMENT_LIST_T(pair_subs);
+ struct element blank;
+
+ blank.e_capture.c_begin = blank.e_capture.c_end
+ = prefix.front().e_capture.c_begin;
+ blank.e_token = DNT_KEY;
+ pair_subs.PUSH_BACK(blank);
+ pair_subs.PUSH_BACK(prefix.front());
+ pairs_out.PUSH_FRONT(element(pair_subs, DNT_PAIR));
+ }
+
+ if (schema != nullptr) {
+ context.Final(schema->out(0), schema->out(1));
+ }
+
+ if (schema != nullptr && this->dp_msg_format != nullptr) {
+ for (auto& fiter : pairs_out) {
+ *(this->dp_msg_format) += this->get_string_up_to_value(fiter);
+ this->dp_msg_format->append("#");
+ }
+ if ((size_t) this->dp_msg_format_begin
+ < this->dp_scanner->get_input().length())
+ {
+ auto last = this->dp_scanner->get_input().substr(
+ this->dp_msg_format_begin);
+
+ switch (last.front()) {
+ case '\'':
+ case '"':
+ last.sf_begin += 1;
+ break;
+ }
+ *(this->dp_msg_format) += last.to_string();
+ }
+ }
+
+ if (pairs_out.size() > 1000) {
+ pairs_out.resize(1000);
+ }
+}
+
+void
+data_parser::discover_format()
+{
+ std::stack<discover_format_state> state_stack;
+ this->dp_group_token.push_back(DT_INVALID);
+ this->dp_group_stack.resize(1);
+
+ state_stack.push(discover_format_state());
+ while (true) {
+ auto tok_res = this->dp_scanner->tokenize2();
+ if (!tok_res) {
+ break;
+ }
+
+ element elem;
+ elem.e_token = tok_res->tr_token;
+ elem.e_capture = tok_res->tr_inner_capture;
+
+ require(elem.e_capture.c_begin >= 0);
+ require(elem.e_capture.c_end >= 0);
+
+ state_stack.top().update_for_element(elem);
+ switch (elem.e_token) {
+ case DT_LPAREN:
+ case DT_LANGLE:
+ case DT_LCURLY:
+ case DT_LSQUARE:
+ this->dp_group_token.push_back(elem.e_token);
+ this->dp_group_stack.emplace_back("_anon_", __FILE__, __LINE__);
+ state_stack.push(discover_format_state());
+ break;
+
+ case DT_EMPTY_CONTAINER: {
+ auto& curr_group = this->dp_group_stack.back();
+ auto empty_list = element_list_t("_anon_", __FILE__, __LINE__);
+ discover_format_state dfs;
+
+ dfs.finalize();
+
+ empty_list.el_format = dfs.dfs_format;
+ curr_group.PUSH_BACK(element());
+
+ auto& empty = curr_group.back();
+ empty.e_capture.c_begin = elem.e_capture.c_begin + 1;
+ empty.e_capture.c_end = elem.e_capture.c_begin + 1;
+ empty.e_token = DNT_GROUP;
+ empty.assign_elements(empty_list);
+ break;
+ }
+
+ case DT_RPAREN:
+ case DT_RANGLE:
+ case DT_RCURLY:
+ case DT_RSQUARE:
+ if (this->dp_group_token.back() == (elem.e_token - 1)) {
+ this->dp_group_token.pop_back();
+
+ auto riter = this->dp_group_stack.rbegin();
+ ++riter;
+ state_stack.top().finalize();
+ this->dp_group_stack.back().el_format
+ = state_stack.top().dfs_format;
+ state_stack.pop();
+ if (!this->dp_group_stack.back().empty()) {
+ (*riter).PUSH_BACK(
+ element(this->dp_group_stack.back(), DNT_GROUP));
+ } else {
+ (*riter).PUSH_BACK(element());
+ riter->back().e_capture.c_begin
+ = elem.e_capture.c_begin;
+ riter->back().e_capture.c_end = elem.e_capture.c_begin;
+ riter->back().e_token = DNT_GROUP;
+ riter->back().assign_elements(
+ this->dp_group_stack.back());
+ }
+ this->dp_group_stack.pop_back();
+ } else {
+ this->dp_group_stack.back().PUSH_BACK(elem);
+ }
+ break;
+
+ default:
+ this->dp_group_stack.back().PUSH_BACK(elem);
+ break;
+ }
+ }
+
+ while (this->dp_group_stack.size() > 1) {
+ this->dp_group_token.pop_back();
+
+ auto riter = this->dp_group_stack.rbegin();
+ ++riter;
+ if (!this->dp_group_stack.back().empty()) {
+ state_stack.top().finalize();
+ this->dp_group_stack.back().el_format
+ = state_stack.top().dfs_format;
+ state_stack.pop();
+ (*riter).PUSH_BACK(element(this->dp_group_stack.back(), DNT_GROUP));
+ }
+ this->dp_group_stack.pop_back();
+ }
+
+ state_stack.top().finalize();
+ this->dp_group_stack.back().el_format = state_stack.top().dfs_format;
+}
+
+void
+data_parser::end_of_value(data_parser::element_list_t& el_stack,
+ data_parser::element_list_t& key_comps,
+ data_parser::element_list_t& value,
+ const data_parser::element_list_t& in_list,
+ int group_depth)
+{
+ key_comps.remove_if(element_if(in_list.el_format.df_terminator));
+ key_comps.remove_if(element_if(DT_COMMA));
+ value.remove_if(element_if(in_list.el_format.df_terminator));
+ value.remove_if(element_if(DT_COMMA));
+ strip(key_comps, element_is_space{});
+ strip(value, element_is_space{});
+ if ((el_stack.empty() || el_stack.back().e_token != DNT_KEY)
+ && value.empty() && key_comps.size() > 1
+ && (key_comps.front().e_token == DT_WORD
+ || key_comps.front().e_token == DT_SYMBOL))
+ {
+ element_list_t::iterator key_iter, key_end;
+ bool found_value = false;
+ int word_count = 0;
+ key_iter = key_comps.begin();
+ key_end = key_comps.begin();
+ for (; key_iter != key_comps.end(); ++key_iter) {
+ if (key_iter->e_token == DT_WORD || key_iter->e_token == DT_SYMBOL)
+ {
+ word_count += 1;
+ if (found_value) {
+ key_end = key_comps.begin();
+ }
+ } else if (key_iter->e_token == DT_WHITE
+ || key_iter->e_token == DT_CSI)
+ {
+ } else {
+ if (!found_value) {
+ key_end = key_iter;
+ }
+ found_value = true;
+ }
+ }
+ if (word_count != 1) {
+ key_end = key_comps.begin();
+ }
+ value.SPLICE(value.end(), key_comps, key_end, key_comps.end());
+ strip(key_comps, element_is_space{});
+ if (!key_comps.empty()) {
+ el_stack.PUSH_BACK(element(key_comps, DNT_KEY, false));
+ }
+ key_comps.CLEAR();
+ } else {
+ value.SPLICE(
+ value.end(), key_comps, key_comps.begin(), key_comps.end());
+ }
+ strip(value, element_is_space{});
+ strip(value, element_if(DT_COLON));
+ strip(value, element_is_space{});
+ if (!value.empty()) {
+ if (value.size() == 2 && value.back().e_token == DNT_GROUP) {
+ element_list_t ELEMENT_LIST_T(group_pair);
+
+ group_pair.PUSH_BACK(element(value, DNT_PAIR));
+ el_stack.PUSH_BACK(element(group_pair, DNT_VALUE));
+ } else {
+ el_stack.PUSH_BACK(element(value, DNT_VALUE));
+ }
+ }
+ value.CLEAR();
+}
+
+void
+data_parser::parse()
+{
+ this->discover_format();
+
+ this->pairup(
+ &this->dp_schema_id, this->dp_pairs, this->dp_group_stack.front());
+}
+
+std::string
+data_parser::get_element_string(const data_parser::element& elem) const
+{
+ return this->dp_scanner->to_string_fragment(elem.e_capture).to_string();
+}
+
+std::string
+data_parser::get_string_up_to_value(const data_parser::element& elem)
+{
+ const element& val_elem
+ = elem.e_token == DNT_PAIR ? elem.e_sub_elements->back() : elem;
+
+ if (this->dp_msg_format_begin <= val_elem.e_capture.c_begin) {
+ auto leading_and_key = data_scanner::capture_t(
+ this->dp_msg_format_begin, val_elem.e_capture.c_begin);
+ auto str = this->dp_scanner->get_input().data();
+ if (leading_and_key.length() >= 2) {
+ switch (str[leading_and_key.c_end - 1]) {
+ case '\'':
+ case '"':
+ leading_and_key.c_end -= 1;
+ switch (str[leading_and_key.c_end - 1]) {
+ case 'r':
+ case 'u':
+ leading_and_key.c_end -= 1;
+ break;
+ }
+ break;
+ }
+ switch (str[leading_and_key.c_begin]) {
+ case '\'':
+ case '"':
+ leading_and_key.c_begin += 1;
+ break;
+ }
+ }
+ this->dp_msg_format_begin = val_elem.e_capture.c_end;
+ return this->dp_scanner->to_string_fragment(leading_and_key)
+ .to_string();
+ } else {
+ this->dp_msg_format_begin = val_elem.e_capture.c_end;
+ }
+ return "";
+}
+
+const char*
+data_parser::get_element_string(const data_parser::element& elem,
+ size_t& len_out)
+{
+ len_out = elem.e_capture.length();
+ return this->dp_scanner->to_string_fragment(elem.e_capture).data();
+}
+
+void
+data_parser::print(FILE* out, data_parser::element_list_t& el)
+{
+ fprintf(out,
+ " %s\n",
+ this->dp_scanner->get_input().to_string().c_str());
+ for (auto& iter : el) {
+ iter.print(out, *this->dp_scanner);
+ }
+}
+
+FILE* data_parser::TRACE_FILE;
+
+data_format_state_t
+dfs_prefix_next(data_format_state_t state, data_token_t next_token)
+{
+ data_format_state_t retval = state;
+
+ switch (state) {
+ case DFS_INIT:
+ switch (next_token) {
+ case DT_PATH:
+ case DT_COLON:
+ case DT_EQUALS:
+ case DT_CONSTANT:
+ case DT_EMAIL:
+ case DT_WORD:
+ case DT_SYMBOL:
+ case DT_OCTAL_NUMBER:
+ case DT_HEX_NUMBER:
+ case DT_NUMBER:
+ case DT_WHITE:
+ case DT_CSI:
+ case DT_LSQUARE:
+ case DT_RSQUARE:
+ case DT_LANGLE:
+ case DT_RANGLE:
+ case DT_EMPTY_CONTAINER:
+ break;
+
+ default:
+ retval = DFS_ERROR;
+ break;
+ }
+ break;
+
+ case DFS_EXPECTING_SEP:
+ case DFS_ERROR:
+ retval = DFS_ERROR;
+ break;
+
+ default:
+ break;
+ }
+
+ return retval;
+}
+
+data_format_state_t
+dfs_semi_next(data_format_state_t state, data_token_t next_token)
+{
+ data_format_state_t retval = state;
+
+ switch (state) {
+ case DFS_INIT:
+ switch (next_token) {
+ case DT_COMMA:
+ case DT_SEMI:
+ retval = DFS_ERROR;
+ break;
+
+ default:
+ retval = DFS_KEY;
+ break;
+ }
+ break;
+
+ case DFS_KEY:
+ switch (next_token) {
+ case DT_COLON:
+ case DT_EQUALS:
+ retval = DFS_VALUE;
+ break;
+
+ case DT_SEMI:
+ retval = DFS_ERROR;
+ break;
+
+ default:
+ break;
+ }
+ break;
+
+ case DFS_VALUE:
+ switch (next_token) {
+ case DT_SEMI:
+ retval = DFS_INIT;
+ break;
+
+ default:
+ break;
+ }
+ break;
+
+ case DFS_EXPECTING_SEP:
+ case DFS_ERROR:
+ retval = DFS_ERROR;
+ break;
+ }
+
+ return retval;
+}
+
+data_format_state_t
+dfs_comma_next(data_format_state_t state, data_token_t next_token)
+{
+ data_format_state_t retval = state;
+
+ switch (state) {
+ case DFS_INIT:
+ switch (next_token) {
+ case DT_COMMA:
+ break;
+
+ case DT_SEMI:
+ retval = DFS_ERROR;
+ break;
+
+ default:
+ retval = DFS_KEY;
+ break;
+ }
+ break;
+
+ case DFS_KEY:
+ switch (next_token) {
+ case DT_COLON:
+ case DT_EQUALS:
+ retval = DFS_VALUE;
+ break;
+
+ case DT_COMMA:
+ retval = DFS_INIT;
+ break;
+
+ case DT_WORD:
+ retval = DFS_EXPECTING_SEP;
+ break;
+
+ case DT_SEMI:
+ retval = DFS_ERROR;
+ break;
+
+ default:
+ break;
+ }
+ break;
+
+ case DFS_EXPECTING_SEP:
+ switch (next_token) {
+ case DT_COLON:
+ case DT_EQUALS:
+ case DT_LPAREN:
+ case DT_LCURLY:
+ case DT_LSQUARE:
+ case DT_LANGLE:
+ retval = DFS_VALUE;
+ break;
+
+ case DT_EMPTY_CONTAINER:
+ retval = DFS_INIT;
+ break;
+
+ case DT_COMMA:
+ case DT_SEMI:
+ retval = DFS_ERROR;
+ break;
+
+ default:
+ break;
+ }
+ break;
+
+ case DFS_VALUE:
+ switch (next_token) {
+ case DT_COMMA:
+ retval = DFS_INIT;
+ break;
+
+ case DT_COLON:
+ case DT_EQUALS:
+ retval = DFS_ERROR;
+ break;
+
+ default:
+ break;
+ }
+ break;
+
+ case DFS_ERROR:
+ retval = DFS_ERROR;
+ break;
+ }
+
+ return retval;
+}
+
+data_parser::element::element()
+ : e_capture(-1, -1), e_token(DT_INVALID), e_sub_elements(nullptr)
+{
+}
+
+data_parser::element::element(data_parser::element_list_t& subs,
+ data_token_t token,
+ bool assign_subs_elements)
+ : e_capture(subs.front().e_capture.c_begin, subs.back().e_capture.c_end),
+ e_token(token), e_sub_elements(nullptr)
+{
+ if (assign_subs_elements) {
+ this->assign_elements(subs);
+ }
+}
+
+data_parser::element::element(const data_parser::element& other)
+{
+ /* require(other.e_sub_elements == nullptr); */
+
+ this->e_capture = other.e_capture;
+ this->e_token = other.e_token;
+ this->e_sub_elements = nullptr;
+ if (other.e_sub_elements != nullptr) {
+ this->assign_elements(*other.e_sub_elements);
+ }
+}
+
+data_parser::element::~element()
+{
+ delete this->e_sub_elements;
+ this->e_sub_elements = nullptr;
+}
+
+data_parser::element&
+data_parser::element::operator=(const data_parser::element& other)
+{
+ this->e_capture = other.e_capture;
+ this->e_token = other.e_token;
+ this->e_sub_elements = nullptr;
+ if (other.e_sub_elements != nullptr) {
+ this->assign_elements(*other.e_sub_elements);
+ }
+ return *this;
+}
+
+void
+data_parser::element::assign_elements(data_parser::element_list_t& subs)
+{
+ if (this->e_sub_elements == nullptr) {
+ this->e_sub_elements = new element_list_t("_sub_", __FILE__, __LINE__);
+ this->e_sub_elements->el_format = subs.el_format;
+ }
+ this->e_sub_elements->SWAP(subs);
+ this->update_capture();
+}
+
+void
+data_parser::element::update_capture()
+{
+ if (this->e_sub_elements != nullptr && !this->e_sub_elements->empty()) {
+ this->e_capture.c_begin
+ = this->e_sub_elements->front().e_capture.c_begin;
+ this->e_capture.c_end = this->e_sub_elements->back().e_capture.c_end;
+ }
+}
+
+const data_parser::element&
+data_parser::element::get_pair_value() const
+{
+ require(this->e_token == DNT_PAIR);
+
+ return this->e_sub_elements->back();
+}
+
+data_token_t
+data_parser::element::value_token() const
+{
+ data_token_t retval = DT_INVALID;
+
+ if (this->e_token == DNT_VALUE) {
+ if (this->e_sub_elements != nullptr
+ && this->e_sub_elements->size() == 1)
+ {
+ retval = this->e_sub_elements->front().e_token;
+ } else {
+ retval = DT_SYMBOL;
+ }
+ } else {
+ retval = this->e_token;
+ }
+ return retval;
+}
+
+const data_parser::element&
+data_parser::element::get_value_elem() const
+{
+ if (this->e_token == DNT_VALUE) {
+ if (this->e_sub_elements != nullptr
+ && this->e_sub_elements->size() == 1)
+ {
+ return this->e_sub_elements->front();
+ }
+ }
+ return *this;
+}
+
+const data_parser::element&
+data_parser::element::get_pair_elem() const
+{
+ if (this->e_token == DNT_VALUE) {
+ return this->e_sub_elements->front();
+ }
+ return *this;
+}
+
+void
+data_parser::element::print(FILE* out, data_scanner& ds, int offset) const
+{
+ int lpc;
+
+ if (this->e_sub_elements != nullptr) {
+ for (auto& e_sub_element : *this->e_sub_elements) {
+ e_sub_element.print(out, ds, offset + 1);
+ }
+ }
+
+ fprintf(out,
+ "%4s %3d:%-3d ",
+ data_scanner::token2name(this->e_token),
+ this->e_capture.c_begin,
+ this->e_capture.c_end);
+ for (lpc = 0; lpc < this->e_capture.c_end; lpc++) {
+ if (lpc == this->e_capture.c_begin) {
+ fputc('^', out);
+ } else if (lpc == (this->e_capture.c_end - 1)) {
+ fputc('^', out);
+ } else if (lpc > this->e_capture.c_begin) {
+ fputc('-', out);
+ } else {
+ fputc(' ', out);
+ }
+ }
+ for (; lpc < (int) ds.get_input().length(); lpc++) {
+ fputc(' ', out);
+ }
+
+ std::string sub = ds.to_string_fragment(this->e_capture).to_string();
+ fprintf(out, " %s\n", sub.c_str());
+}
+
+data_parser::discover_format_state::discover_format_state()
+ : dfs_prefix_state(DFS_INIT), dfs_semi_state(DFS_INIT),
+ dfs_comma_state(DFS_INIT)
+{
+ memset(this->dfs_hist, 0, sizeof(this->dfs_hist));
+}
+
+void
+data_parser::discover_format_state::update_for_element(
+ const data_parser::element& elem)
+{
+ this->dfs_prefix_state
+ = dfs_prefix_next(this->dfs_prefix_state, elem.e_token);
+ this->dfs_semi_state = dfs_semi_next(this->dfs_semi_state, elem.e_token);
+ this->dfs_comma_state = dfs_comma_next(this->dfs_comma_state, elem.e_token);
+ if (this->dfs_prefix_state != DFS_ERROR) {
+ if (this->dfs_semi_state == DFS_ERROR) {
+ this->dfs_semi_state = DFS_INIT;
+ }
+ if (this->dfs_comma_state == DFS_ERROR) {
+ this->dfs_comma_state = DFS_INIT;
+ }
+ }
+ this->dfs_hist[elem.e_token] += 1;
+}
+
+void
+data_parser::discover_format_state::finalize()
+{
+ data_token_t qualifier = this->dfs_format.df_qualifier;
+ data_token_t separator = this->dfs_format.df_separator;
+ data_token_t prefix_term = this->dfs_format.df_prefix_terminator;
+
+ this->dfs_format = FORMAT_PLAIN;
+ if (this->dfs_hist[DT_EQUALS]) {
+ qualifier = DT_COLON;
+ separator = DT_EQUALS;
+ }
+
+ if (this->dfs_semi_state != DFS_ERROR && this->dfs_hist[DT_SEMI]) {
+ this->dfs_format = FORMAT_SEMI;
+ } else if (this->dfs_comma_state != DFS_ERROR) {
+ this->dfs_format = FORMAT_COMMA;
+ if (separator == DT_COLON && this->dfs_hist[DT_COMMA] > 0) {
+ if (!((this->dfs_hist[DT_COLON] == this->dfs_hist[DT_COMMA])
+ || ((this->dfs_hist[DT_COLON] - 1)
+ == this->dfs_hist[DT_COMMA])))
+ {
+ separator = DT_INVALID;
+ if (this->dfs_hist[DT_COLON] == 1) {
+ prefix_term = DT_COLON;
+ }
+ }
+ }
+ }
+
+ this->dfs_format.df_qualifier = qualifier;
+ this->dfs_format.df_separator = separator;
+ this->dfs_format.df_prefix_terminator = prefix_term;
+}