summaryrefslogtreecommitdiffstats
path: root/src/data_parser.cc
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/data_parser.cc495
1 files changed, 425 insertions, 70 deletions
diff --git a/src/data_parser.cc b/src/data_parser.cc
index a751b30..aaec953 100644
--- a/src/data_parser.cc
+++ b/src/data_parser.cc
@@ -27,8 +27,6 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#include <algorithm>
-
#include "data_parser.hh"
#include "config.h"
@@ -36,6 +34,7 @@
data_format data_parser::FORMAT_SEMI("semi", DT_COMMA, DT_SEMI);
data_format data_parser::FORMAT_COMMA("comma", DT_INVALID, DT_COMMA);
+data_format data_parser::FORMAT_EMDASH("emdash", DT_INVALID, DT_EMDASH);
data_format data_parser::FORMAT_PLAIN("plain", DT_INVALID, DT_INVALID);
data_parser::data_parser(data_scanner* ds)
@@ -79,12 +78,13 @@ data_parser::pairup(data_parser::schema_id_t* schema,
if (in_list.el_format.df_prefix_terminator != DT_INVALID) {
if (iter->e_token == in_list.el_format.df_prefix_terminator) {
in_list.el_format.df_prefix_terminator = DT_INVALID;
+ in_list.el_format.df_separator = DT_COLON;
} else {
el_stack.PUSH_BACK(*iter);
}
} else if (iter->e_token == in_list.el_format.df_terminator) {
this->end_of_value(
- el_stack, key_comps, value, in_list, group_depth);
+ el_stack, key_comps, value, in_list, group_depth, iter);
key_comps.PUSH_BACK(*iter);
} else if (iter->e_token == in_list.el_format.df_qualifier) {
@@ -94,9 +94,17 @@ data_parser::pairup(data_parser::schema_id_t* schema,
if (!value.empty()) {
el_stack.PUSH_BACK(element(value, DNT_VALUE));
}
- } else if (iter->e_token == in_list.el_format.df_separator) {
+ value.CLEAR();
+ } else if (iter->e_token == in_list.el_format.df_separator
+ || iter->e_token == DNT_GROUP)
+ {
auto key_iter = key_comps.end();
- bool found = false, key_is_values = true;
+ bool found = false, key_is_values = true, mixed_values = false;
+ auto last_is_key = !key_comps.empty()
+ && (key_comps.back().e_token == DT_WORD
+ || key_comps.back().e_token == DT_SYMBOL);
+ element_list_t ELEMENT_LIST_T(mixed_queue),
+ ELEMENT_LIST_T(mixed_tail);
if (!key_comps.empty()) {
do {
@@ -107,7 +115,9 @@ data_parser::pairup(data_parser::schema_id_t* schema,
key_comps,
key_comps.begin(),
key_iter);
- key_comps.POP_FRONT();
+ if (!key_comps.empty()) {
+ key_comps.POP_FRONT();
+ }
found = true;
} else if (key_iter->e_token
== in_list.el_format.df_terminator)
@@ -127,19 +137,133 @@ data_parser::pairup(data_parser::schema_id_t* schema,
}
found = true;
}
- if (key_iter != key_comps.end()) {
+ if (!found && key_iter != key_comps.end()) {
switch (key_iter->e_token) {
case DT_WORD:
case DT_SYMBOL:
key_is_values = false;
break;
+ case DT_WHITE:
+ break;
+ case DT_ID:
+ case DT_QUOTED_STRING:
+ case DT_URL:
+ case DT_PATH:
+ case DT_MAC_ADDRESS:
+ case DT_DATE:
+ case DT_TIME:
+ case DT_DATE_TIME:
+ case DT_IPV4_ADDRESS:
+ case DT_IPV6_ADDRESS:
+ case DT_HEX_DUMP:
+ case DT_UUID:
+ case DT_CREDIT_CARD_NUMBER:
+ case DT_VERSION_NUMBER:
+ case DT_OCTAL_NUMBER:
+ case DT_PERCENTAGE:
+ case DT_NUMBER:
+ case DT_HEX_NUMBER:
+ case DT_EMAIL:
+ case DT_CONSTANT:
+ case DNT_MEASUREMENT: {
+ if (((in_list.el_format.df_terminator
+ != DT_INVALID
+ && !el_stack.empty())
+ || (key_comps.size() == 1
+ && mixed_queue.empty()))
+ && key_iter->e_token == DT_ID)
+ {
+ key_is_values = false;
+ } else if (in_list.el_format.df_terminator
+ == DT_INVALID
+ || el_stack.empty())
+ {
+ element_list_t ELEMENT_LIST_T(mixed_key);
+ element_list_t ELEMENT_LIST_T(mixed_value);
+
+ mixed_values = true;
+ auto value_iter = key_iter;
+ if (last_is_key) {
+ if (mixed_tail.empty()) {
+ mixed_tail.SPLICE(
+ mixed_tail.end(),
+ key_comps,
+ std::next(value_iter),
+ key_comps.end());
+ }
+ } else {
+ while (std::prev(key_comps.end())
+ != value_iter)
+ {
+ key_comps.POP_BACK();
+ }
+ }
+ key_iter = std::next(value_iter);
+ mixed_value.SPLICE(mixed_value.end(),
+ key_comps,
+ value_iter,
+ key_iter);
+ if (!el_stack.empty()
+ && el_stack.back().e_token == DNT_KEY
+ && key_comps.empty())
+ {
+ el_stack.PUSH_BACK(
+ element(mixed_value, DNT_VALUE));
+ } else {
+ mixed_queue.PUSH_FRONT(
+ element(mixed_value, DNT_VALUE));
+ if (!key_comps.empty()) {
+ if (key_comps.back().e_token
+ == DT_WORD)
+ {
+ key_iter = std::prev(
+ key_comps.end());
+ mixed_key.SPLICE(
+ mixed_key.end(),
+ key_comps,
+ key_iter,
+ key_comps.end());
+ mixed_queue.PUSH_FRONT(element(
+ mixed_key, DNT_KEY));
+ }
+ }
+ }
+ while (!key_comps.empty()
+ && !key_comps.back().is_value())
+ {
+ key_comps.POP_BACK();
+ }
+ key_iter = key_comps.end();
+ }
+ break;
+ }
default:
break;
}
}
} while (key_iter != key_comps.begin() && !found);
}
- if (!found && !el_stack.empty() && !key_comps.empty()) {
+ if (!mixed_queue.empty()) {
+ if (el_stack.back().e_token == DNT_KEY
+ && mixed_queue.front().e_token == DNT_KEY)
+ {
+ el_stack.POP_BACK();
+ }
+ el_stack.SPLICE(el_stack.end(),
+ mixed_queue,
+ mixed_queue.begin(),
+ mixed_queue.end());
+ }
+ if (!mixed_tail.empty()) {
+ key_comps.CLEAR();
+ key_comps.SPLICE(key_comps.end(),
+ mixed_tail,
+ std::prev(mixed_tail.end()),
+ mixed_tail.end());
+ }
+ if (!found && !mixed_values && !el_stack.empty()
+ && !key_comps.empty())
+ {
element_list_t::iterator value_iter;
if (el_stack.size() > 1
@@ -167,15 +291,31 @@ data_parser::pairup(data_parser::schema_id_t* schema,
}
strip(key_comps, element_is_space{});
if (!key_comps.empty()) {
- if (key_is_values) {
- el_stack.PUSH_BACK(element(key_comps, DNT_VALUE));
- } else {
- el_stack.PUSH_BACK(element(key_comps, DNT_KEY, false));
+ if (mixed_values) {
+ key_is_values = false;
+ while (key_comps.size() > 1) {
+ key_comps.POP_FRONT();
+ }
+ }
+ if (!key_comps.empty()) {
+ if (key_is_values) {
+ el_stack.PUSH_BACK(element(key_comps, DNT_VALUE));
+ } else {
+ el_stack.PUSH_BACK(element(key_comps, DNT_KEY, false));
+ }
}
}
key_comps.CLEAR();
value.CLEAR();
- } else {
+
+ if (iter->e_token == DNT_GROUP) {
+ value.PUSH_BACK(*iter);
+ el_stack.PUSH_BACK(element(value, DNT_VALUE));
+ value.CLEAR();
+ }
+ } else if (iter->e_token != DT_WHITE && iter->e_token != DT_CSI
+ && iter->e_token != DT_LINE)
+ {
key_comps.PUSH_BACK(*iter);
}
@@ -192,7 +332,8 @@ data_parser::pairup(data_parser::schema_id_t* schema,
free_row.SPLICE(
free_row.begin(), key_comps, key_comps.begin(), key_comps.end());
} else {
- this->end_of_value(el_stack, key_comps, value, in_list, group_depth);
+ this->end_of_value(
+ el_stack, key_comps, value, in_list, group_depth, in_list.end());
}
POINT_TRACE("pairup_stack");
@@ -324,6 +465,7 @@ data_parser::pairup(data_parser::schema_id_t* schema,
case DT_CONSTANT:
case DT_NUMBER:
case DT_SYMBOL:
+ case DT_ID:
case DT_HEX_NUMBER:
case DT_OCTAL_NUMBER:
case DT_VERSION_NUMBER:
@@ -341,7 +483,8 @@ data_parser::pairup(data_parser::schema_id_t* schema,
case DT_PATH:
case DT_DATE:
case DT_TIME:
- case DT_PERCENTAGE: {
+ case DT_PERCENTAGE:
+ case DNT_MEASUREMENT: {
element_list_t ELEMENT_LIST_T(pair_subs);
struct element blank;
@@ -365,7 +508,8 @@ data_parser::pairup(data_parser::schema_id_t* schema,
= this->get_element_string(free_row.front(), key_len);
context.Update(key_val, key_len);
- } break;
+ break;
+ }
}
free_row.POP_FRONT();
@@ -384,10 +528,6 @@ data_parser::pairup(data_parser::schema_id_t* schema,
pairs_out.PUSH_FRONT(element(pair_subs, DNT_PAIR));
}
- if (schema != nullptr) {
- context.Final(schema->out(0), schema->out(1));
- }
-
if (schema != nullptr && this->dp_msg_format != nullptr) {
for (auto& fiter : pairs_out) {
*(this->dp_msg_format) += this->get_string_up_to_value(fiter);
@@ -407,6 +547,12 @@ data_parser::pairup(data_parser::schema_id_t* schema,
}
*(this->dp_msg_format) += last.to_string();
}
+ context.Update(this->dp_msg_format->c_str(),
+ this->dp_msg_format->length());
+ }
+
+ if (schema != nullptr) {
+ context.Final(schema->out(0), schema->out(1));
}
if (pairs_out.size() > 1000) {
@@ -495,6 +641,20 @@ data_parser::discover_format()
}
break;
+ case DT_UNIT: {
+ element_list_t measurement_list;
+
+ measurement_list.SPLICE(
+ measurement_list.end(),
+ this->dp_group_stack.back(),
+ std::prev(this->dp_group_stack.back().end()),
+ this->dp_group_stack.back().end());
+ measurement_list.PUSH_BACK(elem);
+ this->dp_group_stack.back().PUSH_BACK(
+ element(measurement_list, DNT_MEASUREMENT));
+ break;
+ }
+
default:
this->dp_group_stack.back().PUSH_BACK(elem);
break;
@@ -525,67 +685,212 @@ data_parser::end_of_value(data_parser::element_list_t& el_stack,
data_parser::element_list_t& key_comps,
data_parser::element_list_t& value,
const data_parser::element_list_t& in_list,
- int group_depth)
+ int group_depth,
+ element_list_t::iterator iter)
{
- key_comps.remove_if(element_if(in_list.el_format.df_terminator));
- key_comps.remove_if(element_if(DT_COMMA));
- value.remove_if(element_if(in_list.el_format.df_terminator));
- value.remove_if(element_if(DT_COMMA));
- strip(key_comps, element_is_space{});
- strip(value, element_is_space{});
- if ((el_stack.empty() || el_stack.back().e_token != DNT_KEY)
- && value.empty() && key_comps.size() > 1
- && (key_comps.front().e_token == DT_WORD
- || key_comps.front().e_token == DT_SYMBOL))
- {
- element_list_t::iterator key_iter, key_end;
- bool found_value = false;
- int word_count = 0;
- key_iter = key_comps.begin();
- key_end = key_comps.begin();
- for (; key_iter != key_comps.end(); ++key_iter) {
- if (key_iter->e_token == DT_WORD || key_iter->e_token == DT_SYMBOL)
- {
- word_count += 1;
- if (found_value) {
- key_end = key_comps.begin();
+ auto key_iter = key_comps.end();
+ bool found = false, key_is_values = true, mixed_values = false;
+ auto last_is_key = !key_comps.empty()
+ && (key_comps.back().e_token == DT_WORD
+ || key_comps.back().e_token == DT_SYMBOL);
+ element_list_t ELEMENT_LIST_T(mixed_queue), ELEMENT_LIST_T(mixed_tail);
+
+ if (!key_comps.empty()) {
+ do {
+ --key_iter;
+ if (key_iter->e_token == in_list.el_format.df_appender) {
+ ++key_iter;
+ value.SPLICE(
+ value.end(), key_comps, key_comps.begin(), key_iter);
+ if (!key_comps.empty()) {
+ key_comps.POP_FRONT();
}
- } else if (key_iter->e_token == DT_WHITE
- || key_iter->e_token == DT_CSI)
- {
- } else {
- if (!found_value) {
- key_end = key_iter;
+ found = true;
+ } else if (key_iter->e_token == in_list.el_format.df_terminator) {
+ value.SPLICE(
+ value.end(), key_comps, key_comps.begin(), key_iter);
+ key_comps.POP_FRONT();
+ strip(key_comps, element_is_space{});
+ if (key_comps.empty()) {
+ key_iter = key_comps.end();
+ } else {
+ key_iter = key_comps.begin();
}
- found_value = true;
+ found = true;
}
+ if (!found && key_iter != key_comps.end()) {
+ switch (key_iter->e_token) {
+ case DT_WORD:
+ case DT_SYMBOL:
+ key_is_values = false;
+ break;
+ case DT_WHITE:
+ break;
+ case DT_ID:
+ case DT_QUOTED_STRING:
+ case DT_URL:
+ case DT_PATH:
+ case DT_MAC_ADDRESS:
+ case DT_DATE:
+ case DT_TIME:
+ case DT_DATE_TIME:
+ case DT_IPV4_ADDRESS:
+ case DT_IPV6_ADDRESS:
+ case DT_HEX_DUMP:
+ case DT_UUID:
+ case DT_CREDIT_CARD_NUMBER:
+ case DT_VERSION_NUMBER:
+ case DT_OCTAL_NUMBER:
+ case DT_PERCENTAGE:
+ case DT_NUMBER:
+ case DT_HEX_NUMBER:
+ case DT_EMAIL:
+ case DT_CONSTANT:
+ case DNT_MEASUREMENT: {
+ if (((in_list.el_format.df_terminator != DT_INVALID
+ && !el_stack.empty())
+ || (key_comps.size() == 1 && mixed_queue.empty()))
+ && key_iter->e_token == DT_ID)
+ {
+ key_is_values = false;
+ } else if (in_list.el_format.df_terminator == DT_INVALID
+ || el_stack.empty())
+ {
+ element_list_t ELEMENT_LIST_T(mixed_key);
+ element_list_t ELEMENT_LIST_T(mixed_value);
+
+ mixed_values = true;
+ auto value_iter = key_iter;
+ if (last_is_key) {
+ if (mixed_tail.empty()) {
+ mixed_tail.SPLICE(mixed_tail.end(),
+ key_comps,
+ std::next(value_iter),
+ key_comps.end());
+ }
+ } else {
+ while (std::prev(key_comps.end()) != value_iter)
+ {
+ key_comps.POP_BACK();
+ }
+ }
+ key_iter = std::next(value_iter);
+ mixed_value.SPLICE(mixed_value.end(),
+ key_comps,
+ value_iter,
+ key_iter);
+ if (!el_stack.empty()
+ && el_stack.back().e_token == DNT_KEY
+ && key_comps.empty())
+ {
+ el_stack.PUSH_BACK(
+ element(mixed_value, DNT_VALUE));
+ } else {
+ mixed_queue.PUSH_FRONT(
+ element(mixed_value, DNT_VALUE));
+ if (!key_comps.empty()) {
+ if (key_comps.back().e_token == DT_WORD) {
+ key_iter = std::prev(key_comps.end());
+ mixed_key.SPLICE(mixed_key.end(),
+ key_comps,
+ key_iter,
+ key_comps.end());
+ mixed_queue.PUSH_FRONT(
+ element(mixed_key, DNT_KEY));
+ }
+ }
+ }
+ while (!key_comps.empty()
+ && !key_comps.back().is_value())
+ {
+ key_comps.POP_BACK();
+ }
+ key_iter = key_comps.end();
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ }
+ } while (key_iter != key_comps.begin() && !found);
+ }
+ if (!mixed_queue.empty()) {
+ if (el_stack.back().e_token == DNT_KEY
+ && mixed_queue.front().e_token == DNT_KEY)
+ {
+ el_stack.POP_BACK();
}
- if (word_count != 1) {
- key_end = key_comps.begin();
- }
- value.SPLICE(value.end(), key_comps, key_end, key_comps.end());
- strip(key_comps, element_is_space{});
- if (!key_comps.empty()) {
- el_stack.PUSH_BACK(element(key_comps, DNT_KEY, false));
- }
+ el_stack.SPLICE(el_stack.end(),
+ mixed_queue,
+ mixed_queue.begin(),
+ mixed_queue.end());
+ }
+ if (!mixed_tail.empty()) {
key_comps.CLEAR();
- } else {
+ key_comps.SPLICE(key_comps.end(),
+ mixed_tail,
+ std::prev(mixed_tail.end()),
+ mixed_tail.end());
+ }
+ if (!mixed_values && !el_stack.empty() && !key_comps.empty()) {
+ element_list_t::iterator value_iter;
+
+ if (el_stack.size() > 1 && in_list.el_format.df_appender != DT_INVALID
+ && in_list.el_format.df_terminator != DT_INVALID
+ && iter->e_token == in_list.el_format.df_separator)
+ {
+ /* If we're expecting a terminator and haven't found it */
+ /* then this is part of the value. */
+ return;
+ }
+
value.SPLICE(
value.end(), key_comps, key_comps.begin(), key_comps.end());
+
+ if (value.size() == 2
+ && (value.front().e_token == DT_WORD
+ || value.front().e_token == DT_SYMBOL
+ || value.front().e_token == DT_ID)
+ && el_stack.back().e_token != DNT_KEY)
+ {
+ element_list_t ELEMENT_LIST_T(mixed_key);
+
+ mixed_key.SPLICE(mixed_key.end(),
+ value,
+ value.begin(),
+ std::next(value.begin()));
+ el_stack.PUSH_BACK(element(mixed_key, DNT_KEY, false));
+ }
}
+
strip(value, element_is_space{});
- strip(value, element_if(DT_COLON));
- strip(value, element_is_space{});
+ value.remove_if(element_if(DT_COMMA));
if (!value.empty()) {
- if (value.size() == 2 && value.back().e_token == DNT_GROUP) {
- element_list_t ELEMENT_LIST_T(group_pair);
-
- group_pair.PUSH_BACK(element(value, DNT_PAIR));
- el_stack.PUSH_BACK(element(group_pair, DNT_VALUE));
- } else {
- el_stack.PUSH_BACK(element(value, DNT_VALUE));
+ el_stack.PUSH_BACK(element(value, DNT_VALUE));
+ }
+ strip(key_comps, element_is_space{});
+ if (!key_comps.empty()) {
+ if (mixed_values) {
+ key_is_values = false;
+ while (key_comps.size() > 1) {
+ key_comps.POP_FRONT();
+ }
+ }
+ if (!key_comps.empty()) {
+ if (iter == in_list.end()
+ || iter->e_token != in_list.el_format.df_separator)
+ {
+ key_is_values = true;
+ }
+ if (key_is_values) {
+ el_stack.PUSH_BACK(element(key_comps, DNT_VALUE));
+ } else {
+ el_stack.PUSH_BACK(element(key_comps, DNT_KEY, false));
+ }
}
}
+ key_comps.CLEAR();
value.CLEAR();
}
@@ -679,6 +984,7 @@ dfs_prefix_next(data_format_state_t state, data_token_t next_token)
case DT_EMAIL:
case DT_WORD:
case DT_SYMBOL:
+ case DT_ID:
case DT_OCTAL_NUMBER:
case DT_HEX_NUMBER:
case DT_NUMBER:
@@ -1009,6 +1315,37 @@ data_parser::element::print(FILE* out, data_scanner& ds, int offset) const
fprintf(out, " %s\n", sub.c_str());
}
+bool
+data_parser::element::is_value() const
+{
+ switch (this->e_token) {
+ case DNT_MEASUREMENT:
+ case DT_ID:
+ case DT_QUOTED_STRING:
+ case DT_URL:
+ case DT_PATH:
+ case DT_MAC_ADDRESS:
+ case DT_DATE:
+ case DT_TIME:
+ case DT_DATE_TIME:
+ case DT_IPV4_ADDRESS:
+ case DT_IPV6_ADDRESS:
+ case DT_HEX_DUMP:
+ case DT_UUID:
+ case DT_CREDIT_CARD_NUMBER:
+ case DT_VERSION_NUMBER:
+ case DT_OCTAL_NUMBER:
+ case DT_PERCENTAGE:
+ case DT_NUMBER:
+ case DT_HEX_NUMBER:
+ case DT_EMAIL:
+ case DT_CONSTANT:
+ return true;
+ default:
+ return false;
+ }
+}
+
data_parser::discover_format_state::discover_format_state()
: dfs_prefix_state(DFS_INIT), dfs_semi_state(DFS_INIT),
dfs_comma_state(DFS_INIT)
@@ -1051,14 +1388,18 @@ data_parser::discover_format_state::finalize()
if (this->dfs_semi_state != DFS_ERROR && this->dfs_hist[DT_SEMI]) {
this->dfs_format = FORMAT_SEMI;
} else if (this->dfs_comma_state != DFS_ERROR) {
- this->dfs_format = FORMAT_COMMA;
+ if (this->dfs_hist[DT_COMMA] > 0) {
+ this->dfs_format = FORMAT_COMMA;
+ } else if (this->dfs_hist[DT_EMDASH] > 0) {
+ this->dfs_format = FORMAT_EMDASH;
+ }
if (separator == DT_COLON && this->dfs_hist[DT_COMMA] > 0) {
if (!((this->dfs_hist[DT_COLON] == this->dfs_hist[DT_COMMA])
|| ((this->dfs_hist[DT_COLON] - 1)
== this->dfs_hist[DT_COMMA])))
{
separator = DT_INVALID;
- if (this->dfs_hist[DT_COLON] == 1) {
+ if (this->dfs_hist[DT_COLON] > 0) {
prefix_term = DT_COLON;
}
}
@@ -1069,3 +1410,17 @@ data_parser::discover_format_state::finalize()
this->dfs_format.df_separator = separator;
this->dfs_format.df_prefix_terminator = prefix_term;
}
+
+void
+data_parser::element_list_t::push_back(const data_parser::element& elem,
+ const char* fn,
+ int line)
+{
+ ELEMENT_TRACE;
+
+ require(elem.e_capture.c_end >= -1);
+ require(this->empty()
+ || (elem.e_capture.c_begin == -1 && elem.e_capture.c_end == -1)
+ || this->back().e_capture.c_end <= elem.e_capture.c_begin);
+ this->std::list<element>::push_back(elem);
+}