summaryrefslogtreecommitdiffstats
path: root/src/data_scanner.cc
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/data_scanner.cc140
1 files changed, 140 insertions, 0 deletions
diff --git a/src/data_scanner.cc b/src/data_scanner.cc
index f270a13..3727407 100644
--- a/src/data_scanner.cc
+++ b/src/data_scanner.cc
@@ -48,6 +48,9 @@ static struct {
"quot",
},
{
+ "comm",
+ },
+ {
"url",
},
{
@@ -109,6 +112,9 @@ static struct {
{
"semi",
},
+ {
+ "emda",
+ },
{
"empt",
@@ -179,9 +185,15 @@ static struct {
"word",
},
{
+ "id",
+ },
+ {
"sym",
},
{
+ "unit",
+ },
+ {
"line",
},
{
@@ -200,6 +212,15 @@ static struct {
{
"gbg",
},
+ {
+ "zwsp",
+ },
+ {
+ "dffi",
+ },
+ {
+ "dfch",
+ },
};
const char* DNT_NAMES[DNT_MAX - DNT_KEY] = {
@@ -263,3 +284,122 @@ data_scanner::is_credit_card(string_fragment cc) const
return double_even_sum % 10 == 0;
}
+
+void
+data_scanner::cleanup_end()
+{
+ auto done = false;
+
+ while (!this->ds_input.empty() && !done) {
+ switch (this->ds_input.back()) {
+ case '.':
+ case ' ':
+ case '\r':
+ case '\n':
+ this->ds_input.pop_back();
+ break;
+ default:
+ done = true;
+ break;
+ }
+ }
+}
+
+nonstd::optional<data_scanner::tokenize_result>
+data_scanner::tokenize2(text_format_t tf)
+{
+ auto retval = this->tokenize_int(tf);
+
+ if (this->ds_last_bracket_matched) {
+ this->ds_matching_brackets.pop_back();
+ this->ds_last_bracket_matched = false;
+ }
+ if (retval) {
+ auto dt = retval.value().tr_token;
+ switch (dt) {
+ case DT_LSQUARE:
+ case DT_LCURLY:
+ case DT_LPAREN:
+ this->ds_matching_brackets.emplace_back(retval.value());
+ break;
+ case DT_RSQUARE:
+ case DT_RCURLY:
+ case DT_RPAREN:
+ if (!this->ds_matching_brackets.empty()
+ && this->ds_matching_brackets.back().tr_token
+ == to_opener(dt))
+ {
+ this->ds_last_bracket_matched = true;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ return retval;
+}
+
+nonstd::optional<data_scanner::tokenize_result>
+data_scanner::find_matching_bracket(text_format_t tf, tokenize_result tr)
+{
+ switch (tr.tr_token) {
+ case DT_LSQUARE:
+ case DT_LCURLY:
+ case DT_LPAREN: {
+ auto curr_size = this->ds_matching_brackets.size();
+ while (true) {
+ auto tok_res = this->tokenize2(tf);
+ if (!tok_res) {
+ break;
+ }
+
+ if (this->ds_matching_brackets.size() == curr_size
+ && this->ds_last_bracket_matched)
+ {
+ return tokenize_result{
+ DNT_GROUP,
+ {
+ tr.tr_capture.c_begin,
+ tok_res->tr_capture.c_end,
+ },
+ {
+ tr.tr_capture.c_begin,
+ tok_res->tr_capture.c_end,
+ },
+ tr.tr_data,
+ };
+ }
+ }
+ break;
+ }
+ case DT_RSQUARE:
+ case DT_RCURLY:
+ case DT_RPAREN: {
+ for (auto riter = this->ds_matching_brackets.rbegin();
+ riter != this->ds_matching_brackets.rend();
+ ++riter)
+ {
+ if (riter->tr_token == to_opener(tr.tr_token)) {
+ return data_scanner::tokenize_result{
+ DNT_GROUP,
+ {
+ riter->tr_capture.c_begin,
+ tr.tr_capture.c_end,
+ },
+ {
+ riter->tr_capture.c_begin,
+ tr.tr_capture.c_end,
+ },
+ tr.tr_data,
+ };
+ }
+ }
+ break;
+ }
+ default:
+ break;
+ }
+
+ return nonstd::nullopt;
+}