diff options
Diffstat (limited to '')
-rw-r--r-- | src/data_scanner.hh | 211 |
1 files changed, 211 insertions, 0 deletions
diff --git a/src/data_scanner.hh b/src/data_scanner.hh new file mode 100644 index 0000000..3859ebb --- /dev/null +++ b/src/data_scanner.hh @@ -0,0 +1,211 @@ +/** + * Copyright (c) 2007-2012, Timothy Stack + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * * Neither the name of Timothy Stack nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef data_scanner_hh +#define data_scanner_hh + +#include <string> + +#include "pcrepp/pcre2pp.hh" +#include "shared_buffer.hh" + +enum data_token_t { + DT_INVALID = -1, + + DT_QUOTED_STRING = 0, + DT_URL, + DT_PATH, + DT_MAC_ADDRESS, + DT_DATE, + DT_TIME, + DT_DATE_TIME, + DT_IPV6_ADDRESS, + DT_HEX_DUMP, + DT_XML_DECL_TAG, + DT_XML_EMPTY_TAG, + DT_XML_OPEN_TAG, + DT_XML_CLOSE_TAG, + + DT_H1, + DT_H2, + DT_H3, + + /* DT_QUALIFIED_NAME, */ + + DT_COLON, + DT_EQUALS, + DT_COMMA, + DT_SEMI, + + DT_EMPTY_CONTAINER, + + DT_LCURLY, + DT_RCURLY, + + DT_LSQUARE, + DT_RSQUARE, + + DT_LPAREN, + DT_RPAREN, + + DT_LANGLE, + DT_RANGLE, + + DT_IPV4_ADDRESS, + DT_UUID, + + DT_CREDIT_CARD_NUMBER, + DT_VERSION_NUMBER, + DT_OCTAL_NUMBER, + DT_PERCENTAGE, + DT_NUMBER, + DT_HEX_NUMBER, + + DT_EMAIL, + DT_CONSTANT, + DT_WORD, + DT_SYMBOL, + DT_LINE, + DT_WHITE, + DT_DOT, + DT_ESCAPED_CHAR, + DT_CSI, + + DT_GARBAGE, + + DT_TERMINAL_MAX = DT_GARBAGE + 1, + + DNT_KEY = 50, + DNT_PAIR, + DNT_VALUE, + DNT_ROW, + DNT_UNITS, + DNT_MEASUREMENT, + DNT_VARIABLE_KEY, + DNT_ROWRANGE, + DNT_GROUP, + + DNT_MAX, + + DT_ANY = 100, +}; + +class data_scanner { +public: + static const char* token2name(data_token_t token); + + struct capture_t { + capture_t() + { /* We don't initialize anything since it's a perf hit. */ + } + + capture_t(int begin, int end) : c_begin(begin), c_end(end) + { + assert(begin <= end); + } + + int c_begin; + int c_end; + + void ltrim(const char* str); + + bool contains(int pos) const + { + return this->c_begin <= pos && pos < this->c_end; + } + + bool is_valid() const { return this->c_begin != -1; } + + int length() const { return this->c_end - this->c_begin; } + + bool empty() const { return this->c_begin == this->c_end; } + }; + + data_scanner(const std::string& line, size_t off = 0) + : ds_line(line), ds_input(this->ds_line), ds_init_offset(off), + ds_next_offset(off) + { + if (!line.empty() && line.back() == '.') { + this->ds_input.sf_end -= 1; + } + } + + explicit data_scanner(string_fragment sf) : ds_input(sf) + { + if (!sf.empty() && sf.back() == '.') { + this->ds_input.sf_end -= 1; + } + } + + explicit data_scanner(shared_buffer_ref& line, size_t off, size_t end) + : ds_sbr(line), ds_input(line.to_string_fragment().sub_range(0, end)), + ds_init_offset(off), ds_next_offset(off) + { + if (!this->ds_input.empty() && this->ds_input.back() == '.') { + this->ds_input.sf_end -= 1; + } + } + + struct tokenize_result { + data_token_t tr_token{DT_INVALID}; + capture_t tr_capture; + capture_t tr_inner_capture; + const char* tr_data{nullptr}; + + std::string to_string() const + { + return {&this->tr_data[this->tr_capture.c_begin], + (size_t) this->tr_capture.length()}; + } + }; + + nonstd::optional<tokenize_result> tokenize2(); + + void reset() { this->ds_next_offset = this->ds_init_offset; } + + int get_init_offset() const { return this->ds_init_offset; } + + string_fragment get_input() const { return this->ds_input; } + + string_fragment to_string_fragment(capture_t cap) const + { + return this->ds_input.sub_range(cap.c_begin, cap.c_end); + } + +private: + bool is_credit_card(string_fragment frag) const; + + std::string ds_line; + shared_buffer_ref ds_sbr; + string_fragment ds_input; + int ds_init_offset{0}; + int ds_next_offset{0}; +}; + +#endif |