/** * Copyright (c) 2007-2012, Timothy Stack * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * Neither the name of Timothy Stack nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef data_scanner_hh #define data_scanner_hh #include #include "pcrepp/pcre2pp.hh" #include "shared_buffer.hh" #include "text_format.hh" enum data_token_t { DT_INVALID = -1, DT_QUOTED_STRING = 0, DT_COMMENT, DT_URL, DT_PATH, DT_MAC_ADDRESS, DT_DATE, DT_TIME, DT_DATE_TIME, DT_IPV6_ADDRESS, DT_HEX_DUMP, DT_XML_DECL_TAG, DT_XML_EMPTY_TAG, DT_XML_OPEN_TAG, DT_XML_CLOSE_TAG, DT_H1, DT_H2, DT_H3, /* DT_QUALIFIED_NAME, */ DT_COLON, DT_EQUALS, DT_COMMA, DT_SEMI, DT_EMDASH, DT_EMPTY_CONTAINER, DT_LCURLY, DT_RCURLY, DT_LSQUARE, DT_RSQUARE, DT_LPAREN, DT_RPAREN, DT_LANGLE, DT_RANGLE, DT_IPV4_ADDRESS, DT_UUID, DT_CREDIT_CARD_NUMBER, DT_VERSION_NUMBER, DT_OCTAL_NUMBER, DT_PERCENTAGE, DT_NUMBER, DT_HEX_NUMBER, DT_EMAIL, DT_CONSTANT, DT_WORD, DT_ID, DT_SYMBOL, DT_UNIT, DT_LINE, DT_WHITE, DT_DOT, DT_ESCAPED_CHAR, DT_CSI, DT_GARBAGE, DT_ZERO_WIDTH_SPACE, DT_DIFF_FILE_HEADER, DT_DIFF_HUNK_HEADING, DT_TERMINAL_MAX = DT_DIFF_HUNK_HEADING + 1, DNT_KEY = 54, DNT_PAIR, DNT_VALUE, DNT_ROW, DNT_UNITS, DNT_MEASUREMENT, DNT_VARIABLE_KEY, DNT_ROWRANGE, DNT_GROUP, DNT_MAX, DT_ANY = 100, }; class data_scanner { public: static const char* token2name(data_token_t token); struct capture_t { capture_t() { /* We don't initialize anything since it's a perf hit. */ } capture_t(int begin, int end) : c_begin(begin), c_end(end) { assert(begin <= end); } int c_begin; int c_end; void ltrim(const char* str); bool contains(int pos) const { return this->c_begin <= pos && pos < this->c_end; } bool is_valid() const { return this->c_begin != -1; } int length() const { return this->c_end - this->c_begin; } bool empty() const { return this->c_begin == this->c_end; } }; data_scanner(const std::string& line, size_t off = 0) : ds_line(line), ds_input(this->ds_line), ds_init_offset(off), ds_next_offset(off) { this->cleanup_end(); } explicit data_scanner(string_fragment sf) : ds_input(sf) { this->cleanup_end(); } explicit data_scanner(const shared_buffer_ref& line, size_t off, size_t end) : ds_sbr(line.clone()), ds_input(line.to_string_fragment().sub_range(0, end)), ds_init_offset(off), ds_next_offset(off) { this->cleanup_end(); } struct tokenize_result { data_token_t tr_token{DT_INVALID}; capture_t tr_capture; capture_t tr_inner_capture; const char* tr_data{nullptr}; string_fragment to_string_fragment() const { return string_fragment::from_byte_range(this->tr_data, this->tr_capture.c_begin, this->tr_capture.c_end); } string_fragment inner_string_fragment() const { return string_fragment::from_byte_range( this->tr_data, this->tr_inner_capture.c_begin, this->tr_inner_capture.c_end); } std::string to_string() const { return {&this->tr_data[this->tr_capture.c_begin], (size_t) this->tr_capture.length()}; } }; nonstd::optional tokenize2(text_format_t tf = text_format_t::TF_UNKNOWN); nonstd::optional find_matching_bracket(text_format_t tf, tokenize_result tr); void reset() { this->ds_next_offset = this->ds_init_offset; } int get_init_offset() const { return this->ds_init_offset; } string_fragment get_input() const { return this->ds_input; } string_fragment to_string_fragment(capture_t cap) const { return this->ds_input.sub_range(cap.c_begin, cap.c_end); } private: void cleanup_end(); bool is_credit_card(string_fragment frag) const; nonstd::optional tokenize_int(text_format_t tf = text_format_t::TF_UNKNOWN); std::string ds_line; shared_buffer_ref ds_sbr; string_fragment ds_input; int ds_init_offset{0}; int ds_next_offset{0}; bool ds_bol{true}; bool ds_units{false}; std::vector ds_matching_brackets; bool ds_last_bracket_matched{false}; }; inline data_token_t to_opener(data_token_t dt) { switch (dt) { case DT_XML_CLOSE_TAG: return DT_XML_OPEN_TAG; case DT_RCURLY: return DT_LCURLY; case DT_RSQUARE: return DT_LSQUARE; case DT_RPAREN: return DT_LPAREN; default: ensure(0); } } inline data_token_t to_closer(data_token_t dt) { switch (dt) { case DT_XML_OPEN_TAG: return DT_XML_CLOSE_TAG; case DT_LCURLY: return DT_RCURLY; case DT_LSQUARE: return DT_RSQUARE; case DT_LPAREN: return DT_RPAREN; default: ensure(0); } } #endif