diff options
Diffstat (limited to '')
-rw-r--r-- | src/data_parser.hh | 431 |
1 files changed, 431 insertions, 0 deletions
diff --git a/src/data_parser.hh b/src/data_parser.hh new file mode 100644 index 0000000..ca54a58 --- /dev/null +++ b/src/data_parser.hh @@ -0,0 +1,431 @@ +/** + * Copyright (c) 2007-2012, Timothy Stack + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * * Neither the name of Timothy Stack nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef data_parser_hh +#define data_parser_hh + +#include <iterator> +#include <list> +#include <stack> +#include <vector> + +#include <stdio.h> + +#include "base/lnav_log.hh" +#include "byte_array.hh" +#include "data_scanner.hh" + +#define ELEMENT_LIST_T(var) var("" #var, __FILE__, __LINE__, group_depth) +#define PUSH_FRONT(elem) push_front(elem, __FILE__, __LINE__) +#define PUSH_BACK(elem) push_back(elem, __FILE__, __LINE__) +#define POP_FRONT(elem) pop_front(__FILE__, __LINE__) +#define POP_BACK(elem) pop_back(__FILE__, __LINE__) +#define CLEAR(elem) clear2(__FILE__, __LINE__) +#define SWAP(other) swap(other, __FILE__, __LINE__) +#define SPLICE(pos, other, first, last) \ + splice(pos, other, first, last, __FILE__, __LINE__) + +template<class Container, class UnaryPredicate> +void +strip(Container& container, UnaryPredicate p) +{ + while (!container.empty() && p(container.front())) { + container.POP_FRONT(); + } + while (!container.empty() && p(container.back())) { + container.POP_BACK(); + } +} + +enum data_format_state_t { + DFS_ERROR = -1, + DFS_INIT, + DFS_KEY, + DFS_EXPECTING_SEP, + DFS_VALUE, +}; + +struct data_format { + data_format(const char* name = nullptr, + data_token_t appender = DT_INVALID, + data_token_t terminator = DT_INVALID) noexcept + : df_name(name), df_appender(appender), df_terminator(terminator), + df_qualifier(DT_INVALID), df_separator(DT_COLON), + df_prefix_terminator(DT_INVALID) + { + } + + const char* df_name; + data_token_t df_appender; + data_token_t df_terminator; + data_token_t df_qualifier; + data_token_t df_separator; + data_token_t df_prefix_terminator; +}; + +data_format_state_t dfs_prefix_next(data_format_state_t state, + data_token_t next_token); +data_format_state_t dfs_semi_next(data_format_state_t state, + data_token_t next_token); +data_format_state_t dfs_comma_next(data_format_state_t state, + data_token_t next_token); + +#define LIST_INIT_TRACE \ + do { \ + if (TRACE_FILE != NULL) { \ + fprintf(TRACE_FILE, \ + "%p %s:%d %s %s %d\n", \ + this, \ + fn, \ + line, \ + __func__, \ + varname, \ + group_depth); \ + } \ + } while (false) + +#define LIST_DEINIT_TRACE \ + do { \ + if (TRACE_FILE != NULL) { \ + fprintf(TRACE_FILE, "%p %s:%d %s\n", this, fn, line, __func__); \ + } \ + } while (false) + +#define ELEMENT_TRACE \ + do { \ + if (TRACE_FILE != NULL) { \ + fprintf(TRACE_FILE, \ + "%p %s:%d %s %s %d:%d\n", \ + this, \ + fn, \ + line, \ + __func__, \ + data_scanner::token2name(elem.e_token), \ + elem.e_capture.c_begin, \ + elem.e_capture.c_end); \ + } \ + } while (false) + +#define LIST_TRACE \ + do { \ + if (TRACE_FILE != NULL) { \ + fprintf(TRACE_FILE, "%p %s:%d %s\n", this, fn, line, __func__); \ + } \ + } while (false) + +#define SPLICE_TRACE \ + do { \ + if (TRACE_FILE != NULL) { \ + fprintf(TRACE_FILE, \ + "%p %s:%d %s %d %p %d:%d\n", \ + this, \ + fn, \ + line, \ + __func__, \ + (int) std::distance(this->begin(), pos), \ + &other, \ + (int) std::distance(other.begin(), first), \ + (int) std::distance(last, other.end())); \ + } \ + } while (false); + +#define SWAP_TRACE(other) \ + do { \ + if (TRACE_FILE != NULL) { \ + fprintf(TRACE_FILE, \ + "%p %s:%d %s %p\n", \ + this, \ + fn, \ + line, \ + __func__, \ + &other); \ + } \ + } while (false); + +#define POINT_TRACE(name) \ + do { \ + if (TRACE_FILE) { \ + fprintf( \ + TRACE_FILE, "0x0 %s:%d point %s\n", __FILE__, __LINE__, name); \ + } \ + } while (false); + +#define FORMAT_TRACE(elist) \ + do { \ + if (TRACE_FILE) { \ + const data_format& df = elist.el_format; \ + fprintf(TRACE_FILE, \ + "%p %s:%d format %d %s %s %s %s %s\n", \ + &elist, \ + __FILE__, \ + __LINE__, \ + group_depth, \ + data_scanner::token2name(df.df_appender), \ + data_scanner::token2name(df.df_terminator), \ + data_scanner::token2name(df.df_qualifier), \ + data_scanner::token2name(df.df_separator), \ + data_scanner::token2name(df.df_prefix_terminator)); \ + } \ + } while (false); + +#define CONSUMED_TRACE(elist) \ + do { \ + if (TRACE_FILE) { \ + fprintf(TRACE_FILE, \ + "%p %s:%d consumed\n", \ + &elist, \ + __FILE__, \ + __LINE__); \ + } \ + } while (false); + +class data_parser { +public: + static data_format FORMAT_SEMI; + static data_format FORMAT_COMMA; + static data_format FORMAT_PLAIN; + + static FILE* TRACE_FILE; + + typedef byte_array<2, uint64_t> schema_id_t; + + struct element; + /* typedef std::list<element> element_list_t; */ + + class element_list_t : public std::list<element> { + public: + element_list_t(const char* varname, + const char* fn, + int line, + int group_depth = -1) + { + LIST_INIT_TRACE; + } + + element_list_t() + { + const char* varname = "_anon2_"; + const char* fn = __FILE__; + int line = __LINE__; + int group_depth = -1; + + LIST_INIT_TRACE; + } + + element_list_t(const element_list_t& other) : std::list<element>(other) + { + this->el_format = other.el_format; + } + + ~element_list_t() + { + const char* fn = __FILE__; + int line = __LINE__; + + LIST_DEINIT_TRACE; + } + + void push_front(const element& elem, const char* fn, int line) + { + ELEMENT_TRACE; + + require(elem.e_capture.c_end >= -1); + this->std::list<element>::push_front(elem); + } + + void push_back(const element& elem, const char* fn, int line) + { + ELEMENT_TRACE; + + require(elem.e_capture.c_end >= -1); + this->std::list<element>::push_back(elem); + } + + void pop_front(const char* fn, int line) + { + LIST_TRACE; + + this->std::list<element>::pop_front(); + } + + void pop_back(const char* fn, int line) + { + LIST_TRACE; + + this->std::list<element>::pop_back(); + } + + void clear2(const char* fn, int line) + { + LIST_TRACE; + + this->std::list<element>::clear(); + } + + void swap(element_list_t& other, const char* fn, int line) + { + SWAP_TRACE(other); + + this->std::list<element>::swap(other); + } + + void splice(iterator pos, + element_list_t& other, + iterator first, + iterator last, + const char* fn, + int line) + { + SPLICE_TRACE; + + this->std::list<element>::splice(pos, other, first, last); + } + + data_format el_format; + }; + + struct element { + element(); + + element(element_list_t& subs, + data_token_t token, + bool assign_subs_elements = true); + + element(const element& other); + + ~element(); + + element& operator=(const element& other); + + void assign_elements(element_list_t& subs); + + void update_capture(); + + const element& get_pair_value() const; + + data_token_t value_token() const; + + const element& get_value_elem() const; + + const element& get_pair_elem() const; + + void print(FILE* out, data_scanner&, int offset = 0) const; + + data_scanner::capture_t e_capture; + data_token_t e_token; + + element_list_t* e_sub_elements; + }; + + struct element_cmp { + bool operator()(data_token_t token, const element& elem) const + { + return token == elem.e_token || token == DT_ANY; + } + + bool operator()(const element& elem, data_token_t token) const + { + return (*this)(token, elem); + } + }; + + struct element_if { + element_if(data_token_t token) : ei_token(token) {} + + bool operator()(const element& a) const + { + return a.e_token == this->ei_token; + } + + private: + data_token_t ei_token; + }; + + struct element_is_space { + bool operator()(const element& el) const + { + return el.e_token == DT_WHITE || el.e_token == DT_CSI; + } + }; + + struct discover_format_state { + discover_format_state(); + + void update_for_element(const element& elem); + + void finalize(); + + data_format_state_t dfs_prefix_state; + data_format_state_t dfs_semi_state; + data_format_state_t dfs_comma_state; + int dfs_hist[DT_TERMINAL_MAX]; + + data_format dfs_format; + }; + + data_parser(data_scanner* ds); + + void pairup(schema_id_t* schema, + element_list_t& pairs_out, + element_list_t& in_list, + int group_depth = 0); + + void discover_format(); + + void end_of_value(element_list_t& el_stack, + element_list_t& key_comps, + element_list_t& value, + const element_list_t& in_list, + int group_depth); + + void parse(); + + std::string get_element_string(const element& elem) const; + + std::string get_string_up_to_value(const element& elem); + + const char* get_element_string(const element& elem, size_t& len_out); + + void print(FILE* out, element_list_t& el); + + std::vector<data_token_t> dp_group_token; + std::list<element_list_t> dp_group_stack; + + element_list_t dp_errors; + + element_list_t dp_pairs; + schema_id_t dp_schema_id; + std::string* dp_msg_format; + int dp_msg_format_begin; + +private: + data_scanner* dp_scanner; +}; + +#endif |