summaryrefslogtreecommitdiffstats
path: root/src/s3select/include/s3select_csv_parser.h
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
commit19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/s3select/include/s3select_csv_parser.h
parentInitial commit. (diff)
downloadceph-upstream.tar.xz
ceph-upstream.zip
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--src/s3select/include/s3select_csv_parser.h407
1 files changed, 407 insertions, 0 deletions
diff --git a/src/s3select/include/s3select_csv_parser.h b/src/s3select/include/s3select_csv_parser.h
new file mode 100644
index 000000000..5527da913
--- /dev/null
+++ b/src/s3select/include/s3select_csv_parser.h
@@ -0,0 +1,407 @@
+#include <iostream>
+
+#include <boost/mpl/vector/vector30.hpp>
+// back-end
+#include <boost/msm/back/state_machine.hpp>
+//front-end
+#include <boost/msm/front/state_machine_def.hpp>
+
+#include <vector>
+
+namespace msm = boost::msm;
+namespace mpl = boost::mpl;
+
+namespace s3selectEngine
+{
+// events
+struct event_column_sep {};
+struct event_eol {};
+struct event_end_of_stream {};
+struct event_not_column_sep {};//i.e any char
+struct event_quote {};
+struct event_escape {};
+struct event_empty {};
+
+
+// front-end: define the FSM structure
+struct csvStateMch_ : public msm::front::state_machine_def<csvStateMch_>
+{
+ char* input_stream;
+ std::vector<char*>* tokens;
+ std::vector<int> has_esc{128};
+ size_t token_idx;
+ size_t escape_idx;
+ char* input_cur_location;
+ char* start_token;
+ bool end_of_parse;
+
+ typedef csvStateMch_ csv_rules;
+
+ csvStateMch_():end_of_parse(false) {}
+
+ void set(const char* input, std::vector<char*>* tk)
+ {
+ input_cur_location = input_stream = const_cast<char*>(input);
+ token_idx = 0;
+ tokens = tk;
+ escape_idx = 0;
+ }
+
+ char get_char()
+ {
+ return *input_cur_location;
+ }
+
+ char get_next_char(const char * end_stream)
+ {
+ if (input_cur_location >= end_stream)
+ return 0;
+
+ input_cur_location++;
+ return *input_cur_location;
+ }
+
+ const char* currentLoc()
+ {
+ return input_cur_location;
+ }
+
+ void parse_escape(char* in, char esc_char='\\')
+ {
+ //assumption atleast one escape and single
+ char* dst, *src;
+
+ dst = src = in;
+
+ while (1)
+ {
+ while (*src && *src != esc_char)
+ {
+ src++; //search for escape
+ }
+
+ if (!*src) //reach end
+ {
+ char* p = src;
+ while (dst < src)
+ {
+ *dst++ = *p++; //full copy
+ }
+ return;
+ }
+ //found escape
+ dst = src; //override escape
+ //if(*(dst+1)=='n') {*dst=10;dst++;} //enables special character
+
+ while (*dst)
+ {
+ *dst = *(dst + 1);
+ dst++;
+ } //copy with shift
+ }
+ }
+
+ // The list of FSM states
+ struct Start_new_token_st : public msm::front::state<>
+ {};//0
+
+ struct In_new_token_st : public msm::front::state<>
+ {};//1
+
+ struct In_quote_st : public msm::front::state<>
+ {};//2
+
+ struct In_esc_in_token_st : public msm::front::state<>
+ {};//3
+
+ struct In_esc_quote_st : public msm::front::state<>
+ {};//4
+
+ struct In_esc_start_token_st : public msm::front::state<>
+ {};//5
+
+ struct End_of_line_st : public msm::front::state<>
+ {};//6
+
+ struct Empty_state : public msm::front::state<>
+ {};//7
+
+
+ // the initial state of the csvStateMch SM. Must be defined
+ typedef Start_new_token_st initial_state;
+
+ void start_new_token()//helper
+ {
+ start_token = input_cur_location;
+ (*tokens)[ token_idx ] = start_token;
+ token_idx++;
+ }
+
+ // transition actions
+ void start_new_token(event_column_sep const&)
+ {
+ *input_cur_location = 0;//remove column-delimiter
+ start_new_token();
+ }
+
+ void start_new_token(event_not_column_sep const&)
+ {
+ start_new_token();
+ }
+
+ //need to handle empty lines(no tokens);
+ void start_new_token(event_eol const&)
+ {
+ if(!token_idx)
+ {
+ return;
+ }
+ (*tokens)[ token_idx ] = start_token;
+ token_idx++;
+ }
+
+ void start_new_token(event_end_of_stream const&) {}
+
+ void in_new_token(event_not_column_sep const&)
+ {
+ if(!*start_token)
+ {
+ start_token = input_cur_location;
+ }
+ }
+
+ void in_new_token(event_eol const&)
+ {
+ *input_cur_location=0;
+ }
+
+ void in_new_token(event_end_of_stream const&) {}
+
+ void in_new_token(event_column_sep const&)
+ {
+ (*tokens)[ token_idx ] = input_cur_location+1;
+ *input_cur_location=0;
+ token_idx++;
+ }
+
+ void in_new_token(event_quote const&)
+ {
+ if(!*start_token)
+ {
+ start_token = input_cur_location;
+ }
+ }
+
+ void in_quote(event_quote const&) {}
+
+ void in_quote(event_column_sep const&) {}
+
+ void in_quote(event_not_column_sep const&) {}
+
+ void in_quote(event_eol const&)
+ {
+ *input_cur_location=0;
+ }
+
+ void in_quote(event_end_of_stream const&)
+ {
+ *input_cur_location=0;
+ }
+
+ void start_new_token(event_quote const&)
+ {
+ start_new_token();
+ }
+
+ void push_escape_pos()
+ {
+ if(escape_idx && has_esc[ escape_idx -1]== (int)(token_idx-1))
+ {
+ return;
+ }
+ has_esc[ escape_idx ] = token_idx-1;
+ escape_idx++;
+ }
+ void in_escape(event_escape const&)
+ {
+ push_escape_pos();
+ }
+ void in_escape_start_new_token(event_escape const&)
+ {
+ start_new_token();
+ push_escape_pos();
+ }
+
+ void in_escape(event_column_sep const&) {}
+ void in_escape(event_not_column_sep const&) {}
+ void in_escape(event_quote const&) {}
+ void in_escape(event_eol const&) {}
+ void in_escape(event_end_of_stream const&) {}
+
+ void empty_action(event_empty const&) {}
+
+ //TODO need a guard for tokens vector size (<MAX)
+ // Transition table for csvStateMch
+ struct transition_table : mpl::vector30<
+ // Start Event Next Action Guard
+ // +---------+-------------+---------+---------------------+----------------------+
+ a_row < Start_new_token_st, event_column_sep , Start_new_token_st , &csv_rules::start_new_token >,
+ a_row < Start_new_token_st, event_not_column_sep , In_new_token_st , &csv_rules::start_new_token >,
+ a_row < Start_new_token_st, event_eol , End_of_line_st , &csv_rules::start_new_token >,
+ a_row < Start_new_token_st, event_end_of_stream , End_of_line_st , &csv_rules::start_new_token >,
+ a_row < In_new_token_st , event_not_column_sep, In_new_token_st, &csv_rules::in_new_token >,
+ a_row < In_new_token_st , event_column_sep , In_new_token_st, &csv_rules::in_new_token >,
+ a_row < In_new_token_st , event_eol , End_of_line_st, &csv_rules::in_new_token >,
+ a_row < In_new_token_st , event_end_of_stream , End_of_line_st, &csv_rules::in_new_token >,
+
+ a_row < Start_new_token_st , event_quote , In_quote_st, &csv_rules::start_new_token >, //open quote
+ a_row < In_new_token_st , event_quote , In_quote_st, &csv_rules::in_quote >, //open quote
+ a_row < In_quote_st , event_quote , In_new_token_st, &csv_rules::in_quote >, //close quote
+ a_row < In_quote_st , event_column_sep , In_quote_st, &csv_rules::in_quote >, //stay in quote
+ a_row < In_quote_st , event_not_column_sep , In_quote_st, &csv_rules::in_quote >, //stay in quote
+ a_row < In_quote_st , event_eol , End_of_line_st, &csv_rules::in_quote >, //end of quote/line
+ a_row < In_quote_st , event_end_of_stream , End_of_line_st, &csv_rules::in_quote >, //end of quote/line
+
+
+ //TODO add transitions for escape just before eol , eos.
+ a_row < Start_new_token_st , event_escape , In_esc_start_token_st, &csv_rules::in_escape_start_new_token >,
+ a_row < In_esc_start_token_st, event_column_sep, In_new_token_st, &csv_rules::in_escape >, //escape column-sep
+ a_row < In_esc_start_token_st, event_not_column_sep, In_new_token_st, &csv_rules::in_escape >,
+ a_row < In_esc_start_token_st, event_escape, In_new_token_st, &csv_rules::in_escape >,
+ a_row < In_esc_start_token_st, event_quote, In_new_token_st, &csv_rules::in_escape >,
+
+ a_row < In_new_token_st, event_escape, In_esc_in_token_st, &csv_rules::in_escape >,
+ a_row < In_esc_in_token_st, event_column_sep, In_new_token_st, &csv_rules::in_escape >,
+ a_row < In_esc_in_token_st, event_not_column_sep, In_new_token_st, &csv_rules::in_escape >,
+ a_row < In_esc_in_token_st, event_escape, In_new_token_st, &csv_rules::in_escape >,
+ a_row < In_esc_in_token_st, event_quote, In_new_token_st, &csv_rules::in_escape >,
+
+ a_row < In_quote_st, event_escape, In_esc_quote_st, &csv_rules::in_escape >,
+ a_row < In_esc_quote_st, event_column_sep, In_quote_st, &csv_rules::in_escape >,
+ a_row < In_esc_quote_st, event_not_column_sep, In_quote_st, &csv_rules::in_escape >,
+ a_row < In_esc_quote_st, event_escape, In_quote_st, &csv_rules::in_escape >,
+ a_row < In_esc_quote_st, event_quote, In_quote_st, &csv_rules::in_escape >
+
+ // +---------+-------------+---------+---------------------+----------------------+
+ > {};
+
+ // Replaces the default no-transition response.
+ template <class FSM, class Event>
+ void no_transition(Event const& e, FSM&, int state)
+ {
+ std::cout << "no transition from state " << state
+ << " on event " << typeid(e).name() << std::endl;
+ }
+}; //// end-of-state-machine
+
+
+
+// Pick a back-end
+typedef msm::back::state_machine<csvStateMch_> csvStateMch;
+
+//
+// Testing utilities.
+//
+
+static char const* const state_names[] = {"Start_new_token_st", "In_new_token_st", "In_quote_st", "In_esc_in_token_st",
+ "In_esc_quote_st", "In_esc_start_token_st", "End_of_line_st", "Empty_state"
+ };
+void pstate(csvStateMch const& p)//debug
+{
+ std::cout << " -> " << state_names[p.current_state()[0]] << std::endl;
+}
+
+
+class csvParser
+{
+
+ csvStateMch p;
+
+ char m_row_delimeter;
+ char m_column_delimiter;
+ char m_quote_char;
+ char m_escape_char;
+
+public:
+
+ csvParser(char rd='\n', char cd=',', char quot='"', char ec='\\'):m_row_delimeter(rd), m_column_delimiter(cd), m_quote_char(quot), m_escape_char(ec) {};
+
+ void set(char row_delimiter, char column_delimiter, char quot_char, char escape_char)
+ {
+ m_row_delimeter = row_delimiter;
+ m_column_delimiter = column_delimiter;
+ m_quote_char = quot_char;
+ m_escape_char = escape_char;
+ }
+
+ int parse(char* input_stream, char* end_stream, std::vector<char*>* tk, size_t* num_of_tokens)
+ {
+ p.set(input_stream, tk);
+
+ // needed to start the highest-level SM. This will call on_entry and mark the start of the SM
+ p.start();
+
+ //TODO for better performance to use template specialization (\n \ , ")
+ do
+ {
+ if (p.get_char() == m_row_delimeter)
+ {
+ p.process_event(event_eol());
+ }
+ else if (p.get_char() == m_column_delimiter)
+ {
+ p.process_event(event_column_sep());
+ }
+ else if (p.get_char() == 0)
+ {
+ p.process_event(event_end_of_stream());
+ }
+ else if (p.get_char() == m_quote_char)
+ {
+ p.process_event(event_quote());
+ }
+ else if (p.get_char() == m_escape_char)
+ {
+ p.process_event(event_escape());
+ }
+ else
+ {
+ p.process_event(event_not_column_sep());
+ }
+
+ if (p.tokens->capacity() <= p.token_idx)
+ {
+ return -1;
+ }
+
+ if (p.currentLoc() >= end_stream)
+ {
+ break;
+ }
+ p.get_next_char(end_stream);
+ }
+ while (p.current_state()[0] != 6);
+
+ p.stop();
+
+ *num_of_tokens = p.token_idx;
+
+ //second pass for escape rules; only token with escape are processed, if any.
+ for(size_t i=0; i<p.escape_idx; i++)
+ {
+ p.parse_escape((*tk)[p.has_esc[i]], m_escape_char);
+ }
+
+ return 0;
+ }
+
+ const char* currentLoc()
+ {
+ return p.currentLoc();
+ }
+
+};//end csv-parser
+
+}//end-namespace
+
+