Adding upstream version 16.2.11+ds.upstream/16.2.11+ds upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:45:59 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:45:59 +0000
commit: 19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree: 42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/s3select/include/s3select_csv_parser.h
parent: Initial commit. (diff)
download: ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz
ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip
1 files changed, 407 insertions, 0 deletions
diff --git a/src/s3select/include/s3select_csv_parser.h b/src/s3select/include/s3select_csv_parser.h
new file mode 100644
index 000000000..5527da913
--- /dev/null
+++ b/src/s3select/include/s3select_csv_parser.h
@@ -0,0 +1,407 @@
+#include <iostream>
+
+#include <boost/mpl/vector/vector30.hpp>
+// back-end
+#include <boost/msm/back/state_machine.hpp>
+//front-end
+#include <boost/msm/front/state_machine_def.hpp>
+
+#include <vector>
+
+namespace msm = boost::msm;
+namespace mpl = boost::mpl;
+
+namespace s3selectEngine
+{
+// events
+struct event_column_sep {};
+struct event_eol {};
+struct event_end_of_stream {};
+struct event_not_column_sep {};//i.e any char
+struct event_quote {};
+struct event_escape {};
+struct event_empty {};
+
+
+// front-end: define the FSM structure
+struct csvStateMch_ : public msm::front::state_machine_def<csvStateMch_>
+{
+  char* input_stream;
+  std::vector<char*>* tokens;
+  std::vector<int> has_esc{128};
+  size_t token_idx;
+  size_t escape_idx;
+  char* input_cur_location;
+  char* start_token;
+  bool end_of_parse;
+
+  typedef csvStateMch_ csv_rules;
+
+  csvStateMch_():end_of_parse(false) {}
+
+  void set(const char* input, std::vector<char*>* tk)
+  {
+    input_cur_location = input_stream = const_cast<char*>(input);
+    token_idx = 0;
+    tokens = tk;
+    escape_idx = 0;
+  }
+
+  char get_char()
+  {
+    return *input_cur_location;
+  }
+
+  char get_next_char(const char * end_stream)
+  {
+    if (input_cur_location >= end_stream)
+      return 0;
+
+    input_cur_location++;
+    return *input_cur_location;
+  }
+
+  const char* currentLoc()
+  {
+    return input_cur_location;
+  }
+
+  void parse_escape(char* in, char esc_char='\\')
+  {
+    //assumption atleast one escape and single
+    char* dst, *src;
+
+    dst = src = in;
+
+    while (1)
+    {
+      while (*src && *src != esc_char)
+      {
+        src++;  //search for escape
+      }
+
+      if (!*src) //reach end
+      {
+        char* p = src;
+        while (dst < src)
+        {
+          *dst++ = *p++;  //full copy
+        }
+        return;
+      }
+      //found escape
+      dst = src; //override escape
+      //if(*(dst+1)=='n') {*dst=10;dst++;} //enables special character
+
+      while (*dst)
+      {
+        *dst = *(dst + 1);
+        dst++;
+      } //copy with shift
+    }
+  }
+
+  // The list of FSM states
+  struct Start_new_token_st : public msm::front::state<>
+  {};//0
+
+  struct In_new_token_st : public msm::front::state<>
+  {};//1
+
+  struct In_quote_st : public msm::front::state<>
+  {};//2
+
+  struct In_esc_in_token_st : public msm::front::state<>
+  {};//3
+
+  struct In_esc_quote_st : public msm::front::state<>
+  {};//4
+
+  struct In_esc_start_token_st : public msm::front::state<>
+  {};//5
+
+  struct End_of_line_st : public msm::front::state<>
+  {};//6
+
+  struct Empty_state : public msm::front::state<>
+  {};//7
+
+
+  // the initial state of the csvStateMch SM. Must be defined
+  typedef Start_new_token_st initial_state;
+
+  void start_new_token()//helper
+  {
+    start_token = input_cur_location;
+    (*tokens)[ token_idx ] = start_token;
+    token_idx++;
+  }
+
+  // transition actions
+  void start_new_token(event_column_sep const&)
+  {
+    *input_cur_location = 0;//remove column-delimiter
+    start_new_token();
+  }
+
+  void start_new_token(event_not_column_sep const&)
+  {
+    start_new_token();
+  }
+
+  //need to handle empty lines(no tokens);
+  void start_new_token(event_eol const&)
+  {
+    if(!token_idx)
+    {
+      return;
+    }
+    (*tokens)[ token_idx ] = start_token;
+    token_idx++;
+  }
+
+  void start_new_token(event_end_of_stream const&) {}
+
+  void in_new_token(event_not_column_sep const&)
+  {
+    if(!*start_token)
+    {
+      start_token = input_cur_location;
+    }
+  }
+
+  void in_new_token(event_eol const&)
+  {
+    *input_cur_location=0;
+  }
+
+  void in_new_token(event_end_of_stream const&) {}
+
+  void in_new_token(event_column_sep const&)
+  {
+    (*tokens)[ token_idx ] = input_cur_location+1;
+    *input_cur_location=0;
+    token_idx++;
+  }
+
+  void in_new_token(event_quote const&)
+  {
+    if(!*start_token)
+    {
+      start_token = input_cur_location;
+    }
+  }
+
+  void in_quote(event_quote const&) {}
+
+  void in_quote(event_column_sep const&) {}
+
+  void in_quote(event_not_column_sep const&) {}
+
+  void in_quote(event_eol const&)
+  {
+    *input_cur_location=0;
+  }
+
+  void in_quote(event_end_of_stream const&)
+  {
+    *input_cur_location=0;
+  }
+
+  void start_new_token(event_quote const&)
+  {
+    start_new_token();
+  }
+
+  void push_escape_pos()
+  {
+    if(escape_idx && has_esc[ escape_idx -1]== (int)(token_idx-1))
+    {
+      return;
+    }
+    has_esc[ escape_idx ] = token_idx-1;
+    escape_idx++;
+  }
+  void in_escape(event_escape const&)
+  {
+    push_escape_pos();
+  }
+  void in_escape_start_new_token(event_escape const&)
+  {
+    start_new_token();
+    push_escape_pos();
+  }
+
+  void in_escape(event_column_sep const&) {}
+  void in_escape(event_not_column_sep const&) {}
+  void in_escape(event_quote const&) {}
+  void in_escape(event_eol const&) {}
+  void in_escape(event_end_of_stream const&) {}
+
+  void empty_action(event_empty const&) {}
+
+  //TODO need a guard for tokens vector size (<MAX)
+  // Transition table for csvStateMch
+  struct transition_table : mpl::vector30<
+  //           Start     		Event         		      Next      Action		 Guard
+  //  +---------+-------------+---------+---------------------+----------------------+
+    a_row < Start_new_token_st, event_column_sep 	, Start_new_token_st 	, &csv_rules::start_new_token      >,
+    a_row < Start_new_token_st, event_not_column_sep 	, In_new_token_st 	, &csv_rules::start_new_token      >,
+    a_row < Start_new_token_st, event_eol 	, End_of_line_st 	, &csv_rules::start_new_token      >,
+    a_row < Start_new_token_st, event_end_of_stream 	, End_of_line_st 	, &csv_rules::start_new_token      >,
+    a_row < In_new_token_st 	, event_not_column_sep, In_new_token_st, &csv_rules::in_new_token    >,
+    a_row < In_new_token_st 	, event_column_sep  	, In_new_token_st, &csv_rules::in_new_token    >,
+    a_row < In_new_token_st 	, event_eol  		, End_of_line_st, &csv_rules::in_new_token    >,
+    a_row < In_new_token_st 	, event_end_of_stream  	, End_of_line_st, &csv_rules::in_new_token    >,
+
+    a_row < Start_new_token_st 	, event_quote  	, In_quote_st, &csv_rules::start_new_token    >,   //open quote
+    a_row < In_new_token_st 	, event_quote  	, In_quote_st, &csv_rules::in_quote    >,   //open quote
+    a_row < In_quote_st 	, event_quote  	, In_new_token_st, &csv_rules::in_quote    >,   //close quote
+    a_row < In_quote_st 	, event_column_sep  	, In_quote_st, &csv_rules::in_quote    >,   //stay in quote
+    a_row < In_quote_st 	, event_not_column_sep  	, In_quote_st, &csv_rules::in_quote    >,   //stay in quote
+    a_row < In_quote_st 	, event_eol  	, End_of_line_st, &csv_rules::in_quote    >,   //end of quote/line
+    a_row < In_quote_st 	, event_end_of_stream  	, End_of_line_st, &csv_rules::in_quote    >,   //end of quote/line
+
+
+  //TODO add transitions for escape just before eol , eos.
+    a_row < Start_new_token_st 	, event_escape  	, In_esc_start_token_st, &csv_rules::in_escape_start_new_token    >,
+    a_row < In_esc_start_token_st, event_column_sep, In_new_token_st, &csv_rules::in_escape    >,      //escape column-sep
+    a_row < In_esc_start_token_st, event_not_column_sep, In_new_token_st, &csv_rules::in_escape    >,
+    a_row < In_esc_start_token_st, event_escape, In_new_token_st, &csv_rules::in_escape    >,
+    a_row < In_esc_start_token_st, event_quote, In_new_token_st, &csv_rules::in_escape    >,
+
+    a_row < In_new_token_st, event_escape, In_esc_in_token_st, &csv_rules::in_escape    >,
+    a_row < In_esc_in_token_st, event_column_sep, In_new_token_st, &csv_rules::in_escape    >,
+    a_row < In_esc_in_token_st, event_not_column_sep, In_new_token_st, &csv_rules::in_escape    >,
+    a_row < In_esc_in_token_st, event_escape, In_new_token_st, &csv_rules::in_escape    >,
+    a_row < In_esc_in_token_st, event_quote, In_new_token_st, &csv_rules::in_escape    >,
+
+    a_row < In_quote_st, event_escape, In_esc_quote_st, &csv_rules::in_escape    >,
+    a_row < In_esc_quote_st, event_column_sep, In_quote_st, &csv_rules::in_escape    >,
+    a_row < In_esc_quote_st, event_not_column_sep, In_quote_st, &csv_rules::in_escape    >,
+    a_row < In_esc_quote_st, event_escape, In_quote_st, &csv_rules::in_escape    >,
+    a_row < In_esc_quote_st, event_quote, In_quote_st, &csv_rules::in_escape    >
+
+  //  +---------+-------------+---------+---------------------+----------------------+
+    > {};
+
+  // Replaces the default no-transition response.
+  template <class FSM, class Event>
+  void no_transition(Event const& e, FSM&, int state)
+  {
+    std::cout << "no transition from state " << state
+              << " on event " << typeid(e).name() << std::endl;
+  }
+}; //// end-of-state-machine
+
+
+
+// Pick a back-end
+typedef msm::back::state_machine<csvStateMch_> csvStateMch;
+
+//
+// Testing utilities.
+//
+
+static char const* const state_names[] = {"Start_new_token_st", "In_new_token_st", "In_quote_st", "In_esc_in_token_st",
+                                          "In_esc_quote_st", "In_esc_start_token_st", "End_of_line_st", "Empty_state"
+                                         };
+void pstate(csvStateMch const& p)//debug
+{
+  std::cout << " -> " << state_names[p.current_state()[0]] << std::endl;
+}
+
+
+class csvParser
+{
+
+  csvStateMch p;
+
+  char m_row_delimeter;
+  char m_column_delimiter;
+  char m_quote_char;
+  char m_escape_char;
+
+public:
+
+  csvParser(char rd='\n', char cd=',', char quot='"', char ec='\\'):m_row_delimeter(rd), m_column_delimiter(cd), m_quote_char(quot), m_escape_char(ec) {};
+
+  void set(char row_delimiter, char column_delimiter, char quot_char, char escape_char)
+  {
+    m_row_delimeter = row_delimiter;
+    m_column_delimiter = column_delimiter;
+    m_quote_char = quot_char;
+    m_escape_char = escape_char;
+  }
+
+  int parse(char* input_stream, char* end_stream, std::vector<char*>* tk, size_t* num_of_tokens)
+  {
+    p.set(input_stream, tk);
+
+    // needed to start the highest-level SM. This will call on_entry and mark the start of the SM
+    p.start();
+
+    //TODO for better performance to use template specialization (\n  \ , ")
+    do
+    {
+      if (p.get_char() == m_row_delimeter)
+      {
+        p.process_event(event_eol());
+      }
+      else if (p.get_char() == m_column_delimiter)
+      {
+        p.process_event(event_column_sep());
+      }
+      else if (p.get_char() == 0)
+      {
+        p.process_event(event_end_of_stream());
+      }
+      else if (p.get_char() == m_quote_char)
+      {
+        p.process_event(event_quote());
+      }
+      else if (p.get_char() == m_escape_char)
+      {
+        p.process_event(event_escape());
+      }
+      else
+      {
+        p.process_event(event_not_column_sep());
+      }
+
+      if (p.tokens->capacity() <= p.token_idx)
+      {
+        return -1;
+      }
+
+      if (p.currentLoc() >= end_stream)
+      {
+        break;
+      }
+      p.get_next_char(end_stream);
+    }
+    while (p.current_state()[0] != 6);
+
+    p.stop();
+
+    *num_of_tokens = p.token_idx;
+
+    //second pass for escape rules; only token with escape are processed, if any.
+    for(size_t i=0; i<p.escape_idx; i++)
+    {
+      p.parse_escape((*tk)[p.has_esc[i]], m_escape_char);
+    }
+
+    return 0;
+  }
+
+  const char* currentLoc()
+  {
+    return p.currentLoc();
+  }
+
+};//end csv-parser
+
+}//end-namespace
+
+
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:45:59 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:45:59 +0000
commit	19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree	42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/s3select/include/s3select_csv_parser.h
parent	Initial commit. (diff)
download	ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip