summaryrefslogtreecommitdiffstats
path: root/include/orcus/csv_parser.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'include/orcus/csv_parser.hpp')
-rw-r--r--include/orcus/csv_parser.hpp306
1 files changed, 306 insertions, 0 deletions
diff --git a/include/orcus/csv_parser.hpp b/include/orcus/csv_parser.hpp
new file mode 100644
index 0000000..5cb9598
--- /dev/null
+++ b/include/orcus/csv_parser.hpp
@@ -0,0 +1,306 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#ifndef ORCUS_CSV_PARSER_HPP
+#define ORCUS_CSV_PARSER_HPP
+
+#include "csv_parser_base.hpp"
+
+namespace orcus {
+
+class csv_handler
+{
+public:
+ /**
+ * Called when the parser starts parsing a stream.
+ */
+ void begin_parse() {}
+
+ /**
+ * Called when the parser finishes parsing a stream.
+ */
+ void end_parse() {}
+
+ /**
+ * Called at the beginning of every row.
+ */
+ void begin_row() {}
+
+ /**
+ * Called at the end of every row.
+ */
+ void end_row() {}
+
+ /**
+ * Called after every cell is parsed.
+ *
+ * @param value cell content.
+ * @param transient when true, the text content has been converted and is
+ * stored in a temporary buffer. In such case, there is
+ * no guarantee that the text content remain available
+ * after the end of the call. When this value is false,
+ * the text content is guaranteed to be valid so long as
+ * the original CSV stream content is valid.
+ */
+ void cell(std::string_view value, bool transient)
+ {
+ (void)value; (void)transient;
+ }
+};
+
+/**
+ * Parser for CSV documents.
+ *
+ * @tparam HandlerT Hanlder type with member functions for event callbacks.
+ * Refer to csv_handler.
+ */
+template<typename HandlerT>
+class csv_parser : public csv::parser_base
+{
+public:
+ typedef HandlerT handler_type;
+
+ csv_parser(std::string_view content, handler_type& hdl, const csv::parser_config& config);
+ void parse();
+
+private:
+
+ // handlers
+ void row();
+ void cell();
+ void quoted_cell();
+
+ void parse_cell_with_quote(const char* p0, size_t len0);
+
+ /**
+ * Push cell value to the handler.
+ */
+ void push_cell_value(const char* p, size_t n);
+
+private:
+ handler_type& m_handler;
+};
+
+template<typename _Handler>
+csv_parser<_Handler>::csv_parser(
+ std::string_view content, handler_type& hdl, const csv::parser_config& config) :
+ csv::parser_base(content, config), m_handler(hdl) {}
+
+template<typename _Handler>
+void csv_parser<_Handler>::parse()
+{
+#if ORCUS_DEBUG_CSV
+ for (const char* p = mp_begin; p < mp_end; ++p)
+ std::cout << *p;
+ std::cout << std::endl;
+#endif
+
+ m_handler.begin_parse();
+ while (has_char())
+ row();
+ m_handler.end_parse();
+}
+
+template<typename _Handler>
+void csv_parser<_Handler>::row()
+{
+ m_handler.begin_row();
+ while (true)
+ {
+ if (is_text_qualifier(cur_char()))
+ quoted_cell();
+ else
+ cell();
+
+ if (!has_char())
+ {
+ m_handler.end_row();
+ return;
+ }
+
+ char c = cur_char();
+ if (c == '\n')
+ {
+ next();
+#if ORCUS_DEBUG_CSV
+ cout << "(LF)" << endl;
+#endif
+ m_handler.end_row();
+ return;
+ }
+
+ if (!is_delim(c))
+ throw orcus::parse_error("expected a delimiter", offset());
+
+ next();
+
+ if (m_config.trim_cell_value)
+ skip_blanks();
+
+ if (!has_char())
+ {
+ m_handler.end_row();
+ return;
+ }
+ }
+}
+
+template<typename _Handler>
+void csv_parser<_Handler>::cell()
+{
+ const char* p = mp_char;
+ size_t len = 0;
+ char c = cur_char();
+ while (c != '\n' && !is_delim(c))
+ {
+ ++len;
+ next();
+ if (!has_char())
+ break;
+ c = cur_char();
+ }
+
+ if (!len)
+ p = nullptr;
+
+ push_cell_value(p, len);
+}
+
+template<typename _Handler>
+void csv_parser<_Handler>::quoted_cell()
+{
+#if ORCUS_DEBUG_CSV
+ cout << "--- quoted cell" << endl;
+#endif
+ char c = cur_char();
+ assert(is_text_qualifier(c));
+ next(); // Skip the opening quote.
+ if (!has_char())
+ return;
+
+ const char* p0 = mp_char;
+ size_t len = 1;
+ for (; has_char(); next(), ++len)
+ {
+ c = cur_char();
+#if ORCUS_DEBUG_CSV
+ cout << "'" << c << "'" << endl;
+#endif
+ if (!is_text_qualifier(c))
+ continue;
+
+ // current char is a quote. Check if the next char is also a text
+ // qualifier.
+
+ if (has_next() && is_text_qualifier(peek_char()))
+ {
+ next();
+ parse_cell_with_quote(p0, len);
+ return;
+ }
+
+ // Closing quote.
+ m_handler.cell({p0, len-1}, false);
+ next();
+ skip_blanks();
+ return;
+ }
+
+ // Stream ended prematurely. Handle it gracefully.
+ m_handler.cell({p0, len}, false);
+}
+
+template<typename _Handler>
+void csv_parser<_Handler>::parse_cell_with_quote(const char* p0, size_t len0)
+{
+#if ORCUS_DEBUG_CSV
+ using namespace std;
+ cout << "--- parse cell with quote" << endl;
+#endif
+ assert(is_text_qualifier(cur_char()));
+
+ // Push the preceding chars to the temp buffer.
+ m_cell_buf.reset();
+ m_cell_buf.append(p0, len0);
+
+ // Parse the rest, until the closing quote.
+ next();
+ const char* p_cur = mp_char;
+ size_t cur_len = 0;
+ for (; has_char(); next(), ++cur_len)
+ {
+ char c = cur_char();
+#if ORCUS_DEBUG_CSV
+ cout << "'" << c << "'" << endl;
+#endif
+ if (!is_text_qualifier(c))
+ continue;
+
+ if (has_next() && is_text_qualifier(peek_char()))
+ {
+ // double quotation. Copy the current segment to the cell buffer.
+ m_cell_buf.append(p_cur, cur_len);
+
+ next(); // to the 2nd quote.
+ p_cur = mp_char;
+ cur_len = 0;
+ continue;
+ }
+
+ // closing quote. Flush the current segment to the cell
+ // buffer, push the value to the handler, and exit normally.
+ m_cell_buf.append(p_cur, cur_len);
+
+ m_handler.cell(m_cell_buf.str(), true);
+ next();
+ skip_blanks();
+ return;
+ }
+
+ // Stream ended prematurely.
+ throw parse_error("stream ended prematurely while parsing quoted cell.", offset());
+}
+
+template<typename _Handler>
+void csv_parser<_Handler>::push_cell_value(const char* p, size_t n)
+{
+ size_t len = n;
+
+ if (m_config.trim_cell_value)
+ {
+ // Trim any leading blanks.
+ for (size_t i = 0; i < n; ++i, --len, ++p)
+ {
+ if (!is_blank(*p))
+ break;
+ }
+
+ // Trim any trailing blanks.
+ if (len)
+ {
+ const char* p_end = p + (len-1);
+ for (; p != p_end; --p_end, --len)
+ {
+ if (!is_blank(*p_end))
+ break;
+ }
+ }
+ }
+
+ m_handler.cell({p, len}, false);
+#if ORCUS_DEBUG_CSV
+ if (len)
+ cout << "(cell:'" << std::string(p, len) << "')" << endl;
+ else
+ cout << "(cell:'')" << endl;
+#endif
+}
+
+}
+
+#endif
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */