summaryrefslogtreecommitdiffstats
path: root/src/s3select/include/s3select_csv_parser.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/s3select/include/s3select_csv_parser.h')
-rw-r--r--src/s3select/include/s3select_csv_parser.h418
1 files changed, 418 insertions, 0 deletions
diff --git a/src/s3select/include/s3select_csv_parser.h b/src/s3select/include/s3select_csv_parser.h
new file mode 100644
index 000000000..dab2e4efa
--- /dev/null
+++ b/src/s3select/include/s3select_csv_parser.h
@@ -0,0 +1,418 @@
+#include "csvparser/csv.h"
+
+namespace io{
+
+ namespace error{
+ struct escaped_char_missing :
+ base,
+ with_file_name,
+ with_file_line{
+ void format_error_message()const override{
+ std::snprintf(error_message_buffer, sizeof(error_message_buffer),
+ "Escaped character missing in line %d in file \"%s\"."
+ , file_line, file_name);
+ }
+ };
+ }
+
+ namespace detail{
+ static void unescape(char*&col_begin, char*&col_end, char& quote, char& escape_char)
+ {
+ if(col_end - col_begin >= 2)
+ {
+ while(*col_begin == quote && *(col_begin + 1) == quote)
+ {
+ ++col_begin;
+ ++col_begin;
+ }
+ char*out = col_begin;
+ char* in = col_begin;
+ bool init = true;
+
+ while(in != col_end)
+ {
+ if(*in != quote && *in != escape_char)
+ {
+ if(init)
+ {
+ init = false;
+ }
+ else
+ {
+ *out = *in;
+ }
+ ++in;
+ ++out;
+ }
+ else
+ {
+ if(*in == escape_char)
+ {
+ ++in;
+ if(init)
+ {
+ ++col_begin;
+ ++out;
+ init = false;
+ }
+ else
+ {
+ *out = *in;
+ }
+ ++in;
+ ++out;
+ }
+ else
+ {
+ ++in;
+ while(*in != quote)
+ {
+ if(init)
+ {
+ ++col_begin;
+ ++out;
+ init = false;
+ }
+ else
+ {
+ *out = *in;
+ }
+ ++in;
+ ++out;
+ }
+ ++in;
+ }
+ }
+ }
+ *out = '\0';
+ col_end = out;
+ }
+ }
+
+ static void trim(char*&str_begin, char*&str_end, std::vector<char>& trim_chars)
+ {
+ while(str_begin != str_end && std::find(trim_chars.begin(), trim_chars.end(), *str_begin) != trim_chars.end())
+ ++str_begin;
+ while(str_begin != str_end && std::find(trim_chars.begin(), trim_chars.end(), *(str_end-1)) != trim_chars.end())
+ --str_end;
+ *str_end = '\0';
+ }
+
+ static const char*find_next_column_end(const char*col_begin, char& sep, char& quote, char& escape_char)
+ {
+ while(*col_begin != sep && *col_begin != '\0')
+ {
+ if(*col_begin != quote && *col_begin != escape_char)
+ ++col_begin;
+ else
+ {
+ if(*col_begin == escape_char)
+ {
+ if(*(col_begin+1) == '\0')
+ throw error::escaped_char_missing();
+ col_begin += 2;
+ }
+ else
+ {
+ do
+ {
+ ++col_begin;
+ while(*col_begin != quote)
+ {
+ if(*col_begin == '\0')
+ throw error::escaped_string_not_closed();
+ ++col_begin;
+ }
+ ++col_begin;
+ }while(*col_begin == quote);
+ }
+ }
+ }
+ return col_begin;
+ }
+
+ void chop_next_column(char*&line, char*&col_begin, char*&col_end, char& col_delimiter, char& quote, char& escape_char)
+ {
+ assert(line != nullptr);
+
+ col_begin = line;
+ // the col_begin + (... - col_begin) removes the constness
+ col_end = col_begin + (find_next_column_end(col_begin, col_delimiter, quote, escape_char) - col_begin);
+
+ if(*col_end == '\0')
+ {
+ line = nullptr;
+ }
+ else
+ {
+ *col_end = '\0';
+ line = col_end + 1;
+ }
+ }
+
+ void parse_line(char*line, std::vector<char*>& sorted_col, char& col_delimiter, char& quote, char& escape_char, std::vector<char>& trim_chars)
+ {
+ while (line != nullptr)
+ {
+ char*col_begin, *col_end;
+ chop_next_column(line, col_begin, col_end, col_delimiter, quote, escape_char);
+ if (!trim_chars.empty())
+ trim(col_begin, col_end, trim_chars);
+ if (!(quote == '\0' && escape_char == '\0'))
+ unescape(col_begin, col_end, quote, escape_char);
+ sorted_col.push_back(col_begin);
+ }
+ }
+
+
+ bool empty_comment_line(char* line)
+ {
+ if(*line == '\0')
+ return true;
+ while(*line == ' ' || *line == '\t')
+ {
+ ++line;
+ if(*line == '\0')
+ return true;
+ }
+ return false;
+ }
+
+ bool single_line_comment(char start_char, std::vector<char>& comment_chars)
+ {
+ if(std::find(comment_chars.begin(), comment_chars.end(), start_char) != comment_chars.end())
+ return true;
+ else
+ return false;
+ }
+
+ bool is_comment(char*&line, bool& comment_empty_line, std::vector<char>& comment_chars)
+ {
+ if(!comment_empty_line && comment_chars.empty())
+ return false;
+ else if(comment_empty_line && comment_chars.empty())
+ return empty_comment_line(line);
+ else if(!comment_empty_line && !comment_chars.empty())
+ return single_line_comment(*line, comment_chars);
+ else
+ return empty_comment_line(line) || single_line_comment(*line, comment_chars);
+ }
+
+ }
+}
+
+
+class CSVParser
+{
+ private:
+ char row_delimiter;
+ char col_delimiter;
+ char quote;
+ char escape_char;
+ bool comment_empty_line;
+ std::vector<char> comment_characters;
+ std::vector<char> trim_characters;
+
+ static const int block_len = 1<<20;
+ std::unique_ptr<char[]>buffer; // must be constructed before (and thus destructed after) the reader!
+ #ifdef CSV_IO_NO_THREAD
+ io::detail::SynchronousReader reader;
+ #else
+ io::detail::AsynchronousReader reader;
+ #endif
+ int data_begin;
+ int data_end;
+
+ char file_name[io::error::max_file_name_length+1];
+ unsigned file_line;
+
+ void init(std::unique_ptr<io::ByteSourceBase>byte_source)
+ {
+ file_line = 0;
+
+ buffer = std::unique_ptr<char[]>(new char[3*block_len]);
+ data_begin = 0;
+ data_end = byte_source->read(buffer.get(), 2*block_len);
+
+ // Ignore UTF-8 BOM
+ if(data_end >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF')
+ data_begin = 3;
+
+ if(data_end == 2*block_len){
+ reader.init(std::move(byte_source));
+ reader.start_read(buffer.get() + 2*block_len, block_len);
+ }
+ }
+
+ public:
+ CSVParser() = delete;
+ CSVParser(const CSVParser&) = delete;
+ CSVParser&operator=(const CSVParser&);
+
+ CSVParser(const char*file_name, const char*data_begin, const char*data_end)
+ {
+ set_file_name(file_name);
+ init(std::unique_ptr<io::ByteSourceBase>(new io::detail::NonOwningStringByteSource(data_begin, data_end-data_begin)));
+ }
+
+ CSVParser(const std::string&file_name, const char*data_begin, const char*data_end)
+ {
+ set_file_name(file_name.c_str());
+ init(std::unique_ptr<io::ByteSourceBase>(new io::detail::NonOwningStringByteSource(data_begin, data_end-data_begin)));
+ }
+
+ void set_file_name(const std::string&file_name)
+ {
+ set_file_name(file_name.c_str());
+ }
+
+ void set_file_name(const char*file_name)
+ {
+ if(file_name != nullptr)
+ {
+ strncpy(this->file_name, file_name, sizeof(this->file_name));
+ this->file_name[sizeof(this->file_name)-1] = '\0';
+ }
+ else
+ {
+ this->file_name[0] = '\0';
+ }
+ }
+
+ const char*get_truncated_file_name()const
+ {
+ return file_name;
+ }
+
+ void set_file_line(unsigned file_line)
+ {
+ this->file_line = file_line;
+ }
+
+ unsigned get_file_line()const
+ {
+ return file_line;
+ }
+
+ void set_csv_def(char& row_delimit, char& col_delimit, char& quote_char, char& escp_char, bool& cmnt_empty_line, std::vector<char>& comment_chars , std::vector<char>& trim_chars)
+ {
+ row_delimiter = row_delimit;
+ col_delimiter = col_delimit;
+ quote = quote_char;
+ escape_char = escp_char;
+ comment_empty_line = cmnt_empty_line;
+ comment_characters.assign(comment_chars.begin(), comment_chars.end());
+ trim_characters.assign(trim_chars.begin(), trim_chars.end());
+ }
+
+ char*next_line()
+ {
+ if(data_begin == data_end)
+ return nullptr;
+
+ ++file_line;
+
+ assert(data_begin < data_end);
+ assert(data_end <= block_len*2);
+
+ if(data_begin >= block_len)
+ {
+ std::memcpy(buffer.get(), buffer.get()+block_len, block_len);
+ data_begin -= block_len;
+ data_end -= block_len;
+ if(reader.is_valid())
+ {
+ data_end += reader.finish_read();
+ std::memcpy(buffer.get()+block_len, buffer.get()+2*block_len, block_len);
+ reader.start_read(buffer.get() + 2*block_len, block_len);
+ }
+ }
+
+ int line_end = data_begin;
+ while(line_end != data_end && buffer[line_end] != row_delimiter)
+ {
+ if(buffer[line_end] == quote || buffer[line_end] == escape_char)
+ {
+ if(buffer[line_end] == escape_char)
+ {
+ ++line_end;
+ if(line_end == data_end)
+ {
+ throw io::error::escaped_char_missing();
+ }
+ else if(buffer[line_end] == '\r' && buffer[line_end + 1] == '\n') // handle windows \r\n-line breaks
+ {
+ ++line_end;
+ }
+ }
+ else
+ {
+ ++line_end;
+ while(buffer[line_end] != quote)
+ {
+ if(line_end == data_end)
+ throw io::error::escaped_string_not_closed();
+ ++line_end;
+ }
+ }
+ }
+ ++line_end;
+ }
+
+ if(line_end - data_begin + 1 > block_len)
+ {
+ io::error::line_length_limit_exceeded err;
+ err.set_file_name(file_name);
+ err.set_file_line(file_line);
+ throw err;
+ }
+
+ if(line_end != data_end && buffer[line_end] == row_delimiter)
+ {
+ buffer[line_end] = '\0';
+ }
+ else
+ {
+ // some files are missing the newline at the end of the
+ // last line
+ ++data_end;
+ buffer[line_end] = '\0';
+ }
+
+ // handle windows \r\n-line breaks
+ if(row_delimiter == '\n')
+ {
+ if(line_end != data_begin && buffer[line_end-1] == '\r')
+ buffer[line_end-1] = '\0';
+ }
+
+ char*ret = buffer.get() + data_begin;
+ data_begin = line_end+1;
+ return ret;
+ }
+
+ bool read_row(std::vector<char*>& cols)
+ {
+ try{
+ try{
+ char*line;
+ do{
+ line = next_line();
+ if(!line)
+ return false;
+ }while(io::detail::is_comment(line, comment_empty_line, comment_characters));
+
+ io::detail::parse_line(line, cols, col_delimiter, quote, escape_char, trim_characters);
+
+ }catch(io::error::with_file_name&err){
+ err.set_file_name(get_truncated_file_name());
+ throw;
+ }
+ }catch(io::error::with_file_line&err){
+ err.set_file_line(get_file_line());
+ throw;
+ }
+
+ return true;
+ }
+};