diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/s3select/include | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | src/s3select/include/csvparser/LICENSE | 28 | ||||
-rw-r--r-- | src/s3select/include/csvparser/README.md | 275 | ||||
-rw-r--r-- | src/s3select/include/csvparser/csv.h | 1273 | ||||
-rw-r--r-- | src/s3select/include/encryption_internal.h | 114 | ||||
-rw-r--r-- | src/s3select/include/internal_file_decryptor.h | 121 | ||||
-rw-r--r-- | src/s3select/include/s3select.h | 3153 | ||||
-rw-r--r-- | src/s3select/include/s3select_csv_parser.h | 418 | ||||
-rw-r--r-- | src/s3select/include/s3select_functions.h | 2703 | ||||
-rw-r--r-- | src/s3select/include/s3select_json_parser.h | 829 | ||||
-rw-r--r-- | src/s3select/include/s3select_oper.h | 3326 | ||||
-rw-r--r-- | src/s3select/include/s3select_parquet_intrf.h | 2079 |
11 files changed, 14319 insertions, 0 deletions
diff --git a/src/s3select/include/csvparser/LICENSE b/src/s3select/include/csvparser/LICENSE new file mode 100644 index 000000000..da603a96b --- /dev/null +++ b/src/s3select/include/csvparser/LICENSE @@ -0,0 +1,28 @@ +Copyright (c) 2015, ben-strasser +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of fast-cpp-csv-parser nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/src/s3select/include/csvparser/README.md b/src/s3select/include/csvparser/README.md new file mode 100644 index 000000000..0b1d2c83e --- /dev/null +++ b/src/s3select/include/csvparser/README.md @@ -0,0 +1,275 @@ +# Fast C++ CSV Parser + +This is a small, easy-to-use and fast header-only library for reading comma separated value (CSV) files. + +## Features + + * Automatically rearranges columns by parsing the header line. + * Disk I/O and CSV-parsing are overlapped using threads for efficiency. + * Parsing features such as escaped strings can be enabled and disabled at compile time using templates. You only pay in speed for the features you actually use. + * Can read multiple GB files in reasonable time. + * Support for custom columns separators (i.e. Tab separated value files are supported), quote escaped strings, automatic space trimming. + * Works with `*`nix and Windows newlines and automatically ignores UTF-8 BOMs. + * Exception classes with enough context to format useful error messages. what() returns error messages ready to be shown to a user. + +## Getting Started + +The following small example should contain most of the syntax you need to use the library. + +```cpp +# include "csv.h" + +int main(){ + io::CSVReader<3> in("ram.csv"); + in.read_header(io::ignore_extra_column, "vendor", "size", "speed"); + std::string vendor; int size; double speed; + while(in.read_row(vendor, size, speed)){ + // do stuff with the data + } +} +``` + +## Installation + +The library only needs a standard conformant C++11 compiler. It has no further dependencies. The library is completely contained inside a single header file and therefore it is sufficient to copy this file to some place on your include path. The library does not have to be explicitly build. + +Note however, that threads are used and some compiler (for example GCC) require you to link against additional libraries to make it work. With GCC it is important to add -lpthread as the last item when linking, i.e. the order in + +``` +g++ -std=c++0x a.o b.o -o prog -lpthread +``` + +is important. If you for some reason do not want to use threads you can define CSV_IO_NO_THREAD before including the header. + +Remember that the library makes use of C++11 features and therefore you have to enable support for it (f.e. add -std=c++0x or -std=gnu++0x). + +The library was developed and tested with GCC 4.6.1 + +Note that VS2013 is not C++11 compilant and will therefore not work out of the box. See [here](https://code.google.com/p/fast-cpp-csv-parser/issues/detail?id=6) for what needs to be adjusted to make the code work. + +## Documentation + +The libary provides two classes: + + * `LineReader`: A class to efficiently read large files line by line. + * `CSVReader`: A class that efficiently reads large CSV files. + +Note that everything is contained in the `io` namespace. + +### `LineReader` + +```cpp +class LineReader{ +public: + // Constructors + LineReader(some_string_type file_name); + LineReader(some_string_type file_name, std::FILE*source); + LineReader(some_string_type file_name, std::istream&source); + LineReader(some_string_type file_name, std::unique_ptr<ByteSourceBase>source); + + // Reading + char*next_line(); + + // File Location + void set_file_line(unsigned); + unsigned get_file_line()const; + void set_file_name(some_string_type file_name); + const char*get_truncated_file_name()const; +}; +``` + +The constructor takes a file name and optionally a data source. If no data source is provided the function tries to open the file with the given name and throws an `error::can_not_open_file exception` on failure. If a data source is provided then the file name is only used to format error messages. In that case you can essentially put any string there. Using a string that describes the data source results in more informative error messages. + +`some_string_type` can be a `std::string` or a `char*`. If the data source is a `std::FILE*` then the library will take care of calling `std::fclose`. If it is a `std::istream` then the stream is not closed by the library. For best performance open the streams in binary mode. However using text mode also works. `ByteSourceBase` provides an interface that you can use to implement further data sources. + +```cpp +class ByteSourceBase{ +public: + virtual int read(char*buffer, int size)=0; + virtual ~ByteSourceBase(){} +}; +``` + +The read function should fill the provided buffer with at most `size` bytes from the data source. It should return the number of bytes actually written to the buffer. If data source has run out of bytes (because for example an end of file was reached) then the function should return 0. If a fatal error occures then you can throw an exception. Note that the function can be called both from the main and the worker thread. However, it is guarenteed that they do not call the function at the same time. + +Lines are read by calling the `next_line` function. It returns a pointer to a null terminated C-string that contains the line. If the end of file is reached a null pointer is returned. The newline character is not included in the string. You may modify the string as long as you do not write past the null terminator. The string stays valid until the destructor is called or until next_line is called again. Windows and `*`nix newlines are handled transparently. UTF-8 BOMs are automatically ignored and missing newlines at the end of the file are no problem. + +**Important:** There is a limit of 2^24-1 characters per line. If this limit is exceeded a `error::line_length_limit_exceeded` exception is thrown. + +Looping over all the lines in a file can be done in the following way. +```cpp +LineReader in(...); +while(char*line = in.next_line()){ + ... +} +``` + +The remaining functions are mainly used used to format error messages. The file line indicates the current position in the file, i.e., after the first `next_line` call it is 1 and after the second 2. Before the first call it is 0. The file name is truncated as internally C-strings are used to avoid `std::bad_alloc` exceptions during error reporting. + +**Note:** It is not possible to exchange the line termination character. + +### `CSVReader` + +`CSVReader` uses policies. These are classes with only static members to allow core functionality to be exchanged in an efficient way. + +```cpp +template< + unsigned column_count, + class trim_policy = trim_chars<' ', '\t'>, + class quote_policy = no_quote_escape<','>, + class overflow_policy = throw_on_overflow, + class comment_policy = no_comment +> +class CSVReader{ +public: + // Constructors + // same as for LineReader + + // Parsing Header + void read_header(ignore_column ignore_policy, some_string_type col_name1, some_string_type col_name2, ...); + void set_header(some_string_type col_name1, some_string_type col_name2, ...); + bool has_column(some_string_type col_name)const; + + // Read + char*next_line(); + bool read_row(ColType1&col1, ColType2&col2, ...); + + // File Location + void set_file_line(unsigned); + unsigned get_file_line()const; + void set_file_name(some_string_type file_name); + const char*get_truncated_file_name()const; +}; +``` + +The `column_count` template parameter indicates how many columns you want to read from the CSV file. This must not necessarily coincide with the actual number of columns in the file. The three policies govern various aspects of the parsing. + +The trim policy indicates what characters should be ignored at the begin and the end of every column. The default ignores spaces and tabs. This makes sure that + +``` +a,b,c +1,2,3 +``` + +is interpreted in the same way as + +``` + a, b, c +1 , 2, 3 +``` + +The trim_chars can take any number of template parameters. For example `trim_chars<' ', '\t', '_'> `is also valid. If no character should be trimmed use `trim_chars<>`. + +The quote policy indicates how string should be escaped. It also specifies the column separator. The predefined policies are: + + * `no_quote_escape<sep>` : Strings are not escaped. "`sep`" is used as column separator. + * `double_quote_escape<sep, quote>` : Strings are escaped using quotes. Quotes are escaped using two consecutive quotes. "`sep`" is used as column separator and "`quote`" as quoting character. + +**Important**: When combining trimming and quoting the rows are first trimmed and then unquoted. A consequence is that spaces inside the quotes will be conserved. If you want to get rid of spaces inside the quotes, you need to remove them yourself. + +**Important**: Quoting can be quite expensive. Disable it if you do not need it. + +**Important**: Quoted strings may not contain unescaped newlines. This is currently not supported. + +The overflow policy indicates what should be done if the integers in the input are too large to fit into the variables. There following policies are predefined: + + * `throw_on_overflow` : Throw an `error::integer_overflow` or `error::integer_underflow` exception. + * `ignore_overflow` : Do nothing and let the overflow happen. + * `set_to_max_on_overflow` : Set the value to `numeric_limits<...>::max()` (or to the min-pendant). + +The comment policy allows to skip lines based on some criteria. Valid predefined policies are: + + * `no_comment` : Do not ignore any line. + * `empty_line_comment` : Ignore all lines that are empty or only contains spaces and tabs. + * `single_line_comment<com1, com2, ...>` : Ignore all lines that start with com1 or com2 or ... as the first character. There may not be any space between the beginning of the line and the comment character. + * `single_and_empty_line_comment<com1, com2, ...>` : Ignore all empty lines and single line comments. + +Examples: + + * `CSVReader<4, trim_chars<' '>, double_quote_escape<',','\"'> >` reads 4 columns from a normal CSV file with string escaping enabled. + * `CSVReader<3, trim_chars<' '>, no_quote_escape<'\t'>, throw_on_overflow, single_line_comment<'#'> >` reads 3 columns from a tab separated file with string escaping disabled. Lines starting with a # are ignored. + +The constructors and the file location functions are exactly the same as for `LineReader`. See its documentation for details. + +There are three methods that deal with headers. The `read_header` methods reads a line from the file and rearranges the columns to match that order. It also checks whether all necessary columns are present. The `set_header` method does *not* read any input. Use it if the file does not have any header. Obviously it is impossible to rearrange columns or check for their availability when using it. The order in the file and in the program must match when using `set_header`. The `has_column` method checks whether a column is present in the file. The first argument of `read_header` is a bitfield that determines how the function should react to column mismatches. The default behavior is to throw an `error::extra_column_in_header` exception if the file contains more columns than expected and an `error::missing_column_in_header` when there are not enough. This behavior can be altered using the following flags. + + * `ignore_no_column`: The default behavior, no flags are set + * `ignore_extra_column`: If a column with a name is in the file but not in the argument list, then it is silently ignored. + * `ignore_missing_column`: If a column with a name is not in the file but is in the argument list, then `read_row` will not modify the corresponding variable. + +When using `ignore_missing_column` it is a good idea to initialize the variables passed to `read_row` with a default value, for example: + +```cpp +// The file only contains column "a" +CSVReader<2>in(...); +in.read_header(ignore_missing_column, "a", "b"); +int a,b = 42; +while(in.read_row(a,b)){ + // a contains the value from the file + // b is left unchanged by read_row, i.e., it is 42 +} +``` + +If only some columns are optional or their default value depends on other columns you have to use `has_column`, for example: + +```cpp +// The file only contains the columns "a" and "b" +CSVReader<3>in(...); +in.read_header(ignore_missing_column, "a", "b", "sum"); +if(!in.has_column("a") || !in.has_column("b")) + throw my_neat_error_class(); +bool has_sum = in.has_column("sum"); +int a,b,sum; +while(in.read_row(a,b,sum)){ + if(!has_sum) + sum = a+b; +} +``` + +**Important**: Do not call `has_column` from within the read-loop. It would work correctly but significantly slowdown processing. + +If two columns have the same name an error::duplicated_column_in_header exception is thrown. If `read_header` is called but the file is empty a `error::header_missing` exception is thrown. + +The `next_line` functions reads a line without parsing it. It works analogous to `LineReader::next_line`. This can be used to skip broken lines in a CSV file. However, in nearly all applications you will want to use the `read_row` function. + +The `read_row` function reads a line, splits it into the columns and arranges them correctly. It trims the entries and unescapes them. If requested the content is interpreted as integer or as floating point. The variables passed to read_row may be of the following types. + + * builtin signed integer: These are `signed char`, `short`, `int`, `long` and `long long`. The input must be encoded as a base 10 ASCII number optionally preceded by a + or -. The function detects whether the integer is too large would overflow (or underflow) and behaves as indicated by overflow_policy. + * builtin unsigned integer: Just as the signed counterparts except that a leading + or - is not allowed. + * builtin floating point: These are `float`, `double` and `long double`. The input may have a leading + or -. The number must be base 10 encoded. The decimal point may either be a dot or a comma. (Note that a comma will only work if it is not also used as column separator or the number is escaped.) A base 10 exponent may be specified using the "1e10" syntax. The "e" may be lower- or uppercase. Examples for valid floating points are "1", "-42.42" and "+123.456E789". The input is rounded to the next floating point or infinity if it is too large or small. + * `char`: The column content must be a single character. + * `std::string`: The column content is assigned to the string. The std::string is filled with the trimmed and unescaped version. + * `char*`: A pointer directly into the buffer. The string is trimmed and unescaped and null terminated. This pointer stays valid until read_row is called again or the CSVReader is destroyed. Use this for user defined types. + +Note that there is no inherent overhead to using `char*` and then interpreting it compared to using one of the parsers directly build into `CSVReader`. The builtin number parsers are pure convenience. If you need a slightly different syntax then use `char*` and do the parsing yourself. + +## FAQ + +Q: The library is throwing a std::system_error with code -1. How to get it to work? + +A: Your compiler's std::thread implementation is broken. Define CSV\_IO\_NO\_THREAD to disable threading support. + + +Q: My values are not just ints or strings. I want to parse my customized type. Is this possible? + +A: Read a `char*` and parse the string. At first this seems expensive but it is not as the pointer you get points directly into the memory buffer. In fact there is no inherent reason why a custom int-parser realized this way must be any slower than the int-parser build into the library. By reading a `char*` the library takes care of column reordering and quote escaping and leaves the actual parsing to you. Note that using a std::string is slower as it involves a memory copy. + + +Q: I get lots of compiler errors when compiling the header! Please fix it. :( + +A: Have you enabled the C++11 mode of your compiler? If you use GCC you have to add -std=c++0x to the commandline. If this does not resolve the problem, then please open a ticket. + + +Q: The library crashes when parsing large files! Please fix it. :( + +A: When using GCC have you linked against -lpthread? Read the installation section for details on how to do this. If this does not resolve the issue then please open a ticket. (The reason why it only crashes only on large files is that the first chuck is read synchronous and if the whole file fits into this chuck then no asynchronous call is performed.) Alternatively you can define CSV\_IO\_NO\_THREAD. + + +Q: Does the library support UTF? + +A: The library has basic UTF-8 support, or to be more precise it does not break when passing UTF-8 strings through it. If you read a `char*` then you get a pointer to the UTF-8 string. You will have to decode the string on your own. The separator, quoting, and commenting characters used by the library can only be ASCII characters. + + +Q: Does the library support string fields that span multiple lines? + +A: No. This feature has been often requested in the past, however, it is difficult to make it work with the current design without breaking something else. diff --git a/src/s3select/include/csvparser/csv.h b/src/s3select/include/csvparser/csv.h new file mode 100644 index 000000000..c5cb5bcae --- /dev/null +++ b/src/s3select/include/csvparser/csv.h @@ -0,0 +1,1273 @@ +// Copyright: (2012-2015) Ben Strasser <code@ben-strasser.net> +// License: BSD-3 +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +#ifndef CSV_H +#define CSV_H + +#include <vector> +#include <string> +#include <cstring> +#include <algorithm> +#include <utility> +#include <cstdio> +#include <exception> +#ifndef CSV_IO_NO_THREAD +#include <mutex> +#include <thread> +#include <condition_variable> +#endif +#include <memory> +#include <cassert> +#include <cerrno> +#include <istream> +#include <limits> + +namespace io{ + //////////////////////////////////////////////////////////////////////////// + // LineReader // + //////////////////////////////////////////////////////////////////////////// + + namespace error{ + struct base : std::exception{ + virtual void format_error_message()const = 0; + + const char*what()const noexcept override{ + format_error_message(); + return error_message_buffer; + } + + mutable char error_message_buffer[512]; + }; + + const int max_file_name_length = 255; + + struct with_file_name{ + with_file_name(){ + std::memset(file_name, 0, sizeof(file_name)); + } + + void set_file_name(const char*file_name){ + if(file_name != nullptr){ + // This call to strncpy has parenthesis around it + // to silence the GCC -Wstringop-truncation warning + (strncpy(this->file_name, file_name, sizeof(this->file_name))); + this->file_name[sizeof(this->file_name)-1] = '\0'; + }else{ + this->file_name[0] = '\0'; + } + } + + char file_name[max_file_name_length+1]; + }; + + struct with_file_line{ + with_file_line(){ + file_line = -1; + } + + void set_file_line(int file_line){ + this->file_line = file_line; + } + + int file_line; + }; + + struct with_errno{ + with_errno(){ + errno_value = 0; + } + + void set_errno(int errno_value){ + this->errno_value = errno_value; + } + + int errno_value; + }; + + struct can_not_open_file : + base, + with_file_name, + with_errno{ + void format_error_message()const override{ + if(errno_value != 0) + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Can not open file \"%s\" because \"%s\"." + , file_name, std::strerror(errno_value)); + else + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Can not open file \"%s\"." + , file_name); + } + }; + + struct line_length_limit_exceeded : + base, + with_file_name, + with_file_line{ + void format_error_message()const override{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Line number %d in file \"%s\" exceeds the maximum length of 2^24-1." + , file_line, file_name); + } + }; + } + + class ByteSourceBase{ + public: + virtual int read(char*buffer, int size)=0; + virtual ~ByteSourceBase(){} + }; + + namespace detail{ + + class OwningStdIOByteSourceBase : public ByteSourceBase{ + public: + explicit OwningStdIOByteSourceBase(FILE*file):file(file){ + // Tell the std library that we want to do the buffering ourself. + std::setvbuf(file, 0, _IONBF, 0); + } + + int read(char*buffer, int size){ + return std::fread(buffer, 1, size, file); + } + + ~OwningStdIOByteSourceBase(){ + std::fclose(file); + } + + private: + FILE*file; + }; + + class NonOwningIStreamByteSource : public ByteSourceBase{ + public: + explicit NonOwningIStreamByteSource(std::istream&in):in(in){} + + int read(char*buffer, int size){ + in.read(buffer, size); + return in.gcount(); + } + + ~NonOwningIStreamByteSource(){} + + private: + std::istream∈ + }; + + class NonOwningStringByteSource : public ByteSourceBase{ + public: + NonOwningStringByteSource(const char*str, long long size):str(str), remaining_byte_count(size){} + + int read(char*buffer, int desired_byte_count){ + int to_copy_byte_count = desired_byte_count; + if(remaining_byte_count < to_copy_byte_count) + to_copy_byte_count = remaining_byte_count; + std::memcpy(buffer, str, to_copy_byte_count); + remaining_byte_count -= to_copy_byte_count; + str += to_copy_byte_count; + return to_copy_byte_count; + } + + ~NonOwningStringByteSource(){} + + private: + const char*str; + long long remaining_byte_count; + }; + + #ifndef CSV_IO_NO_THREAD + class AsynchronousReader{ + public: + void init(std::unique_ptr<ByteSourceBase>arg_byte_source){ + std::unique_lock<std::mutex>guard(lock); + byte_source = std::move(arg_byte_source); + desired_byte_count = -1; + termination_requested = false; + worker = std::thread( + [&]{ + std::unique_lock<std::mutex>guard(lock); + try{ + for(;;){ + read_requested_condition.wait( + guard, + [&]{ + return desired_byte_count != -1 || termination_requested; + } + ); + if(termination_requested) + return; + + read_byte_count = byte_source->read(buffer, desired_byte_count); + desired_byte_count = -1; + if(read_byte_count == 0) + break; + read_finished_condition.notify_one(); + } + }catch(...){ + read_error = std::current_exception(); + } + read_finished_condition.notify_one(); + } + ); + } + + bool is_valid()const{ + return byte_source != nullptr; + } + + void start_read(char*arg_buffer, int arg_desired_byte_count){ + std::unique_lock<std::mutex>guard(lock); + buffer = arg_buffer; + desired_byte_count = arg_desired_byte_count; + read_byte_count = -1; + read_requested_condition.notify_one(); + } + + int finish_read(){ + std::unique_lock<std::mutex>guard(lock); + read_finished_condition.wait( + guard, + [&]{ + return read_byte_count != -1 || read_error; + } + ); + if(read_error) + std::rethrow_exception(read_error); + else + return read_byte_count; + } + + ~AsynchronousReader(){ + if(byte_source != nullptr){ + { + std::unique_lock<std::mutex>guard(lock); + termination_requested = true; + } + read_requested_condition.notify_one(); + worker.join(); + } + } + + private: + std::unique_ptr<ByteSourceBase>byte_source; + + std::thread worker; + + bool termination_requested; + std::exception_ptr read_error; + char*buffer; + int desired_byte_count; + int read_byte_count; + + std::mutex lock; + std::condition_variable read_finished_condition; + std::condition_variable read_requested_condition; + }; + #endif + + class SynchronousReader{ + public: + void init(std::unique_ptr<ByteSourceBase>arg_byte_source){ + byte_source = std::move(arg_byte_source); + } + + bool is_valid()const{ + return byte_source != nullptr; + } + + void start_read(char*arg_buffer, int arg_desired_byte_count){ + buffer = arg_buffer; + desired_byte_count = arg_desired_byte_count; + } + + int finish_read(){ + return byte_source->read(buffer, desired_byte_count); + } + private: + std::unique_ptr<ByteSourceBase>byte_source; + char*buffer; + int desired_byte_count; + }; + } + + class LineReader{ + private: + static const int block_len = 1<<20; + std::unique_ptr<char[]>buffer; // must be constructed before (and thus destructed after) the reader! + #ifdef CSV_IO_NO_THREAD + detail::SynchronousReader reader; + #else + detail::AsynchronousReader reader; + #endif + int data_begin; + int data_end; + + char file_name[error::max_file_name_length+1]; + unsigned file_line; + + static std::unique_ptr<ByteSourceBase> open_file(const char*file_name){ + // We open the file in binary mode as it makes no difference under *nix + // and under Windows we handle \r\n newlines ourself. + FILE*file = std::fopen(file_name, "rb"); + if(file == 0){ + int x = errno; // store errno as soon as possible, doing it after constructor call can fail. + error::can_not_open_file err; + err.set_errno(x); + err.set_file_name(file_name); + throw err; + } + return std::unique_ptr<ByteSourceBase>(new detail::OwningStdIOByteSourceBase(file)); + } + + void init(std::unique_ptr<ByteSourceBase>byte_source){ + file_line = 0; + + buffer = std::unique_ptr<char[]>(new char[3*block_len]); + data_begin = 0; + data_end = byte_source->read(buffer.get(), 2*block_len); + + // Ignore UTF-8 BOM + if(data_end >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') + data_begin = 3; + + if(data_end == 2*block_len){ + reader.init(std::move(byte_source)); + reader.start_read(buffer.get() + 2*block_len, block_len); + } + } + + public: + LineReader() = delete; + LineReader(const LineReader&) = delete; + LineReader&operator=(const LineReader&) = delete; + + explicit LineReader(const char*file_name){ + set_file_name(file_name); + init(open_file(file_name)); + } + + explicit LineReader(const std::string&file_name){ + set_file_name(file_name.c_str()); + init(open_file(file_name.c_str())); + } + + LineReader(const char*file_name, std::unique_ptr<ByteSourceBase>byte_source){ + set_file_name(file_name); + init(std::move(byte_source)); + } + + LineReader(const std::string&file_name, std::unique_ptr<ByteSourceBase>byte_source){ + set_file_name(file_name.c_str()); + init(std::move(byte_source)); + } + + LineReader(const char*file_name, const char*data_begin, const char*data_end){ + set_file_name(file_name); + init(std::unique_ptr<ByteSourceBase>(new detail::NonOwningStringByteSource(data_begin, data_end-data_begin))); + } + + LineReader(const std::string&file_name, const char*data_begin, const char*data_end){ + set_file_name(file_name.c_str()); + init(std::unique_ptr<ByteSourceBase>(new detail::NonOwningStringByteSource(data_begin, data_end-data_begin))); + } + + LineReader(const char*file_name, FILE*file){ + set_file_name(file_name); + init(std::unique_ptr<ByteSourceBase>(new detail::OwningStdIOByteSourceBase(file))); + } + + LineReader(const std::string&file_name, FILE*file){ + set_file_name(file_name.c_str()); + init(std::unique_ptr<ByteSourceBase>(new detail::OwningStdIOByteSourceBase(file))); + } + + LineReader(const char*file_name, std::istream&in){ + set_file_name(file_name); + init(std::unique_ptr<ByteSourceBase>(new detail::NonOwningIStreamByteSource(in))); + } + + LineReader(const std::string&file_name, std::istream&in){ + set_file_name(file_name.c_str()); + init(std::unique_ptr<ByteSourceBase>(new detail::NonOwningIStreamByteSource(in))); + } + + void set_file_name(const std::string&file_name){ + set_file_name(file_name.c_str()); + } + + void set_file_name(const char*file_name){ + if(file_name != nullptr){ + strncpy(this->file_name, file_name, sizeof(this->file_name)); + this->file_name[sizeof(this->file_name)-1] = '\0'; + }else{ + this->file_name[0] = '\0'; + } + } + + const char*get_truncated_file_name()const{ + return file_name; + } + + void set_file_line(unsigned file_line){ + this->file_line = file_line; + } + + unsigned get_file_line()const{ + return file_line; + } + + char*next_line(){ + if(data_begin == data_end) + return nullptr; + + ++file_line; + + assert(data_begin < data_end); + assert(data_end <= block_len*2); + + if(data_begin >= block_len){ + std::memcpy(buffer.get(), buffer.get()+block_len, block_len); + data_begin -= block_len; + data_end -= block_len; + if(reader.is_valid()) + { + data_end += reader.finish_read(); + std::memcpy(buffer.get()+block_len, buffer.get()+2*block_len, block_len); + reader.start_read(buffer.get() + 2*block_len, block_len); + } + } + + int line_end = data_begin; + while(line_end != data_end && buffer[line_end] != '\n'){ + ++line_end; + } + + if(line_end - data_begin + 1 > block_len){ + error::line_length_limit_exceeded err; + err.set_file_name(file_name); + err.set_file_line(file_line); + throw err; + } + + if(line_end != data_end && buffer[line_end] == '\n'){ + buffer[line_end] = '\0'; + }else{ + // some files are missing the newline at the end of the + // last line + ++data_end; + buffer[line_end] = '\0'; + } + + // handle windows \r\n-line breaks + if(line_end != data_begin && buffer[line_end-1] == '\r') + buffer[line_end-1] = '\0'; + + char*ret = buffer.get() + data_begin; + data_begin = line_end+1; + return ret; + } + }; + + + //////////////////////////////////////////////////////////////////////////// + // CSV // + //////////////////////////////////////////////////////////////////////////// + + namespace error{ + const int max_column_name_length = 63; + struct with_column_name{ + with_column_name(){ + std::memset(column_name, 0, max_column_name_length+1); + } + + void set_column_name(const char*column_name){ + if(column_name != nullptr){ + std::strncpy(this->column_name, column_name, max_column_name_length); + this->column_name[max_column_name_length] = '\0'; + }else{ + this->column_name[0] = '\0'; + } + } + + char column_name[max_column_name_length+1]; + }; + + + const int max_column_content_length = 63; + + struct with_column_content{ + with_column_content(){ + std::memset(column_content, 0, max_column_content_length+1); + } + + void set_column_content(const char*column_content){ + if(column_content != nullptr){ + std::strncpy(this->column_content, column_content, max_column_content_length); + this->column_content[max_column_content_length] = '\0'; + }else{ + this->column_content[0] = '\0'; + } + } + + char column_content[max_column_content_length+1]; + }; + + + struct extra_column_in_header : + base, + with_file_name, + with_column_name{ + void format_error_message()const override{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + R"(Extra column "%s" in header of file "%s".)" + , column_name, file_name); + } + }; + + struct missing_column_in_header : + base, + with_file_name, + with_column_name{ + void format_error_message()const override{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + R"(Missing column "%s" in header of file "%s".)" + , column_name, file_name); + } + }; + + struct duplicated_column_in_header : + base, + with_file_name, + with_column_name{ + void format_error_message()const override{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + R"(Duplicated column "%s" in header of file "%s".)" + , column_name, file_name); + } + }; + + struct header_missing : + base, + with_file_name{ + void format_error_message()const override{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Header missing in file \"%s\"." + , file_name); + } + }; + + struct too_few_columns : + base, + with_file_name, + with_file_line{ + void format_error_message()const override{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Too few columns in line %d in file \"%s\"." + , file_line, file_name); + } + }; + + struct too_many_columns : + base, + with_file_name, + with_file_line{ + void format_error_message()const override{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Too many columns in line %d in file \"%s\"." + , file_line, file_name); + } + }; + + struct escaped_string_not_closed : + base, + with_file_name, + with_file_line{ + void format_error_message()const override{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Escaped string was not closed in line %d in file \"%s\"." + , file_line, file_name); + } + }; + + struct integer_must_be_positive : + base, + with_file_name, + with_file_line, + with_column_name, + with_column_content{ + void format_error_message()const override{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + R"(The integer "%s" must be positive or 0 in column "%s" in file "%s" in line "%d".)" + , column_content, column_name, file_name, file_line); + } + }; + + struct no_digit : + base, + with_file_name, + with_file_line, + with_column_name, + with_column_content{ + void format_error_message()const override{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + R"(The integer "%s" contains an invalid digit in column "%s" in file "%s" in line "%d".)" + , column_content, column_name, file_name, file_line); + } + }; + + struct integer_overflow : + base, + with_file_name, + with_file_line, + with_column_name, + with_column_content{ + void format_error_message()const override{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + R"(The integer "%s" overflows in column "%s" in file "%s" in line "%d".)" + , column_content, column_name, file_name, file_line); + } + }; + + struct integer_underflow : + base, + with_file_name, + with_file_line, + with_column_name, + with_column_content{ + void format_error_message()const override{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + R"(The integer "%s" underflows in column "%s" in file "%s" in line "%d".)" + , column_content, column_name, file_name, file_line); + } + }; + + struct invalid_single_character : + base, + with_file_name, + with_file_line, + with_column_name, + with_column_content{ + void format_error_message()const override{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + R"(The content "%s" of column "%s" in file "%s" in line "%d" is not a single character.)" + , column_content, column_name, file_name, file_line); + } + }; + } + + using ignore_column = unsigned int; + static const ignore_column ignore_no_column = 0; + static const ignore_column ignore_extra_column = 1; + static const ignore_column ignore_missing_column = 2; + + template<char ... trim_char_list> + struct trim_chars{ + private: + constexpr static bool is_trim_char(char){ + return false; + } + + template<class ...OtherTrimChars> + constexpr static bool is_trim_char(char c, char trim_char, OtherTrimChars...other_trim_chars){ + return c == trim_char || is_trim_char(c, other_trim_chars...); + } + + public: + static void trim(char*&str_begin, char*&str_end){ + while(str_begin != str_end && is_trim_char(*str_begin, trim_char_list...)) + ++str_begin; + while(str_begin != str_end && is_trim_char(*(str_end-1), trim_char_list...)) + --str_end; + *str_end = '\0'; + } + }; + + + struct no_comment{ + static bool is_comment(const char*){ + return false; + } + }; + + template<char ... comment_start_char_list> + struct single_line_comment{ + private: + constexpr static bool is_comment_start_char(char){ + return false; + } + + template<class ...OtherCommentStartChars> + constexpr static bool is_comment_start_char(char c, char comment_start_char, OtherCommentStartChars...other_comment_start_chars){ + return c == comment_start_char || is_comment_start_char(c, other_comment_start_chars...); + } + + public: + + static bool is_comment(const char*line){ + return is_comment_start_char(*line, comment_start_char_list...); + } + }; + + struct empty_line_comment{ + static bool is_comment(const char*line){ + if(*line == '\0') + return true; + while(*line == ' ' || *line == '\t'){ + ++line; + if(*line == 0) + return true; + } + return false; + } + }; + + template<char ... comment_start_char_list> + struct single_and_empty_line_comment{ + static bool is_comment(const char*line){ + return single_line_comment<comment_start_char_list...>::is_comment(line) || empty_line_comment::is_comment(line); + } + }; + + template<char sep> + struct no_quote_escape{ + static const char*find_next_column_end(const char*col_begin){ + while(*col_begin != sep && *col_begin != '\0') + ++col_begin; + return col_begin; + } + + static void unescape(char*&, char*&){ + + } + }; + + template<char sep, char quote> + struct double_quote_escape{ + static const char*find_next_column_end(const char*col_begin){ + while(*col_begin != sep && *col_begin != '\0') + if(*col_begin != quote) + ++col_begin; + else{ + do{ + ++col_begin; + while(*col_begin != quote){ + if(*col_begin == '\0') + throw error::escaped_string_not_closed(); + ++col_begin; + } + ++col_begin; + }while(*col_begin == quote); + } + return col_begin; + } + + static void unescape(char*&col_begin, char*&col_end){ + if(col_end - col_begin >= 2){ + if(*col_begin == quote && *(col_end-1) == quote){ + ++col_begin; + --col_end; + char*out = col_begin; + for(char*in = col_begin; in!=col_end; ++in){ + if(*in == quote && (in+1) != col_end && *(in+1) == quote){ + ++in; + } + *out = *in; + ++out; + } + col_end = out; + *col_end = '\0'; + } + } + + } + }; + + struct throw_on_overflow{ + template<class T> + static void on_overflow(T&){ + throw error::integer_overflow(); + } + + template<class T> + static void on_underflow(T&){ + throw error::integer_underflow(); + } + }; + + struct ignore_overflow{ + template<class T> + static void on_overflow(T&){} + + template<class T> + static void on_underflow(T&){} + }; + + struct set_to_max_on_overflow{ + template<class T> + static void on_overflow(T&x){ + // using (std::numeric_limits<T>::max) instead of std::numeric_limits<T>::max + // to make code including windows.h with its max macro happy + x = (std::numeric_limits<T>::max)(); + } + + template<class T> + static void on_underflow(T&x){ + x = (std::numeric_limits<T>::min)(); + } + }; + + + namespace detail{ + template<class quote_policy> + void chop_next_column( + char*&line, char*&col_begin, char*&col_end + ){ + assert(line != nullptr); + + col_begin = line; + // the col_begin + (... - col_begin) removes the constness + col_end = col_begin + (quote_policy::find_next_column_end(col_begin) - col_begin); + + if(*col_end == '\0'){ + line = nullptr; + }else{ + *col_end = '\0'; + line = col_end + 1; + } + } + + template<class trim_policy, class quote_policy> + void parse_line( + char*line, + char**sorted_col, + const std::vector<int>&col_order + ){ + for (int i : col_order) { + if(line == nullptr) + throw ::io::error::too_few_columns(); + char*col_begin, *col_end; + chop_next_column<quote_policy>(line, col_begin, col_end); + + if (i != -1) { + trim_policy::trim(col_begin, col_end); + quote_policy::unescape(col_begin, col_end); + + sorted_col[i] = col_begin; + } + } + if(line != nullptr) + throw ::io::error::too_many_columns(); + } + + template<unsigned column_count, class trim_policy, class quote_policy> + void parse_header_line( + char*line, + std::vector<int>&col_order, + const std::string*col_name, + ignore_column ignore_policy + ){ + col_order.clear(); + + bool found[column_count]; + std::fill(found, found + column_count, false); + while(line){ + char*col_begin,*col_end; + chop_next_column<quote_policy>(line, col_begin, col_end); + + trim_policy::trim(col_begin, col_end); + quote_policy::unescape(col_begin, col_end); + + for(unsigned i=0; i<column_count; ++i) + if(col_begin == col_name[i]){ + if(found[i]){ + error::duplicated_column_in_header err; + err.set_column_name(col_begin); + throw err; + } + found[i] = true; + col_order.push_back(i); + col_begin = 0; + break; + } + if(col_begin){ + if(ignore_policy & ::io::ignore_extra_column) + col_order.push_back(-1); + else{ + error::extra_column_in_header err; + err.set_column_name(col_begin); + throw err; + } + } + } + if(!(ignore_policy & ::io::ignore_missing_column)){ + for(unsigned i=0; i<column_count; ++i){ + if(!found[i]){ + error::missing_column_in_header err; + err.set_column_name(col_name[i].c_str()); + throw err; + } + } + } + } + + template<class overflow_policy> + void parse(char*col, char &x){ + if(!*col) + throw error::invalid_single_character(); + x = *col; + ++col; + if(*col) + throw error::invalid_single_character(); + } + + template<class overflow_policy> + void parse(char*col, std::string&x){ + x = col; + } + + template<class overflow_policy> + void parse(char*col, const char*&x){ + x = col; + } + + template<class overflow_policy> + void parse(char*col, char*&x){ + x = col; + } + + template<class overflow_policy, class T> + void parse_unsigned_integer(const char*col, T&x){ + x = 0; + while(*col != '\0'){ + if('0' <= *col && *col <= '9'){ + T y = *col - '0'; + if(x > ((std::numeric_limits<T>::max)()-y)/10){ + overflow_policy::on_overflow(x); + return; + } + x = 10*x+y; + }else + throw error::no_digit(); + ++col; + } + } + + template<class overflow_policy>void parse(char*col, unsigned char &x) + {parse_unsigned_integer<overflow_policy>(col, x);} + template<class overflow_policy>void parse(char*col, unsigned short &x) + {parse_unsigned_integer<overflow_policy>(col, x);} + template<class overflow_policy>void parse(char*col, unsigned int &x) + {parse_unsigned_integer<overflow_policy>(col, x);} + template<class overflow_policy>void parse(char*col, unsigned long &x) + {parse_unsigned_integer<overflow_policy>(col, x);} + template<class overflow_policy>void parse(char*col, unsigned long long &x) + {parse_unsigned_integer<overflow_policy>(col, x);} + + template<class overflow_policy, class T> + void parse_signed_integer(const char*col, T&x){ + if(*col == '-'){ + ++col; + + x = 0; + while(*col != '\0'){ + if('0' <= *col && *col <= '9'){ + T y = *col - '0'; + if(x < ((std::numeric_limits<T>::min)()+y)/10){ + overflow_policy::on_underflow(x); + return; + } + x = 10*x-y; + }else + throw error::no_digit(); + ++col; + } + return; + }else if(*col == '+') + ++col; + parse_unsigned_integer<overflow_policy>(col, x); + } + + template<class overflow_policy>void parse(char*col, signed char &x) + {parse_signed_integer<overflow_policy>(col, x);} + template<class overflow_policy>void parse(char*col, signed short &x) + {parse_signed_integer<overflow_policy>(col, x);} + template<class overflow_policy>void parse(char*col, signed int &x) + {parse_signed_integer<overflow_policy>(col, x);} + template<class overflow_policy>void parse(char*col, signed long &x) + {parse_signed_integer<overflow_policy>(col, x);} + template<class overflow_policy>void parse(char*col, signed long long &x) + {parse_signed_integer<overflow_policy>(col, x);} + + template<class T> + void parse_float(const char*col, T&x){ + bool is_neg = false; + if(*col == '-'){ + is_neg = true; + ++col; + }else if(*col == '+') + ++col; + + x = 0; + while('0' <= *col && *col <= '9'){ + int y = *col - '0'; + x *= 10; + x += y; + ++col; + } + + if(*col == '.'|| *col == ','){ + ++col; + T pos = 1; + while('0' <= *col && *col <= '9'){ + pos /= 10; + int y = *col - '0'; + ++col; + x += y*pos; + } + } + + if(*col == 'e' || *col == 'E'){ + ++col; + int e; + + parse_signed_integer<set_to_max_on_overflow>(col, e); + + if(e != 0){ + T base; + if(e < 0){ + base = T(0.1); + e = -e; + }else{ + base = T(10); + } + + while(e != 1){ + if((e & 1) == 0){ + base = base*base; + e >>= 1; + }else{ + x *= base; + --e; + } + } + x *= base; + } + }else{ + if(*col != '\0') + throw error::no_digit(); + } + + if(is_neg) + x = -x; + } + + template<class overflow_policy> void parse(char*col, float&x) { parse_float(col, x); } + template<class overflow_policy> void parse(char*col, double&x) { parse_float(col, x); } + template<class overflow_policy> void parse(char*col, long double&x) { parse_float(col, x); } + + template<class overflow_policy, class T> + void parse(char*col, T&x){ + // Mute unused variable compiler warning + (void)col; + (void)x; + // GCC evalutes "false" when reading the template and + // "sizeof(T)!=sizeof(T)" only when instantiating it. This is why + // this strange construct is used. + static_assert(sizeof(T)!=sizeof(T), + "Can not parse this type. Only buildin integrals, floats, char, char*, const char* and std::string are supported"); + } + + } + + template<unsigned column_count, + class trim_policy = trim_chars<' ', '\t'>, + class quote_policy = no_quote_escape<','>, + class overflow_policy = throw_on_overflow, + class comment_policy = no_comment + > + class CSVReader{ + private: + LineReader in; + + char*row[column_count]; + std::string column_names[column_count]; + + std::vector<int>col_order; + + template<class ...ColNames> + void set_column_names(std::string s, ColNames...cols){ + column_names[column_count-sizeof...(ColNames)-1] = std::move(s); + set_column_names(std::forward<ColNames>(cols)...); + } + + void set_column_names(){} + + + public: + CSVReader() = delete; + CSVReader(const CSVReader&) = delete; + CSVReader&operator=(const CSVReader&); + + template<class ...Args> + explicit CSVReader(Args&&...args):in(std::forward<Args>(args)...){ + std::fill(row, row+column_count, nullptr); + col_order.resize(column_count); + for(unsigned i=0; i<column_count; ++i) + col_order[i] = i; + for(unsigned i=1; i<=column_count; ++i) + column_names[i-1] = "col"+std::to_string(i); + } + + char*next_line(){ + return in.next_line(); + } + + template<class ...ColNames> + void read_header(ignore_column ignore_policy, ColNames...cols){ + static_assert(sizeof...(ColNames)>=column_count, "not enough column names specified"); + static_assert(sizeof...(ColNames)<=column_count, "too many column names specified"); + try{ + set_column_names(std::forward<ColNames>(cols)...); + + char*line; + do{ + line = in.next_line(); + if(!line) + throw error::header_missing(); + }while(comment_policy::is_comment(line)); + + detail::parse_header_line + <column_count, trim_policy, quote_policy> + (line, col_order, column_names, ignore_policy); + }catch(error::with_file_name&err){ + err.set_file_name(in.get_truncated_file_name()); + throw; + } + } + + template<class ...ColNames> + void set_header(ColNames...cols){ + static_assert(sizeof...(ColNames)>=column_count, + "not enough column names specified"); + static_assert(sizeof...(ColNames)<=column_count, + "too many column names specified"); + set_column_names(std::forward<ColNames>(cols)...); + std::fill(row, row+column_count, nullptr); + col_order.resize(column_count); + for(unsigned i=0; i<column_count; ++i) + col_order[i] = i; + } + + bool has_column(const std::string&name) const { + return col_order.end() != std::find( + col_order.begin(), col_order.end(), + std::find(std::begin(column_names), std::end(column_names), name) + - std::begin(column_names)); + } + + void set_file_name(const std::string&file_name){ + in.set_file_name(file_name); + } + + void set_file_name(const char*file_name){ + in.set_file_name(file_name); + } + + const char*get_truncated_file_name()const{ + return in.get_truncated_file_name(); + } + + void set_file_line(unsigned file_line){ + in.set_file_line(file_line); + } + + unsigned get_file_line()const{ + return in.get_file_line(); + } + + private: + void parse_helper(std::size_t){} + + template<class T, class ...ColType> + void parse_helper(std::size_t r, T&t, ColType&...cols){ + if(row[r]){ + try{ + try{ + ::io::detail::parse<overflow_policy>(row[r], t); + }catch(error::with_column_content&err){ + err.set_column_content(row[r]); + throw; + } + }catch(error::with_column_name&err){ + err.set_column_name(column_names[r].c_str()); + throw; + } + } + parse_helper(r+1, cols...); + } + + + public: + template<class ...ColType> + bool read_row(ColType& ...cols){ + static_assert(sizeof...(ColType)>=column_count, + "not enough columns specified"); + static_assert(sizeof...(ColType)<=column_count, + "too many columns specified"); + try{ + try{ + + char*line; + do{ + line = in.next_line(); + if(!line) + return false; + }while(comment_policy::is_comment(line)); + + detail::parse_line<trim_policy, quote_policy> + (line, row, col_order); + + parse_helper(0, cols...); + }catch(error::with_file_name&err){ + err.set_file_name(in.get_truncated_file_name()); + throw; + } + }catch(error::with_file_line&err){ + err.set_file_line(in.get_file_line()); + throw; + } + + return true; + } + }; +} +#endif + diff --git a/src/s3select/include/encryption_internal.h b/src/s3select/include/encryption_internal.h new file mode 100644 index 000000000..c8deee492 --- /dev/null +++ b/src/s3select/include/encryption_internal.h @@ -0,0 +1,114 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <memory> +#include <string> +#include <vector> + +#include "parquet/properties.h" +#include "parquet/types.h" + +using parquet::ParquetCipher; + +namespace parquet { +namespace encryption { + +constexpr int kGcmTagLength = 16; +constexpr int kNonceLength = 12; + +// Module types +constexpr int8_t kFooter = 0; +constexpr int8_t kColumnMetaData = 1; +constexpr int8_t kDataPage = 2; +constexpr int8_t kDictionaryPage = 3; +constexpr int8_t kDataPageHeader = 4; +constexpr int8_t kDictionaryPageHeader = 5; +constexpr int8_t kColumnIndex = 6; +constexpr int8_t kOffsetIndex = 7; + +/// Performs AES encryption operations with GCM or CTR ciphers. +class AesEncryptor { + public: + static AesEncryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata, + std::vector<AesEncryptor*>* all_encryptors); + + ~AesEncryptor(); + + /// Size difference between plaintext and ciphertext, for this cipher. + int CiphertextSizeDelta(); + + /// Encrypts plaintext with the key and aad. Key length is passed only for validation. + /// If different from value in constructor, exception will be thrown. + int Encrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key, + int key_len, const uint8_t* aad, int aad_len, uint8_t* ciphertext); + + /// Encrypts plaintext footer, in order to compute footer signature (tag). + int SignedFooterEncrypt(const uint8_t* footer, int footer_len, const uint8_t* key, + int key_len, const uint8_t* aad, int aad_len, + const uint8_t* nonce, uint8_t* encrypted_footer); + + void WipeOut(); + + private: + /// Can serve one key length only. Possible values: 16, 24, 32 bytes. + explicit AesEncryptor(ParquetCipher::type alg_id, int key_len, bool metadata); + // PIMPL Idiom + class AesEncryptorImpl; + std::unique_ptr<AesEncryptorImpl> impl_; +}; + +/// Performs AES decryption operations with GCM or CTR ciphers. +class AesDecryptor { + public: + static AesDecryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata, + std::vector<AesDecryptor*>* all_decryptors); + + ~AesDecryptor(); + void WipeOut(); + + /// Size difference between plaintext and ciphertext, for this cipher. + int CiphertextSizeDelta(); + + /// Decrypts ciphertext with the key and aad. Key length is passed only for + /// validation. If different from value in constructor, exception will be thrown. + int Decrypt(const uint8_t* ciphertext, int ciphertext_len, const uint8_t* key, + int key_len, const uint8_t* aad, int aad_len, uint8_t* plaintext); + + private: + /// Can serve one key length only. Possible values: 16, 24, 32 bytes. + explicit AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadata); + // PIMPL Idiom + class AesDecryptorImpl; + std::unique_ptr<AesDecryptorImpl> impl_; +}; + +std::string CreateModuleAad(const std::string& file_aad, int8_t module_type, + int16_t row_group_ordinal, int16_t column_ordinal, + int16_t page_ordinal); + +std::string CreateFooterAad(const std::string& aad_prefix_bytes); + +// Update last two bytes of page (or page header) module AAD +void QuickUpdatePageAad(const std::string& AAD, int16_t new_page_ordinal); + +// Wraps OpenSSL RAND_bytes function +void RandBytes(unsigned char* buf, int num); + +} // namespace encryption +} // namespace parquet diff --git a/src/s3select/include/internal_file_decryptor.h b/src/s3select/include/internal_file_decryptor.h new file mode 100644 index 000000000..011c4acbe --- /dev/null +++ b/src/s3select/include/internal_file_decryptor.h @@ -0,0 +1,121 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <map> +#include <memory> +#include <string> +#include <vector> + +#include "parquet/schema.h" + +namespace parquet { + +namespace encryption { +class AesDecryptor; +class AesEncryptor; +} // namespace encryption + +class FileDecryptionProperties; + +class PARQUET_EXPORT Decryptor { + public: + Decryptor(encryption::AesDecryptor* decryptor, const std::string& key, + const std::string& file_aad, const std::string& aad, + ::arrow::MemoryPool* pool); + + const std::string& file_aad() const { return file_aad_; } + void UpdateAad(const std::string& aad) { aad_ = aad; } + ::arrow::MemoryPool* pool() { return pool_; } + + int CiphertextSizeDelta(); + int Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* plaintext); + + private: + encryption::AesDecryptor* aes_decryptor_; + std::string key_; + std::string file_aad_; + std::string aad_; + ::arrow::MemoryPool* pool_; +}; + +class InternalFileDecryptor { + public: + explicit InternalFileDecryptor(FileDecryptionProperties* properties, + const std::string& file_aad, + ParquetCipher::type algorithm, + const std::string& footer_key_metadata, + ::arrow::MemoryPool* pool); + + std::string& file_aad() { return file_aad_; } + + std::string GetFooterKey(); + + ParquetCipher::type algorithm() { return algorithm_; } + + std::string& footer_key_metadata() { return footer_key_metadata_; } + + FileDecryptionProperties* properties() { return properties_; } + + void WipeOutDecryptionKeys(); + + ::arrow::MemoryPool* pool() { return pool_; } + + std::shared_ptr<Decryptor> GetFooterDecryptor(); + std::shared_ptr<Decryptor> GetFooterDecryptorForColumnMeta(const std::string& aad = ""); + std::shared_ptr<Decryptor> GetFooterDecryptorForColumnData(const std::string& aad = ""); + std::shared_ptr<Decryptor> GetColumnMetaDecryptor( + const std::string& column_path, const std::string& column_key_metadata, + const std::string& aad = ""); + std::shared_ptr<Decryptor> GetColumnDataDecryptor( + const std::string& column_path, const std::string& column_key_metadata, + const std::string& aad = ""); + + private: + FileDecryptionProperties* properties_; + // Concatenation of aad_prefix (if exists) and aad_file_unique + std::string file_aad_; + std::map<std::string, std::shared_ptr<Decryptor>> column_data_map_; + std::map<std::string, std::shared_ptr<Decryptor>> column_metadata_map_; + + std::shared_ptr<Decryptor> footer_metadata_decryptor_; + std::shared_ptr<Decryptor> footer_data_decryptor_; + ParquetCipher::type algorithm_; + std::string footer_key_metadata_; + std::vector<encryption::AesDecryptor*> all_decryptors_; + + /// Key must be 16, 24 or 32 bytes in length. Thus there could be up to three + // types of meta_decryptors and data_decryptors. + std::unique_ptr<encryption::AesDecryptor> meta_decryptor_[3]; + std::unique_ptr<encryption::AesDecryptor> data_decryptor_[3]; + + ::arrow::MemoryPool* pool_; + + std::shared_ptr<Decryptor> GetFooterDecryptor(const std::string& aad, bool metadata); + std::shared_ptr<Decryptor> GetColumnDecryptor(const std::string& column_path, + const std::string& column_key_metadata, + const std::string& aad, + bool metadata = false); + + encryption::AesDecryptor* GetMetaAesDecryptor(size_t key_size); + encryption::AesDecryptor* GetDataAesDecryptor(size_t key_size); + + int MapKeyLenToDecryptorArrayIndex(int key_len); +}; + +} // namespace parquet diff --git a/src/s3select/include/s3select.h b/src/s3select/include/s3select.h new file mode 100644 index 000000000..3ac111351 --- /dev/null +++ b/src/s3select/include/s3select.h @@ -0,0 +1,3153 @@ +#ifndef __S3SELECT__ +#define __S3SELECT__ +#define BOOST_SPIRIT_THREADSAFE +#define CSV_IO_NO_THREAD + +#pragma once +#define BOOST_BIND_GLOBAL_PLACEHOLDERS +#include <boost/spirit/include/classic_core.hpp> +#include <boost/algorithm/string.hpp> +#include <iostream> +#include <string> +#include <list> +#include <deque> +#include "s3select_oper.h" +#include "s3select_functions.h" +#include "s3select_csv_parser.h" +#include "s3select_json_parser.h" +#include <boost/function.hpp> +#include <boost/bind.hpp> +#include <functional> + +#define _DEBUG_TERM {string token(a,b);std::cout << __FUNCTION__ << token << std::endl;} + +namespace s3selectEngine +{ + +/// AST builder + +class s3select_projections +{ + +private: + std::vector<base_statement*> m_projections; + +public: + + std::vector<base_statement*>* get() + { + return &m_projections; + } + +}; + +static s3select_reserved_word g_s3select_reserve_word;//read-only + +struct actionQ +{ +// upon parser is accepting a token (lets say some number), +// it push it into dedicated queue, later those tokens are poped out to build some "higher" contruct (lets say 1 + 2) +// those containers are used only for parsing phase and not for runtime. + + std::vector<mulldiv_operation::muldiv_t> muldivQ; + std::vector<addsub_operation::addsub_op_t> addsubQ; + std::vector<arithmetic_operand::cmp_t> arithmetic_compareQ; + std::vector<logical_operand::oplog_t> logical_compareQ; + std::vector<base_statement*> exprQ; + std::vector<base_statement*> funcQ; + std::vector<base_statement*> whenThenQ; + std::vector<base_statement*> inPredicateQ; + base_statement* inMainArg; + std::vector<std::string> dataTypeQ; + std::vector<std::string> trimTypeQ; + std::vector<std::string> datePartQ; + projection_alias alias_map; + std::string from_clause; + std::vector<std::string> json_from_clause; + bool limit_op; + unsigned long limit; + std::string column_prefix; + std::string table_alias; + s3select_projections projections; + + bool projection_or_predicate_state; //true->projection false->predicate(where-clause statement) + std::vector<base_statement*> predicate_columns; + std::vector<base_statement*> projections_columns; + base_statement* first_when_then_expr; + + std::string json_array_name; // _1.a[ ] json_array_name = "a"; upon parser is scanning a correct json-path; json_array_name will contain the array name. + std::string json_object_name; // _1.b json_object_name = "b" ; upon parser is scanning a correct json-path; json_object_name will contain the object name. + std::deque<size_t> json_array_index_number; // _1.a.c[ some integer number >=0 ]; upon parser is scanning a correct json-path; json_array_index_number will contain the array index. + // or in the case of multidimensional contain seiries of index number + + json_variable_access json_var_md; + + std::vector<std::pair<json_variable_access*,size_t>> json_statement_variables_match_expression;//contains all statement variables and their search-expression for locating the correct values in input document + + actionQ(): inMainArg(0),from_clause("##"),limit_op(false),column_prefix("##"),table_alias("##"),projection_or_predicate_state(true),first_when_then_expr(nullptr){} + + std::map<const void*,std::vector<const char*> *> x_map; + + ~actionQ() + { + for(auto m : x_map) + delete m.second; + } + + bool is_already_scanned(const void *th,const char *a) + { + //purpose: caller get indication in the case a specific builder is scan more than once the same text(pointer) + auto t = x_map.find(th); + + if(t == x_map.end()) + { + auto v = new std::vector<const char*>; + x_map.insert(std::pair<const void*,std::vector<const char*> *>(th,v)); + v->push_back(a); + } + else + { + for(auto& c : *(t->second)) + { + if( strcmp(c,a) == 0) + return true; + } + t->second->push_back(a); + } + return false; + } + +}; + +class s3select; + +struct base_ast_builder +{ + void operator()(s3select* self, const char* a, const char* b) const; + + virtual void builder(s3select* self, const char* a, const char* b) const = 0; + + virtual ~base_ast_builder() = default; +}; + +struct push_from_clause : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_from_clause g_push_from_clause; + +struct push_json_from_clause : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_json_from_clause g_push_json_from_clause; + +struct push_limit_clause : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_limit_clause g_push_limit_clause; + +struct push_number : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_number g_push_number; + +struct push_float_number : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_float_number g_push_float_number; + +struct push_string : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_string g_push_string; + +struct push_variable : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_variable g_push_variable; + +struct push_json_variable : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_json_variable g_push_json_variable; + +/////////////////////////arithmetic unit ///////////////// +struct push_addsub : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_addsub g_push_addsub; + +struct push_mulop : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_mulop g_push_mulop; + +struct push_addsub_binop : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_addsub_binop g_push_addsub_binop; + +struct push_mulldiv_binop : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_mulldiv_binop g_push_mulldiv_binop; + +struct push_function_arg : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_function_arg g_push_function_arg; + +struct push_function_name : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_function_name g_push_function_name; + +struct push_function_expr : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_function_expr g_push_function_expr; + +struct push_cast_expr : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_cast_expr g_push_cast_expr; + +struct push_cast_decimal_expr : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_cast_decimal_expr g_push_cast_decimal_expr; + +struct push_decimal_operator : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_decimal_operator g_push_decimal_operator; + +struct push_data_type : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_data_type g_push_data_type; + +////////////////////// logical unit //////////////////////// + +struct push_compare_operator : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; + +}; +static push_compare_operator g_push_compare_operator; + +struct push_logical_operator : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; + +}; +static push_logical_operator g_push_logical_operator; + +struct push_arithmetic_predicate : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; + +}; +static push_arithmetic_predicate g_push_arithmetic_predicate; + +struct push_logical_predicate : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_logical_predicate g_push_logical_predicate; + +struct push_negation : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_negation g_push_negation; + +struct push_column_pos : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_column_pos g_push_column_pos; + +struct push_projection : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_projection g_push_projection; + +struct push_alias_projection : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_alias_projection g_push_alias_projection; + +struct push_between_filter : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_between_filter g_push_between_filter; + +struct push_not_between_filter : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_not_between_filter g_push_not_between_filter; + +struct push_in_predicate : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_in_predicate g_push_in_predicate; + +struct push_in_predicate_arguments : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_in_predicate_arguments g_push_in_predicate_arguments; + +struct push_in_predicate_first_arg : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_in_predicate_first_arg g_push_in_predicate_first_arg; + +struct push_like_predicate_escape : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_like_predicate_escape g_push_like_predicate_escape; + +struct push_like_predicate_no_escape : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_like_predicate_no_escape g_push_like_predicate_no_escape; + +struct push_is_null_predicate : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_is_null_predicate g_push_is_null_predicate; + +struct push_case_when_else : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_case_when_else g_push_case_when_else; + +struct push_when_condition_then : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_when_condition_then g_push_when_condition_then; + +struct push_when_value_then : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_when_value_then g_push_when_value_then; + +struct push_case_value_when_value_else : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_case_value_when_value_else g_push_case_value_when_value_else; + +struct push_substr_from : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_substr_from g_push_substr_from; + +struct push_substr_from_for : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_substr_from_for g_push_substr_from_for; + +struct push_trim_type : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_trim_type g_push_trim_type; + +struct push_trim_whitespace_both : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_trim_whitespace_both g_push_trim_whitespace_both; + +struct push_trim_expr_one_side_whitespace : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_trim_expr_one_side_whitespace g_push_trim_expr_one_side_whitespace; + +struct push_trim_expr_anychar_anyside : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_trim_expr_anychar_anyside g_push_trim_expr_anychar_anyside; + +struct push_datediff : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_datediff g_push_datediff; + +struct push_dateadd : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_dateadd g_push_dateadd; + +struct push_extract : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_extract g_push_extract; + +struct push_date_part : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_date_part g_push_date_part; + +struct push_time_to_string_constant : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_time_to_string_constant g_push_time_to_string_constant; + +struct push_time_to_string_dynamic : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_time_to_string_dynamic g_push_time_to_string_dynamic; + +struct push_string_to_time_constant : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_string_to_time_constant g_push_string_to_time_constant; + +struct push_array_number : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_array_number g_push_array_number; + +struct push_json_array_name : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_json_array_name g_push_json_array_name; + +struct push_json_object : public base_ast_builder +{ + void builder(s3select* self, const char* a, const char* b) const; +}; +static push_json_object g_push_json_object; + +struct s3select : public bsc::grammar<s3select> +{ +private: + + actionQ m_actionQ; + scratch_area m_sca; + s3select_functions m_s3select_functions; + std::string error_description; + s3select_allocator m_s3select_allocator; + bool aggr_flow = false; + bool m_json_query = false; + std::set<base_statement*> m_ast_nodes_to_delete; + base_function* m_to_timestamp_for_clean = nullptr; + +#define BOOST_BIND_ACTION( push_name ) boost::bind( &push_name::operator(), g_ ## push_name, const_cast<s3select*>(&self), _1, _2) + +public: + + std::set<base_statement*>& get_ast_nodes_to_delete() + { + return m_ast_nodes_to_delete; + } + + base_function* & get_to_timestamp_for_clean() + { + return m_to_timestamp_for_clean; + } + + actionQ* getAction() + { + return &m_actionQ; + } + + s3select_allocator* getAllocator() + { + return &m_s3select_allocator; + } + + s3select_functions* getS3F() + { + return &m_s3select_functions; + } + + int semantic() + { + for (const auto &e : get_projections_list()) + { + e->resolve_node(); + //upon validate there is no aggregation-function nested calls, it validates legit aggregation call. + if (e->is_nested_aggregate(aggr_flow)) + { + error_description = "nested aggregation function is illegal i.e. sum(...sum ...)"; + throw base_s3select_exception(error_description, base_s3select_exception::s3select_exp_en_t::FATAL); + } + + e->push_for_cleanup(m_ast_nodes_to_delete); + } + + if(get_filter()) + get_filter()->push_for_cleanup(m_ast_nodes_to_delete); + + if (aggr_flow == true) + {// atleast one projection column contain aggregation function + for (const auto &e : get_projections_list()) + { + auto aggregate_expr = e->get_aggregate(); + + if (aggregate_expr) + { + //per each column, subtree is mark to skip except for the aggregation function subtree. + //for an example: substring( ... , sum() , count() ) :: the substring is mark to skip execution, while sum and count not. + e->set_skip_non_aggregate(true); + e->mark_aggreagtion_subtree_to_execute(); + } + else + { + //in case projection column is not aggregate, the projection column must *not* contain reference to columns. + if(e->is_column_reference()) + { + error_description = "illegal query; projection contains aggregation function is not allowed with projection contains column reference"; + throw base_s3select_exception(error_description, base_s3select_exception::s3select_exp_en_t::FATAL); + } + } + + } + } + + m_json_query = (m_actionQ.json_from_clause.size() != 0); + + return 0; + } + + int parse_query(const char* input_query) + { + if(get_projections_list().empty() == false) + { + return 0; //already parsed + } + + + error_description.clear(); + aggr_flow = false; + + try + { + bsc::parse_info<> info = bsc::parse(input_query, *this, bsc::space_p); + auto query_parse_position = info.stop; + + if (!info.full) + { + error_description = std::string("failure -->") + query_parse_position + std::string("<---"); + return -1; + } + + semantic(); + } + catch (base_s3select_exception& e) + { + error_description.assign(e.what()); + if (e.severity() == base_s3select_exception::s3select_exp_en_t::FATAL) //abort query execution + { + return -1; + } + } + + return 0; + } + + std::string get_error_description() + { + return error_description; + } + + s3select() + { + m_s3select_functions.setAllocator(&m_s3select_allocator); + m_s3select_functions.set_AST_nodes_for_cleanup(&m_ast_nodes_to_delete); + } + + bool is_semantic()//TBD traverse and validate semantics per all nodes + { + base_statement* cond = m_actionQ.exprQ.back(); + + return cond->semantic(); + } + + std::string get_from_clause() const + { + return m_actionQ.from_clause; + } + + bool is_limit() + { + return m_actionQ.limit_op; + } + + unsigned long get_limit() + { + return m_actionQ.limit; + } + + void load_schema(std::vector< std::string>& scm) + { + int i = 0; + for (auto& c : scm) + { + m_sca.set_column_pos(c.c_str(), i++); + } + } + + base_statement* get_filter() + { + if(m_actionQ.exprQ.empty()) + { + return nullptr; + } + + return m_actionQ.exprQ.back(); + } + + std::vector<base_statement*> get_projections_list() + { + return *m_actionQ.projections.get(); //TODO return COPY(?) or to return evalaution results (list of class value{}) / return reference(?) + } + + scratch_area* get_scratch_area() + { + return &m_sca; + } + + projection_alias* get_aliases() + { + return &m_actionQ.alias_map; + } + + std::vector<std::pair<json_variable_access*,size_t>>& get_json_variables_access() + { + return m_actionQ.json_statement_variables_match_expression; + } + + bool is_aggregate_query() const + { + return aggr_flow == true; + } + + bool is_json_query() + { + return m_json_query; + } + + ~s3select() + { + for(auto it : m_ast_nodes_to_delete) + { + if (it->is_function()) + {//upon its a function, call to the implementation destructor + if(dynamic_cast<__function*>(it)->impl()) + dynamic_cast<__function*>(it)->impl()->dtor(); + } + //calling to destrcutor of class-function itself, or non-function destructor + it->dtor(); + } + + for(auto x: m_actionQ.json_statement_variables_match_expression) + {//the json_variable_access object is allocated by S3SELECT_NEW. this object contains stl-vector that should be free + x.first->~json_variable_access(); + } + if(m_to_timestamp_for_clean) + { + m_to_timestamp_for_clean->dtor(); + } + } + +#define JSON_ROOT_OBJECT "s3object[*]" + +//the input is converted to lower case +#define S3SELECT_KW( reserve_word ) bsc::as_lower_d[ reserve_word ] + + template <typename ScannerT> + struct definition + { + explicit definition(s3select const& self) + { + ///// s3select syntax rules and actions for building AST + + select_expr = select_expr_base_ >> bsc::lexeme_d[ *(bsc::str_p(" ")|bsc::str_p(";")) ]; + + select_expr_base_ = select_expr_base >> S3SELECT_KW("limit") >> (limit_number)[BOOST_BIND_ACTION(push_limit_clause)] | select_expr_base; + + limit_number = (+bsc::digit_p); + + select_expr_base = S3SELECT_KW("select") >> projections >> S3SELECT_KW("from") >> (from_expression)[BOOST_BIND_ACTION(push_from_clause)] >> !where_clause ; + + projections = projection_expression >> *( ',' >> projection_expression) ; + + projection_expression = (arithmetic_expression >> S3SELECT_KW("as") >> alias_name)[BOOST_BIND_ACTION(push_alias_projection)] | + (arithmetic_expression)[BOOST_BIND_ACTION(push_projection)] | + (arithmetic_predicate >> S3SELECT_KW("as") >> alias_name)[BOOST_BIND_ACTION(push_alias_projection)] | + (arithmetic_predicate)[BOOST_BIND_ACTION(push_projection)] ; + + alias_name = bsc::lexeme_d[(+bsc::alpha_p >> *bsc::digit_p)] ; + + when_case_else_projection = (S3SELECT_KW("case") >> (+when_stmt) >> S3SELECT_KW("else") >> arithmetic_expression >> S3SELECT_KW("end")) [BOOST_BIND_ACTION(push_case_when_else)]; + + when_stmt = (S3SELECT_KW("when") >> condition_expression >> S3SELECT_KW("then") >> arithmetic_expression)[BOOST_BIND_ACTION(push_when_condition_then)]; + + when_case_value_when = (S3SELECT_KW("case") >> arithmetic_expression >> + (+when_value_then) >> S3SELECT_KW("else") >> arithmetic_expression >> S3SELECT_KW("end")) [BOOST_BIND_ACTION(push_case_value_when_value_else)]; + + when_value_then = (S3SELECT_KW("when") >> arithmetic_expression >> S3SELECT_KW("then") >> arithmetic_expression)[BOOST_BIND_ACTION(push_when_value_then)]; + + from_expression = (s3_object >> variable_name ) | s3_object; + + //the stdin and object_path are for debug purposes(not part of the specs) + s3_object = json_s3_object | S3SELECT_KW("stdin") | S3SELECT_KW("s3object") | object_path; + + json_s3_object = ((S3SELECT_KW(JSON_ROOT_OBJECT)) >> *(bsc::str_p(".") >> json_path_element))[BOOST_BIND_ACTION(push_json_from_clause)]; + + json_path_element = bsc::lexeme_d[+( bsc::alnum_p | bsc::str_p("_")) ]; + + object_path = "/" >> *( fs_type >> "/") >> fs_type; + + fs_type = bsc::lexeme_d[+( bsc::alnum_p | bsc::str_p(".") | bsc::str_p("_")) ]; + + where_clause = S3SELECT_KW("where") >> condition_expression; + + condition_expression = arithmetic_predicate; + + arithmetic_predicate = (S3SELECT_KW("not") >> logical_predicate)[BOOST_BIND_ACTION(push_negation)] | logical_predicate; + + logical_predicate = (logical_and) >> *(or_op[BOOST_BIND_ACTION(push_logical_operator)] >> (logical_and)[BOOST_BIND_ACTION(push_logical_predicate)]); + + logical_and = (cmp_operand) >> *(and_op[BOOST_BIND_ACTION(push_logical_operator)] >> (cmp_operand)[BOOST_BIND_ACTION(push_logical_predicate)]); + + cmp_operand = special_predicates | (factor) >> *(arith_cmp[BOOST_BIND_ACTION(push_compare_operator)] >> (factor)[BOOST_BIND_ACTION(push_arithmetic_predicate)]); + + special_predicates = (is_null) | (is_not_null) | (between_predicate) | (not_between) | (in_predicate) | (like_predicate); + + is_null = ((factor) >> S3SELECT_KW("is") >> S3SELECT_KW("null"))[BOOST_BIND_ACTION(push_is_null_predicate)]; + + is_not_null = ((factor) >> S3SELECT_KW("is") >> S3SELECT_KW("not") >> S3SELECT_KW("null"))[BOOST_BIND_ACTION(push_is_null_predicate)]; + + between_predicate = (arithmetic_expression >> S3SELECT_KW("between") >> arithmetic_expression >> S3SELECT_KW("and") >> arithmetic_expression)[BOOST_BIND_ACTION(push_between_filter)]; + + not_between = (arithmetic_expression >> S3SELECT_KW("not") >> S3SELECT_KW("between") >> arithmetic_expression >> S3SELECT_KW("and") >> arithmetic_expression)[BOOST_BIND_ACTION(push_not_between_filter)]; + + in_predicate = (arithmetic_expression >> S3SELECT_KW("in") >> '(' >> arithmetic_expression[BOOST_BIND_ACTION(push_in_predicate_first_arg)] >> *(',' >> arithmetic_expression[BOOST_BIND_ACTION(push_in_predicate_arguments)]) >> ')')[BOOST_BIND_ACTION(push_in_predicate)]; + + like_predicate = (like_predicate_escape) |(like_predicate_no_escape); + + like_predicate_no_escape = (arithmetic_expression >> S3SELECT_KW("like") >> arithmetic_expression)[BOOST_BIND_ACTION(push_like_predicate_no_escape)]; + + like_predicate_escape = (arithmetic_expression >> S3SELECT_KW("like") >> arithmetic_expression >> S3SELECT_KW("escape") >> arithmetic_expression)[BOOST_BIND_ACTION(push_like_predicate_escape)]; + + factor = arithmetic_expression | ( '(' >> arithmetic_predicate >> ')' ) ; + + arithmetic_expression = (addsub_operand >> *(addsubop_operator[BOOST_BIND_ACTION(push_addsub)] >> addsub_operand[BOOST_BIND_ACTION(push_addsub_binop)] )); + + addsub_operand = (mulldiv_operand >> *(muldiv_operator[BOOST_BIND_ACTION(push_mulop)] >> mulldiv_operand[BOOST_BIND_ACTION(push_mulldiv_binop)] ));// this non-terminal gives precedense to mull/div + + mulldiv_operand = arithmetic_argument | ('(' >> (arithmetic_expression) >> ')') ; + + list_of_function_arguments = (arithmetic_expression)[BOOST_BIND_ACTION(push_function_arg)] >> *(',' >> (arithmetic_expression)[BOOST_BIND_ACTION(push_function_arg)]); + + reserved_function_names = (S3SELECT_KW("when")|S3SELECT_KW("case")|S3SELECT_KW("then")|S3SELECT_KW("not")|S3SELECT_KW("limit")|S3SELECT_KW("where")|S3SELECT_KW("in")|S3SELECT_KW("between") | + S3SELECT_KW("like")|S3SELECT_KW("is") ); + + function = ( ((variable_name) >> '(' )[BOOST_BIND_ACTION(push_function_name)] >> !list_of_function_arguments >> ')')[BOOST_BIND_ACTION(push_function_expr)]; + + arithmetic_argument = (float_number)[BOOST_BIND_ACTION(push_float_number)] | (number)[BOOST_BIND_ACTION(push_number)] | (json_variable_name)[BOOST_BIND_ACTION(push_json_variable)] | + (column_pos)[BOOST_BIND_ACTION(push_column_pos)] | + (string)[BOOST_BIND_ACTION(push_string)] | (backtick_string) | (datediff) | (dateadd) | (extract) | (time_to_string_constant) | (time_to_string_dynamic) | + (cast) | (substr) | (trim) | (when_case_value_when) | (when_case_else_projection) | + (function) | (variable)[BOOST_BIND_ACTION(push_variable)]; //function is pushed by right-term + + cast = cast_as_data_type | cast_as_decimal_expr ; + + cast_as_data_type = (S3SELECT_KW("cast") >> '(' >> factor >> S3SELECT_KW("as") >> (data_type) >> ')') [BOOST_BIND_ACTION(push_cast_expr)]; + + cast_as_decimal_expr = (S3SELECT_KW("cast") >> '(' >> factor >> S3SELECT_KW("as") >> decimal_operator >> ')') [BOOST_BIND_ACTION(push_cast_decimal_expr)]; + + decimal_operator = (S3SELECT_KW("decimal") >> '(' >> (number)[BOOST_BIND_ACTION(push_number)] >> ',' >> (number)[BOOST_BIND_ACTION(push_number)] >> ')') + [BOOST_BIND_ACTION(push_decimal_operator)]; + + data_type = (S3SELECT_KW("int") | S3SELECT_KW("float") | S3SELECT_KW("string") | S3SELECT_KW("timestamp") | S3SELECT_KW("bool"))[BOOST_BIND_ACTION(push_data_type)]; + + substr = (substr_from) | (substr_from_for); + + substr_from = (S3SELECT_KW("substring") >> '(' >> (arithmetic_expression >> S3SELECT_KW("from") >> arithmetic_expression) >> ')') [BOOST_BIND_ACTION(push_substr_from)]; + + substr_from_for = (S3SELECT_KW("substring") >> '(' >> (arithmetic_expression >> S3SELECT_KW("from") >> arithmetic_expression >> S3SELECT_KW("for") >> arithmetic_expression) >> ')') [BOOST_BIND_ACTION(push_substr_from_for)]; + + trim = (trim_whitespace_both) | (trim_one_side_whitespace) | (trim_anychar_anyside); + + trim_one_side_whitespace = (S3SELECT_KW("trim") >> '(' >> (trim_type)[BOOST_BIND_ACTION(push_trim_type)] >> arithmetic_expression >> ')') [BOOST_BIND_ACTION(push_trim_expr_one_side_whitespace)]; + + trim_whitespace_both = (S3SELECT_KW("trim") >> '(' >> arithmetic_expression >> ')') [BOOST_BIND_ACTION(push_trim_whitespace_both)]; + + trim_anychar_anyside = (S3SELECT_KW("trim") >> '(' >> ((trim_remove_type)[BOOST_BIND_ACTION(push_trim_type)] >> arithmetic_expression >> S3SELECT_KW("from") >> arithmetic_expression) >> ')') [BOOST_BIND_ACTION(push_trim_expr_anychar_anyside)]; + + trim_type = ((S3SELECT_KW("leading") >> S3SELECT_KW("from")) | ( S3SELECT_KW("trailing") >> S3SELECT_KW("from")) | (S3SELECT_KW("both") >> S3SELECT_KW("from")) | S3SELECT_KW("from") ); + + trim_remove_type = (S3SELECT_KW("leading") | S3SELECT_KW("trailing") | S3SELECT_KW("both") ); + + datediff = (S3SELECT_KW("date_diff") >> '(' >> date_part >> ',' >> arithmetic_expression >> ',' >> arithmetic_expression >> ')') [BOOST_BIND_ACTION(push_datediff)]; + + dateadd = (S3SELECT_KW("date_add") >> '(' >> date_part >> ',' >> arithmetic_expression >> ',' >> arithmetic_expression >> ')') [BOOST_BIND_ACTION(push_dateadd)]; + + extract = (S3SELECT_KW("extract") >> '(' >> (date_part_extract)[BOOST_BIND_ACTION(push_date_part)] >> S3SELECT_KW("from") >> arithmetic_expression >> ')') [BOOST_BIND_ACTION(push_extract)]; + + date_part = (S3SELECT_KW("year") | S3SELECT_KW("month") | S3SELECT_KW("day") | S3SELECT_KW("hour") | S3SELECT_KW("minute") | S3SELECT_KW("second")) [BOOST_BIND_ACTION(push_date_part)]; + + date_part_extract = ((date_part) | S3SELECT_KW("week") | S3SELECT_KW("timezone_hour") | S3SELECT_KW("timezone_minute")); + + time_to_string_constant = (S3SELECT_KW("to_string") >> '(' >> arithmetic_expression >> ',' >> (string)[BOOST_BIND_ACTION(push_string)] >> ')') [BOOST_BIND_ACTION(push_time_to_string_constant)]; + + time_to_string_dynamic = (S3SELECT_KW("to_string") >> '(' >> arithmetic_expression >> ',' >> arithmetic_expression >> ')') [BOOST_BIND_ACTION(push_time_to_string_dynamic)]; + + number = bsc::int_p; + + float_number = bsc::real_p; + + string = (bsc::str_p("\"") >> *( bsc::anychar_p - bsc::str_p("\"") ) >> bsc::str_p("\"")) | (bsc::str_p("\'") >> *( bsc::anychar_p - bsc::str_p("\'") ) >> bsc::str_p("\'")); + + backtick_string = (bsc::str_p("`") >> *( bsc::anychar_p - bsc::str_p("`") ) >> bsc::str_p("`")) [BOOST_BIND_ACTION(push_string_to_time_constant)]; + + column_pos = (variable_name >> "." >> column_pos_name) | column_pos_name; //TODO what about space + + column_pos_name = ('_'>>+(bsc::digit_p) ) | '*' ; + + muldiv_operator = bsc::str_p("*") | bsc::str_p("/") | bsc::str_p("^") | bsc::str_p("%");// got precedense + + addsubop_operator = bsc::str_p("+") | bsc::str_p("-"); + + arith_cmp = bsc::str_p("<>") | bsc::str_p(">=") | bsc::str_p("<=") | bsc::str_p("=") | bsc::str_p("<") | bsc::str_p(">") | bsc::str_p("!="); + + and_op = S3SELECT_KW("and"); + + or_op = S3SELECT_KW("or"); + + variable_name = bsc::lexeme_d[(+bsc::alpha_p >> *( bsc::alpha_p | bsc::digit_p | '_') ) - reserved_function_names]; + + variable = (variable_name >> "." >> variable_name) | variable_name; + + json_variable_name = bsc::str_p("_1") >> +("." >> (json_array | json_object) ); + + json_object = (variable_name)[BOOST_BIND_ACTION(push_json_object)]; + + json_array = (variable_name >> +(bsc::str_p("[") >> number[BOOST_BIND_ACTION(push_array_number)] >> bsc::str_p("]")) )[BOOST_BIND_ACTION(push_json_array_name)]; + } + + + bsc::rule<ScannerT> cast, data_type, variable, json_variable_name, variable_name, select_expr, select_expr_base, select_expr_base_, s3_object, where_clause, limit_number; + bsc::rule<ScannerT> number, float_number, string, backtick_string, from_expression, cast_as_data_type, cast_as_decimal_expr, decimal_operator; + bsc::rule<ScannerT> cmp_operand, arith_cmp, condition_expression, arithmetic_predicate, logical_predicate, factor; + bsc::rule<ScannerT> trim, trim_whitespace_both, trim_one_side_whitespace, trim_anychar_anyside, trim_type, trim_remove_type, substr, substr_from, substr_from_for; + bsc::rule<ScannerT> datediff, dateadd, extract, date_part, date_part_extract, time_to_string_constant, time_to_string_dynamic; + bsc::rule<ScannerT> special_predicates, between_predicate, not_between, in_predicate, like_predicate, like_predicate_escape, like_predicate_no_escape, is_null, is_not_null; + bsc::rule<ScannerT> muldiv_operator, addsubop_operator, function, arithmetic_expression, addsub_operand, list_of_function_arguments, arithmetic_argument, mulldiv_operand, reserved_function_names; + bsc::rule<ScannerT> fs_type, object_path,json_s3_object,json_path_element,json_object,json_array; + bsc::rule<ScannerT> projections, projection_expression, alias_name, column_pos,column_pos_name; + bsc::rule<ScannerT> when_case_else_projection, when_case_value_when, when_stmt, when_value_then; + bsc::rule<ScannerT> logical_and,and_op,or_op; + bsc::rule<ScannerT> const& start() const + { + return select_expr ; + } + }; +}; + +void base_ast_builder::operator()(s3select *self, const char *a, const char *b) const +{ + //the purpose of the following procedure is to bypass boost::spirit rescan (calling to bind-action more than once per the same text) + //which cause wrong AST creation (and later false execution). + if (self->getAction()->is_already_scanned((void *)(this), const_cast<char *>(a))) + return; + + builder(self, a, b); +} + +void push_from_clause::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b),table_name,alias_name; + + //should search for generic space + if(token.find(' ') != std::string::npos) + { + size_t pos = token.find(' '); + table_name = token.substr(0,pos); + + pos = token.rfind(' '); + alias_name = token.substr(pos+1,token.size()); + + self->getAction()->table_alias = alias_name; + + if(self->getAction()->column_prefix != "##" && self->getAction()->table_alias != self->getAction()->column_prefix) + { + throw base_s3select_exception(std::string("query can not contain more then a single table-alias"), base_s3select_exception::s3select_exp_en_t::FATAL); + } + + token = table_name; + } + + self->getAction()->from_clause = token; + + self->getAction()->exprQ.clear(); +} + +void push_json_from_clause::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b),table_name,alias_name; + + //TODO handle the star-operation ('*') in from-clause. build the parameters for json-reader search-api's. + std::vector<std::string> variable_key_path; + const char* delimiter = "."; + auto pos = token.find(delimiter); + + if(pos != std::string::npos) + { + token = token.substr(strlen(JSON_ROOT_OBJECT)+1,token.size()); + pos = token.find(delimiter); + do + { + variable_key_path.push_back(token.substr(0,pos)); + if(pos != std::string::npos) + token = token.substr(pos+1,token.size()); + else + token = ""; + pos = token.find(delimiter); + }while(token.size()); + } + else + { + variable_key_path.push_back(JSON_ROOT_OBJECT); + } + + self->getAction()->json_from_clause = variable_key_path; +} + +void push_limit_clause::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + self->getAction()->limit_op = true; + try + { + self->getAction()->limit = std::stoul(token); + } + catch(std::invalid_argument& e) + { + throw base_s3select_exception(std::string("Invalid argument "), base_s3select_exception::s3select_exp_en_t::FATAL); + } + catch(std::out_of_range& e) + { + throw base_s3select_exception(std::string("Out of range "), base_s3select_exception::s3select_exp_en_t::FATAL); + } +} + +void push_number::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + variable* v = S3SELECT_NEW(self, variable, atoi(token.c_str())); + + self->getAction()->exprQ.push_back(v); +} + +void push_float_number::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + //the parser for float(real_p) is accepting also integers, thus "blocking" integer acceptence and all are float. + bsc::parse_info<> info = bsc::parse(token.c_str(), bsc::int_p, bsc::space_p); + + if (!info.full) + { + char* perr; + double d = strtod(token.c_str(), &perr); + variable* v = S3SELECT_NEW(self, variable, d); + + self->getAction()->exprQ.push_back(v); + } + else + { + variable* v = S3SELECT_NEW(self, variable, atoi(token.c_str())); + + self->getAction()->exprQ.push_back(v); + } +} + +void push_string::builder(s3select* self, const char* a, const char* b) const +{ + a++; + b--; // remove double quotes + std::string token(a, b); + + variable* v = S3SELECT_NEW(self, variable, token, variable::var_t::COLUMN_VALUE); + + self->getAction()->exprQ.push_back(v); +} + +void push_variable::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + variable* v = nullptr; + + if (g_s3select_reserve_word.is_reserved_word(token)) + { + if (g_s3select_reserve_word.get_reserved_word(token) == s3select_reserved_word::reserve_word_en_t::S3S_NULL) + { + v = S3SELECT_NEW(self, variable, s3select_reserved_word::reserve_word_en_t::S3S_NULL); + } + else if (g_s3select_reserve_word.get_reserved_word(token) == s3select_reserved_word::reserve_word_en_t::S3S_NAN) + { + v = S3SELECT_NEW(self, variable, s3select_reserved_word::reserve_word_en_t::S3S_NAN); + } + else if (g_s3select_reserve_word.get_reserved_word(token) == s3select_reserved_word::reserve_word_en_t::S3S_FALSE) + { + v = S3SELECT_NEW(self, variable, s3select_reserved_word::reserve_word_en_t::S3S_FALSE); + } + else if (g_s3select_reserve_word.get_reserved_word(token) == s3select_reserved_word::reserve_word_en_t::S3S_TRUE) + { + v = S3SELECT_NEW(self, variable, s3select_reserved_word::reserve_word_en_t::S3S_TRUE); + } + else + { + v = S3SELECT_NEW(self, variable, s3select_reserved_word::reserve_word_en_t::NA); + } + + } + else + { + size_t pos = token.find('.'); + std::string alias_name; + if(pos != std::string::npos) + { + alias_name = token.substr(0,pos); + pos ++; + token = token.substr(pos,token.size()); + + if(self->getAction()->column_prefix != "##" && alias_name != self->getAction()->column_prefix) + { + throw base_s3select_exception(std::string("query can not contain more then a single table-alias"), base_s3select_exception::s3select_exp_en_t::FATAL); + } + + self->getAction()->column_prefix = alias_name; + } + v = S3SELECT_NEW(self, variable, token); + } + + self->getAction()->exprQ.push_back(v); +} + +void push_json_variable::builder(s3select* self, const char* a, const char* b) const +{//purpose: handle the use case of json-variable structure (_1.a.b.c) + + std::string token(a, b); + std::vector<std::string> variable_key_path; + + //the following flow determine the index per json variable reside on statement. + //per each discovered json_variable, it search the json-variables-vector whether it already exists. + //in case it is exist, it uses its index (position in vector) + //in case it's not exist its pushes the variable into vector. + //the json-index is used upon updating the scratch area or searching for a specific json-variable value. + + size_t json_index=self->getAction()->json_statement_variables_match_expression.size(); + variable* v = nullptr; + json_variable_access* ja = S3SELECT_NEW(self, json_variable_access); + *ja = self->getAction()->json_var_md; + self->getAction()->json_statement_variables_match_expression.push_back(std::pair<json_variable_access*,size_t>(ja,json_index)); + + v = S3SELECT_NEW(self, variable, token, variable::var_t::JSON_VARIABLE, json_index); + self->getAction()->exprQ.push_back(v); + + self->getAction()->json_var_md.clear(); +} + +void push_array_number::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + //DEBUG - TEMP std::cout << "push_array_number " << token << std::endl; + + self->getAction()->json_array_index_number.push_back(std::stoll(token.c_str())); +} + +void push_json_array_name::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + size_t found = token.find("["); + std::string array_name = token.substr(0,found); + + //DEBUG - TEMP std::cout << "push_json_array_name " << array_name << std::endl; + + //remove white-space + array_name.erase(std::remove_if(array_name.begin(), + array_name.end(), + [](unsigned char x){return std::isspace(x);}), + array_name.end()); + + std::vector<std::string> json_path; + std::vector<std::string> empty = {}; + json_path.push_back(array_name); + + self->getAction()->json_var_md.push_variable_state(json_path, -1);//pushing the array-name, {-1} means, search for object-name + + while(self->getAction()->json_array_index_number.size()) + { + self->getAction()->json_var_md.push_variable_state(empty, self->getAction()->json_array_index_number.front());//pushing empty and number>=0, means array-access + self->getAction()->json_array_index_number.pop_front(); + } +} + +void push_json_object::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + //DEBUG - TEMP std::cout << "push_json_object " << token << std::endl; + + self->getAction()->json_object_name = token; + std::vector<std::string> json_path; + json_path.push_back(token); + + self->getAction()->json_var_md.push_variable_state(json_path, -1); +} + +void push_addsub::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + if (token == "+") + { + self->getAction()->addsubQ.push_back(addsub_operation::addsub_op_t::ADD); + } + else + { + self->getAction()->addsubQ.push_back(addsub_operation::addsub_op_t::SUB); + } +} + +void push_mulop::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + if (token == "*") + { + self->getAction()->muldivQ.push_back(mulldiv_operation::muldiv_t::MULL); + } + else if (token == "/") + { + self->getAction()->muldivQ.push_back(mulldiv_operation::muldiv_t::DIV); + } + else if(token == "^") + { + self->getAction()->muldivQ.push_back(mulldiv_operation::muldiv_t::POW); + } + else + { + self->getAction()->muldivQ.push_back(mulldiv_operation::muldiv_t::MOD); + } +} + +void push_addsub_binop::builder(s3select* self, [[maybe_unused]] const char* a,[[maybe_unused]] const char* b) const +{ + base_statement* l = nullptr, *r = nullptr; + + r = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + l = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + addsub_operation::addsub_op_t o = self->getAction()->addsubQ.back(); + self->getAction()->addsubQ.pop_back(); + addsub_operation* as = S3SELECT_NEW(self, addsub_operation, l, o, r); + self->getAction()->exprQ.push_back(as); +} + +void push_mulldiv_binop::builder(s3select* self, [[maybe_unused]] const char* a, [[maybe_unused]] const char* b) const +{ + base_statement* vl = nullptr, *vr = nullptr; + + vr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + vl = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + mulldiv_operation::muldiv_t o = self->getAction()->muldivQ.back(); + self->getAction()->muldivQ.pop_back(); + mulldiv_operation* f = S3SELECT_NEW(self, mulldiv_operation, vl, o, vr); + self->getAction()->exprQ.push_back(f); +} + +void push_function_arg::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + base_statement* be = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + base_statement* f = self->getAction()->funcQ.back(); + + if (dynamic_cast<__function*>(f)) + { + dynamic_cast<__function*>(f)->push_argument(be); + } +} + +void push_function_name::builder(s3select* self, const char* a, const char* b) const +{ + b--; + while (*b == '(' || *b == ' ') + { + b--; //point to function-name + } + + std::string fn; + fn.assign(a, b - a + 1); + + __function* func = S3SELECT_NEW(self, __function, fn.c_str(), self->getS3F()); + self->getAction()->funcQ.push_back(func); +} + +void push_function_expr::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + base_statement* func = self->getAction()->funcQ.back(); + self->getAction()->funcQ.pop_back(); + + self->getAction()->exprQ.push_back(func); +} + +void push_compare_operator::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + arithmetic_operand::cmp_t c = arithmetic_operand::cmp_t::NA; + + if (token == "=") + { + c = arithmetic_operand::cmp_t::EQ; + } + else if (token == "!=" || token == "<>") + { + c = arithmetic_operand::cmp_t::NE; + } + else if (token == ">=") + { + c = arithmetic_operand::cmp_t::GE; + } + else if (token == "<=") + { + c = arithmetic_operand::cmp_t::LE; + } + else if (token == ">") + { + c = arithmetic_operand::cmp_t::GT; + } + else if (token == "<") + { + c = arithmetic_operand::cmp_t::LT; + } + + self->getAction()->arithmetic_compareQ.push_back(c); +} + +void push_logical_operator::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + logical_operand::oplog_t l = logical_operand::oplog_t::NA; + + if (token == "and") + { + l = logical_operand::oplog_t::AND; + } + else if (token == "or") + { + l = logical_operand::oplog_t::OR; + } + + self->getAction()->logical_compareQ.push_back(l); +} + +void push_arithmetic_predicate::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + base_statement* vr, *vl; + arithmetic_operand::cmp_t c = self->getAction()->arithmetic_compareQ.back(); + self->getAction()->arithmetic_compareQ.pop_back(); + + if (!self->getAction()->exprQ.empty()) + { + vr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + } + else + { + throw base_s3select_exception(std::string("missing right operand for arithmetic-comparision expression"), base_s3select_exception::s3select_exp_en_t::FATAL); + } + + if (!self->getAction()->exprQ.empty()) + { + vl = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + } + else + { + throw base_s3select_exception(std::string("missing left operand for arithmetic-comparision expression"), base_s3select_exception::s3select_exp_en_t::FATAL); + } + + arithmetic_operand* t = S3SELECT_NEW(self, arithmetic_operand, vl, c, vr); + + self->getAction()->exprQ.push_back(t); +} + +void push_logical_predicate::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + base_statement* tl = nullptr, *tr = nullptr; + logical_operand::oplog_t oplog = self->getAction()->logical_compareQ.back(); + self->getAction()->logical_compareQ.pop_back(); + + if (self->getAction()->exprQ.empty() == false) + { + tr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + } + else + {//should reject by syntax parser + throw base_s3select_exception(std::string("missing right operand for logical expression"), base_s3select_exception::s3select_exp_en_t::FATAL); + } + + if (self->getAction()->exprQ.empty() == false) + { + tl = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + } + else + {//should reject by syntax parser + throw base_s3select_exception(std::string("missing left operand for logical expression"), base_s3select_exception::s3select_exp_en_t::FATAL); + } + + logical_operand* f = S3SELECT_NEW(self, logical_operand, tl, oplog, tr); + + self->getAction()->exprQ.push_back(f); +} + +void push_negation::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + base_statement* pred = nullptr; + + if (self->getAction()->exprQ.empty() == false) + { + pred = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + } + else + { + throw base_s3select_exception(std::string("failed to create AST for NOT operator"), base_s3select_exception::s3select_exp_en_t::FATAL); + } + + //upon NOT operator, the logical and arithmetical operators are "tagged" to negate result. + if (dynamic_cast<logical_operand*>(pred)) + { + logical_operand* f = S3SELECT_NEW(self, logical_operand, pred); + self->getAction()->exprQ.push_back(f); + } + else if (dynamic_cast<__function*>(pred) || dynamic_cast<negate_function_operation*>(pred) || dynamic_cast<variable*>(pred)) + { + negate_function_operation* nf = S3SELECT_NEW(self, negate_function_operation, pred); + self->getAction()->exprQ.push_back(nf); + } + else if(dynamic_cast<arithmetic_operand*>(pred)) + { + arithmetic_operand* f = S3SELECT_NEW(self, arithmetic_operand, pred); + self->getAction()->exprQ.push_back(f); + } + else + { + throw base_s3select_exception(std::string("failed to create AST for NOT operator"), base_s3select_exception::s3select_exp_en_t::FATAL); + } +} + +void push_column_pos::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + std::string alias_name; + variable* v; + + if (token == "*" || token == "* ") //TODO space should skip in boost::spirit + { + v = S3SELECT_NEW(self, variable, token, variable::var_t::STAR_OPERATION); + + } + else + { + size_t pos = token.find('.'); + if(pos != std::string::npos) + { + alias_name = token.substr(0,pos); + + pos ++; + token = token.substr(pos,token.size()); + + if(self->getAction()->column_prefix != "##" && self->getAction()->column_prefix != alias_name) + { + throw base_s3select_exception(std::string("query can not contain more then a single table-alias"), base_s3select_exception::s3select_exp_en_t::FATAL); + } + + self->getAction()->column_prefix = alias_name; + } + v = S3SELECT_NEW(self, variable, token, variable::var_t::POS); + } + + self->getAction()->exprQ.push_back(v); +} + +void push_projection::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + self->getAction()->projections.get()->push_back(self->getAction()->exprQ.back()); + self->getAction()->exprQ.pop_back(); +} + +void push_alias_projection::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + //extract alias name + const char* p = b; + while (*(--p) != ' ') + ; + std::string alias_name(p + 1, b); + base_statement* bs = self->getAction()->exprQ.back(); + + //mapping alias name to base-statement + bool res = self->getAction()->alias_map.insert_new_entry(alias_name, bs); + if (res == false) + { + throw base_s3select_exception(std::string("alias <") + alias_name + std::string("> is already been used in query"), base_s3select_exception::s3select_exp_en_t::FATAL); + } + + self->getAction()->projections.get()->push_back(bs); + self->getAction()->exprQ.pop_back(); +} + +void push_between_filter::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + std::string between_function("#between#"); + + __function* func = S3SELECT_NEW(self, __function, between_function.c_str(), self->getS3F()); + + base_statement* second_expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + func->push_argument(second_expr); + + base_statement* first_expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + func->push_argument(first_expr); + + base_statement* main_expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + func->push_argument(main_expr); + + self->getAction()->exprQ.push_back(func); +} + +void push_not_between_filter::builder(s3select* self, const char* a, const char* b) const +{ + + static constexpr const std::string_view not_between_function("#not_between#"); + + __function* func = S3SELECT_NEW(self, __function, not_between_function.data(), self->getS3F()); + + base_statement* second_expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + func->push_argument(second_expr); + + base_statement* first_expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + func->push_argument(first_expr); + + base_statement* main_expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + func->push_argument(main_expr); + + self->getAction()->exprQ.push_back(func); +} + +void push_in_predicate_first_arg::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + if(self->getAction()->exprQ.empty()) + { + throw base_s3select_exception("failed to create AST for in predicate", base_s3select_exception::s3select_exp_en_t::FATAL); + } + + self->getAction()->inPredicateQ.push_back( self->getAction()->exprQ.back() ); + self->getAction()->exprQ.pop_back(); + + if(self->getAction()->exprQ.empty()) + { + throw base_s3select_exception("failed to create AST for in predicate", base_s3select_exception::s3select_exp_en_t::FATAL); + } + + self->getAction()->inMainArg = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + + +} + +void push_in_predicate_arguments::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + if(self->getAction()->exprQ.empty()) + { + throw base_s3select_exception("failed to create AST for in predicate", base_s3select_exception::s3select_exp_en_t::FATAL); + } + + self->getAction()->inPredicateQ.push_back( self->getAction()->exprQ.back() ); + + self->getAction()->exprQ.pop_back(); + +} + +void push_in_predicate::builder(s3select* self, const char* a, const char* b) const +{ + // expr in (e1,e2,e3 ...) + std::string token(a, b); + + std::string in_function("#in_predicate#"); + + __function* func = S3SELECT_NEW(self, __function, in_function.c_str(), self->getS3F()); + + while(!self->getAction()->inPredicateQ.empty()) + { + base_statement* ei = self->getAction()->inPredicateQ.back(); + + self->getAction()->inPredicateQ.pop_back(); + + func->push_argument(ei); + + } + + func->push_argument( self->getAction()->inMainArg ); + + self->getAction()->exprQ.push_back(func); + + self->getAction()->inPredicateQ.clear(); + + self->getAction()->inMainArg = 0; +} + +void push_like_predicate_no_escape::builder(s3select* self, const char* a, const char* b) const +{ + + std::string token(a, b); + std::string in_function("#like_predicate#"); + + __function* func = S3SELECT_NEW(self, __function, in_function.c_str(), self->getS3F()); + + variable* v = S3SELECT_NEW(self, variable, "\\",variable::var_t::COLUMN_VALUE); + func->push_argument(v); + + // experimenting valgrind-issue happens only on teuthology + //self->getS3F()->push_for_cleanup(v); + + base_statement* like_expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + func->push_argument(like_expr); + + base_statement* expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + + func->push_argument(expr); + + self->getAction()->exprQ.push_back(func); +} + +void push_like_predicate_escape::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + std::string in_function("#like_predicate#"); + + __function* func = S3SELECT_NEW(self, __function, in_function.c_str(), self->getS3F()); + + base_statement* expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + + func->push_argument(expr); + + base_statement* main_expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + func->push_argument(main_expr); + + base_statement* escape_expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + func->push_argument(escape_expr); + + self->getAction()->exprQ.push_back(func); +} + +void push_is_null_predicate::builder(s3select* self, const char* a, const char* b) const +{ + //expression is null, is not null + std::string token(a, b); + bool is_null = true; + + for(size_t i=0;i<token.size();i++) + {//TODO use other scan rules + bsc::parse_info<> info = bsc::parse(token.c_str()+i, (bsc::str_p("is") >> bsc::str_p("not") >> bsc::str_p("null")) , bsc::space_p); + if (info.full) + is_null = false; + } + + std::string in_function("#is_null#"); + + if (is_null == false) + { + in_function = "#is_not_null#"; + } + + __function* func = S3SELECT_NEW(self, __function, in_function.c_str(), self->getS3F()); + + if (!self->getAction()->exprQ.empty()) + { + base_statement* expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + func->push_argument(expr); + } + + self->getAction()->exprQ.push_back(func); +} + +void push_when_condition_then::builder(s3select* self, const char* a, const char* b) const +{ +//purpose: each new function node, provide execution for (if {condition} then {expresion} ) + std::string token(a, b); + + // _fn_when_then + __function* func = S3SELECT_NEW(self, __function, "#when-then#", self->getS3F()); + + base_statement* then_expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + + base_statement* when_expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + + func->push_argument(then_expr); + func->push_argument(when_expr); + + self->getAction()->exprQ.push_back(func); + + // the first_when_then_expr mark the first when-then expression, it is been used later upon complete the full statement (case when ... then ... else ... end) + if(self->getAction()->first_when_then_expr == nullptr) + { + self->getAction()->first_when_then_expr = func; + } +} + +void push_case_when_else::builder(s3select* self, const char* a, const char* b) const +{ +//purpose: provide the execution for complete statement, i.e. (case when {expression} then {expression} else {expression} end) + std::string token(a, b); + + base_statement* else_expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + + // _fn_case_when_else + __function* func = S3SELECT_NEW(self, __function, "#case-when-else#", self->getS3F()); + + func->push_argument(else_expr); + + base_statement* when_then_func = nullptr; + + // the loop ended upon reaching the first when-then + while(when_then_func != self->getAction()->first_when_then_expr) + { + // poping from whenThen-queue and pushing to function arguments list + when_then_func = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + func->push_argument(when_then_func); + } + + self->getAction()->first_when_then_expr = nullptr; + //func is the complete statement, implemented by _fn_case_when_else + self->getAction()->exprQ.push_back(func); +} + +void push_case_value_when_value_else::builder(s3select* self, const char* a, const char* b) const +{ +//purpose: provide execution for the complete statement. i.e. case-value-when-value-else-value-end + std::string token(a, b); + + base_statement* else_expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + + // _fn_case_when_else + __function* func = S3SELECT_NEW(self, __function, "#case-when-else#", self->getS3F()); + + // push the else expression + func->push_argument(else_expr); + + // poping the case-value + base_statement* case_value = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + + base_statement* when_then_func = nullptr; + + //poping all when-value-then expression(_fn_when_value_then) and add the case-value per each + while(self->getAction()->whenThenQ.empty() == false) + { + when_then_func = self->getAction()->whenThenQ.back(); + if (dynamic_cast<__function*>(when_then_func)) + { + // adding the case-value as argument + dynamic_cast<__function*>(when_then_func)->push_argument(case_value); + } + else + throw base_s3select_exception("failed to create AST for case-value-when construct", base_s3select_exception::s3select_exp_en_t::FATAL); + + self->getAction()->whenThenQ.pop_back(); + + func->push_argument(when_then_func); + } + //pushing the execution function for the complete statement + self->getAction()->exprQ.push_back(func); +} + +void push_when_value_then::builder(s3select* self, const char* a, const char* b) const +{ + //provide execution of when-value-then-value :: _fn_when_value_then + std::string token(a, b); + + __function* func = S3SELECT_NEW(self, __function, "#when-value-then#", self->getS3F()); + + base_statement* then_expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + + base_statement* when_expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + + func->push_argument(then_expr); + func->push_argument(when_expr); + //each when-value-then-value pushed to dedicated queue + self->getAction()->whenThenQ.push_back(func); +} + +void push_decimal_operator::builder(s3select* self, const char* a, const char* b) const +{//decimal(integer,integer) + std::string token(a, b); + + base_statement* lhs = nullptr; + base_statement* rhs = nullptr; + + //right side (decimal operator) + if (self->getAction()->exprQ.empty() == false) + { + rhs = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + } + + //left side (decimal operator) + if (self->getAction()->exprQ.empty() == false) + { + lhs = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + } + + __function* func = S3SELECT_NEW(self, __function, "#decimal_operator#", self->getS3F()); + + func->push_argument(rhs); + func->push_argument(lhs); + + self->getAction()->exprQ.push_back(func); +} + +void push_cast_decimal_expr::builder(s3select* self, const char* a, const char* b) const +{ + //cast(expression as decimal(x,y)) + std::string token(a, b); + + base_statement* lhs = nullptr; + base_statement* rhs = nullptr; + + //right side (decimal operator) + if (self->getAction()->exprQ.empty() == false) + { + rhs = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + } + + //left side - expression + if (self->getAction()->exprQ.empty() == false) + { + lhs = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + } + + __function* func = S3SELECT_NEW(self, __function, "#cast_as_decimal#", self->getS3F()); + + func->push_argument(rhs); + func->push_argument(lhs); + + self->getAction()->exprQ.push_back(func); +} + +void push_cast_expr::builder(s3select* self, const char* a, const char* b) const +{ + //cast(expression as int/float/string/timestamp) --> new function "int/float/string/timestamp" ( args = expression ) + std::string token(a, b); + + std::string cast_function; + + cast_function = self->getAction()->dataTypeQ.back(); + self->getAction()->dataTypeQ.pop_back(); + + __function* func = S3SELECT_NEW(self, __function, cast_function.c_str(), self->getS3F()); + + base_statement* expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + func->push_argument(expr); + + self->getAction()->exprQ.push_back(func); +} + +void push_data_type::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + auto cast_operator = [&](const char *s){return strncasecmp(a,s,strlen(s))==0;}; + + if(cast_operator("int")) + { + self->getAction()->dataTypeQ.push_back("int"); + }else if(cast_operator("float")) + { + self->getAction()->dataTypeQ.push_back("float"); + }else if(cast_operator("string")) + { + self->getAction()->dataTypeQ.push_back("string"); + }else if(cast_operator("timestamp")) + { + self->getAction()->dataTypeQ.push_back("to_timestamp"); + }else if(cast_operator("bool")) + { + self->getAction()->dataTypeQ.push_back("to_bool"); + } +} + +void push_trim_whitespace_both::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + __function* func = S3SELECT_NEW(self, __function, "#trim#", self->getS3F()); + + base_statement* expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + func->push_argument(expr); + + self->getAction()->exprQ.push_back(func); +} + +void push_trim_expr_one_side_whitespace::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + std::string trim_function; + + trim_function = self->getAction()->trimTypeQ.back(); + self->getAction()->trimTypeQ.pop_back(); + + __function* func = S3SELECT_NEW(self, __function, trim_function.c_str(), self->getS3F()); + + base_statement* inp_expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + func->push_argument(inp_expr); + + self->getAction()->exprQ.push_back(func); +} + +void push_trim_expr_anychar_anyside::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + std::string trim_function; + + trim_function = self->getAction()->trimTypeQ.back(); + self->getAction()->trimTypeQ.pop_back(); + + __function* func = S3SELECT_NEW(self, __function, trim_function.c_str(), self->getS3F()); + + base_statement* expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + func->push_argument(expr); + + base_statement* inp_expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + func->push_argument(inp_expr); + + self->getAction()->exprQ.push_back(func); +} + +void push_trim_type::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + auto trim_option = [&](const char *s){return strncmp(a,s,strlen(s))==0;}; + + if(trim_option("leading")) + { + self->getAction()->trimTypeQ.push_back("#leading#"); + }else if(trim_option("trailing")) + { + self->getAction()->trimTypeQ.push_back("#trailing#"); + }else + { + self->getAction()->trimTypeQ.push_back("#trim#"); + } +} + +void push_substr_from::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + __function* func = S3SELECT_NEW(self, __function, "substring", self->getS3F()); + + base_statement* expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + + base_statement* start_position = self->getAction()->exprQ.back(); + + self->getAction()->exprQ.pop_back(); + func->push_argument(start_position); + func->push_argument(expr); + + self->getAction()->exprQ.push_back(func); +} + +void push_substr_from_for::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + __function* func = S3SELECT_NEW(self, __function, "substring", self->getS3F()); + + base_statement* expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + + base_statement* start_position = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + + base_statement* end_position = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + + func->push_argument(end_position); + func->push_argument(start_position); + func->push_argument(expr); + + self->getAction()->exprQ.push_back(func); +} + +void push_datediff::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + std::string date_op; + + date_op = self->getAction()->datePartQ.back(); + self->getAction()->datePartQ.pop_back(); + + std::string date_function = "#datediff_" + date_op + "#"; + + __function* func = S3SELECT_NEW(self, __function, date_function.c_str(), self->getS3F()); + + base_statement* expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + + base_statement* start_position = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + + func->push_argument(start_position); + func->push_argument(expr); + + self->getAction()->exprQ.push_back(func); +} + +void push_dateadd::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + std::string date_op; + + date_op = self->getAction()->datePartQ.back(); + self->getAction()->datePartQ.pop_back(); + + std::string date_function = "#dateadd_" + date_op + "#"; + + __function* func = S3SELECT_NEW(self, __function, date_function.c_str(), self->getS3F()); + + base_statement* expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + + base_statement* start_position = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + + func->push_argument(start_position); + func->push_argument(expr); + + self->getAction()->exprQ.push_back(func); +} + +void push_extract::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + std::string date_op; + + date_op = self->getAction()->datePartQ.back(); + self->getAction()->datePartQ.pop_back(); + + std::string date_function = "#extract_" + date_op + "#"; + + __function* func = S3SELECT_NEW(self, __function, date_function.c_str(), self->getS3F()); + + base_statement* expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + + func->push_argument(expr); + + self->getAction()->exprQ.push_back(func); +} + +void push_date_part::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + self->getAction()->datePartQ.push_back(token); +} + +void push_time_to_string_constant::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + __function* func = S3SELECT_NEW(self, __function, "#to_string_constant#", self->getS3F()); + + base_statement* expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + + base_statement* frmt = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + + func->push_argument(frmt); + func->push_argument(expr); + + self->getAction()->exprQ.push_back(func); + +} + +void push_time_to_string_dynamic::builder(s3select* self, const char* a, const char* b) const +{ + std::string token(a, b); + + __function* func = S3SELECT_NEW(self, __function, "#to_string_dynamic#", self->getS3F()); + + base_statement* expr = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + + base_statement* frmt = self->getAction()->exprQ.back(); + self->getAction()->exprQ.pop_back(); + + func->push_argument(frmt); + func->push_argument(expr); + + self->getAction()->exprQ.push_back(func); +} + +void push_string_to_time_constant::builder(s3select* self, const char* a, const char* b) const +{ + //token could be a string or a timestamp, we need to check it + //upon it is a timestamp format, we need to push the variable as timestamp or else, it as a string + //the purpose is to use backticks to convert the string to timestamp in parsing time instead of processing time(Trino uses this approach) + + a++; //remove the first quote + b--; + std::string token(a, b); + + _fn_to_timestamp* to_timestamp = S3SELECT_NEW(self, _fn_to_timestamp);//TODO the _fn_to_timestamp should release the memory (cleanup) + bs_stmt_vec_t args; + + variable* var_string = S3SELECT_NEW(self, variable, token, variable::var_t::COLUMN_VALUE); + variable* timestamp = S3SELECT_NEW(self, variable, token, variable::var_t::COLUMN_VALUE); + + (self->get_to_timestamp_for_clean()) = to_timestamp; + var_string->push_for_cleanup(self->get_ast_nodes_to_delete()); + timestamp->push_for_cleanup(self->get_ast_nodes_to_delete()); + + args.push_back(var_string); + + try { + (*to_timestamp)(&args, timestamp); + } + catch(std::exception& e) + { + //it is not a timestamp, it is a string + self->getAction()->exprQ.push_back(var_string); + return; + } + + self->getAction()->exprQ.push_back(timestamp); +} + +struct s3select_csv_definitions //TODO +{ + char row_delimiter; + char column_delimiter; + char output_row_delimiter; + char output_column_delimiter; + char escape_char; + char output_escape_char; + char output_quot_char; + char quot_char; + bool use_header_info; + bool ignore_header_info;//skip first line + bool quote_fields_always; + bool quote_fields_asneeded; + bool redundant_column; + bool comment_empty_lines; + std::vector<char> comment_chars; + std::vector<char> trim_chars; + + s3select_csv_definitions():row_delimiter('\n'), column_delimiter(','), output_row_delimiter('\n'), output_column_delimiter(','), escape_char('\\'), output_escape_char('\\'), output_quot_char('"'), quot_char('"'), use_header_info(false), ignore_header_info(false), quote_fields_always(false), quote_fields_asneeded(false), redundant_column(false), comment_empty_lines(false) {} + +}; + + +/////// handling different object types +class base_s3object +{ + +protected: + scratch_area* m_sa; + std::string m_obj_name; + bool m_aggr_flow = false; //TODO once per query + bool m_is_to_aggregate; + std::vector<base_statement*> m_projections; + base_statement* m_where_clause; + s3select* m_s3_select; + size_t m_error_count; + bool m_is_limit_on; + unsigned long m_limit; + unsigned long m_processed_rows; + size_t m_returned_bytes_size; + std::function<void(const char*)> fp_ext_debug_mesg;//dispache debug message into external system + +public: + s3select_csv_definitions m_csv_defintion;//TODO add method for modify + + enum class Status { + END_OF_STREAM, + INITIAL_STAT, + NORMAL_EXIT, + LIMIT_REACHED, + SQL_ERROR + }; + + Status m_sql_processing_status; + + Status get_sql_processing_status() + { + return m_sql_processing_status; + } + + bool is_sql_limit_reached() + { + return m_sql_processing_status == Status::LIMIT_REACHED; + } + + void set_base_defintions(s3select* m) + { + if(m_s3_select || !m) + {//not to define twice + //not to define with null + return; + } + + m_s3_select=m; + m_sa=m_s3_select->get_scratch_area(); + m_error_count=0; + m_projections = m_s3_select->get_projections_list(); + m_where_clause = m_s3_select->get_filter(); + + if (m_where_clause) + { + m_where_clause->traverse_and_apply(m_sa, m_s3_select->get_aliases(), m_s3_select->is_json_query()); + } + + for (auto& p : m_projections) + { + p->traverse_and_apply(m_sa, m_s3_select->get_aliases(), m_s3_select->is_json_query()); + } + m_is_to_aggregate = true;//TODO not correct. should be set upon end-of-stream + m_aggr_flow = m_s3_select->is_aggregate_query(); + + m_is_limit_on = m_s3_select->is_limit(); + if(m_is_limit_on) + { + m_limit = m_s3_select->get_limit(); + } + + m_processed_rows = 0; + } + + base_s3object():m_sa(nullptr),m_is_to_aggregate(false),m_where_clause(nullptr),m_s3_select(nullptr),m_error_count(0),m_returned_bytes_size(0),m_sql_processing_status(Status::INITIAL_STAT){} + + explicit base_s3object(s3select* m):base_s3object() + { + if(m) + { + set_base_defintions(m); + } + } + + virtual bool is_end_of_stream() {return false;} + virtual void row_fetch_data() {} + virtual void row_update_data() {} + virtual void columnar_fetch_where_clause_columns(){} + virtual void columnar_fetch_projection(){} + // for the case were the rows are not fetched, but "pushed" by the data-source parser (JSON) + virtual bool multiple_row_processing(){return true;} + + void set_external_debug_system(std::function<void(const char*)> fp_external) + { + fp_ext_debug_mesg = fp_external; + } + + size_t get_return_result_size() + { + return m_returned_bytes_size; + } + + void result_values_to_string(multi_values& projections_resuls, std::string& result) + { + size_t i = 0; + std::string output_delimiter(1,m_csv_defintion.output_column_delimiter); + std::string output_row_delimiter(1,m_csv_defintion.output_row_delimiter); + + for(auto& res : projections_resuls.values) + { + if(fp_ext_debug_mesg) + fp_ext_debug_mesg( res->to_string() ); + + if (m_csv_defintion.quote_fields_always) { + std::ostringstream quoted_result; + quoted_result << std::quoted(res->to_string(),m_csv_defintion.output_quot_char, m_csv_defintion.escape_char); + result.append(quoted_result.str()); + m_returned_bytes_size += quoted_result.str().size(); + }//TODO to add asneeded + else + { + result.append(res->to_string()); + m_returned_bytes_size += strlen(res->to_string()); + } + + if(!m_csv_defintion.redundant_column) { + if(++i < projections_resuls.values.size()) { + result.append(output_delimiter); + m_returned_bytes_size += output_delimiter.size(); + } + } + else { + result.append(output_delimiter); + m_returned_bytes_size += output_delimiter.size(); + } + } + if(!m_aggr_flow){ + result.append(output_row_delimiter); + m_returned_bytes_size += output_delimiter.size(); + } + } + + Status getMatchRow( std::string& result) + { + multi_values projections_resuls; + + if (m_is_limit_on && m_processed_rows == m_limit) + { + return m_sql_processing_status = Status::LIMIT_REACHED; + } + + if (m_aggr_flow == true) + { + do + { + row_fetch_data(); + columnar_fetch_where_clause_columns(); + if (is_end_of_stream()) + { + if (m_is_to_aggregate) + for (auto& i : m_projections) + { + i->set_last_call(); + i->set_skip_non_aggregate(false);//projection column is set to be runnable + + projections_resuls.push_value( &(i->eval()) ); + } + + result_values_to_string(projections_resuls,result); + return m_sql_processing_status = Status::END_OF_STREAM; + } + + m_processed_rows++; + if ((*m_projections.begin())->is_set_last_call()) + { + //should validate while query execution , no update upon nodes are marked with set_last_call + throw base_s3select_exception("on aggregation query , can not stream row data post do-aggregate call", base_s3select_exception::s3select_exp_en_t::FATAL); + } + + for (auto& a : *m_s3_select->get_aliases()->get()) + { + a.second->invalidate_cache_result(); + } + + row_update_data(); + if (!m_where_clause || m_where_clause->eval().is_true()) + { + columnar_fetch_projection(); + for (auto i : m_projections) + { + i->eval(); + } + } + + if(m_is_limit_on && m_processed_rows == m_limit) + { + for (auto& i : m_projections) + { + i->set_last_call(); + i->set_skip_non_aggregate(false);//projection column is set to be runnable + projections_resuls.push_value( &(i->eval()) ); + } + result_values_to_string(projections_resuls,result); + return m_sql_processing_status = Status::LIMIT_REACHED; + } + } + while (multiple_row_processing()); + } + else + { + //save the where-clause evaluation result (performance perspective) + bool where_clause_result = false; + do + { + row_fetch_data(); + columnar_fetch_where_clause_columns(); + if(is_end_of_stream()) + { + return m_sql_processing_status = Status::END_OF_STREAM; + } + + m_processed_rows++; + row_update_data(); + for (auto& a : *m_s3_select->get_aliases()->get()) + { + a.second->invalidate_cache_result(); + } + } + while (multiple_row_processing() && m_where_clause && !(where_clause_result = m_where_clause->eval().is_true()) && !(m_is_limit_on && m_processed_rows == m_limit)); + + // in the of JSON it needs to evaluate the where-clause(for the first time) + if(!multiple_row_processing() && m_where_clause){ + where_clause_result = m_where_clause->eval().is_true(); + } + + if(m_where_clause && ! where_clause_result && m_is_limit_on && m_processed_rows == m_limit) + { + return m_sql_processing_status = Status::LIMIT_REACHED; + } + + bool found = multiple_row_processing(); + + if(!multiple_row_processing()) + { + found = !m_where_clause || where_clause_result; + } + + if(found) + { + columnar_fetch_projection(); + projections_resuls.clear(); + for (auto& i : m_projections) + { + projections_resuls.push_value( &(i->eval()) ); + } + result_values_to_string(projections_resuls,result); + } + + } + return is_end_of_stream() ? (m_sql_processing_status = Status::END_OF_STREAM) : (m_sql_processing_status = Status::NORMAL_EXIT); + + }//getMatchRow + + virtual ~base_s3object() = default; + +}; //base_s3object + +//TODO config / default-value +#define CSV_INPUT_TYPE_RESPONSE_SIZE_LIMIT (64 * 1024) +class csv_object : public base_s3object +{ + +public: + + class csv_defintions : public s3select_csv_definitions + {}; + + explicit csv_object(s3select* s3_query) : + base_s3object(s3_query), + m_skip_last_line(false), + m_extract_csv_header_info(false), + m_previous_line(false), + m_skip_first_line(false), + m_processed_bytes(0) {} + + csv_object(s3select* s3_query, csv_defintions csv) : + base_s3object(s3_query), + m_skip_last_line(false), + m_extract_csv_header_info(false), + m_previous_line(false), + m_skip_first_line(false), + m_processed_bytes(0) + { + m_csv_defintion = csv; + } + + csv_object(): + base_s3object(nullptr), + m_skip_last_line(false), + m_extract_csv_header_info(false), + m_previous_line(false), + m_skip_first_line(false), + m_processed_bytes(0) {} + + void set_csv_query(s3select* s3_query,csv_defintions csv) + { + if(m_s3_select != nullptr) + { + //return; + } + + set_base_defintions(s3_query); + m_csv_defintion = csv; + } + +private: + bool m_skip_last_line; + std::string m_error_description; + char* m_stream; + char* m_end_stream; + std::vector<char*> m_row_tokens; + CSVParser* csv_parser; + bool m_extract_csv_header_info; + std::vector<std::string> m_csv_schema{128}; + + //handling arbitrary chunks (rows cut in the middle) + bool m_previous_line; + bool m_skip_first_line; + std::string merge_line; + std::string m_last_line; + size_t m_processed_bytes; + int64_t m_number_of_tokens; + size_t m_skip_x_first_bytes=0; + + std::function<int(std::string&)> fp_s3select_result_format=nullptr; + std::function<int(std::string&)> fp_s3select_header_format=nullptr; +public: + void set_result_formatters( std::function<int(std::string&)>& result_format, + std::function<int(std::string&)>& header_format) + { + fp_s3select_result_format = result_format; + fp_s3select_header_format = header_format; + } +private: + int getNextRow() + { + size_t num_of_tokens=0; + m_row_tokens.clear(); + + if (csv_parser->read_row(m_row_tokens)) + { + num_of_tokens = m_row_tokens.size(); + } + else + { + return -1; + } + + return num_of_tokens; + } + +public: + + std::string get_error_description() + { + return m_error_description; + } + + virtual ~csv_object() = default; + +public: + virtual bool is_end_of_stream() + { + return m_number_of_tokens < 0; + } + + virtual void row_fetch_data() + { + m_number_of_tokens = getNextRow(); + } + + virtual void row_update_data() + { + m_sa->update(m_row_tokens, m_number_of_tokens); + } + + + int extract_csv_header_info() + { + + if (m_csv_defintion.ignore_header_info == true) + { + csv_parser->next_line(); + } + else if(m_csv_defintion.use_header_info == true) + { + size_t num_of_tokens = getNextRow();//TODO validate number of tokens + + for(size_t i=0; i<num_of_tokens; i++) + { + m_csv_schema[i].assign(m_row_tokens[i]); + } + + m_s3_select->load_schema(m_csv_schema); + } + + m_extract_csv_header_info = true; + + return 0; + } + + + int run_s3select_on_stream(std::string& result, const char* csv_stream, size_t stream_length, size_t obj_size) + { + int status=0; + try{ + status = run_s3select_on_stream_internal(result,csv_stream,stream_length,obj_size); + } + catch(base_s3select_exception& e) + { + m_error_description = e.what(); + m_error_count ++; + if (e.severity() == base_s3select_exception::s3select_exp_en_t::FATAL || m_error_count>100)//abort query execution + { + return -1; + } + } + catch(chunkalloc_out_of_mem) + { + m_error_description = "out of memory"; + return -1; + } + catch(io::error::escaped_char_missing& err) + { + m_error_description = "escaped_char_missing failure while csv parsing"; + return -1; + } + catch(io::error::escaped_string_not_closed& err) + { + m_error_description = "escaped_string_not_closed failure while csv parsing"; + return -1; + } + catch(io::error::line_length_limit_exceeded& err) + { + m_error_description = "line_length_limit_exceeded failure while csv parsing"; + return -1; + } + catch(io::error::with_file_name& err) + { + m_error_description = "with_file_name failure while csv parsing"; + return -1; + } + catch(io::error::with_file_line& err) + { + m_error_description = "with_file_line failure while csv parsing"; + return -1; + } + + return status; + } + +private: + int run_s3select_on_stream_internal(std::string& result, const char* csv_stream, size_t stream_length, size_t obj_size) + { + //purpose: the CSV data is "streaming", it may "cut" rows in the middle, in that case the "broken-line" is stores + //for later, upon next chunk of data is streaming, the stored-line is merge with current broken-line, and processed. + std::string tmp_buff; + + m_processed_bytes += stream_length; + + m_skip_first_line = false; + + if (m_previous_line) + { + //if previous broken line exist , merge it to current chunk + char* p_obj_chunk = (char*)csv_stream; + while (*p_obj_chunk != m_csv_defintion.row_delimiter && p_obj_chunk<(csv_stream+stream_length)) + { + p_obj_chunk++; + } + + tmp_buff.assign((char*)csv_stream, (char*)csv_stream + (p_obj_chunk - csv_stream)); + merge_line = m_last_line + tmp_buff + m_csv_defintion.row_delimiter; + m_previous_line = false; + m_skip_first_line = true; + m_skip_x_first_bytes = tmp_buff.size()+1; + + //processing the merged row (previous broken row) + run_s3select_on_object(result, merge_line.c_str(), merge_line.length(), false, false, false); + } + + if (stream_length && csv_stream[stream_length - 1] != m_csv_defintion.row_delimiter) + { + //in case of "broken" last line + char* p_obj_chunk = (char*)&(csv_stream[stream_length - 1]); + while (*p_obj_chunk != m_csv_defintion.row_delimiter && p_obj_chunk>csv_stream) + { + p_obj_chunk--; //scan until end-of previous line in chunk + } + + u_int32_t skip_last_bytes = (&(csv_stream[stream_length - 1]) - p_obj_chunk); + m_last_line.assign(p_obj_chunk + 1, p_obj_chunk + 1 + skip_last_bytes); //save it for next chunk + + m_previous_line = true;//it means to skip last line + + //cut out the broken line + stream_length -= (m_last_line.length()); + } + + return run_s3select_on_object(result, csv_stream, stream_length, m_skip_first_line, m_previous_line, (m_processed_bytes >= obj_size)); + } + +public: + int run_s3select_on_object(std::string& result, const char* csv_stream, size_t stream_length, bool skip_first_line, bool skip_last_line, bool do_aggregate) + { + m_stream = (char*)csv_stream; + m_end_stream = (char*)csv_stream + stream_length; + m_is_to_aggregate = do_aggregate; + m_skip_last_line = skip_last_line; + + if(skip_first_line) + { + //the stream may start in the middle of a row (maybe in the middle of a quote). + //at this point the stream should skip the first row(broken row). + //the csv_parser should be init with the fixed stream position. + m_stream += m_skip_x_first_bytes; + m_skip_x_first_bytes=0; + } + + CSVParser _csv_parser("csv", m_stream, m_end_stream); + csv_parser = &_csv_parser; + csv_parser->set_csv_def( m_csv_defintion.row_delimiter, + m_csv_defintion.column_delimiter, + m_csv_defintion.quot_char, + m_csv_defintion.escape_char, + m_csv_defintion.comment_empty_lines, + m_csv_defintion.comment_chars, + m_csv_defintion.trim_chars); + + + if(m_extract_csv_header_info == false) + { + extract_csv_header_info(); + } + do + { + m_sql_processing_status = Status::INITIAL_STAT; + try + { + getMatchRow(result); + } + catch (base_s3select_exception& e) + { + m_error_description = e.what(); + m_error_count ++; + if (e.severity() == base_s3select_exception::s3select_exp_en_t::FATAL || m_error_count>100 || (m_stream>=m_end_stream))//abort query execution + { + return -1; + } + } + + if(fp_s3select_result_format && fp_s3select_header_format) + { + if (result.size() > CSV_INPUT_TYPE_RESPONSE_SIZE_LIMIT) + {//there are systems that might resject the response due to its size. + fp_s3select_result_format(result); + fp_s3select_header_format(result); + } + } + + if (m_sql_processing_status == Status::END_OF_STREAM) + { + break; + } + else if (m_sql_processing_status == Status::LIMIT_REACHED) // limit reached + { + break;//user should request for sql_processing_status + } + + } while (true); + + if(fp_s3select_result_format && fp_s3select_header_format) + { //note: it may produce empty response(more the once) + //upon empty result, it should return *only* upon last call. + fp_s3select_result_format(result); + fp_s3select_header_format(result); + } + + return 0; + } +}; + +#ifdef _ARROW_EXIST +class parquet_object : public base_s3object +{ + +private: + std::string m_error_description; + parquet_file_parser* object_reader; + parquet_file_parser::column_pos_t m_where_clause_columns; + parquet_file_parser::column_pos_t m_projections_columns; + std::vector<parquet_file_parser::parquet_value_t> m_predicate_values; + std::vector<parquet_file_parser::parquet_value_t> m_projections_values; + bool not_to_increase_first_time; + +public: + + parquet_object(std::string parquet_file_name, s3select *s3_query,s3selectEngine::rgw_s3select_api* rgw) : base_s3object(s3_query),object_reader(nullptr) + { + try{ + + object_reader = new parquet_file_parser(parquet_file_name,rgw); //TODO uniq ptr + } catch(std::exception &e) + { + throw base_s3select_exception(std::string("failure while processing parquet meta-data ") + std::string(e.what()) ,base_s3select_exception::s3select_exp_en_t::FATAL); + } + + parquet_query_setting(nullptr); + } + + parquet_object() : base_s3object(nullptr),object_reader(nullptr) + {} + + void parquet_query_setting(s3select *s3_query) + { + if(s3_query) + { + set_base_defintions(s3_query); + } + load_meta_data_into_scratch_area(); + for(auto x : m_s3_select->get_projections_list()) + {//traverse the AST and extract all columns reside in projection statement. + x->extract_columns(m_projections_columns,object_reader->get_num_of_columns()); + } + //traverse the AST and extract all columns reside in where clause. + if(m_s3_select->get_filter()) + m_s3_select->get_filter()->extract_columns(m_where_clause_columns,object_reader->get_num_of_columns()); + + not_to_increase_first_time = true; + } + + ~parquet_object() + { + if(object_reader != nullptr) + { + delete object_reader; + } + + } + + std::string get_error_description() + { + return m_error_description; + } + + bool is_set() + { + return m_s3_select != nullptr; + } + + void set_parquet_object(std::string parquet_file_name, s3select *s3_query,s3selectEngine::rgw_s3select_api* rgw) //TODO duplicate code + { + try{ + + object_reader = new parquet_file_parser(parquet_file_name,rgw); //TODO uniq ptr + } catch(std::exception &e) + { + throw base_s3select_exception(std::string("failure while processing parquet meta-data ") + std::string(e.what()) ,base_s3select_exception::s3select_exp_en_t::FATAL); + } + + parquet_query_setting(s3_query); + } + + + int run_s3select_on_object(std::string &result, + std::function<int(std::string&)> fp_s3select_result_format, + std::function<int(std::string&)> fp_s3select_header_format) + { + m_sql_processing_status = Status::INITIAL_STAT; + do + { + try + { + getMatchRow(result); + } + catch (base_s3select_exception &e) + { + m_error_description = e.what(); + m_error_count++; + if (e.severity() == base_s3select_exception::s3select_exp_en_t::FATAL || m_error_count > 100) //abort query execution + { + return -1; + } + } + catch (std::exception &e) + { + m_error_description = e.what(); + m_error_count++; + if (m_error_count > 100) //abort query execution + { + return -1; + } + } + +#define S3SELECT_RESPONSE_SIZE_LIMIT (4 * 1024 * 1024) + if (result.size() > S3SELECT_RESPONSE_SIZE_LIMIT) + {//AWS-cli limits response size the following callbacks send response upon some threshold + fp_s3select_result_format(result); + + if (!is_end_of_stream() && (get_sql_processing_status() != Status::LIMIT_REACHED)) + { + fp_s3select_header_format(result); + } + } + else + { + if (is_end_of_stream() || (get_sql_processing_status() == Status::LIMIT_REACHED)) + { + fp_s3select_result_format(result); + } + } + + //TODO is_end_of_stream() required? + if (get_sql_processing_status() == Status::END_OF_STREAM || is_end_of_stream() || get_sql_processing_status() == Status::LIMIT_REACHED) + { + break; + } + + } while (1); + + return 0; + } + + void load_meta_data_into_scratch_area() + { + int i=0; + for(auto x : object_reader->get_schema()) + { + m_s3_select->get_scratch_area()->set_column_pos(x.first.c_str(),i++); + } + } + + virtual bool is_end_of_stream() + { + return object_reader->end_of_stream(); + } + + virtual void columnar_fetch_where_clause_columns() + { + if(!not_to_increase_first_time)//for rownum=0 + object_reader->increase_rownum(); + else + not_to_increase_first_time = false; + + auto status = object_reader->get_column_values_by_positions(m_where_clause_columns, m_predicate_values); + if(status<0)//TODO exception? + return; + m_sa->update(m_predicate_values, m_where_clause_columns); + } + + virtual void columnar_fetch_projection() + { + auto status = object_reader->get_column_values_by_positions(m_projections_columns, m_projections_values); + if(status<0)//TODO exception? + return; + m_sa->update(m_projections_values, m_projections_columns); + } + +}; +#endif //_ARROW_EXIST + +class json_object : public base_s3object +{ +private: + + JsonParserHandler JsonHandler; + size_t m_processed_bytes; + bool m_end_of_stream; + std::string* m_s3select_result = nullptr; + size_t m_row_count; + bool star_operation_ind; + std::string m_error_description; + bool m_init_json_processor_ind; + +public: + + void init_json_processor(s3select* query) + { + if(m_init_json_processor_ind) + return; + + m_init_json_processor_ind = true; + std::function<int(void)> f_sql = [this](void){auto res = sql_execution_on_row_cb();return res;}; + std::function<int(s3selectEngine::value&, int)> + f_push_to_scratch = [this](s3selectEngine::value& value,int json_var_idx){return push_into_scratch_area_cb(value,json_var_idx);}; + std::function <int(s3selectEngine::scratch_area::json_key_value_t&)> + f_push_key_value_into_scratch_area_per_star_operation = [this](s3selectEngine::scratch_area::json_key_value_t& key_value) + {return push_key_value_into_scratch_area_per_star_operation(key_value);}; + + //setting the container for all json-variables, to be extracted by the json reader + JsonHandler.set_statement_json_variables(query->get_json_variables_access()); + + + //calling to getMatchRow. processing a single row per each call. + JsonHandler.set_s3select_processing_callback(f_sql); + //upon excat match between input-json-key-path and sql-statement-variable-path the callback pushes to scratch area + JsonHandler.set_exact_match_callback(f_push_to_scratch); + //upon star-operation(in statemenet) the callback pushes the key-path and value into scratch-area + JsonHandler.set_push_per_star_operation_callback(f_push_key_value_into_scratch_area_per_star_operation); + + //the json-from-clause is unique and should exist. otherwise it's a failure. + if(query->getAction()->json_from_clause.empty()) + { + JsonHandler.m_fatal_initialization_ind = true; + JsonHandler.m_fatal_initialization_description = "the SQL statement is not align with the correct syntax of JSON statement. from-clause is missing."; + return; + } + + //setting the from clause path + if(query->getAction()->json_from_clause[0] == JSON_ROOT_OBJECT) + { + query->getAction()->json_from_clause.pop_back(); + } + JsonHandler.set_prefix_match(query->getAction()->json_from_clause); + + for (auto& p : m_projections) + { + if(p->is_statement_contain_star_operation()) + { + star_operation_ind=true; + break; + } + } + + if(star_operation_ind) + { + JsonHandler.set_star_operation(); + //upon star-operation the key-path is extracted with the value, each key-value displayed in a seperate row. + //the return results end with a line contains the row-number. + m_csv_defintion.output_column_delimiter = m_csv_defintion.output_row_delimiter; + } + + m_sa->set_parquet_type();//TODO json type + } + + json_object(s3select* query):base_s3object(query),m_processed_bytes(0),m_end_of_stream(false),m_row_count(0),star_operation_ind(false),m_init_json_processor_ind(false) + { + init_json_processor(query); + } + + void set_sql_result(std::string& sql_result) + { + m_s3select_result = &sql_result; + } + + json_object(): base_s3object(nullptr), m_processed_bytes(0),m_end_of_stream(false),m_row_count(0),star_operation_ind(false),m_init_json_processor_ind(false) {} + +private: + + virtual bool is_end_of_stream() + { + return m_end_of_stream == true; + } + + virtual bool multiple_row_processing() + { + return false; + } + + int sql_execution_on_row_cb() + { + //execute statement on row + //create response (TODO callback) + + size_t result_len = m_s3select_result->size(); + int status=0; + try{ + getMatchRow(*m_s3select_result); + } + catch(s3selectEngine::base_s3select_exception& e) + { + sql_error_handling(e,*m_s3select_result); + status = -1; + } + + if(is_sql_limit_reached()) + { + status = JSON_PROCESSING_LIMIT_REACHED;//returning number since sql_execution_on_row_cb is a callback; the caller can not access the object + } + + m_sa->clear_data(); + if(star_operation_ind && (m_s3select_result->size() != result_len)) + {//as explained above the star-operation is displayed differently + std::string end_of_row; + end_of_row = "#=== " + std::to_string(m_row_count++) + " ===#\n"; + m_s3select_result->append(end_of_row); + } + return status; + } + + int push_into_scratch_area_cb(s3selectEngine::value& key_value, int json_var_idx) + { + //upon exact-filter match push value to scratch area with json-idx , it should match variable + //push (key path , json-var-idx , value) json-var-idx should be attached per each exact filter + m_sa->update_json_varible(key_value,json_var_idx); + return 0; + } + + int push_key_value_into_scratch_area_per_star_operation(s3selectEngine::scratch_area::json_key_value_t& key_value) + { + m_sa->get_star_operation_cont()->push_back( key_value ); + return 0; + } + + void sql_error_handling(s3selectEngine::base_s3select_exception& e,std::string& result) + { + //the JsonHandler makes the call to SQL processing, upon a failure to procees the SQL statement, + //the error-handling takes care of the error flow. + m_error_description = e.what(); + m_error_count++; + m_s3select_result->append(std::to_string(m_error_count)); + *m_s3select_result += " : "; + m_s3select_result->append(m_error_description); + *m_s3select_result += m_csv_defintion.output_row_delimiter; + } + +public: + + int run_s3select_on_stream(std::string& result, const char* json_stream, size_t stream_length, size_t obj_size) + { + int status=0; + m_processed_bytes += stream_length; + set_sql_result(result); + + if(JsonHandler.is_fatal_initialization()) + { + throw base_s3select_exception(JsonHandler.m_fatal_initialization_description, base_s3select_exception::s3select_exp_en_t::FATAL); + } + + if(!stream_length || !json_stream)//TODO m_processed_bytes(?) + {//last processing cycle + JsonHandler.process_json_buffer(0, 0, true);//TODO end-of-stream = end-of-row + m_end_of_stream = true; + sql_execution_on_row_cb(); + return 0; + } + + try{ + //the handler is processing any buffer size and return results per each buffer + status = JsonHandler.process_json_buffer((char*)json_stream, stream_length); + } + catch(std::exception &e) + { + std::string error_description = std::string("exception while processing :") + e.what(); + throw base_s3select_exception(error_description,base_s3select_exception::s3select_exp_en_t::FATAL); + } + + if(status<0) + { + std::string error_description = std::string("failure upon JSON processing"); + throw base_s3select_exception(error_description,base_s3select_exception::s3select_exp_en_t::FATAL); + return -1; + } + + return status; + } + + void set_json_query(s3select* s3_query) + { + set_base_defintions(s3_query); + init_json_processor(s3_query); + } + + std::string get_error_description() + { + return m_error_description; + } + + ~json_object() = default; +}; + +}; // namespace s3selectEngine + +#endif diff --git a/src/s3select/include/s3select_csv_parser.h b/src/s3select/include/s3select_csv_parser.h new file mode 100644 index 000000000..dab2e4efa --- /dev/null +++ b/src/s3select/include/s3select_csv_parser.h @@ -0,0 +1,418 @@ +#include "csvparser/csv.h" + +namespace io{ + + namespace error{ + struct escaped_char_missing : + base, + with_file_name, + with_file_line{ + void format_error_message()const override{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Escaped character missing in line %d in file \"%s\"." + , file_line, file_name); + } + }; + } + + namespace detail{ + static void unescape(char*&col_begin, char*&col_end, char& quote, char& escape_char) + { + if(col_end - col_begin >= 2) + { + while(*col_begin == quote && *(col_begin + 1) == quote) + { + ++col_begin; + ++col_begin; + } + char*out = col_begin; + char* in = col_begin; + bool init = true; + + while(in != col_end) + { + if(*in != quote && *in != escape_char) + { + if(init) + { + init = false; + } + else + { + *out = *in; + } + ++in; + ++out; + } + else + { + if(*in == escape_char) + { + ++in; + if(init) + { + ++col_begin; + ++out; + init = false; + } + else + { + *out = *in; + } + ++in; + ++out; + } + else + { + ++in; + while(*in != quote) + { + if(init) + { + ++col_begin; + ++out; + init = false; + } + else + { + *out = *in; + } + ++in; + ++out; + } + ++in; + } + } + } + *out = '\0'; + col_end = out; + } + } + + static void trim(char*&str_begin, char*&str_end, std::vector<char>& trim_chars) + { + while(str_begin != str_end && std::find(trim_chars.begin(), trim_chars.end(), *str_begin) != trim_chars.end()) + ++str_begin; + while(str_begin != str_end && std::find(trim_chars.begin(), trim_chars.end(), *(str_end-1)) != trim_chars.end()) + --str_end; + *str_end = '\0'; + } + + static const char*find_next_column_end(const char*col_begin, char& sep, char& quote, char& escape_char) + { + while(*col_begin != sep && *col_begin != '\0') + { + if(*col_begin != quote && *col_begin != escape_char) + ++col_begin; + else + { + if(*col_begin == escape_char) + { + if(*(col_begin+1) == '\0') + throw error::escaped_char_missing(); + col_begin += 2; + } + else + { + do + { + ++col_begin; + while(*col_begin != quote) + { + if(*col_begin == '\0') + throw error::escaped_string_not_closed(); + ++col_begin; + } + ++col_begin; + }while(*col_begin == quote); + } + } + } + return col_begin; + } + + void chop_next_column(char*&line, char*&col_begin, char*&col_end, char& col_delimiter, char& quote, char& escape_char) + { + assert(line != nullptr); + + col_begin = line; + // the col_begin + (... - col_begin) removes the constness + col_end = col_begin + (find_next_column_end(col_begin, col_delimiter, quote, escape_char) - col_begin); + + if(*col_end == '\0') + { + line = nullptr; + } + else + { + *col_end = '\0'; + line = col_end + 1; + } + } + + void parse_line(char*line, std::vector<char*>& sorted_col, char& col_delimiter, char& quote, char& escape_char, std::vector<char>& trim_chars) + { + while (line != nullptr) + { + char*col_begin, *col_end; + chop_next_column(line, col_begin, col_end, col_delimiter, quote, escape_char); + if (!trim_chars.empty()) + trim(col_begin, col_end, trim_chars); + if (!(quote == '\0' && escape_char == '\0')) + unescape(col_begin, col_end, quote, escape_char); + sorted_col.push_back(col_begin); + } + } + + + bool empty_comment_line(char* line) + { + if(*line == '\0') + return true; + while(*line == ' ' || *line == '\t') + { + ++line; + if(*line == '\0') + return true; + } + return false; + } + + bool single_line_comment(char start_char, std::vector<char>& comment_chars) + { + if(std::find(comment_chars.begin(), comment_chars.end(), start_char) != comment_chars.end()) + return true; + else + return false; + } + + bool is_comment(char*&line, bool& comment_empty_line, std::vector<char>& comment_chars) + { + if(!comment_empty_line && comment_chars.empty()) + return false; + else if(comment_empty_line && comment_chars.empty()) + return empty_comment_line(line); + else if(!comment_empty_line && !comment_chars.empty()) + return single_line_comment(*line, comment_chars); + else + return empty_comment_line(line) || single_line_comment(*line, comment_chars); + } + + } +} + + +class CSVParser +{ + private: + char row_delimiter; + char col_delimiter; + char quote; + char escape_char; + bool comment_empty_line; + std::vector<char> comment_characters; + std::vector<char> trim_characters; + + static const int block_len = 1<<20; + std::unique_ptr<char[]>buffer; // must be constructed before (and thus destructed after) the reader! + #ifdef CSV_IO_NO_THREAD + io::detail::SynchronousReader reader; + #else + io::detail::AsynchronousReader reader; + #endif + int data_begin; + int data_end; + + char file_name[io::error::max_file_name_length+1]; + unsigned file_line; + + void init(std::unique_ptr<io::ByteSourceBase>byte_source) + { + file_line = 0; + + buffer = std::unique_ptr<char[]>(new char[3*block_len]); + data_begin = 0; + data_end = byte_source->read(buffer.get(), 2*block_len); + + // Ignore UTF-8 BOM + if(data_end >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') + data_begin = 3; + + if(data_end == 2*block_len){ + reader.init(std::move(byte_source)); + reader.start_read(buffer.get() + 2*block_len, block_len); + } + } + + public: + CSVParser() = delete; + CSVParser(const CSVParser&) = delete; + CSVParser&operator=(const CSVParser&); + + CSVParser(const char*file_name, const char*data_begin, const char*data_end) + { + set_file_name(file_name); + init(std::unique_ptr<io::ByteSourceBase>(new io::detail::NonOwningStringByteSource(data_begin, data_end-data_begin))); + } + + CSVParser(const std::string&file_name, const char*data_begin, const char*data_end) + { + set_file_name(file_name.c_str()); + init(std::unique_ptr<io::ByteSourceBase>(new io::detail::NonOwningStringByteSource(data_begin, data_end-data_begin))); + } + + void set_file_name(const std::string&file_name) + { + set_file_name(file_name.c_str()); + } + + void set_file_name(const char*file_name) + { + if(file_name != nullptr) + { + strncpy(this->file_name, file_name, sizeof(this->file_name)); + this->file_name[sizeof(this->file_name)-1] = '\0'; + } + else + { + this->file_name[0] = '\0'; + } + } + + const char*get_truncated_file_name()const + { + return file_name; + } + + void set_file_line(unsigned file_line) + { + this->file_line = file_line; + } + + unsigned get_file_line()const + { + return file_line; + } + + void set_csv_def(char& row_delimit, char& col_delimit, char& quote_char, char& escp_char, bool& cmnt_empty_line, std::vector<char>& comment_chars , std::vector<char>& trim_chars) + { + row_delimiter = row_delimit; + col_delimiter = col_delimit; + quote = quote_char; + escape_char = escp_char; + comment_empty_line = cmnt_empty_line; + comment_characters.assign(comment_chars.begin(), comment_chars.end()); + trim_characters.assign(trim_chars.begin(), trim_chars.end()); + } + + char*next_line() + { + if(data_begin == data_end) + return nullptr; + + ++file_line; + + assert(data_begin < data_end); + assert(data_end <= block_len*2); + + if(data_begin >= block_len) + { + std::memcpy(buffer.get(), buffer.get()+block_len, block_len); + data_begin -= block_len; + data_end -= block_len; + if(reader.is_valid()) + { + data_end += reader.finish_read(); + std::memcpy(buffer.get()+block_len, buffer.get()+2*block_len, block_len); + reader.start_read(buffer.get() + 2*block_len, block_len); + } + } + + int line_end = data_begin; + while(line_end != data_end && buffer[line_end] != row_delimiter) + { + if(buffer[line_end] == quote || buffer[line_end] == escape_char) + { + if(buffer[line_end] == escape_char) + { + ++line_end; + if(line_end == data_end) + { + throw io::error::escaped_char_missing(); + } + else if(buffer[line_end] == '\r' && buffer[line_end + 1] == '\n') // handle windows \r\n-line breaks + { + ++line_end; + } + } + else + { + ++line_end; + while(buffer[line_end] != quote) + { + if(line_end == data_end) + throw io::error::escaped_string_not_closed(); + ++line_end; + } + } + } + ++line_end; + } + + if(line_end - data_begin + 1 > block_len) + { + io::error::line_length_limit_exceeded err; + err.set_file_name(file_name); + err.set_file_line(file_line); + throw err; + } + + if(line_end != data_end && buffer[line_end] == row_delimiter) + { + buffer[line_end] = '\0'; + } + else + { + // some files are missing the newline at the end of the + // last line + ++data_end; + buffer[line_end] = '\0'; + } + + // handle windows \r\n-line breaks + if(row_delimiter == '\n') + { + if(line_end != data_begin && buffer[line_end-1] == '\r') + buffer[line_end-1] = '\0'; + } + + char*ret = buffer.get() + data_begin; + data_begin = line_end+1; + return ret; + } + + bool read_row(std::vector<char*>& cols) + { + try{ + try{ + char*line; + do{ + line = next_line(); + if(!line) + return false; + }while(io::detail::is_comment(line, comment_empty_line, comment_characters)); + + io::detail::parse_line(line, cols, col_delimiter, quote, escape_char, trim_characters); + + }catch(io::error::with_file_name&err){ + err.set_file_name(get_truncated_file_name()); + throw; + } + }catch(io::error::with_file_line&err){ + err.set_file_line(get_file_line()); + throw; + } + + return true; + } +}; diff --git a/src/s3select/include/s3select_functions.h b/src/s3select/include/s3select_functions.h new file mode 100644 index 000000000..8c507fca1 --- /dev/null +++ b/src/s3select/include/s3select_functions.h @@ -0,0 +1,2703 @@ +#ifndef __S3SELECT_FUNCTIONS__ +#define __S3SELECT_FUNCTIONS__ + + +#include "s3select_oper.h" +#include <boost/algorithm/string.hpp> +#include <boost/algorithm/string/trim.hpp> +#include <boost/regex.hpp> +#include <algorithm> + +using namespace std::string_literals; + +#define BOOST_BIND_ACTION_PARAM( push_name ,param ) boost::bind( &push_name::operator(), g_ ## push_name , _1 ,_2, param) +namespace s3selectEngine +{ + +constexpr double sec_scale(int n) +{ + return pow(10, n); +} + +struct push_char +{ + void operator()(const char* a, const char* b, uint32_t* n) const + { + *n = *a; + } + +}; +static push_char g_push_char; + +struct push_2dig +{ + void operator()(const char* a, const char* b, uint32_t* n) const + { + *n = (static_cast<char>(*a) - 48) * 10 + (static_cast<char>(*(a+1)) - 48) ; + } + +}; +static push_2dig g_push_2dig; + +struct push_4dig +{ + void operator()(const char* a, const char* b, uint32_t* n) const + { + *n = (static_cast<char>(*a) - 48) * 1000 + (static_cast<char>(*(a+1)) - 48) * 100 + (static_cast<char>(*(a+2)) - 48) * 10 + (static_cast<char>(*(a+3)) - 48); + } + +}; +static push_4dig g_push_4dig; + +struct push_1fdig +{ + void operator()(const char* a, const char* b, uint32_t* n) const + { + #if BOOST_DATE_TIME_POSIX_TIME_STD_CONFIG + const double scale = sec_scale(9-1); //nano-sec + #else + const double scale = sec_scale(6-1); //micro-sec + #endif + + *n = ((static_cast<char>(*a) - 48)) * scale; + } + +}; +static push_1fdig g_push_1fdig; + +struct push_2fdig +{ + void operator()(const char* a, const char* b, uint32_t* n) const + { + #if BOOST_DATE_TIME_POSIX_TIME_STD_CONFIG + const double scale = sec_scale(9-2); //nano-sec + #else + const double scale = sec_scale(6-2); //micro-sec + #endif + + *n = ((static_cast<char>(*a) - 48) * 10 + (static_cast<char>(*(a+1)) - 48)) * scale; + } + +}; +static push_2fdig g_push_2fdig; + +struct push_3fdig +{ + void operator()(const char* a, const char* b, uint32_t* n) const + { + #if BOOST_DATE_TIME_POSIX_TIME_STD_CONFIG + const double scale = sec_scale(9-3); //nano-sec + #else + const double scale = sec_scale(6-3); //micro-sec + #endif + + *n = ((static_cast<char>(*a) - 48) * 100 + (static_cast<char>(*(a+1)) - 48) * 10 + (static_cast<char>(*(a+2)) - 48)) * scale; + } + +}; +static push_3fdig g_push_3fdig; + +struct push_4fdig +{ + void operator()(const char* a, const char* b, uint32_t* n) const + { + #if BOOST_DATE_TIME_POSIX_TIME_STD_CONFIG + const double scale = sec_scale(9-4); //nano-sec + #else + const double scale = sec_scale(6-4); //micro-sec + #endif + + *n = ((static_cast<char>(*a) - 48) * 1000 + (static_cast<char>(*(a+1)) - 48) * 100 + (static_cast<char>(*(a+2)) - 48) * 10 + (static_cast<char>(*(a+3)) - 48)) * scale; + } + +}; +static push_4fdig g_push_4fdig; + +struct push_5fdig +{ + void operator()(const char* a, const char* b, uint32_t* n) const + { + #if BOOST_DATE_TIME_POSIX_TIME_STD_CONFIG + const double scale = sec_scale(9-5); //nano-sec + #else + const double scale = sec_scale(6-5); //micro-sec + #endif + + *n = ((static_cast<char>(*a) - 48) * 10000 + (static_cast<char>(*(a+1)) - 48) * 1000 + (static_cast<char>(*(a+2)) - 48) * 100 + (static_cast<char>(*(a+3)) - 48) * 10 + (static_cast<char>(*(a+4)) - 48)) * scale; + } + +}; +static push_5fdig g_push_5fdig; + +struct push_6fdig +{ + void operator()(const char* a, const char* b, uint32_t* n) const + { + #if BOOST_DATE_TIME_POSIX_TIME_STD_CONFIG + const double scale = sec_scale(9-6); //nano-sec + #else + const double scale = sec_scale(6-6); //micro-sec + #endif + + *n = ((static_cast<char>(*a) - 48) * 100000 + (static_cast<char>(*(a+1)) - 48) * 10000 + (static_cast<char>(*(a+2)) - 48) * 1000 + (static_cast<char>(*(a+3)) - 48) * 100 + (static_cast<char>(*(a+4)) - 48) * 10 + (static_cast<char>(*(a+5)) - 48)) * scale; + } + +}; +static push_6fdig g_push_6fdig; + +#if BOOST_DATE_TIME_POSIX_TIME_STD_CONFIG + struct push_7fdig + { + void operator()(const char* a, const char* b, uint32_t* n) const + { + const double scale = sec_scale(9-7); //nano-sec + *n = ((static_cast<char>(*a) - 48) * 1000000 + (static_cast<char>(*(a+1)) - 48) * 100000 + (static_cast<char>(*(a+2)) - 48) * 10000 + (static_cast<char>(*(a+3)) - 48) * 1000 + (static_cast<char>(*(a+4)) - 48) * 100 + (static_cast<char>(*(a+5)) - 48) * 10 + (static_cast<char>(*(a+6)) - 48)) * scale; + } + + }; + static push_7fdig g_push_7fdig; + + struct push_8fdig + { + void operator()(const char* a, const char* b, uint32_t* n) const + { + const double scale = sec_scale(9-8); //nano-sec + *n = ((static_cast<char>(*a) - 48) * 10000000 + (static_cast<char>(*(a+1)) - 48) * 1000000 + (static_cast<char>(*(a+2)) - 48) * 100000 + (static_cast<char>(*(a+3)) - 48) * 10000 + (static_cast<char>(*(a+4)) - 48) * 1000 + (static_cast<char>(*(a+5)) - 48) * 100 + (static_cast<char>(*(a+6)) - 48) * 10 + (static_cast<char>(*(a+7)) - 48)) * scale; + } + + }; + static push_8fdig g_push_8fdig; + + struct push_9fdig + { + void operator()(const char* a, const char* b, uint32_t* n) const + { + const double scale = sec_scale(9-9); //nano-sec + *n = ((static_cast<char>(*a) - 48) * 100000000 + (static_cast<char>(*(a+1)) - 48) * 10000000 + (static_cast<char>(*(a+2)) - 48) * 1000000 + (static_cast<char>(*(a+3)) - 48) * 100000 + (static_cast<char>(*(a+4)) - 48) * 10000 + (static_cast<char>(*(a+5)) - 48) * 1000 + (static_cast<char>(*(a+6)) - 48) * 100 + (static_cast<char>(*(a+7)) - 48) * 10 + (static_cast<char>(*(a+8)) - 48)) * scale; + } + + }; + static push_9fdig g_push_9fdig; +#endif + +enum class s3select_func_En_t {ADD, + SUM, + AVG, + MIN, + MAX, + COUNT, + TO_INT, + TO_FLOAT, + TO_TIMESTAMP, + TO_STRING_CONSTANT, + TO_STRING_DYNAMIC, + TO_BOOL, + SUBSTR, + EXTRACT_YEAR, + EXTRACT_MONTH, + EXTRACT_DAY, + EXTRACT_HOUR, + EXTRACT_MINUTE, + EXTRACT_SECOND, + EXTRACT_WEEK, + EXTRACT_TIMEZONE_HOUR, + EXTRACT_TIMEZONE_MINUTE, + DATE_ADD_YEAR, + DATE_ADD_MONTH, + DATE_ADD_DAY, + DATE_ADD_HOUR, + DATE_ADD_MINUTE, + DATE_ADD_SECOND, + DATE_DIFF_YEAR, + DATE_DIFF_MONTH, + DATE_DIFF_DAY, + DATE_DIFF_HOUR, + DATE_DIFF_MINUTE, + DATE_DIFF_SECOND, + UTCNOW, + LENGTH, + LOWER, + UPPER, + NULLIF, + BETWEEN, + NOT_BETWEEN, + IS_NULL, + IS_NOT_NULL, + IN, + LIKE, + VERSION, + CASE_WHEN_ELSE, + WHEN_THEN, + WHEN_VALUE_THEN, + COALESCE, + STRING, + TRIM, + LEADING, + TRAILING, + DECIMAL_OPERATOR, + CAST_TO_DECIMAL, + ENGINE_VERSION + }; + + +class s3select_functions +{ + +private: + + using FunctionLibrary = std::map<std::string, s3select_func_En_t>; + s3select_allocator* m_s3select_allocator; + std::set<base_statement*>* m_ast_nodes_for_cleanup; + + const FunctionLibrary m_functions_library = + { + {"add", s3select_func_En_t::ADD}, + {"sum", s3select_func_En_t::SUM}, + {"avg", s3select_func_En_t::AVG}, + {"count", s3select_func_En_t::COUNT}, + {"min", s3select_func_En_t::MIN}, + {"max", s3select_func_En_t::MAX}, + {"int", s3select_func_En_t::TO_INT}, + {"float", s3select_func_En_t::TO_FLOAT}, + {"substring", s3select_func_En_t::SUBSTR}, + {"to_timestamp", s3select_func_En_t::TO_TIMESTAMP}, + {"#to_string_constant#",s3select_func_En_t::TO_STRING_CONSTANT}, + {"#to_string_dynamic#",s3select_func_En_t::TO_STRING_DYNAMIC}, + {"to_bool", s3select_func_En_t::TO_BOOL}, + {"#extract_year#", s3select_func_En_t::EXTRACT_YEAR}, + {"#extract_month#", s3select_func_En_t::EXTRACT_MONTH}, + {"#extract_day#", s3select_func_En_t::EXTRACT_DAY}, + {"#extract_hour#", s3select_func_En_t::EXTRACT_HOUR}, + {"#extract_minute#", s3select_func_En_t::EXTRACT_MINUTE}, + {"#extract_second#", s3select_func_En_t::EXTRACT_SECOND}, + {"#extract_week#", s3select_func_En_t::EXTRACT_WEEK}, + {"#extract_timezone_hour#", s3select_func_En_t::EXTRACT_TIMEZONE_HOUR}, + {"#extract_timezone_minute#", s3select_func_En_t::EXTRACT_TIMEZONE_MINUTE}, + {"#dateadd_year#", s3select_func_En_t::DATE_ADD_YEAR}, + {"#dateadd_month#", s3select_func_En_t::DATE_ADD_MONTH}, + {"#dateadd_day#", s3select_func_En_t::DATE_ADD_DAY}, + {"#dateadd_hour#", s3select_func_En_t::DATE_ADD_HOUR}, + {"#dateadd_minute#", s3select_func_En_t::DATE_ADD_MINUTE}, + {"#dateadd_second#", s3select_func_En_t::DATE_ADD_SECOND}, + {"#datediff_year#", s3select_func_En_t::DATE_DIFF_YEAR}, + {"#datediff_month#", s3select_func_En_t::DATE_DIFF_MONTH}, + {"#datediff_day#", s3select_func_En_t::DATE_DIFF_DAY}, + {"#datediff_hour#", s3select_func_En_t::DATE_DIFF_HOUR}, + {"#datediff_minute#", s3select_func_En_t::DATE_DIFF_MINUTE}, + {"#datediff_second#", s3select_func_En_t::DATE_DIFF_SECOND}, + {"utcnow", s3select_func_En_t::UTCNOW}, + {"character_length", s3select_func_En_t::LENGTH}, + {"char_length", s3select_func_En_t::LENGTH}, + {"lower", s3select_func_En_t::LOWER}, + {"upper", s3select_func_En_t::UPPER}, + {"nullif", s3select_func_En_t::NULLIF}, + {"#between#", s3select_func_En_t::BETWEEN}, + {"#not_between#", s3select_func_En_t::NOT_BETWEEN}, + {"#is_null#", s3select_func_En_t::IS_NULL}, + {"#is_not_null#", s3select_func_En_t::IS_NOT_NULL}, + {"#in_predicate#", s3select_func_En_t::IN}, + {"#like_predicate#", s3select_func_En_t::LIKE}, + {"version", s3select_func_En_t::VERSION}, + {"#when-then#", s3select_func_En_t::WHEN_THEN}, + {"#when-value-then#", s3select_func_En_t::WHEN_VALUE_THEN}, + {"#case-when-else#", s3select_func_En_t::CASE_WHEN_ELSE}, + {"coalesce", s3select_func_En_t::COALESCE}, + {"string", s3select_func_En_t::STRING}, + {"#trim#", s3select_func_En_t::TRIM}, + {"#leading#", s3select_func_En_t::LEADING}, + {"#trailing#", s3select_func_En_t::TRAILING}, + {"#decimal_operator#", s3select_func_En_t::DECIMAL_OPERATOR}, + {"#cast_as_decimal#", s3select_func_En_t::CAST_TO_DECIMAL}, + {"engine_version", s3select_func_En_t::ENGINE_VERSION} + + }; + +public: + + base_function* create(std::string_view fn_name,const bs_stmt_vec_t&); + + s3select_functions():m_s3select_allocator(nullptr),m_ast_nodes_for_cleanup(nullptr) + { + } + + + void setAllocator(s3select_allocator* alloc) + { + m_s3select_allocator = alloc; + } + + void set_AST_nodes_for_cleanup(std::set<base_statement*>* ast_for_cleanup) + { + m_ast_nodes_for_cleanup = ast_for_cleanup; + } + + s3select_allocator* getAllocator() + { + return m_s3select_allocator; + } + + void clean(); + +}; + +class __function : public base_statement +{ + +private: + bs_stmt_vec_t arguments; + std::basic_string<char,std::char_traits<char>,ChunkAllocator<char,256>> name; + base_function* m_func_impl; + s3select_functions* m_s3select_functions; + variable m_result; + bool m_is_aggregate_function; + + void _resolve_name() + { + if (m_func_impl) + { + return; + } + + auto string_to_lower = [](std::basic_string<char,std::char_traits<char>,ChunkAllocator<char,256>> s) + { + std::transform(s.begin(),s.end(),s.begin(),[](unsigned char c){ return std::tolower(c); }); + return s; + }; + + //the function name is converted into lowercase to enable case-insensitive + base_function* f = m_s3select_functions->create(string_to_lower(name),arguments); + if (!f) + { + throw base_s3select_exception("function not found", base_s3select_exception::s3select_exp_en_t::FATAL); //should abort query + } + m_func_impl = f; + m_is_aggregate_function= m_func_impl->is_aggregate(); + f->set_function_name(name.c_str()); + } + +public: + + base_function* impl() + { + return m_func_impl; + } + + void traverse_and_apply(scratch_area* sa, projection_alias* pa,bool json_statement) override + { + m_scratch = sa; + m_aliases = pa; + m_json_statement = json_statement; + for (base_statement* ba : arguments) + { + ba->traverse_and_apply(sa, pa, json_statement); + } + } + + void set_last_call() override + {//it cover the use-case where aggregation function is an argument in non-aggregate function. + is_last_call = true; + for (auto& ba : arguments) + { + ba->set_last_call(); + } + } + + void set_skip_non_aggregate(bool skip_non_aggregate_op) override + {//it cover the use-case where aggregation function is an argument in non-aggregate function. + m_skip_non_aggregate_op = skip_non_aggregate_op; + for (auto& ba : arguments) + { + ba->set_skip_non_aggregate(m_skip_non_aggregate_op); + } + } + + bool is_aggregate() const override + { + return m_is_aggregate_function; + } + + bool semantic() override + { + return true; + } + + __function(const char* fname, s3select_functions* s3f) : name(fname), m_func_impl(nullptr), m_s3select_functions(s3f),m_is_aggregate_function(false){set_operator_name(fname);} + + value& eval() override + { + return eval_internal(); + } + + value& eval_internal() override + { + + _resolve_name();//node is "resolved" (function is created) upon first call/first row. + + if (is_last_call == false) + {//all rows prior to last row + if(m_skip_non_aggregate_op == false || is_aggregate() == true) + { + (*m_func_impl)(&arguments, &m_result); + } + else if(m_skip_non_aggregate_op == true) + { + for(auto& p : arguments) + {//evaluating the arguments (not the function itself, which is a non-aggregate function) + //i.e. in the following use case substring( , sum(),count() ) ; only sum() and count() are evaluated. + p->eval(); + } + } + } + else + {//on the last row, the aggregate function is finalized, + //and non-aggregate function is evaluated with the result of aggregate function. + if(is_aggregate()) + (*m_func_impl).get_aggregate_result(&m_result); + else + (*m_func_impl)(&arguments, &m_result); + } + + return m_result.get_value(); + } + + void resolve_node() override + { + _resolve_name(); + + for (auto& arg : arguments) + { + arg->resolve_node(); + } + } + + std::string print(int ident) override + { + return std::string(0); + } + + void push_argument(base_statement* arg) + { + arguments.push_back(arg); + } + + + bs_stmt_vec_t& get_arguments() + { + return arguments; + } + + virtual ~__function() = default; +}; + +/* + s3-select function defintions +*/ +struct _fn_add : public base_function +{ + + value var_result; + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + check_args_size(args,2); + + auto iter = args->begin(); + base_statement* x = *iter; + iter++; + base_statement* y = *iter; + + var_result = x->eval() + y->eval(); + + *result = var_result; + + return true; + } +}; + +struct _fn_sum : public base_function +{ + + value sum; + + _fn_sum() + { + aggregate = true; + sum.setnull(); + } + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + check_args_size(args,1); + + auto iter = args->begin(); + base_statement* x = *iter; + + try + { + if(sum.is_null()) + { + sum = 0; + } + sum = sum + x->eval(); + } + catch (base_s3select_exception& e) + { + if (e.severity() == base_s3select_exception::s3select_exp_en_t::FATAL) + { + throw; + } + } + + return true; + } + + void get_aggregate_result(variable* result) override + { + *result = sum ; + } +}; + +struct _fn_count : public base_function +{ + + int64_t count; + + _fn_count():count(0) + { + aggregate=true; + } + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + if (args->size()) + {// in case argument exist, should count only non-null. + auto iter = args->begin(); + base_statement* x = *iter; + + if(!x->eval().is_null()) + { + count += 1; + } + } + else + {//in case of non-arguments // count() + count += 1; + } + + return true; + } + + void get_aggregate_result(variable* result) override + { + result->set_value(count); + } + +}; + +struct _fn_avg : public base_function +{ + + value sum; + value count{0.0}; + + _fn_avg() : sum(0) { aggregate = true; } + + bool operator()(bs_stmt_vec_t* args, variable *result) override + { + check_args_size(args,1); + + auto iter = args->begin(); + base_statement *x = *iter; + + try + { + sum = sum + x->eval(); + count++; + } + catch (base_s3select_exception &e) + { + throw base_s3select_exception(e.what()); + } + + return true; + } + + void get_aggregate_result(variable *result) override + { + if(count == static_cast<value>(0)) { + value v_null; + v_null.setnull(); + *result=v_null; + } else { + *result = sum/count ; + } + } +}; + +struct _fn_min : public base_function +{ + + value min; + + _fn_min() + { + aggregate=true; + min.setnull(); + } + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + check_args_size(args,1); + + auto iter = args->begin(); + base_statement* x = *iter; + + if(min.is_null() || min > x->eval()) + { + min=x->eval(); + } + + return true; + } + + void get_aggregate_result(variable* result) override + { + *result = min; + } + +}; + +struct _fn_max : public base_function +{ + + value max; + + _fn_max() + { + aggregate=true; + max.setnull(); + } + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + check_args_size(args,1); + + auto iter = args->begin(); + base_statement* x = *iter; + + if(max.is_null() || max < x->eval()) + { + max=x->eval(); + } + + return true; + } + + void get_aggregate_result(variable* result) override + { + *result = max; + } + +}; + +struct _fn_to_int : public base_function +{ + value var_result; + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + check_args_size(args,1); + + value v = (*args->begin())->eval(); + + switch (v.type) { + + case value::value_En_t::STRING: + { + char* pend; + errno = 0; + int64_t i= strtol(v.str(), &pend, 10); + if (errno == ERANGE) { + throw base_s3select_exception("converted value would fall out of the range of the result type!"); + } + if (pend == v.str()) { + // no number found + throw base_s3select_exception("text cannot be converted to a number"); + } + if (*pend) { + throw base_s3select_exception("extra characters after the number"); + } + + var_result = i; + } + break; + + case value::value_En_t::FLOAT: + var_result = static_cast<int64_t>(v.dbl()); + break; + + default: + var_result = v.i64(); + break; + } + + *result = var_result; + return true; + } + +}; + +struct _fn_to_float : public base_function +{ + value var_result; + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + check_args_size(args,1); + + value v = (*args->begin())->eval(); + + switch (v.type) { + + case value::value_En_t::STRING: + { + char* pend; + double d = strtod(v.str(), &pend); + if (errno == ERANGE) { + throw base_s3select_exception("converted value would fall out of the range of the result type!"); + } + if (pend == v.str()) { + // no number found + throw base_s3select_exception("text cannot be converted to a number"); + } + if (*pend) { + throw base_s3select_exception("extra characters after the number"); + } + + var_result = d; + } + break; + + case value::value_En_t::FLOAT: + var_result = v.dbl(); + break; + + default: + var_result = v.i64(); + break; + } + + *result = var_result; + return true; + } + +}; + +struct _fn_to_timestamp : public base_function +{ + bsc::rule<> date_separator = bsc::ch_p("-"); + bsc::rule<> time_separator = bsc::ch_p(":"); + bsc::rule<> nano_sec_separator = bsc::ch_p("."); + bsc::rule<> delimiter = bsc::ch_p("T"); + bsc::rule<> zero_timezone = bsc::ch_p("Z"); + bsc::rule<> timezone_sign = bsc::ch_p("-") | bsc::ch_p("+"); + + uint32_t yr = 1700, mo = 1, dy = 1; + bsc::rule<> dig4 = bsc::lexeme_d[bsc::digit_p >> bsc::digit_p >> bsc::digit_p >> bsc::digit_p]; + bsc::rule<> dig2 = bsc::lexeme_d[bsc::digit_p >> bsc::digit_p]; + + bsc::rule<> d_yyyy_dig = ((dig4[BOOST_BIND_ACTION_PARAM(push_4dig, &yr)]) >> *(delimiter)); + bsc::rule<> d_yyyymmdd_dig = ((dig4[BOOST_BIND_ACTION_PARAM(push_4dig, &yr)]) >> *(date_separator) + >> (dig2[BOOST_BIND_ACTION_PARAM(push_2dig, &mo)]) >> *(date_separator) + >> (dig2[BOOST_BIND_ACTION_PARAM(push_2dig, &dy)]) >> *(delimiter)); + + uint32_t hr = 0, mn = 0, sc = 0, frac_sec = 0, tz_hr = 0, tz_mn = 0, sign = 0, tm_zone = '0'; + + #if BOOST_DATE_TIME_POSIX_TIME_STD_CONFIG + bsc::rule<> fdig9 = bsc::lexeme_d[bsc::digit_p >> bsc::digit_p >> bsc::digit_p >> bsc::digit_p >> bsc::digit_p >> bsc::digit_p >> bsc::digit_p >> bsc::digit_p >> bsc::digit_p]; + bsc::rule<> fdig8 = bsc::lexeme_d[bsc::digit_p >> bsc::digit_p >> bsc::digit_p >> bsc::digit_p >> bsc::digit_p >> bsc::digit_p >> bsc::digit_p >> bsc::digit_p]; + bsc::rule<> fdig7 = bsc::lexeme_d[bsc::digit_p >> bsc::digit_p >> bsc::digit_p >> bsc::digit_p >> bsc::digit_p >> bsc::digit_p >> bsc::digit_p]; + #endif + + bsc::rule<> fdig6 = bsc::lexeme_d[bsc::digit_p >> bsc::digit_p >> bsc::digit_p >> bsc::digit_p >> bsc::digit_p >> bsc::digit_p]; + bsc::rule<> fdig5 = bsc::lexeme_d[bsc::digit_p >> bsc::digit_p >> bsc::digit_p >> bsc::digit_p >> bsc::digit_p]; + bsc::rule<> fdig4 = bsc::lexeme_d[bsc::digit_p >> bsc::digit_p >> bsc::digit_p >> bsc::digit_p]; + bsc::rule<> fdig3 = bsc::lexeme_d[bsc::digit_p >> bsc::digit_p >> bsc::digit_p]; + bsc::rule<> fdig2 = bsc::lexeme_d[bsc::digit_p >> bsc::digit_p]; + bsc::rule<> fdig1 = bsc::lexeme_d[bsc::digit_p]; + + bsc::rule<> d_timezone_dig = ((timezone_sign[BOOST_BIND_ACTION_PARAM(push_char, &sign)]) >> (dig2[BOOST_BIND_ACTION_PARAM(push_2dig, &tz_hr)]) >> *(time_separator) + >> (dig2[BOOST_BIND_ACTION_PARAM(push_2dig, &tz_mn)])) | (zero_timezone[BOOST_BIND_ACTION_PARAM(push_char, &tm_zone)]); + + #if BOOST_DATE_TIME_POSIX_TIME_STD_CONFIG + bsc::rule<> fraction_sec = (fdig9[BOOST_BIND_ACTION_PARAM(push_9fdig, &frac_sec)]) | + (fdig8[BOOST_BIND_ACTION_PARAM(push_8fdig, &frac_sec)]) | + (fdig7[BOOST_BIND_ACTION_PARAM(push_7fdig, &frac_sec)]) | + (fdig6[BOOST_BIND_ACTION_PARAM(push_6fdig, &frac_sec)]) | + (fdig5[BOOST_BIND_ACTION_PARAM(push_5fdig, &frac_sec)]) | + (fdig4[BOOST_BIND_ACTION_PARAM(push_4fdig, &frac_sec)]) | + (fdig3[BOOST_BIND_ACTION_PARAM(push_3fdig, &frac_sec)]) | + (fdig2[BOOST_BIND_ACTION_PARAM(push_2fdig, &frac_sec)]) | + (fdig1[BOOST_BIND_ACTION_PARAM(push_1fdig, &frac_sec)]); + #else + bsc::rule<> fraction_sec = (fdig6[BOOST_BIND_ACTION_PARAM(push_6fdig, &frac_sec)]) | + (fdig5[BOOST_BIND_ACTION_PARAM(push_5fdig, &frac_sec)]) | + (fdig4[BOOST_BIND_ACTION_PARAM(push_4fdig, &frac_sec)]) | + (fdig3[BOOST_BIND_ACTION_PARAM(push_3fdig, &frac_sec)]) | + (fdig2[BOOST_BIND_ACTION_PARAM(push_2fdig, &frac_sec)]) | + (fdig1[BOOST_BIND_ACTION_PARAM(push_1fdig, &frac_sec)]); + #endif + + bsc::rule<> d_time_dig = ((dig2[BOOST_BIND_ACTION_PARAM(push_2dig, &hr)]) >> *(time_separator) + >> (dig2[BOOST_BIND_ACTION_PARAM(push_2dig, &mn)]) >> *(time_separator) + >> (dig2[BOOST_BIND_ACTION_PARAM(push_2dig, &sc)]) >> *(nano_sec_separator) + >> (fraction_sec) >> (d_timezone_dig)) | + ((dig2[BOOST_BIND_ACTION_PARAM(push_2dig, &hr)]) >> *(time_separator) + >> (dig2[BOOST_BIND_ACTION_PARAM(push_2dig, &mn)]) >> *(time_separator) + >> (dig2[BOOST_BIND_ACTION_PARAM(push_2dig, &sc)]) >> (d_timezone_dig)) | + ((dig2[BOOST_BIND_ACTION_PARAM(push_2dig, &hr)]) >> *(time_separator) + >> (dig2[BOOST_BIND_ACTION_PARAM(push_2dig, &mn)]) >> (d_timezone_dig)); + + bsc::rule<> d_date_time = ((d_yyyymmdd_dig) >> (d_time_dig)) | (d_yyyymmdd_dig) | (d_yyyy_dig); + + timestamp_t tmstmp; + value v_str; + int tz_hour, tz_min; + + bool datetime_validation() + { + if (yr >= 1400 && yr <= 9999 && mo >= 1 && mo <= 12 && dy >= 1 && hr < 24 && mn < 60 && sc < 60 && tz_hour <= 14 && tz_hour >= -12 && tz_mn < 60) + { + if ( (tz_hour == -12 || tz_hour == 14) && tz_mn > 0) + return false; + + switch (mo) + { + case 1: + case 3: + case 5: + case 7: + case 8: + case 10: + case 12: + if(dy <= 31) + { + return true; + } + break; + case 4: + case 6: + case 9: + case 11: + if(dy <= 30) + { + return true; + } + break; + case 2: + if(dy >= 28) + { + if(!(yr % 4) == 0 && dy > 28) + { + return false; + } + else if(!(yr % 100) == 0 && dy <= 29) + { + return true; + } + else if(!(yr % 400) == 0 && dy > 28) + { + return false; + } + else + { + return true; + } + } + else + { + return true; + } + break; + default: + return false; + break; + } + } + return false; + } + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + + hr = 0; + mn = 0; + sc = 0; + frac_sec = 0; + tz_hr = 0; + tz_mn = 0; + tm_zone = '0'; + + auto iter = args->begin(); + int args_size = args->size(); + + if (args_size != 1) + { + throw base_s3select_exception("to_timestamp should have one parameter"); + } + + base_statement* str = *iter; + + v_str = str->eval(); + + if (v_str.type != value::value_En_t::STRING) + { + throw base_s3select_exception("to_timestamp first argument must be string"); //can skip current row + } + + bsc::parse_info<> info_dig = bsc::parse(v_str.str(), d_date_time); + + tz_hour = tz_hr; + tz_min = tz_mn; + if ((char)sign == '-') + { + tz_hour *= -1; + tz_min *= -1; + } + + if(datetime_validation()==false or !info_dig.full) + { + throw base_s3select_exception("input date-time is illegal"); + } + + boost::posix_time::ptime new_ptime; + + #if BOOST_DATE_TIME_POSIX_TIME_STD_CONFIG + new_ptime = boost::posix_time::ptime(boost::gregorian::date(yr, mo, dy), + boost::posix_time::hours(hr) + + boost::posix_time::minutes(mn) + + boost::posix_time::seconds(sc) + + boost::posix_time::nanoseconds(frac_sec)); + #else + new_ptime = boost::posix_time::ptime(boost::gregorian::date(yr, mo, dy), + boost::posix_time::hours(hr) + + boost::posix_time::minutes(mn) + + boost::posix_time::seconds(sc) + + boost::posix_time::microseconds(frac_sec)); + #endif + + tmstmp = std::make_tuple(new_ptime, boost::posix_time::time_duration(tz_hour, tz_min, 0), (char)tm_zone == 'Z'); + + result->set_value(&tmstmp); + + return true; + } + +}; + +struct _fn_to_string_constant : public base_timestamp_to_string +{ + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + param_validation(args); + + if (!initialized) + { + prepare_to_string_vector(print_vector, para); + initialized = true; + } + + std::string result_ = execute_to_string(print_vector, para); + + result->set_value(result_.c_str()); + return true; + } +}; + +struct _fn_to_string_dynamic : public base_timestamp_to_string +{ + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + param_validation(args); + + print_vector.clear(); + para.clear(); + + prepare_to_string_vector(print_vector, para); + + std::string result_ = execute_to_string(print_vector, para); + + result->set_value(result_.c_str()); + return true; + } +}; + +struct _fn_extract_year_from_timestamp : public base_date_extract +{ + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + param_validation(args); + + result->set_value( (int64_t)new_ptime.date().year()); + return true; + } +}; + +struct _fn_extract_month_from_timestamp : public base_date_extract +{ + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + param_validation(args); + + result->set_value( (int64_t)new_ptime.date().month()); + return true; + } +}; + +struct _fn_extract_day_from_timestamp : public base_date_extract +{ + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + param_validation(args); + + result->set_value( (int64_t)new_ptime.date().day()); + return true; + } +}; + +struct _fn_extract_hour_from_timestamp : public base_date_extract +{ + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + param_validation(args); + + result->set_value( (int64_t)new_ptime.time_of_day().hours()); + return true; + } +}; + +struct _fn_extract_minute_from_timestamp : public base_date_extract +{ + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + param_validation(args); + + result->set_value( (int64_t)new_ptime.time_of_day().minutes()); + return true; + } +}; + +struct _fn_extract_second_from_timestamp : public base_date_extract +{ + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + param_validation(args); + + result->set_value( (int64_t)new_ptime.time_of_day().seconds()); + return true; + } +}; + +struct _fn_extract_week_from_timestamp : public base_date_extract +{ + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + param_validation(args); + + result->set_value( (int64_t)new_ptime.date().week_number()); + return true; + } +}; + +struct _fn_extract_tz_hour_from_timestamp : public base_date_extract +{ + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + param_validation(args); + + result->set_value((int64_t)td.hours()); + return true; + } +}; + +struct _fn_extract_tz_minute_from_timestamp : public base_date_extract +{ + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + param_validation(args); + + result->set_value((int64_t)td.minutes()); + return true; + } +}; + +struct _fn_diff_year_timestamp : public base_date_diff +{ + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + param_validation(args); + + int year1 = ptime1.date().year(); + int year2 = ptime2.date().year(); + boost::posix_time::time_duration time1 = boost::posix_time::time_duration( + ptime1.time_of_day().hours(), ptime1.time_of_day().minutes(), + ptime1.time_of_day().seconds()); + boost::posix_time::time_duration time2 = boost::posix_time::time_duration( + ptime2.time_of_day().hours(), ptime2.time_of_day().minutes(), + ptime2.time_of_day().seconds()); + + if (year2 > year1 && ((ptime2.date().day_of_year() < ptime1.date().day_of_year()) || + (ptime2.date().day_of_year() == ptime1.date().day_of_year() && time2 < time1))) + { + year2 -= 1; + } + else if (year2 < year1 && ((ptime2.date().day_of_year() > ptime1.date().day_of_year()) || + (ptime2.date().day_of_year() == ptime1.date().day_of_year() && time2 > time1))) + { + year2 += 1; + } + + int64_t yr = year2 - year1; + result->set_value( yr ); + return true; + } +}; + +struct _fn_diff_month_timestamp : public base_date_diff +{ + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + param_validation(args); + + int year1 = ptime1.date().year(); + int year2 = ptime2.date().year(); + int mon1 = ptime1.date().month(); + int mon2 = ptime2.date().month(); + boost::posix_time::time_duration time1 = boost::posix_time::time_duration( + ptime1.time_of_day().hours(), ptime1.time_of_day().minutes(), + ptime1.time_of_day().seconds()); + boost::posix_time::time_duration time2 = boost::posix_time::time_duration( + ptime2.time_of_day().hours(), ptime2.time_of_day().minutes(), + ptime2.time_of_day().seconds()); + + if (year2 > year1) + { + if (ptime2.date().day() < ptime1.date().day() || (ptime2.date().day() == ptime1.date().day() && time2 < time1)) + { + mon2 -= 1; + } + + if (ptime2.date().month() < ptime1.date().month()) + { + mon2 += 12; + year2 -= 1; + } + } + else if (year2 < year1) + { + if (ptime2.date().day() > ptime1.date().day() || (ptime2.date().day() == ptime1.date().day() && time2 > time1)) + { + mon1 -= 1; + } + + if (ptime2.date().month() > ptime1.date().month()) + { + mon1 += 12; + year1 -= 1; + } + } + + int64_t mon_diff = (year2 - year1) * 12 + mon2 - mon1; + + result->set_value(mon_diff); + return true; + } +}; + +struct _fn_diff_day_timestamp : public base_date_diff +{ + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + param_validation(args); + + boost::posix_time::time_duration td_res = ptime2 - ptime1; + int total_seconds = (((td_res.hours() * 60) + td_res.minutes()) * 60) + td_res.seconds(); + int64_t days = total_seconds / (24 * 3600); + + result->set_value(days); + return true; + } +}; + +struct _fn_diff_hour_timestamp : public base_date_diff +{ + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + param_validation(args); + + boost::posix_time::time_duration td_res = ptime2 - ptime1; + result->set_value((int64_t)td_res.hours()); + return true; + } +}; + +struct _fn_diff_minute_timestamp : public base_date_diff +{ + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + param_validation(args); + + boost::posix_time::time_duration td_res = ptime2 - ptime1; + result->set_value((int64_t)((td_res.hours() * 60) + td_res.minutes())); + return true; + } +}; + +struct _fn_diff_second_timestamp : public base_date_diff +{ + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + param_validation(args); + + boost::posix_time::time_duration td_res = ptime2 - ptime1; + result->set_value((int64_t)((((td_res.hours() * 60) + td_res.minutes()) * 60) + td_res.seconds())); + return true; + } +}; + +struct _fn_add_year_to_timestamp : public base_date_add +{ + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + param_validation(args); + + new_ptime += boost::gregorian::years( val_quantity.i64() ); + new_tmstmp = std::make_tuple(new_ptime, td, flag); + result->set_value( &new_tmstmp ); + return true; + } +}; + +struct _fn_add_month_to_timestamp : public base_date_add +{ + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + param_validation(args); + + int yr, mn, dy, quant; + quant = val_quantity.i64(); + dy = new_ptime.date().day(); + + int temp = quant % 12; + mn = new_ptime.date().month() + temp; + temp = quant / 12; + yr = new_ptime.date().year() + temp; + + if (mn > 12) + { + yr += 1; + temp = mn % 12; + if (temp == 0) + { + temp = 12; + } + mn = temp; + } + else if (mn < 1) + { + yr -= 1; + if (mn == 0) + { + mn = 12; + } + else + { + mn = 12 + mn; + } + } + + if ((mn == 4 || mn == 6 || mn == 9 || mn == 11) && dy > 30) + { + dy = 30; + } + else if (mn == 2 && dy > 28) + { + if (!(yr % 4) == 0 || ((yr % 100) == 0 && !(yr % 400) == 0)) + { + dy = 28; + } + else + { + dy = 29; + } + } + + #if BOOST_DATE_TIME_POSIX_TIME_STD_CONFIG + new_ptime = boost::posix_time::ptime(boost::gregorian::date(yr, mn, dy), + boost::posix_time::hours(new_ptime.time_of_day().hours()) + + boost::posix_time::minutes(new_ptime.time_of_day().minutes()) + + boost::posix_time::seconds(new_ptime.time_of_day().seconds()) + + boost::posix_time::nanoseconds(new_ptime.time_of_day().fractional_seconds())); + #else + new_ptime = boost::posix_time::ptime(boost::gregorian::date(yr, mn, dy), + boost::posix_time::hours(new_ptime.time_of_day().hours()) + + boost::posix_time::minutes(new_ptime.time_of_day().minutes()) + + boost::posix_time::seconds(new_ptime.time_of_day().seconds()) + + boost::posix_time::microseconds(new_ptime.time_of_day().fractional_seconds())); + #endif + + new_tmstmp = std::make_tuple(new_ptime, td, flag); + result->set_value( &new_tmstmp ); + return true; + } +}; + +struct _fn_add_day_to_timestamp : public base_date_add +{ + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + param_validation(args); + + new_ptime += boost::gregorian::days( val_quantity.i64() ); + new_tmstmp = std::make_tuple(new_ptime, td, flag); + result->set_value( &new_tmstmp ); + return true; + } +}; + +struct _fn_add_hour_to_timestamp : public base_date_add +{ + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + param_validation(args); + + new_ptime += boost::posix_time::hours( val_quantity.i64() ); + new_tmstmp = std::make_tuple(new_ptime, td, flag); + result->set_value( &new_tmstmp ); + return true; + } +}; + +struct _fn_add_minute_to_timestamp : public base_date_add +{ + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + param_validation(args); + + new_ptime += boost::posix_time::minutes( val_quantity.i64() ); + new_tmstmp = std::make_tuple(new_ptime, td, flag); + result->set_value( &new_tmstmp ); + return true; + } +}; + +struct _fn_add_second_to_timestamp : public base_date_add +{ + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + param_validation(args); + + new_ptime += boost::posix_time::seconds( val_quantity.i64() ); + new_tmstmp = std::make_tuple(new_ptime, td, flag); + result->set_value( &new_tmstmp ); + return true; + } +}; + +struct _fn_utcnow : public base_function +{ + timestamp_t now_timestamp; + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + int args_size = args->size(); + + if (args_size != 0) + { + throw base_s3select_exception("utcnow does not expect any parameters"); + } + + boost::posix_time::ptime now_ptime = boost::posix_time::ptime( boost::posix_time::second_clock::universal_time()); + now_timestamp = std::make_tuple(now_ptime, boost::posix_time::time_duration(0, 0, 0), false); + result->set_value( &now_timestamp ); + + return true; + } +}; + +struct _fn_between : public base_function +{ + + value res; + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + int args_size = args->size(); + + + if (args_size != 3) + { + throw base_s3select_exception("between operates on 3 expressions");//TODO FATAL + } + + auto iter = args->begin(); + + base_statement* second_expr = *iter; + iter++; + base_statement* first_expr = *iter; + iter++; + base_statement* main_expr = *iter; + + value second_expr_val = second_expr->eval(); + value first_expr_val = first_expr->eval(); + value main_expr_val = main_expr->eval(); + + if ((second_expr_val.type == first_expr_val.type && first_expr_val.type == main_expr_val.type) || (second_expr_val.is_number() && first_expr_val.is_number() && main_expr_val.is_number())) + { + if((main_expr_val >= first_expr_val) && (main_expr_val <= second_expr_val)) { + result->set_value(true); + } else { + result->set_value(false); + } + } + return true; + } +}; + +struct _fn_not_between : public base_function +{ + + value res; + _fn_between between_op; + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + between_op(args,result); + + if (result->get_value().is_true() == 0) { + result->set_value(true); + } else { + result->set_value(false); + } + return true; + } +}; + +static char s3select_ver[10]="41.a"; + +struct _fn_version : public base_function +{ + value val; //TODO use git to generate sha1 + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + val = &s3select_ver[0]; + *result = val; + return true; + } +}; + +struct _fn_isnull : public base_function +{ + + value res; + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + check_args_size(args,1); + + auto iter = args->begin(); + base_statement* expr = *iter; + value expr_val = expr->eval(); + if ( expr_val.is_null()) { + result->set_value(true); + } else { + result->set_value(false); + } + return true; + } +}; + +struct _fn_is_not_null : public base_function +{ + value res; + _fn_isnull isnull_op; + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + + isnull_op(args,result); + + if (result->get_value().is_true() == 0) + result->set_value(true); + else + result->set_value(false); + + return true; + } +}; + +struct _fn_in : public base_function +{ + + value res; + + bool operator()(bs_stmt_vec_t *args, variable *result) override + { + check_args_size(args,1); + + int args_size = static_cast<int>(args->size()-1); + base_statement *main_expr = (*args)[args_size]; + value main_expr_val = main_expr->eval(); + args_size--; + while (args_size>=0) + { + base_statement *expr = (*args)[args_size]; + value expr_val = expr->eval(); + args_size--; + if ((expr_val.type == main_expr_val.type) || (expr_val.is_number() && main_expr_val.is_number())) + { + if (expr_val == main_expr_val) + { + result->set_value(true); + return true; + } + } + } + result->set_value(false); + return true; + } +}; + +struct _fn_like : public base_like +{ + explicit _fn_like(base_statement* esc, base_statement* like_expr) + { + auto is_constant = [&](base_statement* bs) { + if (dynamic_cast<variable*>(bs) && dynamic_cast<variable*>(bs)->m_var_type == variable::var_t::COLUMN_VALUE) { + return true; + } else { + return false; + } + }; + + if (is_constant(esc) && is_constant(like_expr)) { + constant_state = true; + } + + if(constant_state == true) + { + param_validation(esc, like_expr); + std::vector<char> like_as_regex = transform(like_expr_val.str(), *escape_expr_val.str()); + compile(like_as_regex); + } + } + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + check_args_size(args,3); + + auto iter = args->begin(); + + base_statement* escape_expr = *iter; + iter++; + base_statement* like_expr = *iter; + iter++; + base_statement* main_expr = *iter; + + if (constant_state == false) + { + param_validation(escape_expr, like_expr); + std::vector<char> like_as_regex = transform(like_expr_val.str(), *escape_expr_val.str()); + compile(like_as_regex); + } + + value main_expr_val = main_expr->eval(); + if (main_expr_val.type != value::value_En_t::STRING) + { + throw base_s3select_exception("main expression must be string"); + } + + match(main_expr_val, result); + return true; + } +}; + +struct _fn_substr : public base_function +{ + + char buff[4096];// this buffer is persist for the query life time, it use for the results per row(only for the specific function call) + //it prevent from intensive use of malloc/free (fragmentation). + //should validate result length. + //TODO may replace by std::string (dynamic) , or to replace with global allocator , in query scope. + value v_str; + value v_from; + value v_to; + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + auto iter = args->begin(); + int args_size = args->size(); + + + if (args_size<2) + { + throw base_s3select_exception("substr accept 2 arguments or 3"); + } + + base_statement* str = *iter; + iter++; + base_statement* from = *iter; + base_statement* to; + + if (args_size == 3) + { + iter++; + to = *iter; + v_to = to->eval(); + if (!v_to.is_number()) + { + throw base_s3select_exception("substr third argument must be number"); //can skip row + } + } + + v_str = str->eval(); + + if(v_str.type != value::value_En_t::STRING) + { + throw base_s3select_exception("substr first argument must be string"); //can skip current row + } + + int str_length = strlen(v_str.str()); + + v_from = from->eval(); + if(!v_from.is_number()) + { + throw base_s3select_exception("substr second argument must be number"); //can skip current row + } + + int64_t f; + int64_t t; + + if (v_from.type == value::value_En_t::FLOAT) + { + f=v_from.dbl(); + } + else + { + f=v_from.i64(); + } + + if (f <= 0 && args_size == 2) + { + f = 1; + } + + if (f>str_length) + { + result->set_value(""); + return true; + } + + if (str_length>(int)sizeof(buff)) + { + throw base_s3select_exception("string too long for internal buffer"); //can skip row + } + + if (args_size == 3) + { + if (v_to.type == value::value_En_t::FLOAT) + { + t = v_to.dbl(); + } + else + { + t = v_to.i64(); + } + + if (f <= 0) + { + t = t + f - 1; + f = 1; + } + + if (t<0) + { + t = 0; + } + + if (t > str_length) + { + t = str_length; + } + + if( (str_length-(f-1)-t) <0) + {//in case the requested length is too long, reduce it to exact length. + t = str_length-(f-1); + } + + strncpy(buff, v_str.str()+f-1, t); + } + else + { + strcpy(buff, v_str.str()+f-1); + } + + result->set_value(buff); + + return true; + } +}; + +struct _fn_charlength : public base_function { + + value v_str; + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + check_args_size(args,1); + + auto iter = args->begin(); + base_statement* str = *iter; + v_str = str->eval(); + if(v_str.type != value::value_En_t::STRING) { + throw base_s3select_exception("content is not string!"); + } else { + int64_t str_length = strlen(v_str.str()); + result->set_value(str_length); + return true; + } + } +}; + +struct _fn_lower : public base_function { + + std::string buff; + value v_str; + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + check_args_size(args,1); + + auto iter = args->begin(); + base_statement* str = *iter; + v_str = str->eval(); + if(v_str.type != value::value_En_t::STRING) { + throw base_s3select_exception("content is not string"); + } else { + buff = v_str.str(); + boost::algorithm::to_lower(buff); + result->set_value(buff.c_str()); + return true; + } + } +}; + +struct _fn_upper : public base_function { + + std::string buff; + value v_str; + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + check_args_size(args,1); + + auto iter = args->begin(); + base_statement* str = *iter; + v_str = str->eval(); + if(v_str.type != value::value_En_t::STRING) { + throw base_s3select_exception("content is not string"); + } else { + buff = v_str.str(); + boost::algorithm::to_upper(buff); + result->set_value(buff.c_str()); + return true; + } + } +}; + +struct _fn_nullif : public base_function { + + value x; + value y; + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + auto iter = args->begin(); + + int args_size = args->size(); + if (args_size != 2) + { + throw base_s3select_exception("nullif accept only 2 arguments"); + } + base_statement *first = *iter; + x = first->eval(); + iter++; + base_statement *second = *iter; + y = second->eval(); + if (x.is_null() && y.is_null()) + { + result->set_null(); + return true; + } + if (x.is_null()) + { + result->set_null(); + return true; + } + if (!(x.is_number() && y.is_number())) { + if (x.type != y.type) { + *result = x; + return true; + } + } + if (x != y) { + *result = x; + } else { + result->set_null(); + } + return true; + } + }; + +struct _fn_when_then : public base_function { + + value when_value; + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + check_args_size(args,2); + + auto iter = args->begin(); + + base_statement* then_expr = *iter; + iter ++; + + base_statement* when_expr = *iter; + + when_value = when_expr->eval(); + + if (when_value.is_true())//true + { + *result = then_expr->eval(); + return true; + } + + result->set_null(); + + return true; + } +}; + +struct _fn_when_value_then : public base_function { + + value when_value; + value case_value; + value then_value; + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + check_args_size(args,3); + + auto iter = args->begin(); + + base_statement* then_expr = *iter; + iter++; + + base_statement* when_expr = *iter; + iter++; + + base_statement* case_expr = *iter; + + when_value = when_expr->eval(); + case_value = case_expr->eval(); + then_value = then_expr->eval(); + + if (case_value == when_value) + { + *result = then_value; + return true; + } + + result->set_null(); + return true; + } +}; + +struct _fn_case_when_else : public base_function { + + value when_then_value; + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + check_args_size(args,1); + + base_statement* else_expr = *(args->begin()); + + size_t args_size = args->size() -1; + + for(int ivec=args_size;ivec>0;ivec--) + { + when_then_value = (*args)[ivec]->eval(); + + if(!when_then_value.is_null()) + { + *result = when_then_value; + return true; + } + + } + + *result = else_expr->eval(); + return true; + } +}; + +struct _fn_coalesce : public base_function +{ + + value res; + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + check_args_size(args,1); + + auto iter_begin = args->begin(); + int args_size = args->size(); + while (args_size >= 1) + { + base_statement* expr = *iter_begin; + value expr_val = expr->eval(); + iter_begin++; + if ( !(expr_val.is_null())) { + *result = expr_val; + return true; + } + args_size--; + } + result->set_null(); + return true; + } +}; + +struct _fn_string : public base_function +{ + + value res; + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + check_args_size(args,1); + + auto iter = args->begin(); + + base_statement* expr = *iter; + value expr_val = expr->eval(); + result->set_value((expr_val.to_string())); + return true; + } +}; + +struct _fn_to_bool : public base_function +{ + + value func_arg; + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + check_args_size(args,1); + + int64_t i=0; + func_arg = (*args->begin())->eval(); + + if (func_arg.type == value::value_En_t::FLOAT) + { + i = func_arg.dbl(); + } + else if (func_arg.type == value::value_En_t::DECIMAL || func_arg.type == value::value_En_t::BOOL) + { + i = func_arg.i64(); + } + else + { + i = 0; + } + if (i == 0) + { + result->set_value(false); + } + else + { + result->set_value(true); + } + return true; + } +}; + +struct _fn_trim : public base_function { + + std::string input_string; + value v_remove; + value v_input; + + _fn_trim() + { + v_remove = " "; + } + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + check_args_size(args,1); + + auto iter = args->begin(); + int args_size = args->size(); + base_statement* str = *iter; + v_input = str->eval(); + if(v_input.type != value::value_En_t::STRING) { + throw base_s3select_exception("content is not string"); + } + input_string = v_input.str(); + if (args_size == 2) { + iter++; + base_statement* next = *iter; + v_remove = next->eval(); + } + boost::trim_right_if(input_string,boost::is_any_of(v_remove.str())); + boost::trim_left_if(input_string,boost::is_any_of(v_remove.str())); + result->set_value(input_string.c_str()); + return true; + } +}; + +struct _fn_leading : public base_function { + + std::string input_string; + value v_remove; + value v_input; + + _fn_leading() + { + v_remove = " "; + } + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + check_args_size(args,1); + + auto iter = args->begin(); + int args_size = args->size(); + base_statement* str = *iter; + v_input = str->eval(); + if(v_input.type != value::value_En_t::STRING) { + throw base_s3select_exception("content is not string"); + } + input_string = v_input.str(); + if (args_size == 2) { + iter++; + base_statement* next = *iter; + v_remove = next->eval(); + } + boost::trim_left_if(input_string,boost::is_any_of(v_remove.str())); + result->set_value(input_string.c_str()); + return true; + } +}; + +struct _fn_trailing : public base_function { + + std::string input_string; + value v_remove; + value v_input; + + _fn_trailing() + { + v_remove = " "; + } + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + check_args_size(args,1); + + auto iter = args->begin(); + int args_size = args->size(); + base_statement* str = *iter; + v_input = str->eval(); + if(v_input.type != value::value_En_t::STRING) { + throw base_s3select_exception("content is not string"); + } + input_string = v_input.str(); + if (args_size == 2) { + iter++; + base_statement* next = *iter; + v_remove = next->eval(); + } + boost::trim_right_if(input_string,boost::is_any_of(v_remove.str())); + result->set_value(input_string.c_str()); + return true; + } +}; + +struct _fn_cast_to_decimal : public base_function { + + int32_t precision=-1; + int32_t scale=-1; + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + //cast(expr as decimal(x,y)) + check_args_size(args,2); + + base_statement* expr = (*args)[1]; + //expr_val should be float or integer + //dynamic value for the decimal operator to get the precision and scale + + _fn_to_float to_float; + bs_stmt_vec_t args_vec; + args_vec.push_back(expr); + to_float(&args_vec,result); + + if (precision == -1 || scale == -1){ + base_statement* decimal_expr = (*args)[0]; + decimal_expr->eval().get_precision_scale(&precision,&scale); + } + + result->set_precision_scale(&precision,&scale); + + return true; + } +}; + +struct _fn_decimal_operator : public base_function { + + int32_t precision=-1; + int32_t scale=-1; + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + //decimal(x,y) operator + check_args_size(args,2); + + auto iter = args->begin(); + base_statement* expr_precision = *iter; + value expr_precision_val = expr_precision->eval(); + + iter++; + base_statement* expr_scale = *iter; + value expr_scale_val = expr_scale->eval(); + + precision = expr_precision_val.i64(); + scale = expr_scale_val.i64(); + + result->set_precision_scale(&precision,&scale); + + return true; + } +}; + +struct _fn_engine_version : public base_function { + + const char* version_description =R"(PR #137 : +the change handle the use cases where the JSON input starts with an anonymous array/object this may cause wrong search result per the user request(SQL statement) + +handle the use-case where the user requests a json-key-path that may point to a non-discrete value. i.e. array or an object. +editorial changes. + +fix for CSV flow, in the case of a "broken row" (upon processing stream of data) + +null results upon aggregation functions on an empty group (no match for where clause). +)"; + + + _fn_engine_version() + { + aggregate = true; + } + + bool operator()(bs_stmt_vec_t* args, variable* result) override + { + result->set_value(version_description); + return true; + } +}; + +base_function* s3select_functions::create(std::string_view fn_name,const bs_stmt_vec_t &arguments) +{ + const FunctionLibrary::const_iterator iter = m_functions_library.find(fn_name.data()); + + if (iter == m_functions_library.end()) + { + std::string msg; + msg = std::string{fn_name} + " " + " function not found"; + throw base_s3select_exception(msg, base_s3select_exception::s3select_exp_en_t::FATAL); + } + + switch (iter->second) + { + case s3select_func_En_t::ADD: + return S3SELECT_NEW(this,_fn_add); + break; + + case s3select_func_En_t::SUM: + return S3SELECT_NEW(this,_fn_sum); + break; + + case s3select_func_En_t::COUNT: + return S3SELECT_NEW(this,_fn_count); + break; + + case s3select_func_En_t::MIN: + return S3SELECT_NEW(this,_fn_min); + break; + + case s3select_func_En_t::MAX: + return S3SELECT_NEW(this,_fn_max); + break; + + case s3select_func_En_t::TO_INT: + return S3SELECT_NEW(this,_fn_to_int); + break; + + case s3select_func_En_t::TO_FLOAT: + return S3SELECT_NEW(this,_fn_to_float); + break; + + case s3select_func_En_t::SUBSTR: + return S3SELECT_NEW(this,_fn_substr); + break; + + case s3select_func_En_t::TO_TIMESTAMP: + return S3SELECT_NEW(this,_fn_to_timestamp); + break; + + case s3select_func_En_t::TO_STRING_CONSTANT: + return S3SELECT_NEW(this,_fn_to_string_constant); + break; + + case s3select_func_En_t::TO_STRING_DYNAMIC: + return S3SELECT_NEW(this,_fn_to_string_dynamic); + break; + + case s3select_func_En_t::TO_BOOL: + return S3SELECT_NEW(this,_fn_to_bool); + break; + + case s3select_func_En_t::EXTRACT_YEAR: + return S3SELECT_NEW(this,_fn_extract_year_from_timestamp); + break; + + case s3select_func_En_t::EXTRACT_MONTH: + return S3SELECT_NEW(this,_fn_extract_month_from_timestamp); + break; + + case s3select_func_En_t::EXTRACT_DAY: + return S3SELECT_NEW(this,_fn_extract_day_from_timestamp); + break; + + case s3select_func_En_t::EXTRACT_HOUR: + return S3SELECT_NEW(this,_fn_extract_hour_from_timestamp); + break; + + case s3select_func_En_t::EXTRACT_MINUTE: + return S3SELECT_NEW(this,_fn_extract_minute_from_timestamp); + break; + + case s3select_func_En_t::EXTRACT_SECOND: + return S3SELECT_NEW(this,_fn_extract_second_from_timestamp); + break; + + case s3select_func_En_t::EXTRACT_WEEK: + return S3SELECT_NEW(this,_fn_extract_week_from_timestamp); + break; + + case s3select_func_En_t::EXTRACT_TIMEZONE_HOUR: + return S3SELECT_NEW(this,_fn_extract_tz_hour_from_timestamp); + break; + + case s3select_func_En_t::EXTRACT_TIMEZONE_MINUTE: + return S3SELECT_NEW(this,_fn_extract_tz_minute_from_timestamp); + break; + + case s3select_func_En_t::DATE_ADD_YEAR: + return S3SELECT_NEW(this,_fn_add_year_to_timestamp); + break; + + case s3select_func_En_t::DATE_ADD_MONTH: + return S3SELECT_NEW(this,_fn_add_month_to_timestamp); + break; + + case s3select_func_En_t::DATE_ADD_DAY: + return S3SELECT_NEW(this,_fn_add_day_to_timestamp); + break; + + case s3select_func_En_t::DATE_ADD_HOUR: + return S3SELECT_NEW(this,_fn_add_hour_to_timestamp); + break; + + case s3select_func_En_t::DATE_ADD_MINUTE: + return S3SELECT_NEW(this,_fn_add_minute_to_timestamp); + break; + + case s3select_func_En_t::DATE_ADD_SECOND: + return S3SELECT_NEW(this,_fn_add_second_to_timestamp); + break; + + case s3select_func_En_t::DATE_DIFF_YEAR: + return S3SELECT_NEW(this,_fn_diff_year_timestamp); + break; + + case s3select_func_En_t::DATE_DIFF_MONTH: + return S3SELECT_NEW(this,_fn_diff_month_timestamp); + break; + + case s3select_func_En_t::DATE_DIFF_DAY: + return S3SELECT_NEW(this,_fn_diff_day_timestamp); + break; + + case s3select_func_En_t::DATE_DIFF_HOUR: + return S3SELECT_NEW(this,_fn_diff_hour_timestamp); + break; + + case s3select_func_En_t::DATE_DIFF_MINUTE: + return S3SELECT_NEW(this,_fn_diff_minute_timestamp); + break; + + case s3select_func_En_t::DATE_DIFF_SECOND: + return S3SELECT_NEW(this,_fn_diff_second_timestamp); + break; + + case s3select_func_En_t::UTCNOW: + return S3SELECT_NEW(this,_fn_utcnow); + break; + + case s3select_func_En_t::AVG: + return S3SELECT_NEW(this,_fn_avg); + break; + + case s3select_func_En_t::LOWER: + return S3SELECT_NEW(this,_fn_lower); + break; + + case s3select_func_En_t::UPPER: + return S3SELECT_NEW(this,_fn_upper); + break; + + case s3select_func_En_t::LENGTH: + return S3SELECT_NEW(this,_fn_charlength); + break; + + case s3select_func_En_t::BETWEEN: + return S3SELECT_NEW(this,_fn_between); + break; + + case s3select_func_En_t::NOT_BETWEEN: + return S3SELECT_NEW(this,_fn_not_between); + break; + + case s3select_func_En_t::IS_NULL: + return S3SELECT_NEW(this,_fn_isnull); + break; + + case s3select_func_En_t::IS_NOT_NULL: + return S3SELECT_NEW(this,_fn_is_not_null); + break; + + case s3select_func_En_t::IN: + return S3SELECT_NEW(this,_fn_in); + break; + + case s3select_func_En_t::VERSION: + return S3SELECT_NEW(this,_fn_version); + break; + + case s3select_func_En_t::NULLIF: + return S3SELECT_NEW(this,_fn_nullif); + break; + + case s3select_func_En_t::LIKE: + return S3SELECT_NEW(this,_fn_like,arguments[0],arguments[1]); + break; + + case s3select_func_En_t::COALESCE: + return S3SELECT_NEW(this,_fn_coalesce); + break; + + case s3select_func_En_t::WHEN_THEN: + return S3SELECT_NEW(this,_fn_when_then); + break; + + case s3select_func_En_t::WHEN_VALUE_THEN: + return S3SELECT_NEW(this,_fn_when_value_then); + break; + + case s3select_func_En_t::CASE_WHEN_ELSE: + return S3SELECT_NEW(this,_fn_case_when_else); + break; + + case s3select_func_En_t::STRING: + return S3SELECT_NEW(this,_fn_string); + break; + + case s3select_func_En_t::TRIM: + return S3SELECT_NEW(this,_fn_trim); + break; + + case s3select_func_En_t::LEADING: + return S3SELECT_NEW(this,_fn_leading); + break; + + case s3select_func_En_t::TRAILING: + return S3SELECT_NEW(this,_fn_trailing); + break; + + case s3select_func_En_t::DECIMAL_OPERATOR: + return S3SELECT_NEW(this,_fn_decimal_operator); + break; + + case s3select_func_En_t::CAST_TO_DECIMAL: + return S3SELECT_NEW(this,_fn_cast_to_decimal); + break; + + case s3select_func_En_t::ENGINE_VERSION: + return S3SELECT_NEW(this,_fn_engine_version); + break; + + default: + throw base_s3select_exception("internal error while resolving function-name"); + break; + } +} + +bool base_statement::is_function() const +{ + if (dynamic_cast<__function*>(const_cast<base_statement*>(this))) + { + return true; + } + else + { + return false; + } +} + +const base_statement* base_statement::get_aggregate() const +{ + //search for aggregation function in AST + const base_statement* res = 0; + + if (is_aggregate()) + { + return this; + } + + if (left() && (res=left()->get_aggregate())!=0) + { + return res; + } + + if (right() && (res=right()->get_aggregate())!=0) + { + return res; + } + + if (is_function()) + { + for (auto i : dynamic_cast<__function*>(const_cast<base_statement*>(this))->get_arguments()) + { + const base_statement* b=i->get_aggregate(); + if (b) + { + return b; + } + } + } + return 0; +} + +bool base_statement::is_column_reference() const +{ + if(is_column()) + return true; + + if(left()) + return left()->is_column_reference(); + + if(right()) + return right()->is_column_reference(); + + if(is_function()) + { + for(auto a : dynamic_cast<__function*>(const_cast<base_statement*>(this))->get_arguments()) + { + if(a->is_column_reference()) + return true; + } + } + + return false; +} + +bool base_statement::is_nested_aggregate(bool &aggr_flow) const +{ + if (is_aggregate()) + { + aggr_flow=true; + for (auto& i : dynamic_cast<__function*>(const_cast<base_statement*>(this))->get_arguments()) + { + if (i->get_aggregate() != nullptr) + { + return true; + } + } + } + + if(left() && left()->is_nested_aggregate(aggr_flow)) + return true; + + if(right() && right()->is_nested_aggregate(aggr_flow)) + return true; + + if (is_function()) + { + for (auto& i : dynamic_cast<__function*>(const_cast<base_statement*>(this))->get_arguments()) + { + if (i->get_aggregate() != nullptr) + { + return i->is_nested_aggregate(aggr_flow); + } + } + } + + return false; +} + +bool base_statement::is_statement_contain_star_operation() const +{ + if(is_star_operation()) + return true; + + if(left()) + return left()->is_statement_contain_star_operation(); + + if(right()) + return right()->is_statement_contain_star_operation(); + + if(is_function()) + { + for(auto a : dynamic_cast<__function*>(const_cast<base_statement*>(this))->get_arguments()) + { + if(a->is_star_operation()) + return true; + } + } + + return false; +} + +bool base_statement::mark_aggreagtion_subtree_to_execute() +{//purpase:: set aggregation subtree as runnable. + //the function search for aggregation function, and mark its subtree {skip = false} + if (is_aggregate()) + set_skip_non_aggregate(false); + + if (left()) + left()->mark_aggreagtion_subtree_to_execute(); + + if(right()) + right()->mark_aggreagtion_subtree_to_execute(); + + if (is_function()) + { + for (auto& i : dynamic_cast<__function*>(this)->get_arguments()) + { + i->mark_aggreagtion_subtree_to_execute(); + } + } + + return true; +} + +void base_statement::push_for_cleanup(std::set<base_statement*>& ast_nodes_to_delete)//semantic loop on each projection +{ +//placement new is releasing the main-buffer in which all AST nodes +//allocating from it. meaning no calls to destructors. +//the purpose of this routine is to traverse the AST in map all nodes for cleanup. +//the cleanup method will trigger all destructors. + + ast_nodes_to_delete.insert(this); + + if (left()) + left()->push_for_cleanup(ast_nodes_to_delete); + + if(right()) + right()->push_for_cleanup(ast_nodes_to_delete); + + if (is_function()) + { + for (auto& i : dynamic_cast<__function*>(this)->get_arguments()) + { + i->push_for_cleanup(ast_nodes_to_delete); + } + } +} + +#ifdef _ARROW_EXIST +void base_statement::extract_columns(parquet_file_parser::column_pos_t &cols,const uint16_t max_columns) +{// purpose: to extract all column-ids from query + if(is_column()) //column reference or column position + {variable* v = dynamic_cast<variable*>(this); + if(dynamic_cast<variable*>(this)->m_var_type == variable::var_t::VARIABLE_NAME) + {//column reference + + if (v->getScratchArea()->get_column_pos(v->get_name().c_str())>=0) + {//column belong to schema + cols.insert( v->getScratchArea()->get_column_pos(v->get_name().c_str() )); + }else { + if(v->getAlias()->search_alias(v->get_name())) + {//column is an alias --> extract columns belong to alias + //TODO cyclic alias to resolve + v->getAlias()->search_alias(v->get_name())->extract_columns(cols,max_columns); + }else { + //column is not alias --> error + std::stringstream ss; + ss << "column " + v->get_name() + " is not part of schema nor an alias"; + throw base_s3select_exception(ss.str(),base_s3select_exception::s3select_exp_en_t::FATAL); + } + } + }else if(v->m_var_type == variable::var_t::STAR_OPERATION) + { + for(uint16_t i=0;i<max_columns;i++) + {//push all columns + cols.insert( i ); + } + } + else { + if (v->get_column_pos()>=max_columns) + { + std::stringstream ss; + ss << "column " + std::to_string( v->get_column_pos()+1 ) + " exceed max number of columns"; + throw base_s3select_exception(ss.str(),base_s3select_exception::s3select_exp_en_t::FATAL); + } + cols.insert(v->get_column_pos());//push column positions + } + }else if(is_function()) + { + __function* f = (dynamic_cast<__function*>(this)); + bs_stmt_vec_t args = f->get_arguments(); + for (auto prm : args) + {//traverse function args + prm->extract_columns(cols,max_columns); + } + + } + + //keep traversing down the AST + if(left()) + left()->extract_columns(cols,max_columns); + + if(right()) + right()->extract_columns(cols,max_columns); +} +#endif //_ARROW_EXIST + +} //namespace s3selectEngine + +#endif diff --git a/src/s3select/include/s3select_json_parser.h b/src/s3select/include/s3select_json_parser.h new file mode 100644 index 000000000..aa06163f5 --- /dev/null +++ b/src/s3select/include/s3select_json_parser.h @@ -0,0 +1,829 @@ +#ifndef S3SELECT_JSON_PARSER_H +#define S3SELECT_JSON_PARSER_H + +//TODO add __FILE__ __LINE__ message +#define RAPIDJSON_ASSERT(x) s3select_json_parse_error(x) +bool s3select_json_parse_error(bool b); +bool s3select_json_parse_error(const char* error); + +#include "rapidjson/reader.h" +#include "rapidjson/writer.h" +#include "rapidjson/filereadstream.h" +#include "rapidjson/filewritestream.h" +#include "rapidjson/error/en.h" +#include "rapidjson/document.h" +#include <cassert> +#include <sstream> +#include <vector> +#include <iostream> +#include <functional> +#include <boost/spirit/include/classic_core.hpp> +#include <boost/algorithm/string/predicate.hpp> +#include "s3select_oper.h"//class value +#include <boost/algorithm/string/predicate.hpp> + +#define JSON_PROCESSING_LIMIT_REACHED 2 + +//TODO missing s3selectEngine namespace + +bool s3select_json_parse_error(bool b) +{ + if(!b) + { + const char* error_str = "failure while processing JSON document"; + throw s3selectEngine::base_s3select_exception(error_str, s3selectEngine::base_s3select_exception::s3select_exp_en_t::FATAL); + } + return false; +} + +bool s3select_json_parse_error(const char* error) +{ + if(!error) + { + const char* error_str = "failure while processing JSON document"; + throw s3selectEngine::base_s3select_exception(error_str, s3selectEngine::base_s3select_exception::s3select_exp_en_t::FATAL); + } + return false; +} + +static auto iequal_predicate = [](std::string& it1, std::string& it2) + { + return boost::iequals(it1,it2); + }; + + +class ChunksStreamer : public rapidjson::MemoryStream { + + //purpose: adding a method `resetBuffer` that enables to parse chunk after chunk + //per each new chunk it reset internal data members + public: + + std::string internal_buffer; + const Ch* next_src_; + size_t next_size_; + + ChunksStreamer():rapidjson::MemoryStream(0,0){next_src_=0;next_size_=0;} + + ChunksStreamer(const Ch *src, size_t size) : rapidjson::MemoryStream(src,size){next_src_=0;next_size_=0;} + + //override Peek methode + Ch Peek() //const + { + if(RAPIDJSON_UNLIKELY(src_ == end_)) + { + if(next_src_)//next chunk exist + {//upon reaching to end of current buffer, to switch with next one + src_ = next_src_; + begin_ = src_; + size_ =next_size_; + end_ = src_ + size_; + + next_src_ = 0; + next_size_ = 0; + return *src_; + } + else return 0; + } + return *src_; + } + + //override Take method + Ch Take() + { + if(RAPIDJSON_UNLIKELY(src_ == end_)) + { + if(next_src_)//next chunk exist + {//upon reaching to end of current buffer, to switch with next one + src_ = next_src_; + begin_ = src_; + size_ = next_size_; + end_ = src_ + size_; + + next_src_ = 0; + next_size_ = 0; + return *src_; + } + else return 0; + } + return *src_++; + } + + void resetBuffer(char* buff, size_t size) + { + if(!src_) + {//first time calling + begin_ = buff; + src_ = buff; + size_ = size; + end_= src_ + size_; + return; + } + + if(!next_src_) + {//save the next-chunk that will be used upon parser reaches end of current buffer + next_src_ = buff; + next_size_ = size; + } + else + {// should not happen + std::cout << "can not replace pointers!!!" << std::endl;//TODO exception + return; + } + } + + void saveRemainingBytes() + {//this routine called per each new chunk + //save the remaining bytes, before its overriden by the next-chunk. + size_t copy_left_sz = getBytesLeft(); //should be very small + internal_buffer.assign(src_,copy_left_sz); + + src_ = internal_buffer.data(); + begin_ = src_; + size_ = copy_left_sz; + end_= src_ + copy_left_sz; + } + + size_t getBytesLeft() { return end_ - src_; } + +}; + +enum class row_state +{ + NA, + OBJECT_START_ROW, + ARRAY_START_ROW +}; + +class json_variable_access { +//purpose: a state-machine for json-variables. +//upon the syntax-parser accepts a variable (projection / where-clause) it create this object. +//this object get events (key,start-array ... etc) as the JSON reader scans the input, +//these events are advancing the states until it reaches to the last one, result with pushing value into scratch-area. + +private: + +// to set the following. +std::vector<std::string>* from_clause; +std::vector<std::string>* key_path; +//m_current_depth : trace the depth of the reader, including "anonymous"(meaning JSON may begin with array that has no name attached to it) +int* m_current_depth; +//m_current_depth_non_anonymous : trace the depth of the reader, NOT including "anonymous" array/object. +//upon user request the following _1.a[12].b, the key-name{a} may reside on some array with no-name, +//the state machine that search for a[12].b, does NOT contain states for that "anonymous" array, +//thus, the state-machine will fail to trace the user request for that specific key.path +int* m_current_depth_non_anonymous; +std::function <int(s3selectEngine::value&,int)>* m_exact_match_cb; +// a state number : (_1).a.b.c[ 17 ].d.e (a.b)=1 (c[)=2 (17)=3 (.d.e)=4 +size_t current_state;//contain the current state of the state machine for searching-expression (each JSON variable in SQL statement has a searching expression) +int nested_array_level;//in the case of array within array it contain the nesting level +int m_json_index; +s3selectEngine::value v_null; +size_t m_from_clause_size; + +struct variable_state_md { + std::vector<std::string> required_path;//set by the syntax-parser. in the case of array its empty + int required_array_entry_no;//set by the syntax-parser, in the case of object-key its -1. + int actual_array_entry_no;//upon scanning the JSON input, this value increased by 1 each new element + int required_depth_size;// depth of state, is aggregated (include the previous). it's the summary of key-elements and array-operator's. + int required_key_depth_size;// same as the above, not including the array-operators. + int last_array_start;//it actually mark the nested-array-level (array within array) +}; + +std::vector<struct variable_state_md> variable_states;//vector is populated upon syntax phase. + +public: + +json_variable_access():from_clause(nullptr),key_path(nullptr),m_current_depth(nullptr),m_current_depth_non_anonymous(nullptr),m_exact_match_cb(nullptr),current_state(-1),nested_array_level(0),m_json_index(-1),v_null(nullptr),m_from_clause_size(0) +{} + +void init( + std::vector<std::string>* reader_from_clause, + std::vector<std::string>* reader_key_path, + int* reader_current_depth, + int* reader_m_current_depth_non_anonymous, + std::function <int(s3selectEngine::value&,int)>* excat_match_cb, + int json_index) +{//this routine should be called before scanning the JSON input + from_clause = reader_from_clause; + key_path = reader_key_path; + m_exact_match_cb = excat_match_cb; + //m_current_depth and m_current_depth_non_anonymous points to the JSON reader variables. + m_current_depth = reader_current_depth; + m_current_depth_non_anonymous = reader_m_current_depth_non_anonymous; + current_state = 0; + m_json_index = json_index; + m_from_clause_size = from_clause->size(); + + //loop on variable_states compute required_depth_size +} + +void clear() +{ + variable_states.clear(); +} + +void debug_info() +{ + auto f = [](std::vector<std::string> x){std::string res;for(auto i : x){res.append(i);res.append(".");};return res;}; + + std::cout << "m_current_depth=" << *m_current_depth << " required_depth_size= " << reader_position_state().required_depth_size << " "; + std::cout << "variable_states[ current_state ].last_array_start=" << reader_position_state().last_array_start; + std::cout << " current_state=" << current_state << " key_path=" << f(*key_path) << std::endl; +} +#define DBG {std::cout << "event=" << __FUNCTION__ << std::endl; debug_info();} +#undef DBG +#define DBG + +void compile_state_machine() +{ + size_t aggregated_required_depth_size = 0; + size_t aggregated_required_key_depth_size = 0; + for(auto& v : variable_states) + { + if(v.required_path.size()) + { + v.required_depth_size = aggregated_required_depth_size + v.required_path.size();//depth size in general, including array + v.required_key_depth_size = aggregated_required_key_depth_size;//depth include ONLY key parts + aggregated_required_key_depth_size += v.required_path.size(); + } + else + { + v.required_depth_size = aggregated_required_depth_size + 1; + } + aggregated_required_depth_size = v.required_depth_size; + } +} + +void push_variable_state(std::vector<std::string>& required_path,int required_array_entry_no) +{ + struct variable_state_md new_state={required_path,required_array_entry_no,-1,0,0,-1}; + variable_states.push_back(new_state); + //TODO required_path.size() > 0 or required_path,required_array_entry_no>=0 : not both + compile_state_machine(); +} + +struct variable_state_md& reader_position_state() +{ + if (current_state>=variable_states.size()) + {//in case the state-machine reached a "dead-end", should push a null for that JSON variable + //going back one state. + (*m_exact_match_cb)(v_null,m_json_index); + decrease_current_state(); + } + + return variable_states[ current_state ]; +} + +bool is_array_state() +{ + return (reader_position_state().required_array_entry_no >= 0); +} + +bool is_reader_located_on_required_depth() +{ + //upon user request `select _1.a.b from s3object[*].c.d;` the c.d sould "cut off" from m_current_depth_non_anonymous + //to get the correct depth of the state-machine + return ((*m_current_depth_non_anonymous - static_cast<int>(m_from_clause_size)) == reader_position_state().required_depth_size); +} + +bool is_on_final_state() +{ + return (current_state == (variable_states.size())); + //&& *m_current_depth == variable_states[ current_state -1 ].required_depth_size); + + // NOTE: by ignoring the current-depth, the matcher gives precedence to key-path match, while not ignoring accessing using array + // meaning, upon requeting a.b[12] , the [12] is not ignored, the a<-->b distance should be calculated as key distance, i.e. not counting array/object with *no keys*. + // user may request 'select _1.phonearray.num'; the reader will traverse `num` exist in `phonearray` +} + +bool is_reader_passed_required_array_entry() +{ + return (reader_position_state().actual_array_entry_no > reader_position_state().required_array_entry_no); +} + +bool is_reader_located_on_array_according_to_current_state() +{ + return (nested_array_level == reader_position_state().last_array_start); +} + +bool is_reader_position_depth_lower_than_required() +{ + //upon user request `select _1.a.b from s3object[*].c.d;` the c.d sould "cut off" from m_current_depth_non_anonymous + //to have the correct depth of the state-machine + return ((*m_current_depth_non_anonymous - static_cast<int>(m_from_clause_size)) < reader_position_state().required_depth_size); +} + +bool is_reader_located_on_array_entry_according_to_current_state() +{ + return (reader_position_state().actual_array_entry_no == reader_position_state().required_array_entry_no); +} + +void increase_current_state() +{ + DBG + + if(current_state >= variable_states.size()) return; + current_state ++; +} + +void decrease_current_state() +{ + DBG + + if(current_state == 0) return; + current_state --; +} + +void key() +{ + DBG + + if(reader_position_state().required_path.size())//current state is a key + { + std::vector<std::string>* filter = &reader_position_state().required_path; + auto required_key_depth_size = reader_position_state().required_key_depth_size; + if(std::equal((*key_path).begin()+(*from_clause).size() + required_key_depth_size, //key-path-start-point + from-clause-depth-size + key-depth + (*key_path).end(), + (*filter).begin(), + (*filter).end(), iequal_predicate)) + { + increase_current_state();//key match according to user request, advancing to the next state + } + } +} + +void increase_array_index() +{ + if(is_reader_located_on_required_depth() && is_array_state())//TODO && is_array_state(). is it necessary? what about nesting level + { + DBG + reader_position_state().actual_array_entry_no++; + } +} + +void dec_key() +{ + DBG + + if(is_reader_position_depth_lower_than_required()) + {//actual key-path is shorter than required + decrease_current_state(); + return; + } + + if(is_reader_located_on_required_depth() && is_array_state())//TODO && is_array_state(). is it necessary?; json_element_state.back() != ARRAY_STATE) + {//key-path-depth matches, and it an array + if(is_reader_located_on_array_entry_according_to_current_state()) + {//we reached the required array entry + increase_current_state(); + } + else if(is_reader_passed_required_array_entry()) + {//had passed the array entry + decrease_current_state(); + } + } +} + +void new_value(s3selectEngine::value& v) +{ + DBG + + if(is_on_final_state()) + { + (*m_exact_match_cb)(v, m_json_index); + decrease_current_state();//the state-machine reached its final destination, "going back" one state, upon another match condition the matched value will override the last one + } + increase_array_index();//next-value in array +} + +void end_object() +{ + increase_array_index(); +} + +void end_array() +{ + //init the correct array index + DBG + + if(is_reader_located_on_array_according_to_current_state()) + {//it reached end of required array + reader_position_state().actual_array_entry_no = 0; + decrease_current_state(); + } + nested_array_level --; + + // option 1. move out of one array, and enter a new one; option-2. enter an object + increase_array_index();//increase only upon correct array //TODO move it into dec_key()? + dec_key(); +} + +void start_array() +{ + DBG + + nested_array_level++; + if(is_reader_located_on_required_depth()) + {//reader entered an array required by JSON variable + reader_position_state().actual_array_entry_no = 0; + reader_position_state().last_array_start = nested_array_level; + + if(is_reader_located_on_array_entry_according_to_current_state()) + {//we reached the required array entry -> next state + increase_current_state(); + } + } +} + +}; //json_variable_access + +class json_variables_operations { + + public: + + std::vector<std::pair<json_variable_access*,size_t>> json_statement_variables{}; + + void init(std::vector<std::pair<json_variable_access*,size_t>>& jsv, //TODO init upon construction? + std::vector <std::string>* from_clause, + std::vector<std::string>* key_path, + int* current_depth, + int* current_depth_non_anonymous, + std::function <int(s3selectEngine::value&,int)>* exact_match_cb) + { + json_statement_variables = jsv; + int i=0;//the index per JSON variable + for(auto& var : json_statement_variables) + { + var.first->init(from_clause, + key_path, + current_depth, + current_depth_non_anonymous, + exact_match_cb,i++); + } + } + + void start_array() + { + for(auto& j : json_statement_variables) + { + j.first->start_array(); + } + } + void end_array() + { + for(auto& j : json_statement_variables) + { + j.first->end_array(); + } + } + void dec_key() + { + for(auto& j : json_statement_variables) + { + j.first->dec_key(); + } + } + void end_object() + { + for(auto& j : json_statement_variables) + { + j.first->end_object(); + } + } + void key() + { + for(auto& j : json_statement_variables) + { + j.first->key(); + } + } + void new_value(s3selectEngine::value& v) + { + for(auto& j : json_statement_variables) + { + j.first->new_value(v); + } + } +};//json_variables_operations + +class JsonParserHandler : public rapidjson::BaseReaderHandler<rapidjson::UTF8<>, JsonParserHandler> { + + public: + + typedef enum {OBJECT_STATE,ARRAY_STATE} en_json_elm_state_t; + typedef std::pair<std::vector<std::string>, s3selectEngine::value> json_key_value_t; + + row_state state = row_state::NA; + std::function <int(s3selectEngine::value&,int)> m_exact_match_cb; + std::function <int(s3selectEngine::scratch_area::json_key_value_t&)> m_star_operation_cb; + + json_variables_operations variable_match_operations; + int row_count{}; + std::vector <std::string> from_clause{}; + bool prefix_match{}; + s3selectEngine::value var_value; + ChunksStreamer stream_buffer; + bool init_buffer_stream; + rapidjson::Reader reader; + std::vector<en_json_elm_state_t> json_element_state; + std::vector<std::string> key_path; + std::function<int(void)> m_s3select_processing; + int m_start_row_depth; + int m_current_depth; + int m_current_depth_non_anonymous; + bool m_star_operation; + int m_sql_processing_status; + bool m_fatal_initialization_ind = false; + std::string m_fatal_initialization_description; + + JsonParserHandler() : prefix_match(false),init_buffer_stream(false),m_start_row_depth(-1),m_current_depth(0),m_current_depth_non_anonymous(0),m_star_operation(false),m_sql_processing_status(0) + { + } + + std::string get_key_path() + {//for debug + std::string res; + for(const auto & i: key_path) + { + res.append(i); + res.append(std::string("/")); + } + return res; + } + + void dec_key_path() + { + if (json_element_state.size()) { + if(json_element_state.back() != ARRAY_STATE) { + if(key_path.size() != 0) { + key_path.pop_back(); + } + } + } + + variable_match_operations.dec_key(); + + //TODO m_current_depth-- should done here + if(m_start_row_depth > m_current_depth) + { + prefix_match = false; + } else + if (prefix_match) { + if (state == row_state::ARRAY_START_ROW && m_start_row_depth == m_current_depth) { + m_sql_processing_status = m_s3select_processing(); //per each element in array + ++row_count; + } + } + } + + void push_new_key_value(s3selectEngine::value& v) + { + if (m_star_operation && prefix_match) + { + json_key_value_t key_value(key_path,v); + m_star_operation_cb(key_value); + } + if (prefix_match) + variable_match_operations.new_value(v); + + dec_key_path(); + } + + bool Null() { + var_value.setnull(); + push_new_key_value(var_value); + return true; } + + bool Bool(bool b) { + var_value = b; + push_new_key_value(var_value); + return true; } + + bool Int(int i) { + var_value = i; + push_new_key_value(var_value); + return true; } + + bool Uint(unsigned u) { + var_value = u; + push_new_key_value(var_value); + return true; } + + bool Int64(int64_t i) { + var_value = i; + push_new_key_value(var_value); + return true; } + + bool Uint64(uint64_t u) { + var_value = u; + push_new_key_value(var_value); + return true; } + + bool Double(double d) { + var_value = d; + push_new_key_value(var_value); + return true; } + + bool String(const char* str, rapidjson::SizeType length, bool copy) { + //TODO use copy + var_value = str; + push_new_key_value(var_value); + return true; + } + + bool Key(const char* str, rapidjson::SizeType length, bool copy) { + key_path.push_back(std::string(str)); + + if(!m_current_depth_non_anonymous){ + //important: upon a key and m_current_depth_non_anonymous is ZERO + //it should advance by 1. to get the correct current depth(for non anonymous counter). + m_current_depth_non_anonymous++; + } + + if(from_clause.size() == 0 || std::equal(key_path.begin(), key_path.end(), from_clause.begin(), from_clause.end(), iequal_predicate)) { + prefix_match = true; + } + + variable_match_operations.key(); + + return true; + } + + bool is_already_row_started() + { + if(state == row_state::OBJECT_START_ROW || state == row_state::ARRAY_START_ROW) + return true; + else + return false; + } + + bool StartObject() { + json_element_state.push_back(OBJECT_STATE); + m_current_depth++; + if(key_path.size()){ + //advancing the counter only upon there is a key. + m_current_depth_non_anonymous++; + } + + if (prefix_match && !is_already_row_started()) { + state = row_state::OBJECT_START_ROW; + m_start_row_depth = m_current_depth; + ++row_count; + } + + return true; + } + + bool EndObject(rapidjson::SizeType memberCount) { + json_element_state.pop_back(); + m_current_depth --; + m_current_depth_non_anonymous --; + + variable_match_operations.end_object(); + + dec_key_path(); + if (state == row_state::OBJECT_START_ROW && (m_start_row_depth > m_current_depth)) { + m_sql_processing_status = m_s3select_processing(); + state = row_state::NA; + } + return true; + } + + bool StartArray() { + json_element_state.push_back(ARRAY_STATE); + m_current_depth++; + if(key_path.size()){ + //advancing the counter only upon there is a key. + m_current_depth_non_anonymous++; + } + + if (prefix_match && !is_already_row_started()) { + state = row_state::ARRAY_START_ROW; + m_start_row_depth = m_current_depth; + } + + variable_match_operations.start_array(); + + return true; + } + + bool EndArray(rapidjson::SizeType elementCount) { + json_element_state.pop_back(); + m_current_depth--; + m_current_depth_non_anonymous--; + + dec_key_path(); + + if (state == row_state::ARRAY_START_ROW && (m_start_row_depth > m_current_depth)) { + state = row_state::NA; + } + + variable_match_operations.end_array(); + + return true; + } + + void set_prefix_match(std::vector<std::string>& requested_prefix_match) + {//purpose: set the filter according to SQL statement(from clause) + from_clause = requested_prefix_match; + if(from_clause.size() ==0) + { + prefix_match = true; + m_start_row_depth = m_current_depth; + } + } + + void set_statement_json_variables(std::vector<std::pair<json_variable_access*,size_t>>& statement_variables) + {//purpose: set the json variables extracted from the SQL statement(projection columns, predicates columns) + variable_match_operations.init( + statement_variables, + &from_clause, + &key_path, + &m_current_depth, + &m_current_depth_non_anonymous, + &m_exact_match_cb); + } + + void set_exact_match_callback(std::function<int(s3selectEngine::value&, int)> f) + {//purpose: upon key is matching one of the exact filters, the callback is called. + m_exact_match_cb = f; + } + + void set_s3select_processing_callback(std::function<int(void)>& f) + {//purpose: execute s3select statement on matching row (according to filters) + m_s3select_processing = f; + } + + void set_push_per_star_operation_callback( std::function <int(s3selectEngine::scratch_area::json_key_value_t&)> cb) + { + m_star_operation_cb = cb; + } + + void set_star_operation() + { + m_star_operation = true; + } + + bool is_fatal_initialization() + { + return m_fatal_initialization_ind; + } + + int process_json_buffer(char* json_buffer,size_t json_buffer_sz, bool end_of_stream=false) + {//user keeps calling with buffers, the method is not aware of the object size. + + + try{ + if(!init_buffer_stream) + { + //set the memoryStreamer + reader.IterativeParseInit(); + init_buffer_stream = true; + + } + + //the non-processed bytes plus the next chunk are copy into main processing buffer + if(!end_of_stream) + stream_buffer.resetBuffer(json_buffer, json_buffer_sz); + + while (!reader.IterativeParseComplete()) { + reader.IterativeParseNext<rapidjson::kParseDefaultFlags>(stream_buffer, *this); + + //once all key-values move into s3select(for further filtering and processing), it should be cleared + + //TODO in the case the chunk is too small or some value in input is too big, the parsing will fail. + if (!end_of_stream && stream_buffer.next_src_==0 && stream_buffer.getBytesLeft() < 2048) + {//the non processed bytes will be processed on next fetched chunk + //TODO save remaining-bytes to internal buffer (or caller will use 2 sets of buffer) + stream_buffer.saveRemainingBytes(); + return 0; + } + if(m_sql_processing_status == JSON_PROCESSING_LIMIT_REACHED)//return status(int) from callback + { + return JSON_PROCESSING_LIMIT_REACHED; + } + + // error message + if(reader.HasParseError()) { + rapidjson::ParseErrorCode c = reader.GetParseErrorCode(); + size_t ofs = reader.GetErrorOffset(); + std::stringstream error_str; + error_str << "parsing error. code:" << c << " position: " << ofs << std::endl; + throw s3selectEngine::base_s3select_exception(error_str.str(), s3selectEngine::base_s3select_exception::s3select_exp_en_t::FATAL); + return -1; + } + }//while reader.IterativeParseComplete + } + catch(std::exception &e){ + std::stringstream error_str; + error_str << "failed to process JSON : " << e.what() << std::endl; + throw s3selectEngine::base_s3select_exception(error_str.str(), s3selectEngine::base_s3select_exception::s3select_exp_en_t::FATAL); + return -1; + } + return 0; + } +}; + + +#endif + diff --git a/src/s3select/include/s3select_oper.h b/src/s3select/include/s3select_oper.h new file mode 100644 index 000000000..89544fc1d --- /dev/null +++ b/src/s3select/include/s3select_oper.h @@ -0,0 +1,3326 @@ +#ifndef __S3SELECT_OPER__ +#define __S3SELECT_OPER__ + +#include <string> +#include <iostream> +#include <list> +#include <map> +#include <vector> +#include <algorithm> +#include <cstring> +#include <cmath> +#include <set> + +#include <boost/lexical_cast.hpp> +#include <boost/date_time/posix_time/posix_time.hpp> +#include <boost/bind.hpp> +#include "s3select_parquet_intrf.h" //NOTE: should include first (c++11 std::string_view) + + +#if __has_include (<hs/hs.h>) && REGEX_HS + #include <hs/hs.h> +#elif __has_include (<re2/re2.h>) && REGEX_RE2 + #include <re2/re2.h> +#else + #include <regex> + #undef REGEX_HS + #undef REGEX_RE2 +#endif + +namespace bsc = BOOST_SPIRIT_CLASSIC_NS; + +namespace s3selectEngine +{ + +//=== stl allocator definition +//this allocator is fit for placement new (no calls to heap) + +class chunkalloc_out_of_mem +{ +}; + +template <typename T, size_t pool_sz> +class ChunkAllocator : public std::allocator<T> +{ +public: + typedef size_t size_type; + typedef T* pointer; + size_t buffer_capacity; + char* buffer_ptr; + + //only ONE pool,not allocated dynamically; main assumption, caller knows in advance its memory limitations. + char buffer[pool_sz]; + + template <typename _Tp1> + struct rebind + { + typedef ChunkAllocator<_Tp1, pool_sz> other; + }; + + //================================== + inline T* _Allocate(size_t num_of_element, T*) + { + // allocate storage for _Count elements of type T + + pointer res = (pointer)(buffer_ptr + buffer_capacity); + + buffer_capacity+= sizeof(T) * num_of_element; + + size_t addr_alignment = (buffer_capacity % sizeof(char*)); + buffer_capacity += addr_alignment != 0 ? sizeof(char*) - addr_alignment : 0; + + if (buffer_capacity> sizeof(buffer)) + { + throw chunkalloc_out_of_mem(); + } + + return res; + } + + //================================== + inline pointer allocate(size_type n, [[maybe_unused]] const void* hint = 0) + { + return (_Allocate(n, (pointer)0)); + } + + //================================== + inline void deallocate(pointer p, size_type n) + { + } + + //================================== + ChunkAllocator() noexcept : std::allocator<T>() + { + // alloc from main-buffer + buffer_capacity = 0; + memset( &buffer[0], 0, sizeof(buffer)); + buffer_ptr = &buffer[0]; + } + + //================================== + ChunkAllocator(const ChunkAllocator& other) noexcept : std::allocator<T>(other) + { + // copy const + buffer_capacity = 0; + buffer_ptr = &buffer[0]; + } + + //================================== + ~ChunkAllocator() noexcept + { + //do nothing + } +}; + +class base_statement; +//typedef std::vector<base_statement *> bs_stmt_vec_t; //without specific allocator + +//ChunkAllocator, prevent allocation from heap. +typedef std::vector<base_statement*, ChunkAllocator<base_statement*, 4096> > bs_stmt_vec_t; + +class base_s3select_exception : public std::exception +{ + +public: + enum class s3select_exp_en_t + { + NONE, + ERROR, + FATAL + } ; + +private: + s3select_exp_en_t m_severity; + +public: + std::string _msg; + explicit base_s3select_exception(const char* n) : m_severity(s3select_exp_en_t::NONE) + { + _msg.assign(n); + } + base_s3select_exception(const char* n, s3select_exp_en_t severity) : m_severity(severity) + { + _msg.assign(n); + } + base_s3select_exception(std::string n, s3select_exp_en_t severity) : m_severity(severity) + { + _msg = n; + } + + virtual const char* what() const noexcept + { + return _msg.c_str(); + } + + s3select_exp_en_t severity() + { + return m_severity; + } + + virtual ~base_s3select_exception() = default; +}; + + + +class s3select_allocator //s3select is the "owner" +{ +private: + + std::vector<char*> list_of_buff; + std::vector<char*> list_of_ptr; + u_int32_t m_idx; + +#define __S3_ALLOCATION_BUFF__ (24*1024) + void check_capacity(size_t sz) + { + if (sz>__S3_ALLOCATION_BUFF__) + { + throw base_s3select_exception("requested size too big", base_s3select_exception::s3select_exp_en_t::FATAL); + } + + if ((m_idx + sz) >= __S3_ALLOCATION_BUFF__) + { + list_of_buff.push_back((char*)malloc(__S3_ALLOCATION_BUFF__)); + m_idx = 0; + } + } + + void inc(size_t sz) + { + m_idx += sz; + m_idx += sizeof(char*) - (m_idx % sizeof(char*)); //alignment + } + +public: + s3select_allocator():m_idx(0) + { + list_of_buff.push_back((char*)malloc(__S3_ALLOCATION_BUFF__)); + } + + void *alloc(size_t sz) + { + check_capacity(sz); + + char* buff = list_of_buff.back(); + + u_int32_t idx = m_idx; + + inc(sz); + + return &buff[ idx ]; + } + + void push_for_delete(void *p) + {//in case of using S3SELECT_NO_PLACEMENT_NEW + list_of_ptr.push_back((char*)p); + } + + virtual ~s3select_allocator() + { + for(auto b : list_of_buff) + { + free(b); + } + + for(auto b : list_of_ptr) + {//in case of using S3SELECT_NO_PLACEMENT_NEW + delete(b); + } + } +}; + +// placement new for allocation of all s3select objects on single(or few) buffers, deallocation of those objects is by releasing the buffer. +#define S3SELECT_NEW(self, type , ... ) [=]() \ + { \ + auto res=new (self->getAllocator()->alloc(sizeof(type))) type(__VA_ARGS__); \ + return res; \ + }(); + +// no placement new; actually, its an oridinary new with additional functionality for deleting the AST nodes. +// (this changes, is for verifying the valgrind report on leak) +#define S3SELECT_NO_PLACEMENT_NEW(self, type , ... ) [=]() \ + { \ + auto res=new type(__VA_ARGS__); \ + self->getAllocator()->push_for_delete(res); \ + return res; \ + }(); + +class s3select_reserved_word +{ + public: + + enum class reserve_word_en_t + { + NA, + S3S_NULL,//TODO check AWS defintions for reserve words, its a long list , what about functions-names? + S3S_NAN, + S3S_TRUE, + S3S_FALSE + } ; + + using reserved_words = std::map<std::string,reserve_word_en_t>; + + const reserved_words m_reserved_words= + { + {"null",reserve_word_en_t::S3S_NULL},{"NULL",reserve_word_en_t::S3S_NULL}, + {"nan",reserve_word_en_t::S3S_NAN},{"NaN",reserve_word_en_t::S3S_NAN}, + {"true",reserve_word_en_t::S3S_TRUE},{"TRUE",reserve_word_en_t::S3S_TRUE}, + {"false",reserve_word_en_t::S3S_FALSE},{"FALSE",reserve_word_en_t::S3S_FALSE} + }; + + bool is_reserved_word(std::string & token) + { + return m_reserved_words.find(token) != m_reserved_words.end() ; + } + + reserve_word_en_t get_reserved_word(std::string & token) + { + if (is_reserved_word(token)==true) + { + return m_reserved_words.find(token)->second; + } + else + { + return reserve_word_en_t::NA; + } + } + +}; + +class base_statement; +class projection_alias +{ +//purpose: mapping between alias-name to base_statement* +//those routines are *NOT* intensive, works once per query parse time. + +private: + std::vector< std::pair<std::string, base_statement*> > alias_map; + +public: + std::vector< std::pair<std::string, base_statement*> >* get() + { + return &alias_map; + } + + bool insert_new_entry(std::string alias_name, base_statement* bs) + { + //purpose: only unique alias names. + + for(auto alias: alias_map) + { + if(alias.first.compare(alias_name) == 0) + { + return false; //alias name already exist + } + + } + std::pair<std::string, base_statement*> new_alias(alias_name, bs); + alias_map.push_back(new_alias); + + return true; + } + + base_statement* search_alias(std::string alias_name) + { + for(auto alias: alias_map) + { + if(alias.first.compare(alias_name) == 0) + { + return alias.second; //refernce to execution node + } + } + return 0; + } +}; + +struct binop_plus +{ + double operator()(double a, double b) + { + return a + b; + } +}; + +struct binop_minus +{ + double operator()(double a, double b) + { + return a - b; + } +}; + +struct binop_mult +{ + double operator()(double a, double b) + { + return a * b; + } +}; + +struct binop_div +{ + double operator()(double a, double b) + { + if (b == 0) { + if( std::isnan(a)) { + return a; + } else { + throw base_s3select_exception("division by zero is not allowed"); + } + } else { + return a / b; + } + } +}; + +struct binop_pow +{ + double operator()(double a, double b) + { + return pow(a, b); + } +}; + +struct binop_modulo +{ + int64_t operator()(int64_t a, int64_t b) + { + if (b == 0) + { + throw base_s3select_exception("Mod zero is not allowed"); + } else { + return a % b; + } + } +}; + +typedef std::tuple<boost::posix_time::ptime, boost::posix_time::time_duration, bool> timestamp_t; + +class value; +class multi_values +{ + public: + std::vector<value*> values; + + public: + void push_value(value* v); + + void clear() + { + values.clear(); + } + +}; + +class value +{ + +public: + typedef union + { + int64_t num; + char* str;//TODO consider string_view(save copy) + double dbl; + timestamp_t* timestamp; + bool b; + } value_t; + + multi_values multiple_values; + +private: + value_t __val; + //JSON query has a unique structure, the variable-name reside on input. there are cases were it should be extracted. + std::vector<std::string> m_json_key; + std::string m_to_string; + //std::basic_string<char,std::char_traits<char>,ChunkAllocator<char,256>> m_to_string; + std::string m_str_value; + //std::basic_string<char,std::char_traits<char>,ChunkAllocator<char,256>> m_str_value; + + int32_t m_precision=-1; + int32_t m_scale=-1; + +public: + enum class value_En_t + { + DECIMAL, + FLOAT, + STRING, + TIMESTAMP, + S3NULL, + S3NAN, + BOOL, + MULTIPLE_VALUES, + NA + } ; + value_En_t type; + + explicit value(int64_t n) : type(value_En_t::DECIMAL) + { + __val.num = n; + } + explicit value(int n) : type(value_En_t::DECIMAL) + { + __val.num = n; + } + explicit value(bool b) : type(value_En_t::BOOL) + { + __val.num = (int64_t)b; + } + explicit value(double d) : type(value_En_t::FLOAT) + { + __val.dbl = d; + } + explicit value(timestamp_t* timestamp) : type(value_En_t::TIMESTAMP) + { + __val.timestamp = timestamp; + } + + explicit value(const char* s) : type(value_En_t::STRING) + { + m_str_value.assign(s); + __val.str = m_str_value.data(); + } + + explicit value(std::nullptr_t) : type(value_En_t::S3NULL) + {} + + ~value() + {//TODO should be a part of the cleanup routine(__function::push_for_cleanup) + multiple_values.values.clear(); + } + + value():type(value_En_t::NA) + { + __val.num=0; + } + + bool is_number() const + { + if ((type == value_En_t::DECIMAL || type == value_En_t::FLOAT)) + { + return true; + } + + return false; + } + + bool is_string() const + { + return type == value_En_t::STRING; + } + bool is_timestamp() const + { + return type == value_En_t::TIMESTAMP; + } + + bool is_bool() const + { + return type == value_En_t::BOOL; + } + + bool is_null() const + { + return type == value_En_t::S3NULL; + } + + bool is_nan() const + { + if (type == value_En_t::FLOAT) { + return std::isnan(this->__val.dbl); + } + return type == value_En_t::S3NAN; + } + + bool is_true() + { + return (i64()!=0 && !is_null()); + } + + void set_nan() + { + __val.dbl = NAN; + type = value_En_t::FLOAT; + } + + void set_true() + { + __val.num = 1; + type = value_En_t::BOOL; + } + + void set_false() + { + __val.num = 0; + type = value_En_t::BOOL; + } + + void setnull() + { + type = value_En_t::S3NULL; + } + + void set_precision_scale(int32_t* precision, int32_t* scale) + { + m_precision = *precision; + m_scale = *scale; + } + + void get_precision_scale(int32_t* precision, int32_t* scale) + { + *precision = m_precision; + *scale = m_scale; + } + + void set_string_nocopy(char* str) + {//purpose: value does not own the string + __val.str = str; + type = value_En_t::STRING; + } + + value_En_t _type() const { return type; } + + void set_json_key_path(std::vector<std::string>& key_path) + { + m_json_key = key_path; + } + + const char* to_string() //TODO very intensive , must improve this + { + + if (type != value_En_t::STRING) + { + if (type == value_En_t::DECIMAL) + { + m_to_string.assign( boost::lexical_cast<std::string>(__val.num) ); + } + if (type == value_En_t::BOOL) + { + if(__val.num == 0) + { + m_to_string.assign("false"); + } + else + { + m_to_string.assign("true"); + } + } + else if(type == value_En_t::FLOAT) + { + if(m_precision != -1 && m_scale != -1) + { + std::stringstream ss; + ss << std::fixed << std::setprecision(m_scale) << __val.dbl; + m_to_string = ss.str(); + } + else + { + m_to_string.assign( boost::lexical_cast<std::string>(__val.dbl) ); + } + } + else if (type == value_En_t::TIMESTAMP) + { + boost::posix_time::ptime new_ptime; + boost::posix_time::time_duration td; + bool flag; + + std::tie(new_ptime, td, flag) = *__val.timestamp; + + if (flag) + { + m_to_string = to_iso_extended_string(new_ptime) + "Z"; + } + else + { + std::string tz_hour = std::to_string(std::abs(td.hours())); + std::string tz_mint = std::to_string(std::abs(td.minutes())); + std::string sign; + if (td.is_negative()) + sign = "-"; + else + sign = "+"; + + m_to_string = to_iso_extended_string(new_ptime) + sign + + std::string(2 - tz_hour.length(), '0') + tz_hour + ":" + + std::string(2 - tz_mint.length(), '0') + tz_mint; + } + } + else if (type == value_En_t::S3NULL) + { + m_to_string.assign("null"); + } + } + else + { + m_to_string.assign( __val.str ); + } + + if(m_json_key.size()) + { + std::string key_path; + for(auto& p : m_json_key) + {//TODO upon star-operation key-path assignment is very intensive + key_path.append(p); + key_path.append("."); + } + + key_path.append(" : "); + key_path.append(m_to_string); + m_to_string = key_path; + } + + return m_to_string.c_str(); + } + + value(const value& o) + { + if(o.type == value_En_t::STRING) + { + if(o.m_str_value.size()) + { + m_str_value = o.m_str_value; + __val.str = m_str_value.data(); + } + else if(o.__val.str) + { + __val.str = o.__val.str; + } + } + else + { + this->__val = o.__val; + } + + this->m_json_key = o.m_json_key; + + this->type = o.type; + } + + value& operator=(value& o) + { + if(o.type == value_En_t::STRING) + { + if(o.m_str_value.size()) + { + m_str_value = o.m_str_value; + __val.str = m_str_value.data(); + } + else if(o.__val.str) + { + __val.str = o.__val.str; + } + } + else + { + this->__val = o.__val; + } + + this->type = o.type; + + this->m_json_key = o.m_json_key; + + return *this; + } + + value& operator=(const char* s) + { + m_str_value.assign(s); + this->__val.str = m_str_value.data(); + this->type = value_En_t::STRING; + + return *this; + } + + value& operator=(int64_t i) + { + this->__val.num = i; + this->type = value_En_t::DECIMAL; + + return *this; + } + + value& operator=(int i) + { + this->__val.num = i; + this->type = value_En_t::DECIMAL; + + return *this; + } + + value& operator=(unsigned i) + { + this->__val.num = i; + this->type = value_En_t::DECIMAL; + + return *this; + } + + value& operator=(uint64_t i) + { + this->__val.num = i; + this->type = value_En_t::DECIMAL; + + return *this; + } + + value& operator=(double d) + { + this->__val.dbl = d; + this->type = value_En_t::FLOAT; + + return *this; + } + + value& operator=(bool b) + { + this->__val.num = (int64_t)b; + this->type = value_En_t::BOOL; + + return *this; + } + + value& operator=(timestamp_t* p) + { + this->__val.timestamp = p; + this->type = value_En_t::TIMESTAMP; + + return *this; + } + + int64_t i64() + { + return __val.num; + } + + const char* str() + { + return __val.str; + } + + double dbl() + { + return __val.dbl; + } + + bool bl() + { + return __val.b; + } + + timestamp_t* timestamp() const + { + return __val.timestamp; + } + + bool operator<(const value& v)//basic compare operator , most itensive runtime operation + { + //TODO NA possible? + if (is_string() && v.is_string()) + { + return strcmp(__val.str, v.__val.str) < 0; + } + + if (is_number() && v.is_number()) + { + + if(type != v.type) //conversion //TODO find better way + { + if (type == value_En_t::DECIMAL) + { + return (double)__val.num < v.__val.dbl; + } + else + { + return __val.dbl < (double)v.__val.num; + } + } + else //no conversion + { + if(type == value_En_t::DECIMAL) + { + return __val.num < v.__val.num; + } + else + { + return __val.dbl < v.__val.dbl; + } + + } + } + + if(is_timestamp() && v.is_timestamp()) + { + return *timestamp() < *(v.timestamp()); + } + + if(is_nan() || v.is_nan()) + { + return false; + } + + throw base_s3select_exception("operands not of the same type(numeric , string), while comparision"); + } + + bool operator>(const value& v) //basic compare operator , most itensive runtime operation + { + //TODO NA possible? + if (is_string() && v.is_string()) + { + return strcmp(__val.str, v.__val.str) > 0; + } + + if (is_number() && v.is_number()) + { + + if(type != v.type) //conversion //TODO find better way + { + if (type == value_En_t::DECIMAL) + { + return (double)__val.num > v.__val.dbl; + } + else + { + return __val.dbl > (double)v.__val.num; + } + } + else //no conversion + { + if(type == value_En_t::DECIMAL) + { + return __val.num > v.__val.num; + } + else + { + return __val.dbl > v.__val.dbl; + } + + } + } + + if(is_timestamp() && v.is_timestamp()) + { + return *timestamp() > *(v.timestamp()); + } + + if(is_nan() || v.is_nan()) + { + return false; + } + + throw base_s3select_exception("operands not of the same type(numeric , string), while comparision"); + } + + friend bool operator==(const value& lhs, const value& rhs) //basic compare operator , most itensive runtime operation + { + //TODO NA possible? + if (lhs.is_string() && rhs.is_string()) + { + return strcmp(lhs.__val.str, rhs.__val.str) == 0; + } + + + if (lhs.is_number() && rhs.is_number()) + { + + if(lhs.type != rhs.type) //conversion //TODO find better way + { + if (lhs.type == value_En_t::DECIMAL) + { + return (double)lhs.__val.num == rhs.__val.dbl; + } + else + { + return lhs.__val.dbl == (double)rhs.__val.num; + } + } + else //no conversion + { + if(lhs.type == value_En_t::DECIMAL) + { + return lhs.__val.num == rhs.__val.num; + } + else + { + return lhs.__val.dbl == rhs.__val.dbl; + } + + } + } + + if(lhs.is_timestamp() && rhs.is_timestamp()) + { + return *(lhs.timestamp()) == *(rhs.timestamp()); + } + + if( + (lhs.is_bool() && rhs.is_bool()) + || + (lhs.is_number() && rhs.is_bool()) + || + (lhs.is_bool() && rhs.is_number()) + ) + { + return lhs.__val.num == rhs.__val.num; + } + + if (lhs.is_nan() || rhs.is_nan()) + { + return false; + } + +// in the case of NULL on right-side or NULL on left-side, the result is false. + if(lhs.is_null() || rhs.is_null()) + { + return false; + } + + throw base_s3select_exception("operands not of the same type(numeric , string), while comparision"); + } + bool operator<=(const value& v) + { + if (is_nan() || v.is_nan()) { + return false; + } else { + return !(*this>v); + } + } + + bool operator>=(const value& v) + { + if (is_nan() || v.is_nan()) { + return false; + } else { + return !(*this<v); + } + } + + bool operator!=(const value& v) + { + if (is_nan() || v.is_nan()) { + return true; + } else { + return !(*this == v); + } + } + + template<typename binop> //conversion rules for arithmetical binary operations + value& compute(value& l, const value& r) //left should be this, it contain the result + { + binop __op; + + if (l.is_string() || r.is_string()) + { + throw base_s3select_exception("illegal binary operation with string"); + } + if (l.is_bool() || r.is_bool()) + { + throw base_s3select_exception("illegal binary operation with bool type"); + } + + if (l.is_number() && r.is_number()) + { + if (l.type != r.type) + { + //conversion + + if (l.type == value_En_t::DECIMAL) + { + l.__val.dbl = __op((double)l.__val.num, r.__val.dbl); + l.type = value_En_t::FLOAT; + } + else + { + l.__val.dbl = __op(l.__val.dbl, (double)r.__val.num); + l.type = value_En_t::FLOAT; + } + } + else + { + //no conversion + + if (l.type == value_En_t::DECIMAL) + { + l.__val.num = __op(l.__val.num, r.__val.num ); + l.type = value_En_t::DECIMAL; + } + else + { + l.__val.dbl = __op(l.__val.dbl, r.__val.dbl ); + l.type = value_En_t::FLOAT; + } + } + } + + if (l.is_null() || r.is_null()) + { + l.setnull(); + } else if(l.is_nan() || r.is_nan()) { + l.set_nan(); + } + + return l; + } + + value& operator+(const value& v) + { + return compute<binop_plus>(*this, v); + } + + value operator++(int) + { + *this = *this + static_cast<value>(1); + return *this; + } + + value& operator-(const value& v) + { + return compute<binop_minus>(*this, v); + } + + value& operator*(const value& v) + { + return compute<binop_mult>(*this, v); + } + + value& operator/(value& v) + { + if (v.is_null() || this->is_null()) { + v.setnull(); + return v; + } else { + return compute<binop_div>(*this, v); + } + } + + value& operator^(const value& v) + { + return compute<binop_pow>(*this, v); + } + + value & operator%(const value &v) + { + if(v.type == value_En_t::DECIMAL) { + return compute<binop_modulo>(*this,v); + } else { + throw base_s3select_exception("wrong use of modulo operation!"); + } + } +}; + +void multi_values::push_value(value *v) +{ + //v could be single or multiple values + if (v->type == value::value_En_t::MULTIPLE_VALUES) + { + for (auto sv : v->multiple_values.values) + { + values.push_back(sv); + } + } + else + { + values.push_back(v); + } +} + + +class scratch_area +{ + +private: + std::vector<value> *m_schema_values; //values got a type + int m_upper_bound; + + std::vector<std::pair<std::string, int >> m_column_name_pos; + bool parquet_type; + char str_buff[4096]; + uint16_t buff_loc; + int max_json_idx; + timestamp_t tmstmp; + +public: + + typedef std::pair<std::vector<std::string>,value> json_key_value_t; + typedef std::vector< json_key_value_t > json_star_op_cont_t; + json_star_op_cont_t m_json_star_operation; + + scratch_area():m_upper_bound(-1),parquet_type(false),buff_loc(0),max_json_idx(-1) + { + m_schema_values = new std::vector<value>(128,value(nullptr)); + } + + ~scratch_area() + { + delete m_schema_values; + } + + json_star_op_cont_t* get_star_operation_cont() + { + return &m_json_star_operation; + } + + void clear_data() + { + m_json_star_operation.clear(); + for(int i=0;i<=max_json_idx;i++) + { + (*m_schema_values)[i].setnull(); + } + } + + void set_column_pos(const char* n, int pos)//TODO use std::string + { + m_column_name_pos.push_back( std::pair<const char*, int>(n, pos)); + } + + void update(std::vector<char*>& tokens, size_t num_of_tokens) + { + size_t i=0; + //increase the Vector::m_schema_values capacity(it should happen few times) + if ((*m_schema_values).capacity() < tokens.size()) + { + (*m_schema_values).resize( tokens.size() * 2 ); + } + + for(auto s : tokens) + { + if (i>=num_of_tokens) + { + break; + } + //not copy the string content. + (*m_schema_values)[i++].set_string_nocopy(s); + } + m_upper_bound = i; + + } + + int get_column_pos(const char* n) + { + //done only upon building the AST, not on "runtime" + + for( auto iter : m_column_name_pos) + { + if (!strcmp(iter.first.c_str(), n)) + { + return iter.second; + } + } + + return -1; + } + + void set_parquet_type() + { + parquet_type = true; + } + + void get_column_value(uint16_t column_pos, value &v) + { + if (column_pos > ((*m_schema_values).size()-1)) + { + throw base_s3select_exception("accessing scratch buffer beyond its size"); + } + + v = (*m_schema_values)[ column_pos ]; + } + + value* get_column_value(uint16_t column_pos) + { + if (column_pos > ((*m_schema_values).size()-1)) + { + throw base_s3select_exception("accessing scratch buffer beyond its size"); + } + + return &(*m_schema_values)[ column_pos ]; + } + + int get_num_of_columns() + { + return m_upper_bound; + } + + int update_json_varible(value v,int json_idx) + { + if(json_idx>max_json_idx) + { + max_json_idx = json_idx; + } + + //increase the Vector::m_schema_values capacity(it should happen few times) + if ((*m_schema_values).capacity() < static_cast<unsigned long long>(max_json_idx)) + { + (*m_schema_values).resize(max_json_idx * 2); + } + + (*m_schema_values)[ json_idx ] = v; + + if(json_idx>m_upper_bound) + { + m_upper_bound = json_idx; + } + return 0; + } + +#ifdef _ARROW_EXIST + +#define S3SELECT_MICROSEC (1000*1000) +#define S3SELECT_MILLISEX (1000) + + int update(std::vector<parquet_file_parser::parquet_value_t> &parquet_row_value, parquet_file_parser::column_pos_t &column_positions) + { + //TODO no need for copy , possible to save referece (its save last row for calculation) + + parquet_file_parser::column_pos_t::iterator column_pos_iter = column_positions.begin(); + m_upper_bound =0; + buff_loc=0; + + //increase the Vector::m_schema_values capacity(it should happen few times) + if ((*m_schema_values).capacity() < parquet_row_value.size()) + { + (*m_schema_values).resize(parquet_row_value.size() * 2); + } + + if (*column_pos_iter > ((*m_schema_values).size()-1)) + { + throw base_s3select_exception("accessing scratch buffer beyond its size"); + } + + for(auto v : parquet_row_value) + { + //TODO (parquet_value_t) --> (value) , or better get it as value (i.e. parquet reader know class-value) + //TODO temporary + switch( v.type ) + { + case parquet_file_parser::parquet_type::INT32: + (*m_schema_values)[ *column_pos_iter ] = v.num; + break; + + case parquet_file_parser::parquet_type::INT64: + (*m_schema_values)[ *column_pos_iter ] = v.num; + break; + + case parquet_file_parser::parquet_type::DOUBLE: + (*m_schema_values)[ *column_pos_iter ] = v.dbl; + break; + + case parquet_file_parser::parquet_type::STRING: + //TODO waste of CPU + //TODO value need to present string with char* and length + + memcpy(str_buff+buff_loc, v.str, v.str_len); + str_buff[buff_loc+v.str_len] = 0; + (*m_schema_values)[ *column_pos_iter ] = str_buff+buff_loc; + buff_loc += v.str_len+1; + break; + + case parquet_file_parser::parquet_type::PARQUET_NULL: + + (*m_schema_values)[ *column_pos_iter ].setnull(); + break; + + case parquet_file_parser::parquet_type::TIMESTAMP: //TODO milli-sec, micro-sec, nano-sec + { + auto tm_sec = v.num/S3SELECT_MICROSEC; //TODO should use the correct unit + boost::posix_time::ptime new_ptime = boost::posix_time::from_time_t( tm_sec ); + boost::posix_time::time_duration td_zero((tm_sec/3600)%24,(tm_sec/60)%24,tm_sec%60); + tmstmp = std::make_tuple(new_ptime, td_zero, (char)'Z'); + (*m_schema_values)[ *column_pos_iter ] = &tmstmp; + } + break; + + default: + throw base_s3select_exception("wrong parquet type for conversion."); + + //return -1;//TODO exception + } + m_upper_bound = *column_pos_iter+1; + column_pos_iter ++; + } + return 0; + } +#endif // _ARROW_EXIST + +}; + +class base_statement +{ + +protected: + + scratch_area* m_scratch; + projection_alias* m_aliases; + bool is_last_call; //valid only for aggregation functions + bool m_is_cache_result; + value m_alias_result; + base_statement* m_projection_alias; + int m_eval_stack_depth; + bool m_skip_non_aggregate_op; + value value_na; + //JSON queries has different syntax from other data-sources(Parquet,CSV) + bool m_json_statement; + uint64_t number_of_calls = 0; + std::string operator_name; + +public: + base_statement():m_scratch(nullptr), is_last_call(false), m_is_cache_result(false), + m_projection_alias(nullptr), m_eval_stack_depth(0), m_skip_non_aggregate_op(false),m_json_statement(false) {} + + void set_operator_name(const char* op) + { +#ifdef S3SELECT_PROF + operator_name = op; +#endif + } + + virtual value& eval() + { +#ifdef S3SELECT_PROF + number_of_calls++; +#endif + //purpose: on aggregation flow to run only the correct subtree(aggregation subtree) + + if (m_skip_non_aggregate_op == false) + return eval_internal();//not skipping this node. + else + { + //skipping this node. + //in case execution should skip a node, it will traverse (left and right) + //and search for subtree to execute. + //example: sum( ... ) - sum( ... ) ; the minus operand is skipped while sum() operand is not. + if(left()) + left()->eval_internal(); + + if(right()) + right()->eval_internal(); + + } + + return value_na; + } + + virtual value& eval_internal() = 0; + +public: + virtual base_statement* left() const + { + return 0; + } + virtual base_statement* right() const + { + return 0; + } + virtual std::string print(int ident) =0;//TODO complete it, one option to use level parametr in interface , + virtual bool semantic() =0;//done once , post syntax , traverse all nodes and validate semantics. + + virtual void traverse_and_apply(scratch_area* sa, projection_alias* pa,bool json_statement) + { + m_scratch = sa; + m_aliases = pa; + m_json_statement = json_statement; + + if (left()) + { + left()->traverse_and_apply(m_scratch, m_aliases, json_statement); + } + if (right()) + { + right()->traverse_and_apply(m_scratch, m_aliases, json_statement); + } + } + + virtual void set_skip_non_aggregate(bool skip_non_aggregate_op) + { + m_skip_non_aggregate_op = skip_non_aggregate_op; + + if (left()) + { + left()->set_skip_non_aggregate(m_skip_non_aggregate_op); + } + if (right()) + { + right()->set_skip_non_aggregate(m_skip_non_aggregate_op); + } + } + + virtual bool is_aggregate() const + { + return false; + } + + virtual bool is_column() const + { + return false; + } + + virtual bool is_star_operation() const + { + return false; + } + + virtual void resolve_node() + {//part of semantic analysis(TODO maybe semantic method should handle this) + if (left()) + { + left()->resolve_node(); + } + if (right()) + { + right()->resolve_node(); + } + } + + bool is_json_statement() + { + return m_json_statement; + } + + bool is_function() const; + const base_statement* get_aggregate() const; + bool is_nested_aggregate(bool&) const; + bool is_column_reference() const; + bool mark_aggreagtion_subtree_to_execute(); + bool is_statement_contain_star_operation() const; + void push_for_cleanup(std::set<base_statement*>&); + +#ifdef _ARROW_EXIST + void extract_columns(parquet_file_parser::column_pos_t &cols,const uint16_t max_columns); +#endif + + virtual void set_last_call() + { + is_last_call = true; + if(left()) + { + left()->set_last_call(); + } + if(right()) + { + right()->set_last_call(); + } + } + + bool is_set_last_call() + { + return is_last_call; + } + + void invalidate_cache_result() + { + m_is_cache_result = false; + } + + bool is_result_cached() + { + return m_is_cache_result == true; + } + + void set_result_cache(value& eval_result) + { + m_alias_result = eval_result; + m_is_cache_result = true; + } + + void dec_call_stack_depth() + { + m_eval_stack_depth --; + } + + value& get_result_cache() + { + return m_alias_result; + } + + int& get_eval_call_depth() + { + m_eval_stack_depth++; + return m_eval_stack_depth; + } + + virtual ~base_statement() +{ +#ifdef S3SELECT_PROF +std::cout<< operator_name << ":" << number_of_calls <<std::endl; +#endif +} + + void dtor() + { + this->~base_statement(); + } + + scratch_area* getScratchArea() + { + return m_scratch; + } + + projection_alias* getAlias() + { + return m_aliases; + } + +}; + +class variable : public base_statement +{ + +public: + + enum class var_t + { + NA, + VARIABLE_NAME,//schema column (i.e. age , price , ...) + COLUMN_VALUE, //concrete value (string,number,boolean) + JSON_VARIABLE,//a key-path reference + POS, // CSV column number (i.e. _1 , _2 ... ) + STAR_OPERATION, //'*' + } ; + var_t m_var_type; + +private: + + std::string _name; + int column_pos; + value var_value; + int json_variable_idx; + + const int undefined_column_pos = -1; + const int column_alias = -2; + const char* this_operator_name = "variable"; + +public: + variable():m_var_type(var_t::NA), _name(""), column_pos(-1), json_variable_idx(-1){set_operator_name(this_operator_name);} + + explicit variable(int64_t i) : m_var_type(var_t::COLUMN_VALUE), column_pos(-1), var_value(i), json_variable_idx(-1){set_operator_name(this_operator_name);} + + explicit variable(double d) : m_var_type(var_t::COLUMN_VALUE), _name("#"), column_pos(-1), var_value(d), json_variable_idx(-1){set_operator_name(this_operator_name);} + + explicit variable(int i) : m_var_type(var_t::COLUMN_VALUE), column_pos(-1), var_value(i), json_variable_idx(-1){set_operator_name(this_operator_name);} + + explicit variable(const std::string& n) : m_var_type(var_t::VARIABLE_NAME), _name(n), column_pos(-1), json_variable_idx(-1){set_operator_name(this_operator_name);} + + explicit variable(const std::string& n, var_t tp, size_t json_idx) : m_var_type(var_t::NA) + {//only upon JSON use case + set_operator_name(this_operator_name); + if(tp == variable::var_t::JSON_VARIABLE) + { + m_var_type = variable::var_t::JSON_VARIABLE; + json_variable_idx = static_cast<int>(json_idx); + _name = n;//"#"; debug + } + } + + variable(const std::string& n, var_t tp) : m_var_type(var_t::NA) + { + set_operator_name(this_operator_name); + if(tp == variable::var_t::POS) + { + _name = n; + m_var_type = tp; + int pos = atoi( n.c_str() + 1 ); //TODO >0 < (schema definition , semantic analysis) + column_pos = pos -1;// _1 is the first column ( zero position ) + } + else if (tp == variable::var_t::COLUMN_VALUE) + { + _name = "#"; + m_var_type = tp; + column_pos = -1; + var_value = n.c_str(); + } + else if (tp ==variable::var_t::STAR_OPERATION) + { + _name = "#"; + m_var_type = tp; + column_pos = -1; + } + } + + explicit variable(s3select_reserved_word::reserve_word_en_t reserve_word) + { + set_operator_name(this_operator_name); + if (reserve_word == s3select_reserved_word::reserve_word_en_t::S3S_NULL) + { + m_var_type = variable::var_t::COLUMN_VALUE; + column_pos = undefined_column_pos; + var_value.type = value::value_En_t::S3NULL;//TODO use set_null + } + else if (reserve_word == s3select_reserved_word::reserve_word_en_t::S3S_NAN) + { + m_var_type = variable::var_t::COLUMN_VALUE; + column_pos = undefined_column_pos; + var_value.set_nan(); + } + else if (reserve_word == s3select_reserved_word::reserve_word_en_t::S3S_TRUE) + { + m_var_type = variable::var_t::COLUMN_VALUE; + column_pos = -1; + var_value.set_true(); + } + else if (reserve_word == s3select_reserved_word::reserve_word_en_t::S3S_FALSE) + { + m_var_type = variable::var_t::COLUMN_VALUE; + column_pos = -1; + var_value.set_false(); + } + else + { + _name = "#"; + m_var_type = var_t::NA; + column_pos = undefined_column_pos; + } + } + + void operator=(value& v) + { + var_value = v; + } + + void set_value(const char* s) + { + var_value = s; + } + + void set_value(double d) + { + var_value = d; + } + + void set_value(int64_t i) + { + var_value = i; + } + + void set_value(timestamp_t* p) + { + var_value = p; + } + + void set_value(bool b) + { + var_value = b; + var_value.type = value::value_En_t::BOOL; + } + + void set_null() + { + var_value.setnull(); + } + + void set_precision_scale(int32_t* p, int32_t* s) + { + var_value.set_precision_scale(p, s); + } + + virtual ~variable() {} + + virtual bool is_column() const //is reference to column. + { + if(m_var_type == var_t::VARIABLE_NAME || m_var_type == var_t::POS || m_var_type == var_t::STAR_OPERATION) + { + return true; + } + return false; + } + + virtual bool is_star_operation() const + { + if(m_var_type == var_t::STAR_OPERATION) + { + return true; + } + return false; + } + + value& get_value() + { + return var_value; //TODO is it correct + } + + std::string get_name() + { + return _name; + } + + int get_column_pos() + { + return column_pos; + } + + virtual value::value_En_t get_value_type() + { + return var_value.type; + } + + value& star_operation() + {//purpose return content of all columns in a input stream + if(is_json_statement()) + return json_star_operation(); + + var_value.multiple_values.clear(); + for(int i=0; i<m_scratch->get_num_of_columns(); i++) + { + var_value.multiple_values.push_value( m_scratch->get_column_value(i) ); + } + var_value.type = value::value_En_t::MULTIPLE_VALUES; + return var_value; + } + + value& json_star_operation() + {//purpose: per JSON star-operation it needs to get column-name(full-path) and its value + + var_value.multiple_values.clear(); + for(auto& key_value : *m_scratch->get_star_operation_cont()) + { + key_value.second.set_json_key_path(key_value.first); + var_value.multiple_values.push_value(&key_value.second); + } + + var_value.type = value::value_En_t::MULTIPLE_VALUES; + + return var_value; + } + + virtual value& eval_internal() + { + if (m_var_type == var_t::COLUMN_VALUE) + { + return var_value; // a literal,could be deciml / float / string + } + else if(m_var_type == var_t::STAR_OPERATION) + { + return star_operation(); + } + else if(m_var_type == var_t::JSON_VARIABLE && json_variable_idx >= 0) + { + column_pos = json_variable_idx; //TODO handle column alias + } + else if (column_pos == undefined_column_pos) + { + //done once , for the first time + column_pos = m_scratch->get_column_pos(_name.c_str()); + + if(column_pos>=0 && m_aliases->search_alias(_name.c_str())) + { + throw base_s3select_exception(std::string("multiple definition of column {") + _name + "} as schema-column and alias", base_s3select_exception::s3select_exp_en_t::FATAL); + } + + + if (column_pos == undefined_column_pos) + { + //not belong to schema , should exist in aliases + m_projection_alias = m_aliases->search_alias(_name.c_str()); + + //not enter this scope again + column_pos = column_alias; + if(m_projection_alias == 0) + { + throw base_s3select_exception(std::string("alias {")+_name+std::string("} or column not exist in schema"), base_s3select_exception::s3select_exp_en_t::FATAL); + } + } + + } + + if (m_projection_alias) + { + if (m_projection_alias->get_eval_call_depth()>2) + { + throw base_s3select_exception("number of calls exceed maximum size, probably a cyclic reference to alias", base_s3select_exception::s3select_exp_en_t::FATAL); + } + + if (m_projection_alias->is_result_cached() == false) + { + var_value = m_projection_alias->eval(); + m_projection_alias->set_result_cache(var_value); + } + else + { + var_value = m_projection_alias->get_result_cache(); + } + + m_projection_alias->dec_call_stack_depth(); + } + else + { + m_scratch->get_column_value(column_pos,var_value); + //in the case of successive column-delimiter {1,some_data,,3}=> third column is NULL + if (var_value.is_string() && (var_value.str()== 0 || (var_value.str() && *var_value.str()==0))){ + var_value.setnull();//TODO is it correct for Parquet + } + } + + return var_value; + } + + virtual std::string print(int ident) + { + //std::string out = std::string(ident,' ') + std::string("var:") + std::to_string(var_value.__val.num); + //return out; + return std::string("#");//TBD + } + + virtual bool semantic() + { + return false; + } + +}; + +class arithmetic_operand : public base_statement +{ + +public: + + enum class cmp_t {NA, EQ, LE, LT, GT, GE, NE} ; + +private: + base_statement* l; + base_statement* r; + + cmp_t _cmp; + value var_value; + bool negation_result;//false: dont negate ; upon NOT operator(unary) its true + +public: + + virtual bool semantic() + { + return true; + } + + base_statement* left() const override + { + return l; + } + base_statement* right() const override + { + return r; + } + + virtual std::string print(int ident) + { + //std::string out = std::string(ident,' ') + "compare:" += std::to_string(_cmp) + "\n" + l->print(ident-5) +r->print(ident+5); + //return out; + return std::string("#");//TBD + } + + virtual value& eval_internal() + { + value l_val = l->eval(); + value r_val; + if (l_val.is_null()) { + var_value.setnull(); + return var_value; + } else {r_val = r->eval();} + if(r_val.is_null()) { + var_value.setnull(); + return var_value; + } + + switch (_cmp) + { + case cmp_t::EQ: + return var_value = bool( (l_val == r_val) ^ negation_result ); + break; + + case cmp_t::LE: + return var_value = bool( (l_val <= r_val) ^ negation_result ); + break; + + case cmp_t::GE: + return var_value = bool( (l_val >= r_val) ^ negation_result ); + break; + + case cmp_t::NE: + return var_value = bool( (l_val != r_val) ^ negation_result ); + break; + + case cmp_t::GT: + return var_value = bool( (l_val > r_val) ^ negation_result ); + break; + + case cmp_t::LT: + return var_value = bool( (l_val < r_val) ^ negation_result ); + break; + + default: + throw base_s3select_exception("internal error"); + break; + } + } + + arithmetic_operand(base_statement* _l, cmp_t c, base_statement* _r):l(_l), r(_r), _cmp(c),negation_result(false){set_operator_name("arithmetic_operand");} + + explicit arithmetic_operand(base_statement* p)//NOT operator + { + l = dynamic_cast<arithmetic_operand*>(p)->l; + r = dynamic_cast<arithmetic_operand*>(p)->r; + _cmp = dynamic_cast<arithmetic_operand*>(p)->_cmp; + // not( not ( logical expression )) == ( logical expression ); there is no limitation for number of NOT. + negation_result = ! dynamic_cast<arithmetic_operand*>(p)->negation_result; + } + + virtual ~arithmetic_operand() {} +}; + +class logical_operand : public base_statement +{ + +public: + + enum class oplog_t {AND, OR, NA}; + +private: + base_statement* l; + base_statement* r; + + oplog_t _oplog; + value var_value; + bool negation_result;//false: dont negate ; upon NOT operator(unary) its true + +public: + + base_statement* left() const override + { + return l; + } + base_statement* right() const override + { + return r; + } + + virtual bool semantic() + { + return true; + } + + logical_operand(base_statement* _l, oplog_t _o, base_statement* _r):l(_l), r(_r), _oplog(_o),negation_result(false){set_operator_name("logical_operand");} + + explicit logical_operand(base_statement * p)//NOT operator + { + l = dynamic_cast<logical_operand*>(p)->l; + r = dynamic_cast<logical_operand*>(p)->r; + _oplog = dynamic_cast<logical_operand*>(p)->_oplog; + // not( not ( logical expression )) == ( logical expression ); there is no limitation for number of NOT. + negation_result = ! dynamic_cast<logical_operand*>(p)->negation_result; + } + + virtual ~logical_operand() {} + + virtual std::string print(int ident) + { + //std::string out = std::string(ident, ' ') + "logical_operand:" += std::to_string(_oplog) + "\n" + l->print(ident - 5) + r->print(ident + 5); + //return out; + return std::string("#");//TBD + } + virtual value& eval_internal() + { + if (!l || !r) + { + throw base_s3select_exception("missing operand for logical ", base_s3select_exception::s3select_exp_en_t::FATAL); + } + value a = l->eval(); + if (_oplog == oplog_t::AND) + { + if (!a.is_null() && a.i64() == false) { + bool res = false ^ negation_result; + return var_value = res; + } + value b = r->eval(); + if(!b.is_null() && b.i64() == false) { + bool res = false ^ negation_result; + return var_value = res; + } else { + if (a.is_null() || b.is_null()) { + var_value.setnull(); + return var_value; + } else { + bool res = true ^ negation_result ; + return var_value =res; + } + } + } + else + { + if (a.is_true()) { + bool res = true ^ negation_result; + return var_value = res; + } + value b = r->eval(); + if(b.is_true() == true) { + bool res = true ^ negation_result; + return var_value = res; + } else { + if (a.is_null() || b.is_null()) { + var_value.setnull(); + return var_value; + } else { + bool res = false ^ negation_result ; + return var_value =res; + } + } + } + } +}; + +class mulldiv_operation : public base_statement +{ + +public: + + enum class muldiv_t {NA, MULL, DIV, POW, MOD} ; + +private: + base_statement* l; + base_statement* r; + + muldiv_t _mulldiv; + value var_value; + value tmp_value; + +public: + + base_statement* left() const override + { + return l; + } + base_statement* right() const override + { + return r; + } + + virtual bool semantic() + { + return true; + } + + virtual std::string print(int ident) + { + //std::string out = std::string(ident, ' ') + "mulldiv_operation:" += std::to_string(_mulldiv) + "\n" + l->print(ident - 5) + r->print(ident + 5); + //return out; + return std::string("#");//TBD + } + + virtual value& eval_internal() + { + switch (_mulldiv) + { + case muldiv_t::MULL: + tmp_value = l->eval(); + return var_value = tmp_value * r->eval(); + break; + + case muldiv_t::DIV: + tmp_value = l->eval(); + return var_value = tmp_value / r->eval(); + break; + + case muldiv_t::POW: + tmp_value = l->eval(); + return var_value = tmp_value ^ r->eval(); + break; + + case muldiv_t::MOD: + tmp_value = l->eval(); + return var_value = tmp_value % r->eval(); + break; + + default: + throw base_s3select_exception("internal error"); + break; + } + } + + mulldiv_operation(base_statement* _l, muldiv_t c, base_statement* _r):l(_l), r(_r), _mulldiv(c){set_operator_name("mulldiv_operation");} + + virtual ~mulldiv_operation() {} +}; + +class addsub_operation : public base_statement +{ + +public: + + enum class addsub_op_t {ADD, SUB, NA}; + +private: + base_statement* l; + base_statement* r; + + addsub_op_t _op; + value var_value; + value tmp_value; + +public: + + base_statement* left() const override + { + return l; + } + base_statement* right() const override + { + return r; + } + + virtual bool semantic() + { + return true; + } + + addsub_operation(base_statement* _l, addsub_op_t _o, base_statement* _r):l(_l), r(_r), _op(_o) {} + + virtual ~addsub_operation() {} + + virtual std::string print(int ident) + { + //std::string out = std::string(ident, ' ') + "addsub_operation:" += std::to_string(_op) + "\n" + l->print(ident - 5) + r->print(ident + 5); + return std::string("#");//TBD + } + + virtual value& eval_internal() + { + if (_op == addsub_op_t::NA) // -num , +num , unary-operation on number + { + if (l) + { + return var_value = l->eval(); + } + else if (r) + { + return var_value = r->eval(); + } + } + else if (_op == addsub_op_t::ADD) + {tmp_value=l->eval(); + return var_value = (tmp_value + r->eval()); + } + else + {tmp_value=l->eval(); + return var_value = (tmp_value - r->eval()); + } + + return var_value; + } +}; + +class negate_function_operation : public base_statement +{ + //purpose: some functions (between,like,in) are participating in where-clause as predicates; thus NOT unary-operator may operate on them. + + private: + + base_statement* function_to_negate; + value res; + + public: + + explicit negate_function_operation(base_statement *f):function_to_negate(f){set_operator_name("negate_function_operation");} + + virtual std::string print(int ident) + { + return std::string("#");//TBD + } + + virtual bool semantic() + { + return true; + } + + base_statement* left() const override + { + return function_to_negate; + } + + virtual value& eval_internal() + { + res = function_to_negate->eval(); + + if (res.is_number() || res.is_bool())//TODO is integer type + { + if (res.is_true()) + { + res = (bool)0; + } + else + { + res = (bool)1; + } + } + + return res; + } + +}; + +class base_function +{ + +protected: + bool aggregate; + +public: + //TODO add semantic to base-function , it operate once on function creation + // validate semantic on creation instead on run-time + virtual bool operator()(bs_stmt_vec_t* args, variable* result) = 0; + std::string m_function_name; + base_function() : aggregate(false) {} + bool is_aggregate() const + { + return aggregate == true; + } + virtual void get_aggregate_result(variable*) {} + + virtual ~base_function() = default; + + virtual void dtor() + {//release function-body implementation + this->~base_function(); + } + + void check_args_size(bs_stmt_vec_t* args, uint16_t required, const char* error_msg) + {//verify for atleast required parameters + if(args->size() < required) + { + throw base_s3select_exception(error_msg,base_s3select_exception::s3select_exp_en_t::FATAL); + } + } + + void check_args_size(bs_stmt_vec_t* args,uint16_t required) + { + if(args->size() < required) + { + std::string error_msg = m_function_name + " requires for " + std::to_string(required) + " arguments"; + throw base_s3select_exception(error_msg,base_s3select_exception::s3select_exp_en_t::FATAL); + } + } + + void set_function_name(const char* name) + { + m_function_name.assign(name); + } +}; + +class base_date_extract : public base_function +{ + protected: + value val_timestamp; + boost::posix_time::ptime new_ptime; + boost::posix_time::time_duration td; + bool flag; + + public: + void param_validation(bs_stmt_vec_t*& args) + { + auto iter = args->begin(); + int args_size = args->size(); + + if (args_size < 1) + { + throw base_s3select_exception("to_timestamp should have 2 parameters"); + } + + base_statement* ts = *iter; + val_timestamp = ts->eval(); + if(val_timestamp.is_timestamp()== false) + { + throw base_s3select_exception("second parameter is not timestamp"); + } + + std::tie(new_ptime, td, flag) = *val_timestamp.timestamp(); + } + +}; + +class base_date_diff : public base_function +{ + protected: + boost::posix_time::ptime ptime1; + boost::posix_time::ptime ptime2; + + public: + void param_validation(bs_stmt_vec_t*& args) + { + auto iter = args->begin(); + int args_size = args->size(); + + if (args_size < 2) + { + throw base_s3select_exception("datediff need 3 parameters"); + } + + base_statement* dt1_param = *iter; + value val_ts1 = dt1_param->eval(); + + if (val_ts1.is_timestamp() == false) + { + throw base_s3select_exception("second parameter should be timestamp"); + } + + iter++; + base_statement* dt2_param = *iter; + value val_ts2 = dt2_param->eval(); + + if (val_ts2.is_timestamp() == false) + { + throw base_s3select_exception("third parameter should be timestamp"); + } + + boost::posix_time::ptime ts1_ptime; + boost::posix_time::time_duration ts1_td; + boost::posix_time::ptime ts2_ptime; + boost::posix_time::time_duration ts2_td; + + std::tie(ts1_ptime, ts1_td, std::ignore) = *val_ts1.timestamp(); + std::tie(ts2_ptime, ts2_td, std::ignore) = *val_ts2.timestamp(); + + ptime1 = ts1_ptime + boost::posix_time::hours(ts1_td.hours() * -1); + ptime1 += boost::posix_time::minutes(ts1_td.minutes() * -1); + ptime2 = ts2_ptime + boost::posix_time::hours(ts2_td.hours() * -1); + ptime2 += boost::posix_time::minutes(ts2_td.minutes() * -1); + } + +}; + +class base_date_add : public base_function +{ + protected: + value val_quantity; + boost::posix_time::ptime new_ptime; + boost::posix_time::time_duration td; + bool flag; + timestamp_t new_tmstmp; + + public: + void param_validation(bs_stmt_vec_t*& args) + { + auto iter = args->begin(); + int args_size = args->size(); + + if (args_size < 2) + { + throw base_s3select_exception("add_to_timestamp should have 3 parameters"); + } + + base_statement* quan = *iter; + val_quantity = quan->eval(); + + if (val_quantity.is_number() == false) + { + throw base_s3select_exception("second parameter should be number"); //TODO what about double? + } + + iter++; + base_statement* ts = *iter; + value val_ts = ts->eval(); + + if(val_ts.is_timestamp() == false) + { + throw base_s3select_exception("third parameter should be time-stamp"); + } + + std::tie(new_ptime, td, flag) = *val_ts.timestamp(); + } + +}; + +class base_time_to_string +{ + protected: + std::vector<std::string> months = { "January", "February", "March","April", + "May", "June", "July", "August", "September", + "October", "November", "December"}; + public: + virtual std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) = 0; + virtual ~base_time_to_string() = default; +}; + +class derive_yyyy : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + int64_t yr = new_ptime.date().year(); + return std::string(param - 4, '0') + std::to_string(yr); + } +} yyyy_to_string; + +class derive_yy : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + int64_t yr = new_ptime.date().year(); + return std::string(2 - std::to_string(yr%100).length(), '0') + std::to_string(yr%100); + } +} yy_to_string; + +class derive_y : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + int64_t yr = new_ptime.date().year(); + return std::to_string(yr); + } +} y_to_string; + +class derive_mmmmm_month : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + int64_t mnth = new_ptime.date().month(); + return (months[mnth - 1]).substr(0, 1); + } +} mmmmm_month_to_string; + +class derive_mmmm_month : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + int64_t mnth = new_ptime.date().month(); + return months[mnth - 1]; + } +} mmmm_month_to_string; + +class derive_mmm_month : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + int64_t mnth = new_ptime.date().month(); + return (months[mnth - 1]).substr(0, 3); + } +} mmm_month_to_string; + +class derive_mm_month : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + int64_t mnth = new_ptime.date().month(); + std::string mnth_str = std::to_string(mnth); + return std::string(2 - mnth_str.length(), '0') + mnth_str; + } +} mm_month_to_string; + +class derive_m_month : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + int64_t mnth = new_ptime.date().month(); + return std::to_string(mnth); + } +} m_month_to_string; + +class derive_dd : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + std::string day = std::to_string(new_ptime.date().day()); + return std::string(2 - day.length(), '0') + day; + } +} dd_to_string; + +class derive_d : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + std::string day = std::to_string(new_ptime.date().day()); + return day; + } +} d_to_string; + +class derive_a : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + int64_t hr = new_ptime.time_of_day().hours(); + std::string meridiem = (hr < 12 ? "AM" : "PM"); + return meridiem; + } +} a_to_string; + +class derive_hh : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + int64_t hr = new_ptime.time_of_day().hours(); + std::string hr_12 = std::to_string(hr%12 == 0 ? 12 : hr%12); + return std::string(2 - hr_12.length(), '0') + hr_12; + } +} hh_to_string; + +class derive_h : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + int64_t hr = new_ptime.time_of_day().hours(); + std::string hr_12 = std::to_string(hr%12 == 0 ? 12 : hr%12); + return hr_12; + } +} h_to_string; + +class derive_h2 : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + int64_t hr = new_ptime.time_of_day().hours(); + std::string hr_24 = std::to_string(hr); + return std::string(2 - hr_24.length(), '0') + hr_24; + } +} h2_to_string; + +class derive_h1 : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + int64_t hr = new_ptime.time_of_day().hours(); + return std::to_string(hr); + } +} h1_to_string; + +class derive_mm : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + std::string mint = std::to_string(new_ptime.time_of_day().minutes()); + return std::string(2 - mint.length(), '0') + mint; + } +} mm_to_string; + +class derive_m : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + std::string mint = std::to_string(new_ptime.time_of_day().minutes()); + return mint; + } +} m_to_string; + +class derive_ss : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + std::string sec = std::to_string(new_ptime.time_of_day().seconds()); + return std::string(2 - sec.length(), '0') + sec; + } +} ss_to_string; + +class derive_s : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + std::string sec = std::to_string(new_ptime.time_of_day().seconds()); + return sec; + } +} s_to_string; + +class derive_frac_sec : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + std::string frac_seconds = std::to_string(new_ptime.time_of_day().fractional_seconds()); + #if BOOST_DATE_TIME_POSIX_TIME_STD_CONFIG + frac_seconds = std::string(9 - frac_seconds.length(), '0') + frac_seconds; + #else + frac_seconds = std::string(6 - frac_seconds.length(), '0') + frac_seconds; + #endif + if (param >= frac_seconds.length()) + { + return frac_seconds + std::string(param - frac_seconds.length(), '0'); + } + else + { + return frac_seconds.substr(0, param); + } + } +} frac_sec_to_string; + +class derive_n : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + int frac_seconds = new_ptime.time_of_day().fractional_seconds(); + + if(frac_seconds == 0) + return std::to_string(frac_seconds); + else + { + #if BOOST_DATE_TIME_POSIX_TIME_STD_CONFIG + return std::to_string(frac_seconds); + #else + return std::to_string(frac_seconds) + std::string(3, '0'); + #endif + } + } +} n_to_string; + +class derive_x1 : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + int tz_hour = td.hours(); + int tz_minute = td.minutes(); + if (tz_hour == 0 && tz_minute == 0) + { + return "Z"; + } + else if (tz_minute == 0) + { + std::string tz_hr = std::to_string(std::abs(tz_hour)); + return (td.is_negative() ? "-" : "+") + std::string(2 - tz_hr.length(), '0') + tz_hr; + } + else + { + std::string tz_hr = std::to_string(std::abs(tz_hour)); + std::string tz_mn = std::to_string(std::abs(tz_minute)); + return (td.is_negative() ? "-" : "+") + std::string(2 - tz_hr.length(), '0') + tz_hr + std::string(2 - tz_mn.length(), '0') + tz_mn; + } + } +} x1_to_string; + +class derive_x2 : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + int tz_hour = td.hours(); + int tz_minute = td.minutes(); + if (tz_hour == 0 && tz_minute == 0) + { + return "Z"; + } + else + { + std::string tz_hr = std::to_string(std::abs(tz_hour)); + std::string tz_mn = std::to_string(std::abs(tz_minute)); + return (td.is_negative() ? "-" : "+") + std::string(2 - tz_hr.length(), '0') + tz_hr + std::string(2 - tz_mn.length(), '0') + tz_mn; + } + } +} x2_to_string; + +class derive_x3 : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + int tz_hour = td.hours(); + int tz_minute = td.minutes(); + if (tz_hour == 0 && tz_minute == 0) + { + return "Z"; + } + else + { + std::string tz_hr = std::to_string(std::abs(tz_hour)); + std::string tz_mn = std::to_string(std::abs(tz_minute)); + return (td.is_negative() ? "-" : "+") + std::string(2 - tz_hr.length(), '0') + tz_hr + ":" + std::string(2 - tz_mn.length(), '0') + tz_mn; + } + } +} x3_to_string; + +class derive_x : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + int tz_minute = td.minutes(); + std::string tz_hr = std::to_string(std::abs(td.hours())); + if (tz_minute == 0) + { + return (td.is_negative() ? "-" : "+") + std::string(2 - tz_hr.length(), '0') + tz_hr; + } + else + { + std::string tz_mn = std::to_string(std::abs(tz_minute)); + return (td.is_negative() ? "-" : "+") + std::string(2 - tz_hr.length(), '0') + tz_hr + std::string(2 - tz_mn.length(), '0') + tz_mn; + } + } +} x_to_string; + +class derive_xx : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + std::string tz_hr = std::to_string(std::abs(td.hours())); + std::string tz_mn = std::to_string(std::abs(td.minutes())); + return (td.is_negative() ? "-" : "+") + std::string(2 - tz_hr.length(), '0') + tz_hr + std::string(2 - tz_mn.length(), '0') + tz_mn; + } +} xx_to_string; + +class derive_xxx : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + std::string tz_hr = std::to_string(std::abs(td.hours())); + std::string tz_mn = std::to_string(std::abs(td.minutes())); + return (td.is_negative() ? "-" : "+") + std::string(2 - tz_hr.length(), '0') + tz_hr + ":" + std::string(2 - tz_mn.length(), '0') + tz_mn; + } +} xxx_to_string; + +class derive_delimiter : public base_time_to_string +{ + public: + std::string print_time(boost::posix_time::ptime& new_ptime, boost::posix_time::time_duration& td, uint32_t param) + { + char ch = param; + return std::string(1, ch); + } +} delimiter_to_string; + +class base_timestamp_to_string : public base_function +{ + protected: + boost::posix_time::ptime new_ptime; + boost::posix_time::time_duration td; + bool flag; + std::string format; + std::vector<char> m_metachar {'y', 'M', 'd', 'a', 'h', 'H', 'm', 's', 'S', 'n', 'X', 'x'}; + std::vector<std::string> m_metaword_vec {"yyy", "yy", "y", "MMMMM", "MMMM", "MMM", "MM", "M", + "dd", "d", "a", "hh", "h", "HH", "H", "mm", "m", "ss", "s", "n", + "XXXXX", "XXXX", "XXX", "XX", "X", "xxxxx", "xxxx", "xxx", "xx", + "x"}; + std::vector<base_time_to_string*> print_vector; + std::vector<uint32_t> para; + bool initialized = false; + + using to_string_lib_t = std::map<std::string,base_time_to_string* >; + + const to_string_lib_t time_to_string_functions = + { + {"yyyy+", &yyyy_to_string}, + {"yyy", &y_to_string}, + {"yy", &yy_to_string}, + {"y", &y_to_string}, + {"MMMMM", &mmmmm_month_to_string}, + {"MMMM", &mmmm_month_to_string}, + {"MMM", &mmm_month_to_string}, + {"MM", &mm_month_to_string}, + {"M", &m_month_to_string}, + {"dd", &dd_to_string }, + {"d", &d_to_string }, + {"a", &a_to_string }, + {"hh", &hh_to_string}, + {"h", &h_to_string}, + {"HH", &h2_to_string}, + {"H", &h1_to_string}, + {"mm", &mm_to_string}, + {"m", &m_to_string}, + {"ss", &ss_to_string}, + {"s", &s_to_string}, + {"S+", &frac_sec_to_string}, + {"n", &n_to_string}, + {"XXXXX", &x3_to_string}, + {"XXXX", &x2_to_string}, + {"XXX", &x3_to_string}, + {"XX", &x2_to_string}, + {"X", &x1_to_string}, + {"xxxxx", &xxx_to_string}, + {"xxxx", &xx_to_string}, + {"xxx", &xxx_to_string}, + {"xx", &xx_to_string}, + {"x", &x_to_string}, + {"delimiter", &delimiter_to_string} + }; + + public: + void param_validation(bs_stmt_vec_t*& args) + { + auto iter = args->begin(); + int args_size = args->size(); + + if (args_size < 2) + { + throw base_s3select_exception("to_string need 2 parameters"); + } + + base_statement* dt1_param = *iter; + value val_timestamp = dt1_param->eval(); + + if (val_timestamp.is_timestamp() == false) + { + throw base_s3select_exception("first parameter should be timestamp"); + } + + iter++; + base_statement* frmt = *iter; + value val_format = frmt->eval(); + + if (val_format.is_string() == false) + { + throw base_s3select_exception("second parameter should be string"); + } + + std::tie(new_ptime, td, flag) = *val_timestamp.timestamp(); + format = val_format.to_string(); + } + + uint32_t length_same_char_str(std::string str, char ch) + { + int i = 0; + while(str[i] == ch) + i++; + return i; + } + + void prepare_to_string_vector(std::vector<base_time_to_string*>& print_vector, std::vector<uint32_t>& para) + { + for (uint32_t i = 0; i < format.length(); i++) + { + if (std::find(m_metachar.begin(), m_metachar.end() , format[i]) != m_metachar.end()) + { + if (format.substr(i, 4).compare("yyyy") == 0) + { + uint32_t len = length_same_char_str(format.substr(i), 'y'); + auto it = time_to_string_functions.find("yyyy+"); + print_vector.push_back( it->second); + para.push_back(len); + i += len - 1; + continue; + } + else if (format[i] == 'S') + { + uint32_t len = length_same_char_str(format.substr(i), 'S'); + auto it = time_to_string_functions.find("S+"); + print_vector.push_back( it->second); + para.push_back(len); + i += len - 1; + continue; + } + + for (auto word : m_metaword_vec) + { + if (format.substr(i, word.length()).compare(word) == 0) + { + auto it = time_to_string_functions.find(word.c_str()); + print_vector.push_back( it->second); + para.push_back('\0'); + i += word.length() - 1; + break; + } + } + } + else + { + auto it = time_to_string_functions.find("delimiter"); + print_vector.push_back( it->second ); + para.push_back(format[i]); + } + } + } + + std::string execute_to_string(std::vector<base_time_to_string*>& print_vector, std::vector<uint32_t>& para) + { + std::string res; + int temp = 0; + for(auto p : print_vector) + { + res += p->print_time(new_ptime, td, para.at(temp)); + temp++; + } + return res; + } + +}; + + +class base_like : public base_function +{ + protected: + value like_expr_val; + value escape_expr_val; + bool constant_state = false; + #if REGEX_HS + hs_database_t* compiled_regex; + hs_scratch_t *scratch = NULL; + bool res; + #elif REGEX_RE2 + std::unique_ptr<RE2> compiled_regex; + #else + std::regex compiled_regex; + #endif + + public: + void param_validation(base_statement* escape_expr, base_statement* like_expr) + { + escape_expr_val = escape_expr->eval(); + if (escape_expr_val.type != value::value_En_t::STRING) + { + throw base_s3select_exception("esacpe expression must be string"); + } + + like_expr_val = like_expr->eval(); + if (like_expr_val.type != value::value_En_t::STRING) + { + throw base_s3select_exception("like expression must be string"); + } + } + + std::vector<char> transform(const char* s, char escape) + { + enum state_expr_t {START, ESCAPE, START_STAR_CHAR, START_METACHAR, START_ANYCHAR, METACHAR, + STAR_CHAR, ANYCHAR, END }; + state_expr_t st{START}; + + const char *p = s; + size_t size = strlen(s); + size_t i = 0; + std::vector<char> v; + + while(*p) + { + switch (st) + { + case START: + if (*p == escape) + { + st = ESCAPE; + v.push_back('^'); + } + else if (*p == '%') + { + v.push_back('^'); + v.push_back('.'); + v.push_back('*'); + st = START_STAR_CHAR; + } + else if (*p == '_') + { + v.push_back('^'); + v.push_back('.'); + st=START_METACHAR; + } + else + { + v.push_back('^'); + v.push_back(*p); + st=START_ANYCHAR; + } + break; + + case START_STAR_CHAR: + if (*p == escape) + { + st = ESCAPE; + } + else if (*p == '%') + { + st = START_STAR_CHAR; + } + else if (*p == '_') + { + v.push_back('.'); + st = METACHAR; + } + else + { + v.push_back(*p); + st = ANYCHAR; + } + break; + + case START_METACHAR: + if (*p == escape) + { + st = ESCAPE; + } + else if(*p == '_') + { + v.push_back('.'); + st = METACHAR; + } + else if(*p == '%') + { + v.push_back('.'); + v.push_back('*'); + st = STAR_CHAR; + } + else + { + v.push_back(*p); + st = ANYCHAR; + } + break; + + case START_ANYCHAR: + if (*p == escape) + { + st = ESCAPE; + } + else if (*p == '_' && i == size-1) + { + v.push_back('.'); + v.push_back('$'); + st = END; + } + else if (*p == '_') + { + v.push_back('.'); + st = METACHAR; + } + else if (*p == '%' && i == size-1) + { + v.push_back('.'); + v.push_back('*'); + v.push_back('$'); + st = END; + } + else if (*p == '%') + { + v.push_back('.'); + v.push_back('*'); + st = STAR_CHAR; + } + else if (i == size-1) + { + v.push_back(*p); + v.push_back('$'); + st = END; + } + else + { + v.push_back(*p); + st = ANYCHAR; + } + break; + + case METACHAR: + if (*p == escape) + { + st = ESCAPE; + } + else if (*p == '_' && i == size-1) + { + v.push_back('.'); + v.push_back('$'); + st = END; + } + else if (*p == '_') + { + v.push_back('.'); + st = METACHAR; + } + else if (*p == '%' && i == size-1) + { + v.push_back('.'); + v.push_back('*'); + v.push_back('$'); + st = END; + } + else if (*p == '%') + { + v.push_back('.'); + v.push_back('*'); + st = STAR_CHAR; + } + else if (i == size-1) + { + v.push_back(*p); + v.push_back('$'); + st = END; + } + else + { + v.push_back(*p); + st = ANYCHAR; + } + break; + + case ANYCHAR: + if (*p == escape) + { + st = ESCAPE; + } + else if (*p == '_' && i == size-1) + { + v.push_back('.'); + v.push_back('$'); + st = END; + } + else if (*p == '_') + { + v.push_back('.'); + st = METACHAR; + } + else if (*p == '%' && i == size-1) + { + v.push_back('.'); + v.push_back('*'); + v.push_back('$'); + st = END; + } + else if (*p == '%') + { + v.push_back('.'); + v.push_back('*'); + st = STAR_CHAR; + } + else if (i == size-1) + { + v.push_back(*p); + v.push_back('$'); + st = END; + } + else + { + v.push_back(*p); + st = ANYCHAR; + } + break; + + case STAR_CHAR: + if (*p == escape) + { + st = ESCAPE; + } + else if (*p == '%' && i == size-1) + { + v.push_back('$'); + st = END; + } + else if (*p == '%') + { + st = STAR_CHAR; + } + else if (*p == '_' && i == size-1) + { + v.push_back('.'); + v.push_back('$'); + st = END; + } + else if (*p == '_') + { + v.push_back('.'); + st = METACHAR; + } + else if (i == size-1) + { + v.push_back(*p); + v.push_back('$'); + st = END; + } + else + { + v.push_back(*p); + st = ANYCHAR; + } + break; + + case ESCAPE: + if (i == size-1) + { + v.push_back(*p); + v.push_back('$'); + st = END; + } + else + { + v.push_back(*p); + st = ANYCHAR; + } + break; + + case END: + return v; + + default: + throw base_s3select_exception("missing state!"); + break; + } + p++; + i++; + } + return v; + } + + void compile(std::vector<char>& like_regex) + { + std::string like_as_regex_str(like_regex.begin(), like_regex.end()); + + #if REGEX_HS + std::string temp = "^" + like_as_regex_str + "\\z"; //for anchoring start and end + char* c_regex = &temp[0]; + hs_compile_error_t *compile_err; + if (hs_compile(c_regex, HS_FLAG_DOTALL, HS_MODE_BLOCK, NULL, &compiled_regex, + &compile_err) != HS_SUCCESS) + { + throw base_s3select_exception("ERROR: Unable to compile pattern."); + } + + if (hs_alloc_scratch(compiled_regex, &scratch) != HS_SUCCESS) + { + throw base_s3select_exception("ERROR: Unable to allocate scratch space."); + } + #elif REGEX_RE2 + compiled_regex = std::make_unique<RE2>(like_as_regex_str); + #else + compiled_regex = std::regex(like_as_regex_str); + #endif + } + + void match(value& main_expr_val, variable* result) + { + std::string content_str = main_expr_val.to_string(); + #if REGEX_HS + const char* content = content_str.c_str(); + res = false; + + if (hs_scan(compiled_regex, content, strlen(content), 0, scratch, eventHandler, &res) != + HS_SUCCESS) + { + throw base_s3select_exception("ERROR: Unable to scan input buffer. Exiting."); + } + + result->set_value(res); + #elif REGEX_RE2 + re2::StringPiece res[1]; + + if (compiled_regex->Match(content_str, 0, content_str.size(), RE2::ANCHOR_BOTH, res, 1)) + { + result->set_value(true); + } + else + { + result->set_value(false); + } + #else + if (std::regex_match(content_str, compiled_regex)) + { + result->set_value(true); + } + else + { + result->set_value(false); + } + #endif + } + + static int eventHandler(unsigned int id, unsigned long long from, unsigned long long to, + unsigned int flags, void* ctx) + { + *((bool*)ctx) = true; + return 0; + } + +}; + +};//namespace + +#endif diff --git a/src/s3select/include/s3select_parquet_intrf.h b/src/s3select/include/s3select_parquet_intrf.h new file mode 100644 index 000000000..df04e1618 --- /dev/null +++ b/src/s3select/include/s3select_parquet_intrf.h @@ -0,0 +1,2079 @@ + +#pragma once + +#if ! __has_include (<arrow/api.h>) || ! __has_include (<arrow/io/api.h>) || !__has_include (<parquet/arrow/reader.h>) +# undef _ARROW_EXIST +#endif + +#ifdef _ARROW_EXIST + +#include <iostream> +#include <arrow/api.h> +#include <arrow/io/api.h> +#include <parquet/arrow/reader.h> +#include <parquet/arrow/writer.h> +#include <parquet/exception.h> +#include <set> +#include <parquet/column_reader.h> +#include <arrow/util/io_util.h> + +#include <arrow/io/interfaces.h> +#include <utility> + +#include <mutex> +#include <functional> + +#include "internal_file_decryptor.h" +#include "encryption_internal.h" + +#if ARROW_VERSION_MAJOR < 9 +#define _ARROW_FD fd_ +#define _ARROW_FD_TYPE int +#else +#define _ARROW_FD fd_.fd() +#define _ARROW_FD_TYPE arrow::internal::FileDescriptor +#endif + +/******************************************/ +/******************************************/ +class optional_yield; +namespace s3selectEngine { +class rgw_s3select_api { + + // global object for setting interface between RGW and parquet-reader + private: + + public: + + std::function<int(int64_t,int64_t,void*,optional_yield*)> range_req_fptr; + std::function<size_t(void)> get_size_fptr; + optional_yield *m_y; + + void set_range_req_api(std::function<int(int64_t,int64_t,void*,optional_yield*)> fp) + { + range_req_fptr = fp; + } + + void set_get_size_api(std::function<size_t(void)> fp) + { + get_size_fptr = fp; + } +}; +} + +/******************************************/ +/******************************************/ +/******************************************/ + +static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'}; +static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'}; +constexpr int kGcmTagLength = 16; + +namespace arrow { +namespace io { +namespace internal { + +ARROW_EXPORT void CloseFromDestructor(FileInterface* file); + +// Validate a (offset, size) region (as given to ReadAt) against +// the file size. Return the actual read size. +ARROW_EXPORT Result<int64_t> ValidateReadRange(int64_t offset, int64_t size, + int64_t file_size); +// Validate a (offset, size) region (as given to WriteAt) against +// the file size. Short writes are not allowed. +ARROW_EXPORT Status ValidateWriteRange(int64_t offset, int64_t size, int64_t file_size); + +// Validate a (offset, size) region (as given to ReadAt or WriteAt), without +// knowing the file size. +ARROW_EXPORT Status ValidateRange(int64_t offset, int64_t size); + +ARROW_EXPORT +std::vector<ReadRange> CoalesceReadRanges(std::vector<ReadRange> ranges, + int64_t hole_size_limit, + int64_t range_size_limit); + +ARROW_EXPORT +::arrow::internal::ThreadPool* GetIOThreadPool(); + +} // namespace internal +} // namespace io +} + + +// RGWimpl and OSFile implements the access to storage objects, OSFile(filesystem files) RGWimpl( ceph S3 ) +// ObjectInterface(temporary) is "empty base class" enables injections of access function to storage-objects +// ReadableFileImpl an implementation layer to ObjectInterface objects +// ReadableFile a layer which call to ReadableFileImpl, enable runtime switching between implementations +// ParquetFileReader is the main interface (underline implementation is transparent to this layer) +// + + +namespace arrow { +class Buffer; +namespace io { + +class ObjectInterface { + +#define NOT_IMPLEMENTED {std::cout << "not implemented" << std::endl;} + +//purpose: to implement the range-request from single object +public: + ObjectInterface() : fd_(-1), is_open_(false), size_(-1), need_seeking_(false) {} + + virtual ~ObjectInterface(){} + + // Note: only one of the Open* methods below may be called on a given instance + + virtual Status OpenWritable(const std::string& path, bool truncate, bool append, bool write_only){return Status::OK();} + + // This is different from OpenWritable(string, ...) in that it doesn't + // truncate nor mandate a seekable file + virtual Status OpenWritable(int fd){return Status::OK();} + + virtual Status OpenReadable(const std::string& path){return Status::OK();} + + virtual Status OpenReadable(int fd){return Status::OK();} + + virtual Status CheckClosed() const {return Status::OK();} + + virtual Status Close(){return Status::OK();} + + virtual Result<int64_t> Read(int64_t nbytes, void* out){return Result<int64_t>(-1);} + + virtual Result<int64_t> ReadAt(int64_t position, int64_t nbytes, void* out){return Result<int64_t>(-1);} + + virtual Status Seek(int64_t pos){return Status::OK();} + + virtual Result<int64_t> Tell() const {return Result<int64_t>(-1);} + + virtual Status Write(const void* data, int64_t length){return Status::OK();} + + virtual int fd() const{return -1;} + + virtual bool is_open() const{return false;} + + virtual int64_t size() const{return -1;} + + virtual FileMode::type mode() const{return FileMode::READ;} + + #if 0 + std::mutex& lock(){} + #endif + + protected: + virtual Status SetFileName(const std::string& file_name){return Status::OK();} + + virtual Status SetFileName(int fd){return Status::OK();} + + virtual Status CheckPositioned(){return Status::OK();} + + ::arrow::internal::PlatformFilename file_name_; + + std::mutex lock_; + + // File descriptor + _ARROW_FD_TYPE fd_; + + FileMode::type mode_; + + bool is_open_; + int64_t size_; + // Whether ReadAt made the file position non-deterministic. + std::atomic<bool> need_seeking_; + +}; //ObjectInterface + +} //namespace io +} //namespace arrow + +namespace arrow { + +using internal::IOErrorFromErrno; + +namespace io { + +class OSFile : public ObjectInterface { + public: + OSFile() : fd_(-1), is_open_(false), size_(-1), need_seeking_(false) {} + + ~OSFile() {} + + // Note: only one of the Open* methods below may be called on a given instance + + Status OpenWritable(const std::string& path, bool truncate, bool append, + bool write_only) override { + RETURN_NOT_OK(SetFileName(path)); + + ARROW_ASSIGN_OR_RAISE(fd_, ::arrow::internal::FileOpenWritable(file_name_, write_only, + truncate, append)); + is_open_ = true; + mode_ = write_only ? FileMode::WRITE : FileMode::READWRITE; + + if (!truncate) { + ARROW_ASSIGN_OR_RAISE(size_, ::arrow::internal::FileGetSize(_ARROW_FD)); + } else { + size_ = 0; + } + return Status::OK(); + } + + // This is different from OpenWritable(string, ...) in that it doesn't + // truncate nor mandate a seekable file + Status OpenWritable(int fd) override { + auto result = ::arrow::internal::FileGetSize(fd); + if (result.ok()) { + size_ = *result; + } else { + // Non-seekable file + size_ = -1; + } + RETURN_NOT_OK(SetFileName(fd)); + is_open_ = true; + mode_ = FileMode::WRITE; + #if ARROW_VERSION_MAJOR < 9 + fd_ = fd; + #else + fd_ = arrow::internal::FileDescriptor{fd}; + #endif + return Status::OK(); + } + + Status OpenReadable(const std::string& path) override { + RETURN_NOT_OK(SetFileName(path)); + + ARROW_ASSIGN_OR_RAISE(fd_, ::arrow::internal::FileOpenReadable(file_name_)); + ARROW_ASSIGN_OR_RAISE(size_, ::arrow::internal::FileGetSize(_ARROW_FD)); + + is_open_ = true; + mode_ = FileMode::READ; + return Status::OK(); + } + + Status OpenReadable(int fd) override { + ARROW_ASSIGN_OR_RAISE(size_, ::arrow::internal::FileGetSize(fd)); + RETURN_NOT_OK(SetFileName(fd)); + is_open_ = true; + mode_ = FileMode::READ; + #if ARROW_VERSION_MAJOR < 9 + fd_ = fd; + #else + fd_ = arrow::internal::FileDescriptor{fd}; + #endif + return Status::OK(); + } + + Status CheckClosed() const override { + if (!is_open_) { + return Status::Invalid("Invalid operation on closed file"); + } + return Status::OK(); + } + + Status Close() override { + if (is_open_) { + // Even if closing fails, the fd will likely be closed (perhaps it's + // already closed). + is_open_ = false; + #if ARROW_VERSION_MAJOR < 9 + int fd = fd_; + fd_ = -1; + RETURN_NOT_OK(::arrow::internal::FileClose(fd)); + #else + RETURN_NOT_OK(fd_.Close()); + #endif + } + return Status::OK(); + } + + Result<int64_t> Read(int64_t nbytes, void* out) override { + RETURN_NOT_OK(CheckClosed()); + RETURN_NOT_OK(CheckPositioned()); + return ::arrow::internal::FileRead(_ARROW_FD, reinterpret_cast<uint8_t*>(out), nbytes); + } + + Result<int64_t> ReadAt(int64_t position, int64_t nbytes, void* out) override { + RETURN_NOT_OK(CheckClosed()); + RETURN_NOT_OK(internal::ValidateRange(position, nbytes)); + // ReadAt() leaves the file position undefined, so require that we seek + // before calling Read() or Write(). + need_seeking_.store(true); + return ::arrow::internal::FileReadAt(_ARROW_FD, reinterpret_cast<uint8_t*>(out), position, + nbytes); + } + + Status Seek(int64_t pos) override { + RETURN_NOT_OK(CheckClosed()); + if (pos < 0) { + return Status::Invalid("Invalid position"); + } + Status st = ::arrow::internal::FileSeek(_ARROW_FD, pos); + if (st.ok()) { + need_seeking_.store(false); + } + return st; + } + + Result<int64_t> Tell() const override { + RETURN_NOT_OK(CheckClosed()); + return ::arrow::internal::FileTell(_ARROW_FD); + } + + Status Write(const void* data, int64_t length) override { + RETURN_NOT_OK(CheckClosed()); + + std::lock_guard<std::mutex> guard(lock_); + RETURN_NOT_OK(CheckPositioned()); + if (length < 0) { + return Status::IOError("Length must be non-negative"); + } + return ::arrow::internal::FileWrite(_ARROW_FD, reinterpret_cast<const uint8_t*>(data), + length); + } + + int fd() const override { return _ARROW_FD; } + + bool is_open() const override { return is_open_; } + + int64_t size() const override { return size_; } + + FileMode::type mode() const override { return mode_; } + + std::mutex& lock() { return lock_; } + + protected: + Status SetFileName(const std::string& file_name) override { + return ::arrow::internal::PlatformFilename::FromString(file_name).Value(&file_name_); + } + + Status SetFileName(int fd) override { + std::stringstream ss; + ss << "<fd " << fd << ">"; + return SetFileName(ss.str()); + } + + Status CheckPositioned() override { + if (need_seeking_.load()) { + return Status::Invalid( + "Need seeking after ReadAt() before " + "calling implicitly-positioned operation"); + } + return Status::OK(); + } + + ::arrow::internal::PlatformFilename file_name_; + + std::mutex lock_; + + // File descriptor + _ARROW_FD_TYPE fd_; + + FileMode::type mode_; + + bool is_open_; + int64_t size_; + // Whether ReadAt made the file position non-deterministic. + std::atomic<bool> need_seeking_; +}; +} // namespace io +} // namespace arrow + +namespace arrow { +class Buffer; +namespace io { + +class RGWimpl : public ObjectInterface { + +//purpose: to implement the range-request from single object +public: + RGWimpl(s3selectEngine::rgw_s3select_api* rgw) : fd_(-1), is_open_(false), size_(-1), need_seeking_(false),m_rgw_impl(rgw) {} + + ~RGWimpl(){} + +#define NOT_IMPLEMENT { \ + std::stringstream ss; \ + ss << " method " << __FUNCTION__ << " is not implemented;"; \ + throw parquet::ParquetException(ss.str()); \ + } + + // Note: only one of the Open* methods below may be called on a given instance + + Status OpenWritable(const std::string& path, bool truncate, bool append, bool write_only) { NOT_IMPLEMENT;return Status::OK(); } + + // This is different from OpenWritable(string, ...) in that it doesn't + // truncate nor mandate a seekable file + Status OpenWritable(int fd) {NOT_IMPLEMENT;return Status::OK(); } + + Status OpenReadable(const std::string& path) { + //RGW-implement + + RETURN_NOT_OK(SetFileName(path));//TODO can skip that + size_ = m_rgw_impl->get_size_fptr(); + + is_open_ = true; + mode_ = FileMode::READ; + return Status::OK(); + } + + Status OpenReadable(int fd) {NOT_IMPLEMENT;return Status::OK(); } + + Status CheckClosed() const { + //RGW-implement + if (!is_open_) { + return Status::Invalid("Invalid operation on closed file"); + } + return Status::OK(); + } + + Status Close() { + //RGW-implement + if (is_open_) { + // Even if closing fails, the fd will likely be closed (perhaps it's + // already closed). + is_open_ = false; + //int fd = fd_; + #if ARROW_VERSION_MAJOR < 9 + fd_ = -1; + #else + fd_.Close(); + #endif + //RETURN_NOT_OK(::arrow::internal::FileClose(fd)); + } + return Status::OK(); + } + + Result<int64_t> Read(int64_t nbytes, void* out) { + NOT_IMPLEMENT; + RETURN_NOT_OK(CheckClosed()); + RETURN_NOT_OK(CheckPositioned()); + return ::arrow::internal::FileRead(_ARROW_FD, reinterpret_cast<uint8_t*>(out), nbytes); + } + + Result<int64_t> ReadAt(int64_t position, int64_t nbytes, void* out) { + + Result<int64_t> status = m_rgw_impl->range_req_fptr(position,nbytes,out,m_rgw_impl->m_y); + + return status; + } + + Status Seek(int64_t pos) {NOT_IMPLEMENT;return Status::OK(); } + + Result<int64_t> Tell() const { + NOT_IMPLEMENT; + return Result<int64_t>(0); + } + + Status Write(const void* data, int64_t length) { + NOT_IMPLEMENT; + return Status::OK(); + } + + int fd() const { return _ARROW_FD; } + + bool is_open() const { return is_open_; } + + int64_t size() const { return size_; } + + FileMode::type mode() const { return mode_; } + + std::mutex& lock() { return lock_; } //TODO skip + + protected: + Status SetFileName(const std::string& file_name) override { + return ::arrow::internal::PlatformFilename::FromString(file_name).Value(&file_name_); + } + + Status SetFileName(int fd) {NOT_IMPLEMENT; return Status::OK(); } + + Status CheckPositioned() {NOT_IMPLEMENT; return Status::OK(); } + + ::arrow::internal::PlatformFilename file_name_; + + std::mutex lock_; + + // File descriptor + _ARROW_FD_TYPE fd_; + + FileMode::type mode_; + + bool is_open_; + int64_t size_; + // Whether ReadAt made the file position non-deterministic. + std::atomic<bool> need_seeking_; + +private: + + s3selectEngine::rgw_s3select_api* m_rgw_impl; +}; + +} //namespace io +} //namespace arrow + +namespace arrow { + +class Buffer; +class MemoryPool; +class Status; + +namespace io { +namespace ceph { + +/// \brief An operating system file open in read-only mode. +/// +/// Reads through this implementation are unbuffered. If many small reads +/// need to be issued, it is recommended to use a buffering layer for good +/// performance. +class ARROW_EXPORT ReadableFile + : public internal::RandomAccessFileConcurrencyWrapper<ReadableFile> { + public: + ~ReadableFile() override; + + /// \brief Open a local file for reading + /// \param[in] path with UTF8 encoding + /// \param[in] pool a MemoryPool for memory allocations + /// \return ReadableFile instance + static Result<std::shared_ptr<ReadableFile>> Open( + const std::string& path,s3selectEngine::rgw_s3select_api* rgw,MemoryPool* pool = default_memory_pool()); + + /// \brief Open a local file for reading + /// \param[in] fd file descriptor + /// \param[in] pool a MemoryPool for memory allocations + /// \return ReadableFile instance + /// + /// The file descriptor becomes owned by the ReadableFile, and will be closed + /// on Close() or destruction. + static Result<std::shared_ptr<ReadableFile>> Open( + int fd, MemoryPool* pool = default_memory_pool()); + + bool closed() const override; + + int file_descriptor() const; + + Status WillNeed(const std::vector<ReadRange>& ranges) override; + + private: + friend RandomAccessFileConcurrencyWrapper<ReadableFile>; + + explicit ReadableFile(MemoryPool* pool,s3selectEngine::rgw_s3select_api* rgw); + + Status DoClose(); + Result<int64_t> DoTell() const; + Result<int64_t> DoRead(int64_t nbytes, void* buffer); + Result<std::shared_ptr<Buffer>> DoRead(int64_t nbytes); + + /// \brief Thread-safe implementation of ReadAt + Result<int64_t> DoReadAt(int64_t position, int64_t nbytes, void* out); + + /// \brief Thread-safe implementation of ReadAt + Result<std::shared_ptr<Buffer>> DoReadAt(int64_t position, int64_t nbytes); + + Result<int64_t> DoGetSize(); + Status DoSeek(int64_t position); + + class ARROW_NO_EXPORT ReadableFileImpl; + std::unique_ptr<ReadableFileImpl> impl_; +}; + + +} // namespace ceph +} // namespace io +} // namespace arrow + +// ---------------------------------------------------------------------- +// ReadableFileImpl implementation + +namespace arrow { +namespace io { +namespace ceph { + +class ReadableFile::ReadableFileImpl : public ObjectInterface { + public: + + ~ReadableFileImpl() + { + if(IMPL != nullptr) + { + delete IMPL; + } + } + +#ifdef CEPH_USE_FS + explicit ReadableFileImpl(MemoryPool* pool) : pool_(pool) {IMPL=new OSFile();} +#endif + explicit ReadableFileImpl(MemoryPool* pool,s3selectEngine::rgw_s3select_api* rgw) : pool_(pool) {IMPL=new RGWimpl(rgw);} + + Status Open(const std::string& path) { return IMPL->OpenReadable(path); } + + Status Open(int fd) { return IMPL->OpenReadable(fd); } + + Result<std::shared_ptr<Buffer>> ReadBuffer(int64_t nbytes) { + ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(nbytes, pool_)); + + ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, IMPL->Read(nbytes, buffer->mutable_data())); + if (bytes_read < nbytes) { + RETURN_NOT_OK(buffer->Resize(bytes_read)); + buffer->ZeroPadding(); + } + return buffer; + } + + Result<std::shared_ptr<Buffer>> ReadBufferAt(int64_t position, int64_t nbytes) { + ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(nbytes, pool_)); + + ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, + IMPL->ReadAt(position, nbytes, buffer->mutable_data())); + if (bytes_read < nbytes) { + RETURN_NOT_OK(buffer->Resize(bytes_read)); + buffer->ZeroPadding(); + } + return buffer; + } + + Status WillNeed(const std::vector<ReadRange>& ranges) { + RETURN_NOT_OK(CheckClosed()); + for (const auto& range : ranges) { + RETURN_NOT_OK(internal::ValidateRange(range.offset, range.length)); +#if defined(POSIX_FADV_WILLNEED) + if (posix_fadvise(_ARROW_FD, range.offset, range.length, POSIX_FADV_WILLNEED)) { + return IOErrorFromErrno(errno, "posix_fadvise failed"); + } +#elif defined(F_RDADVISE) // macOS, BSD? + struct { + off_t ra_offset; + int ra_count; + } radvisory{range.offset, static_cast<int>(range.length)}; + if (radvisory.ra_count > 0 && fcntl(_ARROW_FD, F_RDADVISE, &radvisory) == -1) { + return IOErrorFromErrno(errno, "fcntl(fd, F_RDADVISE, ...) failed"); + } +#endif + } + return Status::OK(); + } + + ObjectInterface *IMPL;//TODO to declare in ObjectInterface + + private: + + MemoryPool* pool_; + +}; + +// ReadableFile implemmetation +ReadableFile::ReadableFile(MemoryPool* pool,s3selectEngine::rgw_s3select_api* rgw) { impl_.reset(new ReadableFileImpl(pool,rgw)); } + +ReadableFile::~ReadableFile() { internal::CloseFromDestructor(this); } + +Result<std::shared_ptr<ReadableFile>> ReadableFile::Open(const std::string& path, + s3selectEngine::rgw_s3select_api* rgw, + MemoryPool* pool + ) { + auto file = std::shared_ptr<ReadableFile>(new ReadableFile(pool,rgw)); + RETURN_NOT_OK(file->impl_->Open(path)); + return file; +} + +Result<std::shared_ptr<ReadableFile>> ReadableFile::Open(int fd, MemoryPool* pool) { + NOT_IMPLEMENT; + auto file = std::shared_ptr<ReadableFile>(new ReadableFile(pool,0)); + RETURN_NOT_OK(file->impl_->Open(fd)); + return file; +} + +Status ReadableFile::DoClose() { return impl_->Close(); } + +bool ReadableFile::closed() const { return !impl_->is_open(); } + +Status ReadableFile::WillNeed(const std::vector<ReadRange>& ranges) { + return impl_->WillNeed(ranges); +} + +Result<int64_t> ReadableFile::DoTell() const { return impl_->Tell(); } + +Result<int64_t> ReadableFile::DoRead(int64_t nbytes, void* out) { + return impl_->IMPL->Read(nbytes, out); +} + +Result<int64_t> ReadableFile::DoReadAt(int64_t position, int64_t nbytes, void* out) { + return impl_->IMPL->ReadAt(position, nbytes, out); +} + +Result<std::shared_ptr<Buffer>> ReadableFile::DoReadAt(int64_t position, int64_t nbytes) { + return impl_->ReadBufferAt(position, nbytes); +} + +Result<std::shared_ptr<Buffer>> ReadableFile::DoRead(int64_t nbytes) { + return impl_->ReadBuffer(nbytes); +} + +Result<int64_t> ReadableFile::DoGetSize() { return impl_->IMPL->size(); } + +Status ReadableFile::DoSeek(int64_t pos) { return impl_->IMPL->Seek(pos); } + +int ReadableFile::file_descriptor() const { return impl_->IMPL->fd(); } + +} // namepace ceph +} // namespace io +} // namespace arrow + + +namespace parquet { + +class ColumnReader; +class FileMetaData; +class PageReader; +class RandomAccessSource; +class RowGroupMetaData; + +namespace ceph { +class PARQUET_EXPORT RowGroupReader { + public: + // Forward declare a virtual class 'Contents' to aid dependency injection and more + // easily create test fixtures + // An implementation of the Contents class is defined in the .cc file + struct Contents { + virtual ~Contents() {} + virtual std::unique_ptr<PageReader> GetColumnPageReader(int i) = 0; + virtual const RowGroupMetaData* metadata() const = 0; + virtual const ReaderProperties* properties() const = 0; + }; + + explicit RowGroupReader(std::unique_ptr<Contents> contents); + + // Returns the rowgroup metadata + const RowGroupMetaData* metadata() const; + + // Construct a ColumnReader for the indicated row group-relative + // column. Ownership is shared with the RowGroupReader. + std::shared_ptr<ColumnReader> Column(int i); + + std::unique_ptr<PageReader> GetColumnPageReader(int i); + + private: + // Holds a pointer to an instance of Contents implementation + std::unique_ptr<Contents> contents_; +}; + +class PARQUET_EXPORT ParquetFileReader { + public: + // Declare a virtual class 'Contents' to aid dependency injection and more + // easily create test fixtures + // An implementation of the Contents class is defined in the .cc file + struct PARQUET_EXPORT Contents { + static std::unique_ptr<Contents> Open( + std::shared_ptr<::arrow::io::RandomAccessFile> source, + const ReaderProperties& props = default_reader_properties(), + std::shared_ptr<FileMetaData> metadata = NULLPTR); + + virtual ~Contents() = default; + // Perform any cleanup associated with the file contents + virtual void Close() = 0; + virtual std::shared_ptr<RowGroupReader> GetRowGroup(int i) = 0; + virtual std::shared_ptr<FileMetaData> metadata() const = 0; + }; + + ParquetFileReader(); + ~ParquetFileReader(); + + // Create a reader from some implementation of parquet-cpp's generic file + // input interface + // + // If you cannot provide exclusive access to your file resource, create a + // subclass of RandomAccessSource that wraps the shared resource + ARROW_DEPRECATED("Use arrow::io::RandomAccessFile version") + static std::unique_ptr<ParquetFileReader> Open( + std::unique_ptr<RandomAccessSource> source, + const ReaderProperties& props = default_reader_properties(), + std::shared_ptr<FileMetaData> metadata = NULLPTR); + + // Create a file reader instance from an Arrow file object. Thread-safety is + // the responsibility of the file implementation + static std::unique_ptr<ParquetFileReader> Open( + std::shared_ptr<::arrow::io::RandomAccessFile> source, + const ReaderProperties& props = default_reader_properties(), + std::shared_ptr<FileMetaData> metadata = NULLPTR); + + // API Convenience to open a serialized Parquet file on disk, using Arrow IO + // interfaces. + static std::unique_ptr<ParquetFileReader> OpenFile( + const std::string& path,s3selectEngine::rgw_s3select_api* rgw, bool memory_map = true, + const ReaderProperties& props = default_reader_properties(), + std::shared_ptr<FileMetaData> metadata = NULLPTR + ); + + void Open(std::unique_ptr<Contents> contents); + void Close(); + + // The RowGroupReader is owned by the FileReader + std::shared_ptr<RowGroupReader> RowGroup(int i); + + // Returns the file metadata. Only one instance is ever created + std::shared_ptr<FileMetaData> metadata() const; + + /// Pre-buffer the specified column indices in all row groups. + /// + /// Readers can optionally call this to cache the necessary slices + /// of the file in-memory before deserialization. Arrow readers can + /// automatically do this via an option. This is intended to + /// increase performance when reading from high-latency filesystems + /// (e.g. Amazon S3). + /// + /// After calling this, creating readers for row groups/column + /// indices that were not buffered may fail. Creating multiple + /// readers for the a subset of the buffered regions is + /// acceptable. This may be called again to buffer a different set + /// of row groups/columns. + /// + /// If memory usage is a concern, note that data will remain + /// buffered in memory until either \a PreBuffer() is called again, + /// or the reader itself is destructed. Reading - and buffering - + /// only one row group at a time may be useful. + void PreBuffer(const std::vector<int>& row_groups, + const std::vector<int>& column_indices, + const ::arrow::io::IOContext& ctx, + const ::arrow::io::CacheOptions& options); + + private: + // Holds a pointer to an instance of Contents implementation + std::unique_ptr<Contents> contents_; +}; + +// Read only Parquet file metadata +std::shared_ptr<FileMetaData> PARQUET_EXPORT +ReadMetaData(const std::shared_ptr<::arrow::io::RandomAccessFile>& source); + +/// \brief Scan all values in file. Useful for performance testing +/// \param[in] columns the column numbers to scan. If empty scans all +/// \param[in] column_batch_size number of values to read at a time when scanning column +/// \param[in] reader a ParquetFileReader instance +/// \return number of semantic rows in file +PARQUET_EXPORT +int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size, + ParquetFileReader* reader); + +}//namespace ceph +}//namespace parquet + + +namespace parquet { + +namespace ceph { + +// PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file +static constexpr int64_t kDefaultFooterReadSize = 64 * 1024; +static constexpr uint32_t kFooterSize = 8; + +// For PARQUET-816 +static constexpr int64_t kMaxDictHeaderSize = 100; + +// ---------------------------------------------------------------------- +// RowGroupReader public API + +RowGroupReader::RowGroupReader(std::unique_ptr<Contents> contents) + : contents_(std::move(contents)) {} + +std::shared_ptr<ColumnReader> RowGroupReader::Column(int i) { + if (i >= metadata()->num_columns()) { + std::stringstream ss; + ss << "Trying to read column index " << i << " but row group metadata has only " + << metadata()->num_columns() << " columns"; + throw ParquetException(ss.str()); + } + const ColumnDescriptor* descr = metadata()->schema()->Column(i); + + std::unique_ptr<PageReader> page_reader = contents_->GetColumnPageReader(i); + return ColumnReader::Make( + descr, std::move(page_reader), + const_cast<ReaderProperties*>(contents_->properties())->memory_pool()); +} + +std::unique_ptr<PageReader> RowGroupReader::GetColumnPageReader(int i) { + if (i >= metadata()->num_columns()) { + std::stringstream ss; + ss << "Trying to read column index " << i << " but row group metadata has only " + << metadata()->num_columns() << " columns"; + throw ParquetException(ss.str()); + } + return contents_->GetColumnPageReader(i); +} + +// Returns the rowgroup metadata +const RowGroupMetaData* RowGroupReader::metadata() const { return contents_->metadata(); } + +/// Compute the section of the file that should be read for the given +/// row group and column chunk. +::arrow::io::ReadRange ComputeColumnChunkRange(FileMetaData* file_metadata, + int64_t source_size, int row_group_index, + int column_index) { + auto row_group_metadata = file_metadata->RowGroup(row_group_index); + auto column_metadata = row_group_metadata->ColumnChunk(column_index); + + int64_t col_start = column_metadata->data_page_offset(); + if (column_metadata->has_dictionary_page() && + column_metadata->dictionary_page_offset() > 0 && + col_start > column_metadata->dictionary_page_offset()) { + col_start = column_metadata->dictionary_page_offset(); + } + + int64_t col_length = column_metadata->total_compressed_size(); + // PARQUET-816 workaround for old files created by older parquet-mr + const ApplicationVersion& version = file_metadata->writer_version(); + if (version.VersionLt(ApplicationVersion::PARQUET_816_FIXED_VERSION())) { + // The Parquet MR writer had a bug in 1.2.8 and below where it didn't include the + // dictionary page header size in total_compressed_size and total_uncompressed_size + // (see IMPALA-694). We add padding to compensate. + int64_t bytes_remaining = source_size - (col_start + col_length); + int64_t padding = std::min<int64_t>(kMaxDictHeaderSize, bytes_remaining); + col_length += padding; + } + + return {col_start, col_length}; +} + +// RowGroupReader::Contents implementation for the Parquet file specification +class SerializedRowGroup : public RowGroupReader::Contents { + public: + SerializedRowGroup(std::shared_ptr<ArrowInputFile> source, + std::shared_ptr<::arrow::io::internal::ReadRangeCache> cached_source, + int64_t source_size, FileMetaData* file_metadata, + int row_group_number, const ReaderProperties& props, + std::shared_ptr<parquet::InternalFileDecryptor> file_decryptor = nullptr) + : source_(std::move(source)), + cached_source_(std::move(cached_source)), + source_size_(source_size), + file_metadata_(file_metadata), + properties_(props), + row_group_ordinal_(row_group_number), + file_decryptor_(file_decryptor) { + row_group_metadata_ = file_metadata->RowGroup(row_group_number); + } + + const RowGroupMetaData* metadata() const override { return row_group_metadata_.get(); } + + const ReaderProperties* properties() const override { return &properties_; } + + std::unique_ptr<PageReader> GetColumnPageReader(int i) override { + // Read column chunk from the file + auto col = row_group_metadata_->ColumnChunk(i); + + ::arrow::io::ReadRange col_range = + ComputeColumnChunkRange(file_metadata_, source_size_, row_group_ordinal_, i); + std::shared_ptr<ArrowInputStream> stream; + if (cached_source_) { + // PARQUET-1698: if read coalescing is enabled, read from pre-buffered + // segments. + PARQUET_ASSIGN_OR_THROW(auto buffer, cached_source_->Read(col_range)); + stream = std::make_shared<::arrow::io::BufferReader>(buffer); + } else { + stream = properties_.GetStream(source_, col_range.offset, col_range.length); + } + + std::unique_ptr<ColumnCryptoMetaData> crypto_metadata = col->crypto_metadata(); + + // Column is encrypted only if crypto_metadata exists. + if (!crypto_metadata) { + return PageReader::Open(stream, col->num_values(), col->compression(), + properties_.memory_pool()); + } + + if (file_decryptor_ == nullptr) { + throw ParquetException("RowGroup is noted as encrypted but no file decryptor"); + } + + constexpr auto kEncryptedRowGroupsLimit = 32767; + if (i > kEncryptedRowGroupsLimit) { + throw ParquetException("Encrypted files cannot contain more than 32767 row groups"); + } + + // The column is encrypted + std::shared_ptr<::parquet::Decryptor> meta_decryptor; + std::shared_ptr<Decryptor> data_decryptor; + // The column is encrypted with footer key + if (crypto_metadata->encrypted_with_footer_key()) { + meta_decryptor = file_decryptor_->GetFooterDecryptorForColumnMeta(); + data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData(); + + CryptoContext ctx(col->has_dictionary_page(), row_group_ordinal_, + static_cast<int16_t>(i), meta_decryptor, data_decryptor); + return PageReader::Open(stream, col->num_values(), col->compression(), + #if ARROW_VERSION_MAJOR > 8 + false, + #endif + properties_.memory_pool(), &ctx); + } + + // The column is encrypted with its own key + std::string column_key_metadata = crypto_metadata->key_metadata(); + const std::string column_path = crypto_metadata->path_in_schema()->ToDotString(); + + meta_decryptor = + file_decryptor_->GetColumnMetaDecryptor(column_path, column_key_metadata); + data_decryptor = + file_decryptor_->GetColumnDataDecryptor(column_path, column_key_metadata); + + CryptoContext ctx(col->has_dictionary_page(), row_group_ordinal_, + static_cast<int16_t>(i), meta_decryptor, data_decryptor); + return PageReader::Open(stream, col->num_values(), col->compression(), + #if ARROW_VERSION_MAJOR > 8 + false, + #endif + properties_.memory_pool(), &ctx); + } + + private: + std::shared_ptr<ArrowInputFile> source_; + // Will be nullptr if PreBuffer() is not called. + std::shared_ptr<::arrow::io::internal::ReadRangeCache> cached_source_; + int64_t source_size_; + FileMetaData* file_metadata_; + std::unique_ptr<RowGroupMetaData> row_group_metadata_; + ReaderProperties properties_; + int row_group_ordinal_; + std::shared_ptr<InternalFileDecryptor> file_decryptor_; +}; + +// ---------------------------------------------------------------------- +// SerializedFile: An implementation of ParquetFileReader::Contents that deals +// with the Parquet file structure, Thrift deserialization, and other internal +// matters + +// This class takes ownership of the provided data source +class SerializedFile : public ParquetFileReader::Contents { + public: + SerializedFile(std::shared_ptr<ArrowInputFile> source, + const ReaderProperties& props = default_reader_properties()) + : source_(std::move(source)), properties_(props) { + PARQUET_ASSIGN_OR_THROW(source_size_, source_->GetSize()); + } + + ~SerializedFile() override { + try { + Close(); + } catch (...) { + } + } + + void Close() override { + if (file_decryptor_) file_decryptor_->WipeOutDecryptionKeys(); + } + + std::shared_ptr<RowGroupReader> GetRowGroup(int i) override { + std::unique_ptr<SerializedRowGroup> contents( + new SerializedRowGroup(source_, cached_source_, source_size_, + file_metadata_.get(), i, properties_, file_decryptor_)); + return std::make_shared<RowGroupReader>(std::move(contents)); + } + + std::shared_ptr<FileMetaData> metadata() const override { return file_metadata_; } + + void set_metadata(std::shared_ptr<FileMetaData> metadata) { + file_metadata_ = std::move(metadata); + } + + void PreBuffer(const std::vector<int>& row_groups, + const std::vector<int>& column_indices, + const ::arrow::io::IOContext& ctx, + const ::arrow::io::CacheOptions& options) { + cached_source_ = + std::make_shared<::arrow::io::internal::ReadRangeCache>(source_, ctx, options); + //std::vector<arrow::io::ReadRange> ranges; + std::vector<::arrow::io::ReadRange> ranges; + for (int row : row_groups) { + for (int col : column_indices) { + ranges.push_back( + ComputeColumnChunkRange(file_metadata_.get(), source_size_, row, col)); + } + } + PARQUET_THROW_NOT_OK(cached_source_->Cache(ranges)); + } + + void ParseMetaData() { + if (source_size_ == 0) { + throw ParquetInvalidOrCorruptedFileException("Parquet file size is 0 bytes"); + } else if (source_size_ < kFooterSize) { + throw ParquetInvalidOrCorruptedFileException( + "Parquet file size is ", source_size_, + " bytes, smaller than the minimum file footer (", kFooterSize, " bytes)"); + } + + int64_t footer_read_size = std::min(source_size_, kDefaultFooterReadSize); + PARQUET_ASSIGN_OR_THROW( + auto footer_buffer, + source_->ReadAt(source_size_ - footer_read_size, footer_read_size)); + + // Check if all bytes are read. Check if last 4 bytes read have the magic bits + if (footer_buffer->size() != footer_read_size || + (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) != 0 && + memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) != 0)) { + throw ParquetInvalidOrCorruptedFileException( + "Parquet magic bytes not found in footer. Either the file is corrupted or this " + "is not a parquet file."); + } + + if (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) == 0) { + // Encrypted file with Encrypted footer. + ParseMetaDataOfEncryptedFileWithEncryptedFooter(footer_buffer, footer_read_size); + return; + } + + // No encryption or encryption with plaintext footer mode. + std::shared_ptr<Buffer> metadata_buffer; + uint32_t metadata_len, read_metadata_len; + ParseUnencryptedFileMetadata(footer_buffer, footer_read_size, &metadata_buffer, + &metadata_len, &read_metadata_len); + + auto file_decryption_properties = properties_.file_decryption_properties().get(); + if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file. + if (file_decryption_properties != nullptr) { + if (!file_decryption_properties->plaintext_files_allowed()) { + throw ParquetException("Applying decryption properties on plaintext file"); + } + } + } else { + // Encrypted file with plaintext footer mode. + ParseMetaDataOfEncryptedFileWithPlaintextFooter( + file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len); + } + } + + private: + std::shared_ptr<ArrowInputFile> source_; + std::shared_ptr<::arrow::io::internal::ReadRangeCache> cached_source_; + int64_t source_size_; + std::shared_ptr<FileMetaData> file_metadata_; + ReaderProperties properties_; + + std::shared_ptr<::parquet::InternalFileDecryptor> file_decryptor_; + + void ParseUnencryptedFileMetadata(const std::shared_ptr<Buffer>& footer_buffer, + int64_t footer_read_size, + std::shared_ptr<Buffer>* metadata_buffer, + uint32_t* metadata_len, uint32_t* read_metadata_len); + + std::string HandleAadPrefix(FileDecryptionProperties* file_decryption_properties, + EncryptionAlgorithm& algo); + + void ParseMetaDataOfEncryptedFileWithPlaintextFooter( + FileDecryptionProperties* file_decryption_properties, + const std::shared_ptr<Buffer>& metadata_buffer, uint32_t metadata_len, + uint32_t read_metadata_len); + + void ParseMetaDataOfEncryptedFileWithEncryptedFooter( + const std::shared_ptr<Buffer>& footer_buffer, int64_t footer_read_size); +}; + +void SerializedFile::ParseUnencryptedFileMetadata( + const std::shared_ptr<Buffer>& footer_buffer, int64_t footer_read_size, + std::shared_ptr<Buffer>* metadata_buffer, uint32_t* metadata_len, + uint32_t* read_metadata_len) { + *metadata_len = ::arrow::util::SafeLoadAs<uint32_t>( + reinterpret_cast<const uint8_t*>(footer_buffer->data()) + footer_read_size - + kFooterSize); + int64_t metadata_start = source_size_ - kFooterSize - *metadata_len; + if (*metadata_len > source_size_ - kFooterSize) { + throw ParquetInvalidOrCorruptedFileException( + "Parquet file size is ", source_size_, + " bytes, smaller than the size reported by metadata (", metadata_len, "bytes)"); + } + + // Check if the footer_buffer contains the entire metadata + if (footer_read_size >= (*metadata_len + kFooterSize)) { + *metadata_buffer = SliceBuffer( + footer_buffer, footer_read_size - *metadata_len - kFooterSize, *metadata_len); + } else { + PARQUET_ASSIGN_OR_THROW(*metadata_buffer, + source_->ReadAt(metadata_start, *metadata_len)); + if ((*metadata_buffer)->size() != *metadata_len) { + throw ParquetException("Failed reading metadata buffer (requested " + + std::to_string(*metadata_len) + " bytes but got " + + std::to_string((*metadata_buffer)->size()) + " bytes)"); + } + } + + *read_metadata_len = *metadata_len; + file_metadata_ = FileMetaData::Make((*metadata_buffer)->data(), read_metadata_len); +} + +void SerializedFile::ParseMetaDataOfEncryptedFileWithEncryptedFooter( + const std::shared_ptr<Buffer>& footer_buffer, int64_t footer_read_size) { + // encryption with encrypted footer + // both metadata & crypto metadata length + uint32_t footer_len = ::arrow::util::SafeLoadAs<uint32_t>( + reinterpret_cast<const uint8_t*>(footer_buffer->data()) + footer_read_size - + kFooterSize); + int64_t crypto_metadata_start = source_size_ - kFooterSize - footer_len; + if (kFooterSize + footer_len > source_size_) { + throw ParquetInvalidOrCorruptedFileException( + "Parquet file size is ", source_size_, + " bytes, smaller than the size reported by footer's (", footer_len, "bytes)"); + } + std::shared_ptr<Buffer> crypto_metadata_buffer; + // Check if the footer_buffer contains the entire metadata + if (footer_read_size >= (footer_len + kFooterSize)) { + crypto_metadata_buffer = SliceBuffer( + footer_buffer, footer_read_size - footer_len - kFooterSize, footer_len); + } else { + PARQUET_ASSIGN_OR_THROW(crypto_metadata_buffer, + source_->ReadAt(crypto_metadata_start, footer_len)); + if (crypto_metadata_buffer->size() != footer_len) { + throw ParquetException("Failed reading encrypted metadata buffer (requested " + + std::to_string(footer_len) + " bytes but got " + + std::to_string(crypto_metadata_buffer->size()) + " bytes)"); + } + } + auto file_decryption_properties = properties_.file_decryption_properties().get(); + if (file_decryption_properties == nullptr) { + throw ParquetException( + "Could not read encrypted metadata, no decryption found in reader's properties"); + } + uint32_t crypto_metadata_len = footer_len; + std::shared_ptr<FileCryptoMetaData> file_crypto_metadata = + FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); + // Handle AAD prefix + EncryptionAlgorithm algo = file_crypto_metadata->encryption_algorithm(); + std::string file_aad = HandleAadPrefix(file_decryption_properties, algo); + file_decryptor_ = std::make_shared<::parquet::InternalFileDecryptor>( + file_decryption_properties, file_aad, algo.algorithm, + file_crypto_metadata->key_metadata(), properties_.memory_pool()); + + int64_t metadata_offset = source_size_ - kFooterSize - footer_len + crypto_metadata_len; + uint32_t metadata_len = footer_len - crypto_metadata_len; + PARQUET_ASSIGN_OR_THROW(auto metadata_buffer, + source_->ReadAt(metadata_offset, metadata_len)); + if (metadata_buffer->size() != metadata_len) { + throw ParquetException("Failed reading metadata buffer (requested " + + std::to_string(metadata_len) + " bytes but got " + + std::to_string(metadata_buffer->size()) + " bytes)"); + } + + file_metadata_ = + FileMetaData::Make(metadata_buffer->data(), &metadata_len, file_decryptor_); + //FileMetaData::Make(metadata_buffer->data(), &metadata_len, default_reader_properties(), file_decryptor_); //version>9 +} + +void SerializedFile::ParseMetaDataOfEncryptedFileWithPlaintextFooter( + FileDecryptionProperties* file_decryption_properties, + const std::shared_ptr<Buffer>& metadata_buffer, uint32_t metadata_len, + uint32_t read_metadata_len) { + // Providing decryption properties in plaintext footer mode is not mandatory, for + // example when reading by legacy reader. + if (file_decryption_properties != nullptr) { + EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); + // Handle AAD prefix + std::string file_aad = HandleAadPrefix(file_decryption_properties, algo); + file_decryptor_ = std::make_shared<::parquet::InternalFileDecryptor>( + file_decryption_properties, file_aad, algo.algorithm, + file_metadata_->footer_signing_key_metadata(), properties_.memory_pool()); + // set the InternalFileDecryptor in the metadata as well, as it's used + // for signature verification and for ColumnChunkMetaData creation. +#if GAL_set_file_decryptor_declare_private + file_metadata_->set_file_decryptor(file_decryptor_); +#endif + if (file_decryption_properties->check_plaintext_footer_integrity()) { + if (metadata_len - read_metadata_len != + (parquet::encryption::kGcmTagLength + parquet::encryption::kNonceLength)) { + throw ParquetInvalidOrCorruptedFileException( + "Failed reading metadata for encryption signature (requested ", + parquet::encryption::kGcmTagLength + parquet::encryption::kNonceLength, + " bytes but have ", metadata_len - read_metadata_len, " bytes)"); + } + + if (!file_metadata_->VerifySignature(metadata_buffer->data() + read_metadata_len)) { + throw ParquetInvalidOrCorruptedFileException( + "Parquet crypto signature verification failed"); + } + } + } +} + +std::string SerializedFile::HandleAadPrefix( + FileDecryptionProperties* file_decryption_properties, EncryptionAlgorithm& algo) { + std::string aad_prefix_in_properties = file_decryption_properties->aad_prefix(); + std::string aad_prefix = aad_prefix_in_properties; + bool file_has_aad_prefix = algo.aad.aad_prefix.empty() ? false : true; + std::string aad_prefix_in_file = algo.aad.aad_prefix; + + if (algo.aad.supply_aad_prefix && aad_prefix_in_properties.empty()) { + throw ParquetException( + "AAD prefix used for file encryption, " + "but not stored in file and not supplied " + "in decryption properties"); + } + + if (file_has_aad_prefix) { + if (!aad_prefix_in_properties.empty()) { + if (aad_prefix_in_properties.compare(aad_prefix_in_file) != 0) { + throw ParquetException( + "AAD Prefix in file and in properties " + "is not the same"); + } + } + aad_prefix = aad_prefix_in_file; + std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier = + file_decryption_properties->aad_prefix_verifier(); + if (aad_prefix_verifier != nullptr) aad_prefix_verifier->Verify(aad_prefix); + } else { + if (!algo.aad.supply_aad_prefix && !aad_prefix_in_properties.empty()) { + throw ParquetException( + "AAD Prefix set in decryption properties, but was not used " + "for file encryption"); + } + std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier = + file_decryption_properties->aad_prefix_verifier(); + if (aad_prefix_verifier != nullptr) { + throw ParquetException( + "AAD Prefix Verifier is set, but AAD Prefix not found in file"); + } + } + return aad_prefix + algo.aad.aad_file_unique; +} + +// ---------------------------------------------------------------------- +// ParquetFileReader public API + +ParquetFileReader::ParquetFileReader() {} + +ParquetFileReader::~ParquetFileReader() { + try { + Close(); + } catch (...) { + } +} + +// Open the file. If no metadata is passed, it is parsed from the footer of +// the file +std::unique_ptr<ParquetFileReader::Contents> ParquetFileReader::Contents::Open( + std::shared_ptr<ArrowInputFile> source, const ReaderProperties& props, + std::shared_ptr<FileMetaData> metadata) { + std::unique_ptr<ParquetFileReader::Contents> result( + new SerializedFile(std::move(source), props)); + + // Access private methods here, but otherwise unavailable + SerializedFile* file = static_cast<SerializedFile*>(result.get()); + + if (metadata == nullptr) { + // Validates magic bytes, parses metadata, and initializes the SchemaDescriptor + file->ParseMetaData(); + } else { + file->set_metadata(std::move(metadata)); + } + + return result; +} + +std::unique_ptr<ParquetFileReader> ParquetFileReader::Open( + std::shared_ptr<::arrow::io::RandomAccessFile> source, const ReaderProperties& props, + std::shared_ptr<FileMetaData> metadata) { + auto contents = SerializedFile::Open(std::move(source), props, std::move(metadata)); + std::unique_ptr<ParquetFileReader> result(new ParquetFileReader()); + result->Open(std::move(contents)); + return result; +} + +#if GAL_NOT_IMPLEMENTED +std::unique_ptr<ParquetFileReader> ParquetFileReader::Open( + std::unique_ptr<RandomAccessSource> source, const ReaderProperties& props, + std::shared_ptr<FileMetaData> metadata) { + auto wrapper = std::make_shared<ParquetInputWrapper>(std::move(source)); + return Open(std::move(wrapper), props, std::move(metadata)); +} +#endif + +std::unique_ptr<ParquetFileReader> ParquetFileReader::OpenFile( + const std::string& path, s3selectEngine::rgw_s3select_api* rgw, bool memory_map, const ReaderProperties& props, + std::shared_ptr<FileMetaData> metadata) { + std::shared_ptr<::arrow::io::RandomAccessFile> source; + if (memory_map) { + PARQUET_ASSIGN_OR_THROW( + source, ::arrow::io::MemoryMappedFile::Open(path, ::arrow::io::FileMode::READ));//GAL change that also, or to remove? + } else { + PARQUET_ASSIGN_OR_THROW(source, + ::arrow::io::ceph::ReadableFile::Open(path, rgw, props.memory_pool())); + } + + return Open(std::move(source), props, std::move(metadata)); +} + +void ParquetFileReader::Open(std::unique_ptr<ParquetFileReader::Contents> contents) { + contents_ = std::move(contents); +} + +void ParquetFileReader::Close() { + if (contents_) { + contents_->Close(); + } +} + +std::shared_ptr<FileMetaData> ParquetFileReader::metadata() const { + return contents_->metadata(); +} + +std::shared_ptr<RowGroupReader> ParquetFileReader::RowGroup(int i) { + if (i >= metadata()->num_row_groups()) { + std::stringstream ss; + ss << "Trying to read row group " << i << " but file only has " + << metadata()->num_row_groups() << " row groups"; + throw ParquetException(ss.str()); + } + return contents_->GetRowGroup(i); +} + +void ParquetFileReader::PreBuffer(const std::vector<int>& row_groups, + const std::vector<int>& column_indices, + const ::arrow::io::IOContext& ctx, + const ::arrow::io::CacheOptions& options) { + // Access private methods here + SerializedFile* file = + ::arrow::internal::checked_cast<SerializedFile*>(contents_.get()); + file->PreBuffer(row_groups, column_indices, ctx, options); +} + +// ---------------------------------------------------------------------- +// File metadata helpers + +std::shared_ptr<FileMetaData> ReadMetaData( + const std::shared_ptr<::arrow::io::RandomAccessFile>& source) { + return ParquetFileReader::Open(source)->metadata(); +} + +// ---------------------------------------------------------------------- +// File scanner for performance testing +#if GAL_ScanAllValues_is_no_declare +int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size, + ParquetFileReader* reader) { + std::vector<int16_t> rep_levels(column_batch_size); + std::vector<int16_t> def_levels(column_batch_size); + + int num_columns = static_cast<int>(columns.size()); + + // columns are not specified explicitly. Add all columns + if (columns.size() == 0) { + num_columns = reader->metadata()->num_columns(); + columns.resize(num_columns); + for (int i = 0; i < num_columns; i++) { + columns[i] = i; + } + } + + std::vector<int64_t> total_rows(num_columns, 0); + + for (int r = 0; r < reader->metadata()->num_row_groups(); ++r) { + auto group_reader = reader->RowGroup(r); + int col = 0; + for (auto i : columns) { + std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i); + size_t value_byte_size = GetTypeByteSize(col_reader->descr()->physical_type()); + std::vector<uint8_t> values(column_batch_size * value_byte_size); + + int64_t values_read = 0; + while (col_reader->HasNext()) { + int64_t levels_read = + ScanAllValues(column_batch_size, def_levels.data(), rep_levels.data(), + values.data(), &values_read, col_reader.get()); + if (col_reader->descr()->max_repetition_level() > 0) { + for (int64_t i = 0; i < levels_read; i++) { + if (rep_levels[i] == 0) { + total_rows[col]++; + } + } + } else { + total_rows[col] += levels_read; + } + } + col++; + } + } + + for (int i = 1; i < num_columns; ++i) { + if (total_rows[0] != total_rows[i]) { + throw ParquetException("Parquet error: Total rows among columns do not match"); + } + } + + return total_rows[0]; +} +#endif + +} //namespace ceph +} //namespace parquet + +/******************************************/ +/******************************************/ +/******************************************/ +class column_reader_wrap +{ + +private: + + int64_t m_rownum; + parquet::Type::type m_type; + std::shared_ptr<parquet::ceph::RowGroupReader> m_row_group_reader; + int m_row_grouop_id; + uint16_t m_col_id; + parquet::ceph::ParquetFileReader* m_parquet_reader; + std::shared_ptr<parquet::ColumnReader> m_ColumnReader; + bool m_end_of_stream; + bool m_read_last_value; + + +public: + + enum class parquet_type + { + NA_TYPE, + STRING, + INT32, + INT64, + FLOAT, + DOUBLE, + TIMESTAMP, + PARQUET_NULL + }; + + struct parquet_value + { + int64_t num; + char *str; //str is pointing to offset in string which is NOT null terminated. + uint16_t str_len; + double dbl; + parquet_type type; + + parquet_value():type(parquet_type::NA_TYPE){} + }; + + typedef struct parquet_value parquet_value_t; + + enum class parquet_column_read_state {PARQUET_OUT_OF_RANGE,PARQUET_READ_OK}; + + private: + parquet_value_t m_last_value; + + public: + column_reader_wrap(std::unique_ptr<parquet::ceph::ParquetFileReader> & parquet_reader,uint16_t col_id); + + parquet::Type::type get_type(); + + bool HasNext();//TODO template + + int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels, + parquet_value_t* values, int64_t* values_read); + + int64_t Skip(int64_t rows_to_skip); + + parquet_column_read_state Read(uint64_t rownum,parquet_value_t & value); + +}; + +class parquet_file_parser +{ + +public: + + typedef std::vector<std::pair<std::string, column_reader_wrap::parquet_type>> schema_t; + typedef std::set<uint16_t> column_pos_t; + typedef std::vector<column_reader_wrap::parquet_value_t> row_values_t; + + typedef column_reader_wrap::parquet_value_t parquet_value_t; + typedef column_reader_wrap::parquet_type parquet_type; + +private: + + std::string m_parquet_file_name; + uint32_t m_num_of_columms; + uint64_t m_num_of_rows; + uint64_t m_rownum; + schema_t m_schm; + int m_num_row_groups; + std::shared_ptr<parquet::FileMetaData> m_file_metadata; + std::unique_ptr<parquet::ceph::ParquetFileReader> m_parquet_reader; + std::vector<column_reader_wrap*> m_column_readers; + s3selectEngine::rgw_s3select_api* m_rgw_s3select_api; + + public: + + parquet_file_parser(std::string parquet_file_name,s3selectEngine::rgw_s3select_api* rgw_api) : + m_parquet_file_name(parquet_file_name), + m_num_of_columms(0), + m_num_of_rows(0), + m_rownum(0), + m_num_row_groups(0), + m_rgw_s3select_api(rgw_api) + + + { + load_meta_data(); + } + + ~parquet_file_parser() + { + for(auto r : m_column_readers) + { + delete r; + } + } + + int load_meta_data() + { + m_parquet_reader = parquet::ceph::ParquetFileReader::OpenFile(m_parquet_file_name,m_rgw_s3select_api,false); + m_file_metadata = m_parquet_reader->metadata(); + m_num_of_columms = m_parquet_reader->metadata()->num_columns(); + m_num_row_groups = m_file_metadata->num_row_groups(); + m_num_of_rows = m_file_metadata->num_rows(); + + for (uint32_t i = 0; i < m_num_of_columms; i++) + { + parquet::Type::type tp = m_file_metadata->schema()->Column(i)->physical_type(); + std::pair<std::string, column_reader_wrap::parquet_type> elm; + + switch (tp) + { + case parquet::Type::type::INT32: + elm = std::pair<std::string, column_reader_wrap::parquet_type>(m_file_metadata->schema()->Column(i)->name(), column_reader_wrap::parquet_type::INT32); + m_schm.push_back(elm); + break; + + case parquet::Type::type::INT64: + elm = std::pair<std::string, column_reader_wrap::parquet_type>(m_file_metadata->schema()->Column(i)->name(), column_reader_wrap::parquet_type::INT64); + m_schm.push_back(elm); + break; + + case parquet::Type::type::FLOAT: + elm = std::pair<std::string, column_reader_wrap::parquet_type>(m_file_metadata->schema()->Column(i)->name(), column_reader_wrap::parquet_type::FLOAT); + m_schm.push_back(elm); + break; + + case parquet::Type::type::DOUBLE: + elm = std::pair<std::string, column_reader_wrap::parquet_type>(m_file_metadata->schema()->Column(i)->name(), column_reader_wrap::parquet_type::DOUBLE); + m_schm.push_back(elm); + break; + + case parquet::Type::type::BYTE_ARRAY: + elm = std::pair<std::string, column_reader_wrap::parquet_type>(m_file_metadata->schema()->Column(i)->name(), column_reader_wrap::parquet_type::STRING); + m_schm.push_back(elm); + break; + + default: + { + std::stringstream err; + err << "some parquet type not supported"; + throw std::runtime_error(err.str()); + } + } + + m_column_readers.push_back(new column_reader_wrap(m_parquet_reader,i)); + } + + return 0; + } + + bool end_of_stream() + { + + if (m_rownum > (m_num_of_rows-1)) + return true; + return false; + } + + uint64_t get_number_of_rows() + { + return m_num_of_rows; + } + + uint64_t rownum() + { + return m_rownum; + } + + bool increase_rownum() + { + if (end_of_stream()) + return false; + + m_rownum++; + return true; + } + + uint64_t get_rownum() + { + return m_rownum; + } + + uint32_t get_num_of_columns() + { + return m_num_of_columms; + } + + int get_column_values_by_positions(column_pos_t positions, row_values_t &row_values) + { + column_reader_wrap::parquet_value_t column_value; + row_values.clear(); + + for(auto col : positions) + { + if((col)>=m_num_of_columms) + {//TODO should verified upon syntax phase + //TODO throw exception + return -1; + } + auto status = m_column_readers[col]->Read(m_rownum,column_value); + if(status == column_reader_wrap::parquet_column_read_state::PARQUET_OUT_OF_RANGE) return -1; + row_values.push_back(column_value);//TODO intensive (should move) + } + return 0; + } + + schema_t get_schema() + { + return m_schm; + } +}; + +/******************************************/ + + + column_reader_wrap::column_reader_wrap(std::unique_ptr<parquet::ceph::ParquetFileReader> & parquet_reader,uint16_t col_id): + m_rownum(-1), + m_type(parquet::Type::type::UNDEFINED), + m_row_grouop_id(0), + m_col_id(col_id), + m_end_of_stream(false), + m_read_last_value(false) + { + m_parquet_reader = parquet_reader.get(); + m_row_group_reader = m_parquet_reader->RowGroup(m_row_grouop_id); + m_ColumnReader = m_row_group_reader->Column(m_col_id); + } + + parquet::Type::type column_reader_wrap::get_type() + {//TODO if UNDEFINED + return m_parquet_reader->metadata()->schema()->Column(m_col_id)->physical_type(); + } + + bool column_reader_wrap::HasNext()//TODO template + { + parquet::Int32Reader* int32_reader; + parquet::Int64Reader* int64_reader; + parquet::FloatReader* float_reader; + parquet::DoubleReader* double_reader; + parquet::ByteArrayReader* byte_array_reader; + + switch (get_type()) + { + case parquet::Type::type::INT32: + int32_reader = static_cast<parquet::Int32Reader *>(m_ColumnReader.get()); + return int32_reader->HasNext(); + break; + + case parquet::Type::type::INT64: + int64_reader = static_cast<parquet::Int64Reader *>(m_ColumnReader.get()); + return int64_reader->HasNext(); + break; + + case parquet::Type::type::FLOAT: + float_reader = static_cast<parquet::FloatReader *>(m_ColumnReader.get()); + return float_reader->HasNext(); + break; + + case parquet::Type::type::DOUBLE: + double_reader = static_cast<parquet::DoubleReader *>(m_ColumnReader.get()); + return double_reader->HasNext(); + break; + + case parquet::Type::type::BYTE_ARRAY: + byte_array_reader = static_cast<parquet::ByteArrayReader *>(m_ColumnReader.get()); + return byte_array_reader->HasNext(); + break; + + default: + + std::stringstream err; + err << "HasNext():" << "wrong type or type not exist" << std::endl; + throw std::runtime_error(err.str()); + + return false; + //TODO throw exception + } + + return false; + } + + int64_t column_reader_wrap::ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels, + parquet_value_t* values, int64_t* values_read) + { + parquet::Int32Reader* int32_reader; + parquet::Int64Reader* int64_reader; + parquet::FloatReader* float_reader; + parquet::DoubleReader* double_reader; + parquet::ByteArrayReader* byte_array_reader; + + parquet::ByteArray str_value; + int64_t rows_read; + int32_t i32_val; + + auto error_msg = [&](std::exception &e) + { + std::stringstream err; + err << "what() :" << e.what() << std::endl; + err << "failed to parse column id:" << this->m_col_id << " name:" <<this->m_parquet_reader->metadata()->schema()->Column(m_col_id)->name(); + return err; + }; + int16_t defintion_level; + int16_t repeat_level; + + switch (get_type()) + { + case parquet::Type::type::INT32: + int32_reader = static_cast<parquet::Int32Reader *>(m_ColumnReader.get()); + try { + rows_read = int32_reader->ReadBatch(1, &defintion_level, &repeat_level, &i32_val , values_read); + if(defintion_level == 0) + { + values->type = column_reader_wrap::parquet_type::PARQUET_NULL; + } else + { + values->num = i32_val; + values->type = column_reader_wrap::parquet_type::INT32; + } + } + catch(std::exception &e) + { + throw std::runtime_error(error_msg(e).str()); + } + + break; + + case parquet::Type::type::INT64: + int64_reader = static_cast<parquet::Int64Reader *>(m_ColumnReader.get()); + try{ + rows_read = int64_reader->ReadBatch(1, &defintion_level, &repeat_level, (int64_t *)&(values->num), values_read); + if(defintion_level == 0) + { + values->type = column_reader_wrap::parquet_type::PARQUET_NULL; + } else + { + auto logical_type = m_parquet_reader->metadata()->schema()->Column(m_col_id)->logical_type(); + + if (logical_type.get()->type() == parquet::LogicalType::Type::type::TIMESTAMP) //TODO missing sub-type (milli,micro) + values->type = column_reader_wrap::parquet_type::TIMESTAMP; + else + values->type = column_reader_wrap::parquet_type::INT64; + } + } + catch(std::exception &e) + { + throw std::runtime_error(error_msg(e).str()); + } + break; + + case parquet::Type::type::FLOAT: + float_reader = static_cast<parquet::FloatReader *>(m_ColumnReader.get()); + try{ + float data_source_float = 0; + rows_read = float_reader->ReadBatch(1, &defintion_level, &repeat_level, &data_source_float , values_read);//TODO proper cast + if(defintion_level == 0) + { + values->type = column_reader_wrap::parquet_type::PARQUET_NULL; + } else + { + values->type = column_reader_wrap::parquet_type::DOUBLE; + values->dbl = data_source_float; + + } + } + catch(std::exception &e) + { + throw std::runtime_error(error_msg(e).str()); + } + break; + + case parquet::Type::type::DOUBLE: + double_reader = static_cast<parquet::DoubleReader *>(m_ColumnReader.get()); + try{ + rows_read = double_reader->ReadBatch(1, &defintion_level, &repeat_level, (double *)&(values->dbl), values_read); + if(defintion_level == 0) + { + values->type = column_reader_wrap::parquet_type::PARQUET_NULL; + } else + { + values->type = column_reader_wrap::parquet_type::DOUBLE; + } + } + catch(std::exception &e) + { + throw std::runtime_error(error_msg(e).str()); + } + break; + + case parquet::Type::type::BYTE_ARRAY: + byte_array_reader = static_cast<parquet::ByteArrayReader *>(m_ColumnReader.get()); + try{ + rows_read = byte_array_reader->ReadBatch(1, &defintion_level, &repeat_level, &str_value , values_read); + if(defintion_level == 0) + { + values->type = column_reader_wrap::parquet_type::PARQUET_NULL; + } else + { + values->type = column_reader_wrap::parquet_type::STRING; + values->str = (char*)str_value.ptr; + values->str_len = str_value.len; + } + } + catch(std::exception &e) + { + throw std::runtime_error(error_msg(e).str()); + } + break; + + default: + { + std::stringstream err; + err << "wrong type" << std::endl; + throw std::runtime_error(err.str()); + } + + } + + return rows_read; + } + + int64_t column_reader_wrap::Skip(int64_t rows_to_skip) + { + parquet::Int32Reader* int32_reader; + parquet::Int64Reader* int64_reader; + parquet::DoubleReader* double_reader; + parquet::FloatReader* float_reader; + parquet::ByteArrayReader* byte_array_reader; + + parquet::ByteArray str_value; + int64_t rows_read; + + auto error_msg = [&](std::exception &e) + { + std::stringstream err; + err << "what() :" << e.what() << std::endl; + err << "failed to parse column id:" << this->m_col_id << " name:" <<this->m_parquet_reader->metadata()->schema()->Column(m_col_id)->name(); + return err; + }; + + switch (get_type()) + { + case parquet::Type::type::INT32: + int32_reader = static_cast<parquet::Int32Reader *>(m_ColumnReader.get()); + try{ + rows_read = int32_reader->Skip(rows_to_skip); + } + catch(std::exception &e) + { + throw std::runtime_error(error_msg(e).str()); + } + break; + + case parquet::Type::type::INT64: + int64_reader = static_cast<parquet::Int64Reader *>(m_ColumnReader.get()); + try{ + rows_read = int64_reader->Skip(rows_to_skip); + } + catch(std::exception &e) + { + throw std::runtime_error(error_msg(e).str()); + } + break; + + case parquet::Type::type::FLOAT: + float_reader = static_cast<parquet::FloatReader *>(m_ColumnReader.get()); + try { + rows_read = float_reader->Skip(rows_to_skip); + } + catch(std::exception &e) + { + throw std::runtime_error(error_msg(e).str()); + } + break; + + case parquet::Type::type::DOUBLE: + double_reader = static_cast<parquet::DoubleReader *>(m_ColumnReader.get()); + try { + rows_read = double_reader->Skip(rows_to_skip); + } + catch(std::exception &e) + { + throw std::runtime_error(error_msg(e).str()); + } + break; + + case parquet::Type::type::BYTE_ARRAY: + byte_array_reader = static_cast<parquet::ByteArrayReader *>(m_ColumnReader.get()); + try{ + rows_read = byte_array_reader->Skip(rows_to_skip); + } + catch(std::exception &e) + { + throw std::runtime_error(error_msg(e).str()); + } + break; + + default: + { + std::stringstream err; + err << "wrong type" << std::endl; + throw std::runtime_error(err.str()); + } + } + + return rows_read; + } + + + column_reader_wrap::parquet_column_read_state column_reader_wrap::Read(const uint64_t rownum,parquet_value_t & value) + { + int64_t values_read = 0; + + if (m_rownum < (int64_t)rownum) + { //should skip + m_read_last_value = false; + + //TODO what about Skip(0) + uint64_t skipped_rows = Skip(rownum - m_rownum -1); + m_rownum += skipped_rows; + + while (((m_rownum+1) < (int64_t)rownum) || HasNext() == false) + { + uint64_t skipped_rows = Skip(rownum - m_rownum -1); + m_rownum += skipped_rows; + + if (HasNext() == false) + { + if ((m_row_grouop_id + 1) >= m_parquet_reader->metadata()->num_row_groups()) + { + m_end_of_stream = true; + return column_reader_wrap::parquet_column_read_state::PARQUET_OUT_OF_RANGE;//end-of-stream + } + else + { + m_row_grouop_id++; + m_row_group_reader = m_parquet_reader->RowGroup(m_row_grouop_id); + m_ColumnReader = m_row_group_reader->Column(m_col_id); + } + } + } //end-while + + ReadBatch(1, nullptr, nullptr, &m_last_value, &values_read); + m_read_last_value = true; + m_rownum++; + value = m_last_value; + } + else + { + if (m_read_last_value == false) + { + ReadBatch(1, nullptr, nullptr, &m_last_value, &values_read); + m_read_last_value = true; + m_rownum++; + } + + value = m_last_value; + } + + return column_reader_wrap::parquet_column_read_state::PARQUET_READ_OK; + } + +#endif + |