summaryrefslogtreecommitdiffstats
path: root/src/arrow/cpp/src/gandiva/date_utils.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/arrow/cpp/src/gandiva/date_utils.cc')
-rw-r--r--src/arrow/cpp/src/gandiva/date_utils.cc232
1 files changed, 232 insertions, 0 deletions
diff --git a/src/arrow/cpp/src/gandiva/date_utils.cc b/src/arrow/cpp/src/gandiva/date_utils.cc
new file mode 100644
index 000000000..f0a80d3c9
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/date_utils.cc
@@ -0,0 +1,232 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <sstream>
+#include <vector>
+
+#include "gandiva/date_utils.h"
+
+namespace gandiva {
+
+std::vector<std::string> DateUtils::GetMatches(std::string pattern, bool exactMatch) {
+ // we are case insensitive
+ std::transform(pattern.begin(), pattern.end(), pattern.begin(), ::tolower);
+ std::vector<std::string> matches;
+
+ for (const auto& it : sql_date_format_to_boost_map_) {
+ if (it.first.find(pattern) != std::string::npos &&
+ (!exactMatch || (it.first.length() == pattern.length()))) {
+ matches.push_back(it.first);
+ }
+ }
+
+ return matches;
+}
+
+std::vector<std::string> DateUtils::GetPotentialMatches(const std::string& pattern) {
+ return GetMatches(pattern, false);
+}
+
+std::vector<std::string> DateUtils::GetExactMatches(const std::string& pattern) {
+ return GetMatches(pattern, true);
+}
+
+/**
+ * Validates and converts format to the strptime equivalent
+ *
+ */
+Status DateUtils::ToInternalFormat(const std::string& format,
+ std::shared_ptr<std::string>* internal_format) {
+ std::stringstream builder;
+ std::stringstream buffer;
+ bool is_in_quoted_text = false;
+
+ for (size_t i = 0; i < format.size(); i++) {
+ char currentChar = format[i];
+
+ // logic before we append to the buffer
+ if (currentChar == '"') {
+ if (is_in_quoted_text) {
+ // we are done with a quoted block
+ is_in_quoted_text = false;
+
+ // use ' for quoting
+ builder << '\'';
+ builder << buffer.str();
+ builder << '\'';
+
+ // clear buffer
+ buffer.str("");
+ continue;
+ } else {
+ ARROW_RETURN_IF(buffer.str().length() > 0,
+ Status::Invalid("Invalid date format string '", format, "'"));
+
+ is_in_quoted_text = true;
+ continue;
+ }
+ }
+
+ // handle special characters we want to simply pass through, but only if not in quoted
+ // and the buffer is empty
+ std::string special_characters = "*-/,.;: ";
+ if (!is_in_quoted_text && buffer.str().length() == 0 &&
+ (special_characters.find_first_of(currentChar) != std::string::npos)) {
+ builder << currentChar;
+ continue;
+ }
+
+ // append to the buffer
+ buffer << currentChar;
+
+ // nothing else to do if we are in quoted text
+ if (is_in_quoted_text) {
+ continue;
+ }
+
+ // check how many matches we have for our buffer
+ std::vector<std::string> potentialList = GetPotentialMatches(buffer.str());
+ int64_t potentialCount = potentialList.size();
+
+ if (potentialCount >= 1) {
+ // one potential and the length match
+ if (potentialCount == 1 && potentialList[0].length() == buffer.str().length()) {
+ // we have a match!
+ builder << sql_date_format_to_boost_map_[potentialList[0]];
+ buffer.str("");
+ } else {
+ // Some patterns (like MON, MONTH) can cause ambiguity, such as "MON:". "MON"
+ // will have two potential matches, but "MON:" will match nothing, so we want to
+ // look ahead when we match "MON" and check if adding the next char leads to 0
+ // potentials. If it does, we go ahead and treat the buffer as matched (if a
+ // potential match exists that matches the buffer)
+ if (format.length() - 1 > i) {
+ std::string lookAheadPattern = (buffer.str() + format.at(i + 1));
+ std::transform(lookAheadPattern.begin(), lookAheadPattern.end(),
+ lookAheadPattern.begin(), ::tolower);
+ bool lookAheadMatched = false;
+
+ // we can query potentialList to see if it has anything that matches the
+ // lookahead pattern
+ for (std::string potential : potentialList) {
+ if (potential.find(lookAheadPattern) != std::string::npos) {
+ lookAheadMatched = true;
+ break;
+ }
+ }
+
+ if (!lookAheadMatched) {
+ // check if any of the potential matches are the same length as our buffer, we
+ // do not want to match "MO:"
+ bool matched = false;
+ for (std::string potential : potentialList) {
+ if (potential.length() == buffer.str().length()) {
+ matched = true;
+ break;
+ }
+ }
+
+ if (matched) {
+ std::string match = buffer.str();
+ std::transform(match.begin(), match.end(), match.begin(), ::tolower);
+ builder << sql_date_format_to_boost_map_[match];
+ buffer.str("");
+ continue;
+ }
+ }
+ }
+ }
+ } else {
+ return Status::Invalid("Invalid date format string '", format, "'");
+ }
+ }
+
+ if (buffer.str().length() > 0) {
+ // Some patterns (like MON, MONTH) can cause us to reach this point with a valid
+ // buffer value as MON has 2 valid potential matches, so double check here
+ std::vector<std::string> exactMatches = GetExactMatches(buffer.str());
+ if (exactMatches.size() == 1 && exactMatches[0].length() == buffer.str().length()) {
+ builder << sql_date_format_to_boost_map_[exactMatches[0]];
+ } else {
+ // Format partially parsed
+ int64_t pos = format.length() - buffer.str().length();
+ return Status::Invalid("Invalid date format string '", format, "' at position ",
+ pos);
+ }
+ }
+ std::string final_pattern = builder.str();
+ internal_format->reset(new std::string(final_pattern));
+ return Status::OK();
+}
+
+DateUtils::date_format_converter DateUtils::sql_date_format_to_boost_map_ = InitMap();
+
+DateUtils::date_format_converter DateUtils::InitMap() {
+ date_format_converter map;
+
+ // Era
+ map["ad"] = "%EC";
+ map["bc"] = "%EC";
+ // Meridian
+ map["am"] = "%p";
+ map["pm"] = "%p";
+ // Century
+ map["cc"] = "%C";
+ // Week of year
+ map["ww"] = "%W";
+ // Day of week
+ map["d"] = "%u";
+ // Day name of week
+ map["dy"] = "%a";
+ map["day"] = "%a";
+ // Year
+ map["yyyy"] = "%Y";
+ map["yy"] = "%y";
+ // Day of year
+ map["ddd"] = "%j";
+ // Month
+ map["mm"] = "%m";
+ map["mon"] = "%b";
+ map["month"] = "%b";
+ // Day of month
+ map["dd"] = "%d";
+ // Hour of day
+ map["hh"] = "%I";
+ map["hh12"] = "%I";
+ map["hh24"] = "%H";
+ // Minutes
+ map["mi"] = "%M";
+ // Seconds
+ map["ss"] = "%S";
+ // Milliseconds
+ map["f"] = "S";
+ map["ff"] = "SS";
+ map["fff"] = "SSS";
+ /*
+ // Timezone not tested/supported yet fully.
+ map["tzd"] = "%Z";
+ map["tzo"] = "%z";
+ map["tzh:tzm"] = "%z";
+ */
+
+ return map;
+}
+
+} // namespace gandiva