diff options
Diffstat (limited to 'src/arrow/cpp/src/gandiva/date_utils.cc')
-rw-r--r-- | src/arrow/cpp/src/gandiva/date_utils.cc | 232 |
1 files changed, 232 insertions, 0 deletions
diff --git a/src/arrow/cpp/src/gandiva/date_utils.cc b/src/arrow/cpp/src/gandiva/date_utils.cc new file mode 100644 index 000000000..f0a80d3c9 --- /dev/null +++ b/src/arrow/cpp/src/gandiva/date_utils.cc @@ -0,0 +1,232 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <algorithm> +#include <cstdint> +#include <memory> +#include <sstream> +#include <vector> + +#include "gandiva/date_utils.h" + +namespace gandiva { + +std::vector<std::string> DateUtils::GetMatches(std::string pattern, bool exactMatch) { + // we are case insensitive + std::transform(pattern.begin(), pattern.end(), pattern.begin(), ::tolower); + std::vector<std::string> matches; + + for (const auto& it : sql_date_format_to_boost_map_) { + if (it.first.find(pattern) != std::string::npos && + (!exactMatch || (it.first.length() == pattern.length()))) { + matches.push_back(it.first); + } + } + + return matches; +} + +std::vector<std::string> DateUtils::GetPotentialMatches(const std::string& pattern) { + return GetMatches(pattern, false); +} + +std::vector<std::string> DateUtils::GetExactMatches(const std::string& pattern) { + return GetMatches(pattern, true); +} + +/** + * Validates and converts format to the strptime equivalent + * + */ +Status DateUtils::ToInternalFormat(const std::string& format, + std::shared_ptr<std::string>* internal_format) { + std::stringstream builder; + std::stringstream buffer; + bool is_in_quoted_text = false; + + for (size_t i = 0; i < format.size(); i++) { + char currentChar = format[i]; + + // logic before we append to the buffer + if (currentChar == '"') { + if (is_in_quoted_text) { + // we are done with a quoted block + is_in_quoted_text = false; + + // use ' for quoting + builder << '\''; + builder << buffer.str(); + builder << '\''; + + // clear buffer + buffer.str(""); + continue; + } else { + ARROW_RETURN_IF(buffer.str().length() > 0, + Status::Invalid("Invalid date format string '", format, "'")); + + is_in_quoted_text = true; + continue; + } + } + + // handle special characters we want to simply pass through, but only if not in quoted + // and the buffer is empty + std::string special_characters = "*-/,.;: "; + if (!is_in_quoted_text && buffer.str().length() == 0 && + (special_characters.find_first_of(currentChar) != std::string::npos)) { + builder << currentChar; + continue; + } + + // append to the buffer + buffer << currentChar; + + // nothing else to do if we are in quoted text + if (is_in_quoted_text) { + continue; + } + + // check how many matches we have for our buffer + std::vector<std::string> potentialList = GetPotentialMatches(buffer.str()); + int64_t potentialCount = potentialList.size(); + + if (potentialCount >= 1) { + // one potential and the length match + if (potentialCount == 1 && potentialList[0].length() == buffer.str().length()) { + // we have a match! + builder << sql_date_format_to_boost_map_[potentialList[0]]; + buffer.str(""); + } else { + // Some patterns (like MON, MONTH) can cause ambiguity, such as "MON:". "MON" + // will have two potential matches, but "MON:" will match nothing, so we want to + // look ahead when we match "MON" and check if adding the next char leads to 0 + // potentials. If it does, we go ahead and treat the buffer as matched (if a + // potential match exists that matches the buffer) + if (format.length() - 1 > i) { + std::string lookAheadPattern = (buffer.str() + format.at(i + 1)); + std::transform(lookAheadPattern.begin(), lookAheadPattern.end(), + lookAheadPattern.begin(), ::tolower); + bool lookAheadMatched = false; + + // we can query potentialList to see if it has anything that matches the + // lookahead pattern + for (std::string potential : potentialList) { + if (potential.find(lookAheadPattern) != std::string::npos) { + lookAheadMatched = true; + break; + } + } + + if (!lookAheadMatched) { + // check if any of the potential matches are the same length as our buffer, we + // do not want to match "MO:" + bool matched = false; + for (std::string potential : potentialList) { + if (potential.length() == buffer.str().length()) { + matched = true; + break; + } + } + + if (matched) { + std::string match = buffer.str(); + std::transform(match.begin(), match.end(), match.begin(), ::tolower); + builder << sql_date_format_to_boost_map_[match]; + buffer.str(""); + continue; + } + } + } + } + } else { + return Status::Invalid("Invalid date format string '", format, "'"); + } + } + + if (buffer.str().length() > 0) { + // Some patterns (like MON, MONTH) can cause us to reach this point with a valid + // buffer value as MON has 2 valid potential matches, so double check here + std::vector<std::string> exactMatches = GetExactMatches(buffer.str()); + if (exactMatches.size() == 1 && exactMatches[0].length() == buffer.str().length()) { + builder << sql_date_format_to_boost_map_[exactMatches[0]]; + } else { + // Format partially parsed + int64_t pos = format.length() - buffer.str().length(); + return Status::Invalid("Invalid date format string '", format, "' at position ", + pos); + } + } + std::string final_pattern = builder.str(); + internal_format->reset(new std::string(final_pattern)); + return Status::OK(); +} + +DateUtils::date_format_converter DateUtils::sql_date_format_to_boost_map_ = InitMap(); + +DateUtils::date_format_converter DateUtils::InitMap() { + date_format_converter map; + + // Era + map["ad"] = "%EC"; + map["bc"] = "%EC"; + // Meridian + map["am"] = "%p"; + map["pm"] = "%p"; + // Century + map["cc"] = "%C"; + // Week of year + map["ww"] = "%W"; + // Day of week + map["d"] = "%u"; + // Day name of week + map["dy"] = "%a"; + map["day"] = "%a"; + // Year + map["yyyy"] = "%Y"; + map["yy"] = "%y"; + // Day of year + map["ddd"] = "%j"; + // Month + map["mm"] = "%m"; + map["mon"] = "%b"; + map["month"] = "%b"; + // Day of month + map["dd"] = "%d"; + // Hour of day + map["hh"] = "%I"; + map["hh12"] = "%I"; + map["hh24"] = "%H"; + // Minutes + map["mi"] = "%M"; + // Seconds + map["ss"] = "%S"; + // Milliseconds + map["f"] = "S"; + map["ff"] = "SS"; + map["fff"] = "SSS"; + /* + // Timezone not tested/supported yet fully. + map["tzd"] = "%Z"; + map["tzo"] = "%z"; + map["tzh:tzm"] = "%z"; + */ + + return map; +} + +} // namespace gandiva |