summaryrefslogtreecommitdiffstats
path: root/src/arrow/cpp/src/gandiva/regex_util.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/arrow/cpp/src/gandiva/regex_util.cc')
-rw-r--r--src/arrow/cpp/src/gandiva/regex_util.cc63
1 files changed, 63 insertions, 0 deletions
diff --git a/src/arrow/cpp/src/gandiva/regex_util.cc b/src/arrow/cpp/src/gandiva/regex_util.cc
new file mode 100644
index 000000000..abdd579d1
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/regex_util.cc
@@ -0,0 +1,63 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/regex_util.h"
+
+namespace gandiva {
+
+const std::set<char> RegexUtil::pcre_regex_specials_ = {
+ '[', ']', '(', ')', '|', '^', '-', '+', '*', '?', '{', '}', '$', '\\', '.'};
+
+Status RegexUtil::SqlLikePatternToPcre(const std::string& sql_pattern, char escape_char,
+ std::string& pcre_pattern) {
+ /// Characters that are considered special by pcre regex. These needs to be
+ /// escaped with '\\'.
+ pcre_pattern.clear();
+ for (size_t idx = 0; idx < sql_pattern.size(); ++idx) {
+ auto cur = sql_pattern.at(idx);
+
+ // Escape any char that is special for pcre regex
+ if (pcre_regex_specials_.find(cur) != pcre_regex_specials_.end()) {
+ pcre_pattern += "\\";
+ }
+
+ if (cur == escape_char) {
+ // escape char must be followed by '_', '%' or the escape char itself.
+ ++idx;
+ ARROW_RETURN_IF(
+ idx == sql_pattern.size(),
+ Status::Invalid("Unexpected escape char at the end of pattern ", sql_pattern));
+
+ cur = sql_pattern.at(idx);
+ if (cur == '_' || cur == '%' || cur == escape_char) {
+ pcre_pattern += cur;
+ } else {
+ return Status::Invalid("Invalid escape sequence in pattern ", sql_pattern,
+ " at offset ", idx);
+ }
+ } else if (cur == '_') {
+ pcre_pattern += '.';
+ } else if (cur == '%') {
+ pcre_pattern += ".*";
+ } else {
+ pcre_pattern += cur;
+ }
+ }
+ return Status::OK();
+}
+
+} // namespace gandiva