summaryrefslogtreecommitdiffstats
path: root/src/arrow/cpp/src/gandiva
diff options
context:
space:
mode:
Diffstat (limited to 'src/arrow/cpp/src/gandiva')
-rw-r--r--src/arrow/cpp/src/gandiva/CMakeLists.txt253
-rw-r--r--src/arrow/cpp/src/gandiva/GandivaConfig.cmake.in36
-rw-r--r--src/arrow/cpp/src/gandiva/annotator.cc118
-rw-r--r--src/arrow/cpp/src/gandiva/annotator.h81
-rw-r--r--src/arrow/cpp/src/gandiva/annotator_test.cc102
-rw-r--r--src/arrow/cpp/src/gandiva/arrow.h57
-rw-r--r--src/arrow/cpp/src/gandiva/basic_decimal_scalar.h65
-rw-r--r--src/arrow/cpp/src/gandiva/bitmap_accumulator.cc75
-rw-r--r--src/arrow/cpp/src/gandiva/bitmap_accumulator.h79
-rw-r--r--src/arrow/cpp/src/gandiva/bitmap_accumulator_test.cc112
-rw-r--r--src/arrow/cpp/src/gandiva/cache.cc45
-rw-r--r--src/arrow/cpp/src/gandiva/cache.h60
-rw-r--r--src/arrow/cpp/src/gandiva/cast_time.cc85
-rw-r--r--src/arrow/cpp/src/gandiva/compiled_expr.h71
-rw-r--r--src/arrow/cpp/src/gandiva/condition.h37
-rw-r--r--src/arrow/cpp/src/gandiva/configuration.cc43
-rw-r--r--src/arrow/cpp/src/gandiva/configuration.h84
-rw-r--r--src/arrow/cpp/src/gandiva/context_helper.cc76
-rw-r--r--src/arrow/cpp/src/gandiva/date_utils.cc232
-rw-r--r--src/arrow/cpp/src/gandiva/date_utils.h52
-rw-r--r--src/arrow/cpp/src/gandiva/decimal_ir.cc559
-rw-r--r--src/arrow/cpp/src/gandiva/decimal_ir.h188
-rw-r--r--src/arrow/cpp/src/gandiva/decimal_scalar.h76
-rw-r--r--src/arrow/cpp/src/gandiva/decimal_type_util.cc75
-rw-r--r--src/arrow/cpp/src/gandiva/decimal_type_util.h83
-rw-r--r--src/arrow/cpp/src/gandiva/decimal_type_util_test.cc58
-rw-r--r--src/arrow/cpp/src/gandiva/decimal_xlarge.cc284
-rw-r--r--src/arrow/cpp/src/gandiva/decimal_xlarge.h41
-rw-r--r--src/arrow/cpp/src/gandiva/dex.h396
-rw-r--r--src/arrow/cpp/src/gandiva/dex_visitor.h97
-rw-r--r--src/arrow/cpp/src/gandiva/engine.cc338
-rw-r--r--src/arrow/cpp/src/gandiva/engine.h104
-rw-r--r--src/arrow/cpp/src/gandiva/engine_llvm_test.cc131
-rw-r--r--src/arrow/cpp/src/gandiva/eval_batch.h107
-rw-r--r--src/arrow/cpp/src/gandiva/execution_context.h54
-rw-r--r--src/arrow/cpp/src/gandiva/exported_funcs.h59
-rw-r--r--src/arrow/cpp/src/gandiva/exported_funcs_registry.cc30
-rw-r--r--src/arrow/cpp/src/gandiva/exported_funcs_registry.h54
-rw-r--r--src/arrow/cpp/src/gandiva/expr_decomposer.cc310
-rw-r--r--src/arrow/cpp/src/gandiva/expr_decomposer.h128
-rw-r--r--src/arrow/cpp/src/gandiva/expr_decomposer_test.cc409
-rw-r--r--src/arrow/cpp/src/gandiva/expr_validator.cc193
-rw-r--r--src/arrow/cpp/src/gandiva/expr_validator.h80
-rw-r--r--src/arrow/cpp/src/gandiva/expression.cc25
-rw-r--r--src/arrow/cpp/src/gandiva/expression.h46
-rw-r--r--src/arrow/cpp/src/gandiva/expression_registry.cc187
-rw-r--r--src/arrow/cpp/src/gandiva/expression_registry.h71
-rw-r--r--src/arrow/cpp/src/gandiva/expression_registry_test.cc68
-rw-r--r--src/arrow/cpp/src/gandiva/field_descriptor.h69
-rw-r--r--src/arrow/cpp/src/gandiva/filter.cc171
-rw-r--r--src/arrow/cpp/src/gandiva/filter.h112
-rw-r--r--src/arrow/cpp/src/gandiva/formatting_utils.h69
-rw-r--r--src/arrow/cpp/src/gandiva/func_descriptor.h50
-rw-r--r--src/arrow/cpp/src/gandiva/function_holder.h34
-rw-r--r--src/arrow/cpp/src/gandiva/function_holder_registry.h76
-rw-r--r--src/arrow/cpp/src/gandiva/function_ir_builder.cc81
-rw-r--r--src/arrow/cpp/src/gandiva/function_ir_builder.h61
-rw-r--r--src/arrow/cpp/src/gandiva/function_registry.cc83
-rw-r--r--src/arrow/cpp/src/gandiva/function_registry.h47
-rw-r--r--src/arrow/cpp/src/gandiva/function_registry_arithmetic.cc125
-rw-r--r--src/arrow/cpp/src/gandiva/function_registry_arithmetic.h27
-rw-r--r--src/arrow/cpp/src/gandiva/function_registry_common.h268
-rw-r--r--src/arrow/cpp/src/gandiva/function_registry_datetime.cc132
-rw-r--r--src/arrow/cpp/src/gandiva/function_registry_datetime.h27
-rw-r--r--src/arrow/cpp/src/gandiva/function_registry_hash.cc63
-rw-r--r--src/arrow/cpp/src/gandiva/function_registry_hash.h27
-rw-r--r--src/arrow/cpp/src/gandiva/function_registry_math_ops.cc106
-rw-r--r--src/arrow/cpp/src/gandiva/function_registry_math_ops.h27
-rw-r--r--src/arrow/cpp/src/gandiva/function_registry_string.cc422
-rw-r--r--src/arrow/cpp/src/gandiva/function_registry_string.h27
-rw-r--r--src/arrow/cpp/src/gandiva/function_registry_test.cc96
-rw-r--r--src/arrow/cpp/src/gandiva/function_registry_timestamp_arithmetic.cc89
-rw-r--r--src/arrow/cpp/src/gandiva/function_registry_timestamp_arithmetic.h27
-rw-r--r--src/arrow/cpp/src/gandiva/function_signature.cc113
-rw-r--r--src/arrow/cpp/src/gandiva/function_signature.h55
-rw-r--r--src/arrow/cpp/src/gandiva/function_signature_test.cc113
-rw-r--r--src/arrow/cpp/src/gandiva/gandiva.pc.in27
-rw-r--r--src/arrow/cpp/src/gandiva/gandiva_aliases.h62
-rw-r--r--src/arrow/cpp/src/gandiva/gdv_function_stubs.cc1603
-rw-r--r--src/arrow/cpp/src/gandiva/gdv_function_stubs.h173
-rw-r--r--src/arrow/cpp/src/gandiva/gdv_function_stubs_test.cc769
-rw-r--r--src/arrow/cpp/src/gandiva/greedy_dual_size_cache.h154
-rw-r--r--src/arrow/cpp/src/gandiva/greedy_dual_size_cache_test.cc88
-rw-r--r--src/arrow/cpp/src/gandiva/hash_utils.cc134
-rw-r--r--src/arrow/cpp/src/gandiva/hash_utils.h44
-rw-r--r--src/arrow/cpp/src/gandiva/hash_utils_test.cc164
-rw-r--r--src/arrow/cpp/src/gandiva/in_holder.h91
-rw-r--r--src/arrow/cpp/src/gandiva/jni/CMakeLists.txt107
-rw-r--r--src/arrow/cpp/src/gandiva/jni/config_builder.cc53
-rw-r--r--src/arrow/cpp/src/gandiva/jni/config_holder.cc30
-rw-r--r--src/arrow/cpp/src/gandiva/jni/config_holder.h68
-rw-r--r--src/arrow/cpp/src/gandiva/jni/env_helper.h23
-rw-r--r--src/arrow/cpp/src/gandiva/jni/expression_registry_helper.cc190
-rw-r--r--src/arrow/cpp/src/gandiva/jni/id_to_module_map.h66
-rw-r--r--src/arrow/cpp/src/gandiva/jni/jni_common.cc1055
-rw-r--r--src/arrow/cpp/src/gandiva/jni/module_holder.h59
-rw-r--r--src/arrow/cpp/src/gandiva/jni/symbols.map20
-rw-r--r--src/arrow/cpp/src/gandiva/like_holder.cc156
-rw-r--r--src/arrow/cpp/src/gandiva/like_holder.h68
-rw-r--r--src/arrow/cpp/src/gandiva/like_holder_test.cc281
-rw-r--r--src/arrow/cpp/src/gandiva/literal_holder.cc45
-rw-r--r--src/arrow/cpp/src/gandiva/literal_holder.h36
-rw-r--r--src/arrow/cpp/src/gandiva/llvm_generator.cc1400
-rw-r--r--src/arrow/cpp/src/gandiva/llvm_generator.h253
-rw-r--r--src/arrow/cpp/src/gandiva/llvm_generator_test.cc116
-rw-r--r--src/arrow/cpp/src/gandiva/llvm_includes.h56
-rw-r--r--src/arrow/cpp/src/gandiva/llvm_types.cc48
-rw-r--r--src/arrow/cpp/src/gandiva/llvm_types.h130
-rw-r--r--src/arrow/cpp/src/gandiva/llvm_types_test.cc61
-rw-r--r--src/arrow/cpp/src/gandiva/local_bitmaps_holder.h85
-rw-r--r--src/arrow/cpp/src/gandiva/lvalue.h77
-rw-r--r--src/arrow/cpp/src/gandiva/make_precompiled_bitcode.py49
-rw-r--r--src/arrow/cpp/src/gandiva/native_function.h81
-rw-r--r--src/arrow/cpp/src/gandiva/node.h299
-rw-r--r--src/arrow/cpp/src/gandiva/node_visitor.h56
-rw-r--r--src/arrow/cpp/src/gandiva/pch.h24
-rw-r--r--src/arrow/cpp/src/gandiva/precompiled/CMakeLists.txt142
-rw-r--r--src/arrow/cpp/src/gandiva/precompiled/arithmetic_ops.cc274
-rw-r--r--src/arrow/cpp/src/gandiva/precompiled/arithmetic_ops_test.cc180
-rw-r--r--src/arrow/cpp/src/gandiva/precompiled/bitmap.cc60
-rw-r--r--src/arrow/cpp/src/gandiva/precompiled/bitmap_test.cc62
-rw-r--r--src/arrow/cpp/src/gandiva/precompiled/decimal_ops.cc723
-rw-r--r--src/arrow/cpp/src/gandiva/precompiled/decimal_ops.h90
-rw-r--r--src/arrow/cpp/src/gandiva/precompiled/decimal_ops_test.cc1095
-rw-r--r--src/arrow/cpp/src/gandiva/precompiled/decimal_wrapper.cc433
-rw-r--r--src/arrow/cpp/src/gandiva/precompiled/epoch_time_point.h118
-rw-r--r--src/arrow/cpp/src/gandiva/precompiled/epoch_time_point_test.cc103
-rw-r--r--src/arrow/cpp/src/gandiva/precompiled/extended_math_ops.cc410
-rw-r--r--src/arrow/cpp/src/gandiva/precompiled/extended_math_ops_test.cc349
-rw-r--r--src/arrow/cpp/src/gandiva/precompiled/hash.cc407
-rw-r--r--src/arrow/cpp/src/gandiva/precompiled/hash_test.cc122
-rw-r--r--src/arrow/cpp/src/gandiva/precompiled/print.cc28
-rw-r--r--src/arrow/cpp/src/gandiva/precompiled/string_ops.cc2198
-rw-r--r--src/arrow/cpp/src/gandiva/precompiled/string_ops_test.cc1758
-rw-r--r--src/arrow/cpp/src/gandiva/precompiled/testing.h43
-rw-r--r--src/arrow/cpp/src/gandiva/precompiled/time.cc894
-rw-r--r--src/arrow/cpp/src/gandiva/precompiled/time_constants.h30
-rw-r--r--src/arrow/cpp/src/gandiva/precompiled/time_fields.h35
-rw-r--r--src/arrow/cpp/src/gandiva/precompiled/time_test.cc953
-rw-r--r--src/arrow/cpp/src/gandiva/precompiled/timestamp_arithmetic.cc283
-rw-r--r--src/arrow/cpp/src/gandiva/precompiled/types.h592
-rw-r--r--src/arrow/cpp/src/gandiva/precompiled_bitcode.cc.in26
-rw-r--r--src/arrow/cpp/src/gandiva/projector.cc369
-rw-r--r--src/arrow/cpp/src/gandiva/projector.h143
-rw-r--r--src/arrow/cpp/src/gandiva/proto/Types.proto255
-rw-r--r--src/arrow/cpp/src/gandiva/random_generator_holder.cc45
-rw-r--r--src/arrow/cpp/src/gandiva/random_generator_holder.h57
-rw-r--r--src/arrow/cpp/src/gandiva/random_generator_holder_test.cc103
-rw-r--r--src/arrow/cpp/src/gandiva/regex_util.cc63
-rw-r--r--src/arrow/cpp/src/gandiva/regex_util.h45
-rw-r--r--src/arrow/cpp/src/gandiva/replace_holder.cc65
-rw-r--r--src/arrow/cpp/src/gandiva/replace_holder.h97
-rw-r--r--src/arrow/cpp/src/gandiva/replace_holder_test.cc129
-rw-r--r--src/arrow/cpp/src/gandiva/selection_vector.cc179
-rw-r--r--src/arrow/cpp/src/gandiva/selection_vector.h151
-rw-r--r--src/arrow/cpp/src/gandiva/selection_vector_impl.h108
-rw-r--r--src/arrow/cpp/src/gandiva/selection_vector_test.cc270
-rw-r--r--src/arrow/cpp/src/gandiva/simple_arena.h160
-rw-r--r--src/arrow/cpp/src/gandiva/simple_arena_test.cc102
-rw-r--r--src/arrow/cpp/src/gandiva/symbols.map35
-rw-r--r--src/arrow/cpp/src/gandiva/tests/CMakeLists.txt42
-rw-r--r--src/arrow/cpp/src/gandiva/tests/binary_test.cc136
-rw-r--r--src/arrow/cpp/src/gandiva/tests/boolean_expr_test.cc388
-rw-r--r--src/arrow/cpp/src/gandiva/tests/date_time_test.cc602
-rw-r--r--src/arrow/cpp/src/gandiva/tests/decimal_single_test.cc305
-rw-r--r--src/arrow/cpp/src/gandiva/tests/decimal_test.cc1194
-rw-r--r--src/arrow/cpp/src/gandiva/tests/filter_project_test.cc276
-rw-r--r--src/arrow/cpp/src/gandiva/tests/filter_test.cc340
-rw-r--r--src/arrow/cpp/src/gandiva/tests/generate_data.h152
-rw-r--r--src/arrow/cpp/src/gandiva/tests/hash_test.cc615
-rw-r--r--src/arrow/cpp/src/gandiva/tests/huge_table_test.cc157
-rw-r--r--src/arrow/cpp/src/gandiva/tests/if_expr_test.cc378
-rw-r--r--src/arrow/cpp/src/gandiva/tests/in_expr_test.cc278
-rw-r--r--src/arrow/cpp/src/gandiva/tests/literal_test.cc232
-rw-r--r--src/arrow/cpp/src/gandiva/tests/micro_benchmarks.cc456
-rw-r--r--src/arrow/cpp/src/gandiva/tests/null_validity_test.cc175
-rw-r--r--src/arrow/cpp/src/gandiva/tests/projector_build_validation_test.cc287
-rw-r--r--src/arrow/cpp/src/gandiva/tests/projector_test.cc1609
-rw-r--r--src/arrow/cpp/src/gandiva/tests/test_util.h103
-rw-r--r--src/arrow/cpp/src/gandiva/tests/timed_evaluate.h136
-rw-r--r--src/arrow/cpp/src/gandiva/tests/to_string_test.cc88
-rw-r--r--src/arrow/cpp/src/gandiva/tests/utf8_test.cc751
-rw-r--r--src/arrow/cpp/src/gandiva/to_date_holder.cc116
-rw-r--r--src/arrow/cpp/src/gandiva/to_date_holder.h58
-rw-r--r--src/arrow/cpp/src/gandiva/to_date_holder_test.cc152
-rw-r--r--src/arrow/cpp/src/gandiva/tree_expr_builder.cc223
-rw-r--r--src/arrow/cpp/src/gandiva/tree_expr_builder.h139
-rw-r--r--src/arrow/cpp/src/gandiva/tree_expr_test.cc159
-rw-r--r--src/arrow/cpp/src/gandiva/value_validity_pair.h48
-rw-r--r--src/arrow/cpp/src/gandiva/visibility.h48
190 files changed, 40339 insertions, 0 deletions
diff --git a/src/arrow/cpp/src/gandiva/CMakeLists.txt b/src/arrow/cpp/src/gandiva/CMakeLists.txt
new file mode 100644
index 000000000..654a4a40b
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/CMakeLists.txt
@@ -0,0 +1,253 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set(GANDIVA_VERSION "${ARROW_VERSION}")
+
+# For "make gandiva" to build everything Gandiva-related
+add_custom_target(gandiva-all)
+add_custom_target(gandiva)
+add_custom_target(gandiva-tests)
+add_custom_target(gandiva-benchmarks)
+
+add_dependencies(gandiva-all gandiva gandiva-tests gandiva-benchmarks)
+
+find_package(LLVMAlt REQUIRED)
+
+if(LLVM_VERSION_MAJOR LESS "10")
+ set(GANDIVA_CXX_STANDARD ${CMAKE_CXX_STANDARD})
+else()
+ # LLVM 10 or later requires C++ 14
+ if(CMAKE_CXX_STANDARD LESS 14)
+ set(GANDIVA_CXX_STANDARD 14)
+ else()
+ set(GANDIVA_CXX_STANDARD ${CMAKE_CXX_STANDARD})
+ endif()
+endif()
+
+add_definitions(-DGANDIVA_LLVM_VERSION=${LLVM_VERSION_MAJOR})
+
+find_package(OpenSSLAlt REQUIRED)
+
+# Set the path where the bitcode file generated, see precompiled/CMakeLists.txt
+set(GANDIVA_PRECOMPILED_BC_PATH "${CMAKE_CURRENT_BINARY_DIR}/irhelpers.bc")
+set(GANDIVA_PRECOMPILED_CC_PATH "${CMAKE_CURRENT_BINARY_DIR}/precompiled_bitcode.cc")
+set(GANDIVA_PRECOMPILED_CC_IN_PATH
+ "${CMAKE_CURRENT_SOURCE_DIR}/precompiled_bitcode.cc.in")
+
+# add_arrow_lib will look for this not yet existing file, so flag as generated
+set_source_files_properties(${GANDIVA_PRECOMPILED_CC_PATH} PROPERTIES GENERATED TRUE)
+
+set(SRC_FILES
+ annotator.cc
+ bitmap_accumulator.cc
+ cache.cc
+ cast_time.cc
+ configuration.cc
+ context_helper.cc
+ decimal_ir.cc
+ decimal_type_util.cc
+ decimal_xlarge.cc
+ engine.cc
+ date_utils.cc
+ expr_decomposer.cc
+ expr_validator.cc
+ expression.cc
+ expression_registry.cc
+ exported_funcs_registry.cc
+ filter.cc
+ function_ir_builder.cc
+ function_registry.cc
+ function_registry_arithmetic.cc
+ function_registry_datetime.cc
+ function_registry_hash.cc
+ function_registry_math_ops.cc
+ function_registry_string.cc
+ function_registry_timestamp_arithmetic.cc
+ function_signature.cc
+ gdv_function_stubs.cc
+ hash_utils.cc
+ llvm_generator.cc
+ llvm_types.cc
+ like_holder.cc
+ literal_holder.cc
+ projector.cc
+ regex_util.cc
+ replace_holder.cc
+ selection_vector.cc
+ tree_expr_builder.cc
+ to_date_holder.cc
+ random_generator_holder.cc
+ ${GANDIVA_PRECOMPILED_CC_PATH})
+
+set(GANDIVA_SHARED_PRIVATE_LINK_LIBS arrow_shared LLVM::LLVM_INTERFACE
+ ${GANDIVA_OPENSSL_LIBS})
+
+set(GANDIVA_STATIC_LINK_LIBS arrow_static LLVM::LLVM_INTERFACE ${GANDIVA_OPENSSL_LIBS})
+
+if(ARROW_GANDIVA_STATIC_LIBSTDCPP AND (CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX
+ ))
+ set(GANDIVA_STATIC_LINK_LIBS ${GANDIVA_STATIC_LINK_LIBS} -static-libstdc++
+ -static-libgcc)
+endif()
+
+# if (MSVC)
+# # Symbols that need to be made public in gandiva.dll for LLVM IR
+# # compilation
+# set(MSVC_SYMBOL_EXPORTS _Init_thread_header)
+# foreach(SYMBOL ${MSVC_SYMBOL_EXPORTS})
+# set(GANDIVA_SHARED_LINK_FLAGS "${GANDIVA_SHARED_LINK_FLAGS} /EXPORT:${SYMBOL}")
+# endforeach()
+# endif()
+if(CXX_LINKER_SUPPORTS_VERSION_SCRIPT)
+ set(GANDIVA_VERSION_SCRIPT_FLAGS
+ "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/symbols.map")
+ set(GANDIVA_SHARED_LINK_FLAGS
+ "${GANDIVA_SHARED_LINK_FLAGS} ${GANDIVA_VERSION_SCRIPT_FLAGS}")
+endif()
+
+add_arrow_lib(gandiva
+ CMAKE_PACKAGE_NAME
+ Gandiva
+ PKG_CONFIG_NAME
+ gandiva
+ SOURCES
+ ${SRC_FILES}
+ PRECOMPILED_HEADERS
+ "$<$<COMPILE_LANGUAGE:CXX>:gandiva/pch.h>"
+ OUTPUTS
+ GANDIVA_LIBRARIES
+ DEPENDENCIES
+ arrow_dependencies
+ precompiled
+ EXTRA_INCLUDES
+ $<TARGET_PROPERTY:LLVM::LLVM_INTERFACE,INTERFACE_INCLUDE_DIRECTORIES>
+ ${GANDIVA_OPENSSL_INCLUDE_DIR}
+ ${UTF8PROC_INCLUDE_DIR}
+ SHARED_LINK_FLAGS
+ ${GANDIVA_SHARED_LINK_FLAGS}
+ SHARED_LINK_LIBS
+ arrow_shared
+ SHARED_PRIVATE_LINK_LIBS
+ ${GANDIVA_SHARED_PRIVATE_LINK_LIBS}
+ STATIC_LINK_LIBS
+ ${GANDIVA_STATIC_LINK_LIBS})
+
+foreach(LIB_TARGET ${GANDIVA_LIBRARIES})
+ target_compile_definitions(${LIB_TARGET} PRIVATE GANDIVA_EXPORTING)
+ set_target_properties(${LIB_TARGET} PROPERTIES CXX_STANDARD ${GANDIVA_CXX_STANDARD})
+endforeach()
+
+if(ARROW_BUILD_STATIC AND WIN32)
+ target_compile_definitions(gandiva_static PUBLIC GANDIVA_STATIC)
+endif()
+
+add_dependencies(gandiva ${GANDIVA_LIBRARIES})
+
+arrow_install_all_headers("gandiva")
+
+set(GANDIVA_STATIC_TEST_LINK_LIBS gandiva_static ${ARROW_TEST_LINK_LIBS})
+
+set(GANDIVA_SHARED_TEST_LINK_LIBS gandiva_shared ${ARROW_TEST_LINK_LIBS})
+
+function(ADD_GANDIVA_TEST REL_TEST_NAME)
+ set(options USE_STATIC_LINKING)
+ set(one_value_args)
+ set(multi_value_args)
+ cmake_parse_arguments(ARG
+ "${options}"
+ "${one_value_args}"
+ "${multi_value_args}"
+ ${ARGN})
+
+ if(NO_TESTS)
+ return()
+ endif()
+
+ set(TEST_ARGUMENTS
+ ENABLED
+ PREFIX
+ "gandiva"
+ LABELS
+ "gandiva-tests"
+ ${ARG_UNPARSED_ARGUMENTS})
+
+ # and uses less disk space, but in some cases we need to force static
+ # linking (see rationale below).
+ if(ARG_USE_STATIC_LINKING OR ARROW_TEST_LINKAGE STREQUAL "static")
+ add_test_case(${REL_TEST_NAME}
+ ${TEST_ARGUMENTS}
+ STATIC_LINK_LIBS
+ ${GANDIVA_STATIC_TEST_LINK_LIBS}
+ ${ARG_UNPARSED_ARGUMENTS})
+ else()
+ add_test_case(${REL_TEST_NAME}
+ ${TEST_ARGUMENTS}
+ STATIC_LINK_LIBS
+ ${GANDIVA_SHARED_TEST_LINK_LIBS}
+ ${ARG_UNPARSED_ARGUMENTS})
+ endif()
+
+ set(TEST_NAME gandiva-${REL_TEST_NAME})
+ string(REPLACE "_" "-" TEST_NAME ${TEST_NAME})
+ set_target_properties(${TEST_NAME} PROPERTIES CXX_STANDARD ${GANDIVA_CXX_STANDARD})
+endfunction()
+
+set(GANDIVA_INTERNALS_TEST_ARGUMENTS)
+if(WIN32)
+ list(APPEND
+ GANDIVA_INTERNALS_TEST_ARGUMENTS
+ EXTRA_LINK_LIBS
+ LLVM::LLVM_INTERFACE
+ ${GANDIVA_OPENSSL_LIBS})
+endif()
+add_gandiva_test(internals-test
+ SOURCES
+ bitmap_accumulator_test.cc
+ engine_llvm_test.cc
+ function_registry_test.cc
+ function_signature_test.cc
+ llvm_types_test.cc
+ llvm_generator_test.cc
+ annotator_test.cc
+ tree_expr_test.cc
+ expr_decomposer_test.cc
+ expression_registry_test.cc
+ selection_vector_test.cc
+ greedy_dual_size_cache_test.cc
+ to_date_holder_test.cc
+ simple_arena_test.cc
+ like_holder_test.cc
+ replace_holder_test.cc
+ decimal_type_util_test.cc
+ random_generator_holder_test.cc
+ hash_utils_test.cc
+ gdv_function_stubs_test.cc
+ EXTRA_DEPENDENCIES
+ LLVM::LLVM_INTERFACE
+ ${GANDIVA_OPENSSL_LIBS}
+ EXTRA_INCLUDES
+ $<TARGET_PROPERTY:LLVM::LLVM_INTERFACE,INTERFACE_INCLUDE_DIRECTORIES>
+ ${GANDIVA_INTERNALS_TEST_ARGUMENTS}
+ ${GANDIVA_OPENSSL_INCLUDE_DIR}
+ ${UTF8PROC_INCLUDE_DIR})
+
+if(ARROW_GANDIVA_JAVA)
+ add_subdirectory(jni)
+endif()
+
+add_subdirectory(precompiled)
+add_subdirectory(tests)
diff --git a/src/arrow/cpp/src/gandiva/GandivaConfig.cmake.in b/src/arrow/cpp/src/gandiva/GandivaConfig.cmake.in
new file mode 100644
index 000000000..09bc33901
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/GandivaConfig.cmake.in
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# This config sets the following variables in your project::
+#
+# Gandiva_FOUND - true if Gandiva found on the system
+#
+# This config sets the following targets in your project::
+#
+# gandiva_shared - for linked as shared library if shared library is built
+# gandiva_static - for linked as static library if static library is built
+
+@PACKAGE_INIT@
+
+include(CMakeFindDependencyMacro)
+find_dependency(Arrow)
+
+# Load targets only once. If we load targets multiple times, CMake reports
+# already existent target error.
+if(NOT (TARGET gandiva_shared OR TARGET gandiva_static))
+ include("${CMAKE_CURRENT_LIST_DIR}/GandivaTargets.cmake")
+endif()
diff --git a/src/arrow/cpp/src/gandiva/annotator.cc b/src/arrow/cpp/src/gandiva/annotator.cc
new file mode 100644
index 000000000..f6acaff18
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/annotator.cc
@@ -0,0 +1,118 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/annotator.h"
+
+#include <memory>
+#include <string>
+
+#include "gandiva/field_descriptor.h"
+
+namespace gandiva {
+
+FieldDescriptorPtr Annotator::CheckAndAddInputFieldDescriptor(FieldPtr field) {
+ // If the field is already in the map, return the entry.
+ auto found = in_name_to_desc_.find(field->name());
+ if (found != in_name_to_desc_.end()) {
+ return found->second;
+ }
+
+ auto desc = MakeDesc(field, false /*is_output*/);
+ in_name_to_desc_[field->name()] = desc;
+ return desc;
+}
+
+FieldDescriptorPtr Annotator::AddOutputFieldDescriptor(FieldPtr field) {
+ auto desc = MakeDesc(field, true /*is_output*/);
+ out_descs_.push_back(desc);
+ return desc;
+}
+
+FieldDescriptorPtr Annotator::MakeDesc(FieldPtr field, bool is_output) {
+ int data_idx = buffer_count_++;
+ int validity_idx = buffer_count_++;
+ int offsets_idx = FieldDescriptor::kInvalidIdx;
+ if (arrow::is_binary_like(field->type()->id())) {
+ offsets_idx = buffer_count_++;
+ }
+ int data_buffer_ptr_idx = FieldDescriptor::kInvalidIdx;
+ if (is_output) {
+ data_buffer_ptr_idx = buffer_count_++;
+ }
+ return std::make_shared<FieldDescriptor>(field, data_idx, validity_idx, offsets_idx,
+ data_buffer_ptr_idx);
+}
+
+void Annotator::PrepareBuffersForField(const FieldDescriptor& desc,
+ const arrow::ArrayData& array_data,
+ EvalBatch* eval_batch, bool is_output) {
+ int buffer_idx = 0;
+
+ // The validity buffer is optional. Use nullptr if it does not have one.
+ if (array_data.buffers[buffer_idx]) {
+ uint8_t* validity_buf = const_cast<uint8_t*>(array_data.buffers[buffer_idx]->data());
+ eval_batch->SetBuffer(desc.validity_idx(), validity_buf, array_data.offset);
+ } else {
+ eval_batch->SetBuffer(desc.validity_idx(), nullptr, array_data.offset);
+ }
+ ++buffer_idx;
+
+ if (desc.HasOffsetsIdx()) {
+ uint8_t* offsets_buf = const_cast<uint8_t*>(array_data.buffers[buffer_idx]->data());
+ eval_batch->SetBuffer(desc.offsets_idx(), offsets_buf, array_data.offset);
+ ++buffer_idx;
+ }
+
+ uint8_t* data_buf = const_cast<uint8_t*>(array_data.buffers[buffer_idx]->data());
+ eval_batch->SetBuffer(desc.data_idx(), data_buf, array_data.offset);
+ if (is_output) {
+ // pass in the Buffer object for output data buffers. Can be used for resizing.
+ uint8_t* data_buf_ptr =
+ reinterpret_cast<uint8_t*>(array_data.buffers[buffer_idx].get());
+ eval_batch->SetBuffer(desc.data_buffer_ptr_idx(), data_buf_ptr, array_data.offset);
+ }
+}
+
+EvalBatchPtr Annotator::PrepareEvalBatch(const arrow::RecordBatch& record_batch,
+ const ArrayDataVector& out_vector) {
+ EvalBatchPtr eval_batch = std::make_shared<EvalBatch>(
+ record_batch.num_rows(), buffer_count_, local_bitmap_count_);
+
+ // Fill in the entries for the input fields.
+ for (int i = 0; i < record_batch.num_columns(); ++i) {
+ const std::string& name = record_batch.column_name(i);
+ auto found = in_name_to_desc_.find(name);
+ if (found == in_name_to_desc_.end()) {
+ // skip columns not involved in the expression.
+ continue;
+ }
+
+ PrepareBuffersForField(*(found->second), *(record_batch.column(i))->data(),
+ eval_batch.get(), false /*is_output*/);
+ }
+
+ // Fill in the entries for the output fields.
+ int idx = 0;
+ for (auto& arraydata : out_vector) {
+ const FieldDescriptorPtr& desc = out_descs_.at(idx);
+ PrepareBuffersForField(*desc, *arraydata, eval_batch.get(), true /*is_output*/);
+ ++idx;
+ }
+ return eval_batch;
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/annotator.h b/src/arrow/cpp/src/gandiva/annotator.h
new file mode 100644
index 000000000..5f185d183
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/annotator.h
@@ -0,0 +1,81 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <list>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "arrow/util/logging.h"
+#include "gandiva/arrow.h"
+#include "gandiva/eval_batch.h"
+#include "gandiva/gandiva_aliases.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+/// \brief annotate the arrow fields in an expression, and use that
+/// to convert the incoming arrow-format row batch to an EvalBatch.
+class GANDIVA_EXPORT Annotator {
+ public:
+ Annotator() : buffer_count_(0), local_bitmap_count_(0) {}
+
+ /// Add an annotated field descriptor for a field in an input schema.
+ /// If the field is already annotated, returns that instead.
+ FieldDescriptorPtr CheckAndAddInputFieldDescriptor(FieldPtr field);
+
+ /// Add an annotated field descriptor for an output field.
+ FieldDescriptorPtr AddOutputFieldDescriptor(FieldPtr field);
+
+ /// Add a local bitmap (for saving validity bits of an intermediate node).
+ /// Returns the index of the bitmap in the list of local bitmaps.
+ int AddLocalBitMap() { return local_bitmap_count_++; }
+
+ /// Prepare an eval batch for the incoming record batch.
+ EvalBatchPtr PrepareEvalBatch(const arrow::RecordBatch& record_batch,
+ const ArrayDataVector& out_vector);
+
+ int buffer_count() { return buffer_count_; }
+
+ private:
+ /// Annotate a field and return the descriptor.
+ FieldDescriptorPtr MakeDesc(FieldPtr field, bool is_output);
+
+ /// Populate eval_batch by extracting the raw buffers from the arrow array, whose
+ /// contents are represent by the annotated descriptor 'desc'.
+ void PrepareBuffersForField(const FieldDescriptor& desc,
+ const arrow::ArrayData& array_data, EvalBatch* eval_batch,
+ bool is_output);
+
+ /// The list of input/output buffers (includes bitmap buffers, value buffers and
+ /// offset buffers).
+ int buffer_count_;
+
+ /// The number of local bitmaps. These are used to save the validity bits for
+ /// intermediate nodes in the expression tree.
+ int local_bitmap_count_;
+
+ /// map between field name and annotated input field descriptor.
+ std::unordered_map<std::string, FieldDescriptorPtr> in_name_to_desc_;
+
+ /// vector of annotated output field descriptors.
+ std::vector<FieldDescriptorPtr> out_descs_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/annotator_test.cc b/src/arrow/cpp/src/gandiva/annotator_test.cc
new file mode 100644
index 000000000..e537943d9
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/annotator_test.cc
@@ -0,0 +1,102 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/annotator.h"
+
+#include <memory>
+#include <utility>
+
+#include <arrow/memory_pool.h>
+#include <gtest/gtest.h>
+#include "gandiva/field_descriptor.h"
+
+namespace gandiva {
+
+class TestAnnotator : public ::testing::Test {
+ protected:
+ ArrayPtr MakeInt32Array(int length);
+};
+
+ArrayPtr TestAnnotator::MakeInt32Array(int length) {
+ arrow::Status status;
+
+ auto validity = *arrow::AllocateBuffer((length + 63) / 8);
+
+ auto values = *arrow::AllocateBuffer(length * sizeof(int32_t));
+
+ auto array_data = arrow::ArrayData::Make(arrow::int32(), length,
+ {std::move(validity), std::move(values)});
+ return arrow::MakeArray(array_data);
+}
+
+TEST_F(TestAnnotator, TestAdd) {
+ Annotator annotator;
+
+ auto field_a = arrow::field("a", arrow::int32());
+ auto field_b = arrow::field("b", arrow::int32());
+ auto in_schema = arrow::schema({field_a, field_b});
+ auto field_sum = arrow::field("sum", arrow::int32());
+
+ FieldDescriptorPtr desc_a = annotator.CheckAndAddInputFieldDescriptor(field_a);
+ EXPECT_EQ(desc_a->field(), field_a);
+ EXPECT_EQ(desc_a->data_idx(), 0);
+ EXPECT_EQ(desc_a->validity_idx(), 1);
+
+ // duplicate add shouldn't cause a new descriptor.
+ FieldDescriptorPtr dup = annotator.CheckAndAddInputFieldDescriptor(field_a);
+ EXPECT_EQ(dup, desc_a);
+ EXPECT_EQ(dup->validity_idx(), desc_a->validity_idx());
+
+ FieldDescriptorPtr desc_b = annotator.CheckAndAddInputFieldDescriptor(field_b);
+ EXPECT_EQ(desc_b->field(), field_b);
+ EXPECT_EQ(desc_b->data_idx(), 2);
+ EXPECT_EQ(desc_b->validity_idx(), 3);
+
+ FieldDescriptorPtr desc_sum = annotator.AddOutputFieldDescriptor(field_sum);
+ EXPECT_EQ(desc_sum->field(), field_sum);
+ EXPECT_EQ(desc_sum->data_idx(), 4);
+ EXPECT_EQ(desc_sum->validity_idx(), 5);
+ EXPECT_EQ(desc_sum->data_buffer_ptr_idx(), 6);
+
+ // prepare record batch
+ int num_records = 100;
+ auto arrow_v0 = MakeInt32Array(num_records);
+ auto arrow_v1 = MakeInt32Array(num_records);
+
+ // prepare input record batch
+ auto record_batch =
+ arrow::RecordBatch::Make(in_schema, num_records, {arrow_v0, arrow_v1});
+
+ auto arrow_sum = MakeInt32Array(num_records);
+ EvalBatchPtr batch = annotator.PrepareEvalBatch(*record_batch, {arrow_sum->data()});
+ EXPECT_EQ(batch->GetNumBuffers(), 7);
+
+ auto buffers = batch->GetBufferArray();
+ EXPECT_EQ(buffers[desc_a->validity_idx()], arrow_v0->data()->buffers.at(0)->data());
+ EXPECT_EQ(buffers[desc_a->data_idx()], arrow_v0->data()->buffers.at(1)->data());
+ EXPECT_EQ(buffers[desc_b->validity_idx()], arrow_v1->data()->buffers.at(0)->data());
+ EXPECT_EQ(buffers[desc_b->data_idx()], arrow_v1->data()->buffers.at(1)->data());
+ EXPECT_EQ(buffers[desc_sum->validity_idx()], arrow_sum->data()->buffers.at(0)->data());
+ EXPECT_EQ(buffers[desc_sum->data_idx()], arrow_sum->data()->buffers.at(1)->data());
+ EXPECT_EQ(buffers[desc_sum->data_buffer_ptr_idx()],
+ reinterpret_cast<uint8_t*>(arrow_sum->data()->buffers.at(1).get()));
+
+ auto bitmaps = batch->GetLocalBitMapArray();
+ EXPECT_EQ(bitmaps, nullptr);
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/arrow.h b/src/arrow/cpp/src/gandiva/arrow.h
new file mode 100644
index 000000000..e6d40cb18
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/arrow.h
@@ -0,0 +1,57 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "arrow/array.h" // IWYU pragma: export
+#include "arrow/builder.h" // IWYU pragma: export
+#include "arrow/pretty_print.h" // IWYU pragma: export
+#include "arrow/record_batch.h" // IWYU pragma: export
+#include "arrow/status.h" // IWYU pragma: export
+#include "arrow/type.h" // IWYU pragma: export
+
+namespace gandiva {
+
+using arrow::ArrayDataVector;
+using arrow::DataTypeVector;
+using arrow::FieldVector;
+using arrow::Result;
+using arrow::Status;
+using arrow::StatusCode;
+
+using ArrayPtr = std::shared_ptr<arrow::Array>;
+using ArrayDataPtr = std::shared_ptr<arrow::ArrayData>;
+using DataTypePtr = std::shared_ptr<arrow::DataType>;
+using FieldPtr = std::shared_ptr<arrow::Field>;
+using RecordBatchPtr = std::shared_ptr<arrow::RecordBatch>;
+using SchemaPtr = std::shared_ptr<arrow::Schema>;
+
+using Decimal128TypePtr = std::shared_ptr<arrow::Decimal128Type>;
+using Decimal128TypeVector = std::vector<Decimal128TypePtr>;
+
+static inline bool is_decimal_128(DataTypePtr type) {
+ if (type->id() == arrow::Type::DECIMAL) {
+ auto decimal_type = arrow::internal::checked_cast<arrow::DecimalType*>(type.get());
+ return decimal_type->byte_width() == 16;
+ } else {
+ return false;
+ }
+}
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/basic_decimal_scalar.h b/src/arrow/cpp/src/gandiva/basic_decimal_scalar.h
new file mode 100644
index 000000000..b2f0da506
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/basic_decimal_scalar.h
@@ -0,0 +1,65 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/util/basic_decimal.h"
+#include "arrow/util/decimal.h"
+
+namespace gandiva {
+
+using arrow::BasicDecimal128;
+
+/// Represents a 128-bit decimal value along with its precision and scale.
+class BasicDecimalScalar128 {
+ public:
+ constexpr BasicDecimalScalar128(int64_t high_bits, uint64_t low_bits, int32_t precision,
+ int32_t scale)
+ : value_(high_bits, low_bits), precision_(precision), scale_(scale) {}
+
+ constexpr BasicDecimalScalar128(const BasicDecimal128& value, int32_t precision,
+ int32_t scale)
+ : value_(value), precision_(precision), scale_(scale) {}
+
+ constexpr BasicDecimalScalar128(int32_t precision, int32_t scale)
+ : precision_(precision), scale_(scale) {}
+
+ int32_t scale() const { return scale_; }
+
+ int32_t precision() const { return precision_; }
+
+ const BasicDecimal128& value() const { return value_; }
+
+ private:
+ BasicDecimal128 value_;
+ int32_t precision_;
+ int32_t scale_;
+};
+
+inline bool operator==(const BasicDecimalScalar128& left,
+ const BasicDecimalScalar128& right) {
+ return left.value() == right.value() && left.precision() == right.precision() &&
+ left.scale() == right.scale();
+}
+
+inline BasicDecimalScalar128 operator-(const BasicDecimalScalar128& operand) {
+ return BasicDecimalScalar128{-operand.value(), operand.precision(), operand.scale()};
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/bitmap_accumulator.cc b/src/arrow/cpp/src/gandiva/bitmap_accumulator.cc
new file mode 100644
index 000000000..8fc66b389
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/bitmap_accumulator.cc
@@ -0,0 +1,75 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/bitmap_accumulator.h"
+
+#include <vector>
+
+#include "arrow/util/bitmap_ops.h"
+
+namespace gandiva {
+
+void BitMapAccumulator::ComputeResult(uint8_t* dst_bitmap) {
+ int64_t num_records = eval_batch_.num_records();
+
+ if (all_invalid_) {
+ // set all bits to 0.
+ memset(dst_bitmap, 0, arrow::BitUtil::BytesForBits(num_records));
+ } else {
+ IntersectBitMaps(dst_bitmap, src_maps_, src_map_offsets_, num_records);
+ }
+}
+
+/// Compute the intersection of multiple bitmaps.
+void BitMapAccumulator::IntersectBitMaps(uint8_t* dst_map,
+ const std::vector<uint8_t*>& src_maps,
+ const std::vector<int64_t>& src_map_offsets,
+ int64_t num_records) {
+ int64_t num_words = (num_records + 63) / 64; // aligned to 8-byte.
+ int64_t num_bytes = num_words * 8;
+ int64_t nmaps = src_maps.size();
+
+ switch (nmaps) {
+ case 0: {
+ // no src_maps_ bitmap. simply set all bits
+ memset(dst_map, 0xff, num_bytes);
+ break;
+ }
+
+ case 1: {
+ // one src_maps_ bitmap. copy to dst_map
+ arrow::internal::CopyBitmap(src_maps[0], src_map_offsets[0], num_records, dst_map,
+ 0);
+ break;
+ }
+
+ default: {
+ // src_maps bitmaps ANDs
+ arrow::internal::BitmapAnd(src_maps[0], src_map_offsets[0], src_maps[1],
+ src_map_offsets[1], num_records, /*offset=*/0, dst_map);
+ for (int64_t m = 2; m < nmaps; ++m) {
+ arrow::internal::BitmapAnd(dst_map, 0, src_maps[m], src_map_offsets[m],
+ num_records,
+ /*offset=*/0, dst_map);
+ }
+
+ break;
+ }
+ }
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/bitmap_accumulator.h b/src/arrow/cpp/src/gandiva/bitmap_accumulator.h
new file mode 100644
index 000000000..0b297a98f
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/bitmap_accumulator.h
@@ -0,0 +1,79 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "arrow/util/macros.h"
+#include "gandiva/dex.h"
+#include "gandiva/dex_visitor.h"
+#include "gandiva/eval_batch.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+/// \brief Extract bitmap buffer from either the input/buffer vectors or the
+/// local validity bitmap, and accumulates them to do the final computation.
+class GANDIVA_EXPORT BitMapAccumulator : public DexDefaultVisitor {
+ public:
+ explicit BitMapAccumulator(const EvalBatch& eval_batch)
+ : eval_batch_(eval_batch), all_invalid_(false) {}
+
+ void Visit(const VectorReadValidityDex& dex) {
+ int idx = dex.ValidityIdx();
+ auto bitmap = eval_batch_.GetBuffer(idx);
+ // The bitmap could be null. Ignore it in this case.
+ if (bitmap != NULLPTR) {
+ src_maps_.push_back(bitmap);
+ src_map_offsets_.push_back(eval_batch_.GetBufferOffset(idx));
+ }
+ }
+
+ void Visit(const LocalBitMapValidityDex& dex) {
+ int idx = dex.local_bitmap_idx();
+ auto bitmap = eval_batch_.GetLocalBitMap(idx);
+ src_maps_.push_back(bitmap);
+ src_map_offsets_.push_back(0); // local bitmap has offset 0
+ }
+
+ void Visit(const TrueDex& dex) {
+ // bitwise-and with 1 is always 1. so, ignore.
+ }
+
+ void Visit(const FalseDex& dex) {
+ // The final result is "all 0s".
+ all_invalid_ = true;
+ }
+
+ /// Compute the dst_bmap based on the contents and type of the accumulated bitmap dex.
+ void ComputeResult(uint8_t* dst_bitmap);
+
+ /// Compute the intersection of the accumulated bitmaps (with offsets) and save the
+ /// result in dst_bmap.
+ static void IntersectBitMaps(uint8_t* dst_map, const std::vector<uint8_t*>& src_maps,
+ const std::vector<int64_t>& src_maps_offsets,
+ int64_t num_records);
+
+ private:
+ const EvalBatch& eval_batch_;
+ std::vector<uint8_t*> src_maps_;
+ std::vector<int64_t> src_map_offsets_;
+ bool all_invalid_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/bitmap_accumulator_test.cc b/src/arrow/cpp/src/gandiva/bitmap_accumulator_test.cc
new file mode 100644
index 000000000..ccffab3e9
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/bitmap_accumulator_test.cc
@@ -0,0 +1,112 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/bitmap_accumulator.h"
+
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/util.h"
+#include "arrow/util/bitmap_ops.h"
+
+#include "gandiva/dex.h"
+
+namespace gandiva {
+
+class TestBitMapAccumulator : public ::testing::Test {
+ protected:
+ void FillBitMap(uint8_t* bmap, uint32_t seed, int nrecords);
+ void ByteWiseIntersectBitMaps(uint8_t* dst, const std::vector<uint8_t*>& srcs,
+ const std::vector<int64_t>& srcOffsets, int nrecords);
+};
+
+void TestBitMapAccumulator::FillBitMap(uint8_t* bmap, uint32_t seed, int nbytes) {
+ ::arrow::random_bytes(nbytes, seed, bmap);
+}
+
+void TestBitMapAccumulator::ByteWiseIntersectBitMaps(
+ uint8_t* dst, const std::vector<uint8_t*>& srcs,
+ const std::vector<int64_t>& srcOffsets, int nrecords) {
+ if (srcs.empty()) {
+ arrow::BitUtil::SetBitsTo(dst, 0, nrecords, true);
+ return;
+ }
+
+ arrow::internal::CopyBitmap(srcs[0], srcOffsets[0], nrecords, dst, 0);
+ for (uint32_t j = 1; j < srcs.size(); ++j) {
+ arrow::internal::BitmapAnd(dst, 0, srcs[j], srcOffsets[j], nrecords, 0, dst);
+ }
+}
+
+TEST_F(TestBitMapAccumulator, TestIntersectBitMaps) {
+ const int length = 128;
+ const int nrecords = length * 8;
+ uint8_t src_bitmaps[4][length];
+ uint8_t dst_bitmap[length];
+ uint8_t expected_bitmap[length];
+
+ for (int i = 0; i < 4; i++) {
+ FillBitMap(src_bitmaps[i], i, length);
+ }
+
+ for (int i = 0; i < 4; i++) {
+ std::vector<uint8_t*> src_bitmap_ptrs;
+ std::vector<int64_t> src_bitmap_offsets(i, 0);
+ for (int j = 0; j < i; ++j) {
+ src_bitmap_ptrs.push_back(src_bitmaps[j]);
+ }
+
+ BitMapAccumulator::IntersectBitMaps(dst_bitmap, src_bitmap_ptrs, src_bitmap_offsets,
+ nrecords);
+ ByteWiseIntersectBitMaps(expected_bitmap, src_bitmap_ptrs, src_bitmap_offsets,
+ nrecords);
+ EXPECT_EQ(memcmp(dst_bitmap, expected_bitmap, length), 0);
+ }
+}
+
+TEST_F(TestBitMapAccumulator, TestIntersectBitMapsWithOffset) {
+ const int length = 128;
+ uint8_t src_bitmaps[4][length];
+ uint8_t dst_bitmap[length];
+ uint8_t expected_bitmap[length];
+
+ for (int i = 0; i < 4; i++) {
+ FillBitMap(src_bitmaps[i], i, length);
+ }
+
+ for (int i = 0; i < 4; i++) {
+ std::vector<uint8_t*> src_bitmap_ptrs;
+ std::vector<int64_t> src_bitmap_offsets;
+ for (int j = 0; j < i; ++j) {
+ src_bitmap_ptrs.push_back(src_bitmaps[j]);
+ src_bitmap_offsets.push_back(j); // offset j
+ }
+ const int nrecords = (i == 0) ? length * 8 : length * 8 - i + 1;
+
+ BitMapAccumulator::IntersectBitMaps(dst_bitmap, src_bitmap_ptrs, src_bitmap_offsets,
+ nrecords);
+ ByteWiseIntersectBitMaps(expected_bitmap, src_bitmap_ptrs, src_bitmap_offsets,
+ nrecords);
+ EXPECT_TRUE(
+ arrow::internal::BitmapEquals(dst_bitmap, 0, expected_bitmap, 0, nrecords));
+ }
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/cache.cc b/src/arrow/cpp/src/gandiva/cache.cc
new file mode 100644
index 000000000..d823a676b
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/cache.cc
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/cache.h"
+#include "arrow/util/logging.h"
+
+namespace gandiva {
+
+static const int DEFAULT_CACHE_SIZE = 500;
+
+int GetCapacity() {
+ int capacity;
+ const char* env_cache_size = std::getenv("GANDIVA_CACHE_SIZE");
+ if (env_cache_size != nullptr) {
+ capacity = std::atoi(env_cache_size);
+ if (capacity <= 0) {
+ ARROW_LOG(WARNING) << "Invalid cache size provided. Using default cache size: "
+ << DEFAULT_CACHE_SIZE;
+ capacity = DEFAULT_CACHE_SIZE;
+ }
+ } else {
+ capacity = DEFAULT_CACHE_SIZE;
+ }
+ return capacity;
+}
+
+void LogCacheSize(size_t capacity) {
+ ARROW_LOG(INFO) << "Creating gandiva cache with capacity: " << capacity;
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/cache.h b/src/arrow/cpp/src/gandiva/cache.h
new file mode 100644
index 000000000..8d0f75ce3
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/cache.h
@@ -0,0 +1,60 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdlib>
+#include <memory>
+#include <mutex>
+
+#include "gandiva/greedy_dual_size_cache.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+GANDIVA_EXPORT
+int GetCapacity();
+
+GANDIVA_EXPORT
+void LogCacheSize(size_t capacity);
+
+template <class KeyType, typename ValueType>
+class Cache {
+ public:
+ explicit Cache(size_t capacity) : cache_(capacity) { LogCacheSize(capacity); }
+
+ Cache() : Cache(GetCapacity()) {}
+
+ ValueType GetModule(KeyType cache_key) {
+ arrow::util::optional<ValueCacheObject<ValueType>> result;
+ mtx_.lock();
+ result = cache_.get(cache_key);
+ mtx_.unlock();
+ return result != arrow::util::nullopt ? (*result).module : nullptr;
+ }
+
+ void PutModule(KeyType cache_key, ValueCacheObject<ValueType> valueCacheObject) {
+ mtx_.lock();
+ cache_.insert(cache_key, valueCacheObject);
+ mtx_.unlock();
+ }
+
+ private:
+ GreedyDualSizeCache<KeyType, ValueType> cache_;
+ std::mutex mtx_;
+};
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/cast_time.cc b/src/arrow/cpp/src/gandiva/cast_time.cc
new file mode 100644
index 000000000..843ce01f8
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/cast_time.cc
@@ -0,0 +1,85 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+
+#include "arrow/vendored/datetime.h"
+
+#include "gandiva/precompiled/time_fields.h"
+
+#ifndef GANDIVA_UNIT_TEST
+#include "gandiva/exported_funcs.h"
+#include "gandiva/gdv_function_stubs.h"
+
+#include "gandiva/engine.h"
+
+namespace gandiva {
+
+void ExportedTimeFunctions::AddMappings(Engine* engine) const {
+ std::vector<llvm::Type*> args;
+ auto types = engine->types();
+
+ // gdv_fn_time_with_zone
+ args = {types->ptr_type(types->i32_type()), // time fields
+ types->i8_ptr_type(), // const char* zone
+ types->i32_type(), // int data_len
+ types->i64_type()}; // timestamp *ret_time
+
+ engine->AddGlobalMappingForFunc("gdv_fn_time_with_zone",
+ types->i32_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_time_with_zone));
+}
+
+} // namespace gandiva
+#endif // !GANDIVA_UNIT_TEST
+
+extern "C" {
+
+// TODO : Do input validation or make sure the callers do that ?
+int gdv_fn_time_with_zone(int* time_fields, const char* zone, int zone_len,
+ int64_t* ret_time) {
+ using arrow_vendored::date::day;
+ using arrow_vendored::date::local_days;
+ using arrow_vendored::date::locate_zone;
+ using arrow_vendored::date::month;
+ using arrow_vendored::date::time_zone;
+ using arrow_vendored::date::year;
+ using std::chrono::hours;
+ using std::chrono::milliseconds;
+ using std::chrono::minutes;
+ using std::chrono::seconds;
+
+ using gandiva::TimeFields;
+ try {
+ const time_zone* tz = locate_zone(std::string(zone, zone_len));
+ *ret_time = tz->to_sys(local_days(year(time_fields[TimeFields::kYear]) /
+ month(time_fields[TimeFields::kMonth]) /
+ day(time_fields[TimeFields::kDay])) +
+ hours(time_fields[TimeFields::kHours]) +
+ minutes(time_fields[TimeFields::kMinutes]) +
+ seconds(time_fields[TimeFields::kSeconds]) +
+ milliseconds(time_fields[TimeFields::kSubSeconds]))
+ .time_since_epoch()
+ .count();
+ } catch (...) {
+ return EINVAL;
+ }
+
+ return 0;
+}
+
+} // extern "C"
diff --git a/src/arrow/cpp/src/gandiva/compiled_expr.h b/src/arrow/cpp/src/gandiva/compiled_expr.h
new file mode 100644
index 000000000..ba0ca3437
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/compiled_expr.h
@@ -0,0 +1,71 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <vector>
+#include "gandiva/llvm_includes.h"
+#include "gandiva/selection_vector.h"
+#include "gandiva/value_validity_pair.h"
+
+namespace gandiva {
+
+using EvalFunc = int (*)(uint8_t** buffers, int64_t* offsets, uint8_t** local_bitmaps,
+ const uint8_t* selection_buffer, int64_t execution_ctx_ptr,
+ int64_t record_count);
+
+/// \brief Tracks the compiled state for one expression.
+class CompiledExpr {
+ public:
+ CompiledExpr(ValueValidityPairPtr value_validity, FieldDescriptorPtr output)
+ : value_validity_(value_validity), output_(output) {}
+
+ ValueValidityPairPtr value_validity() const { return value_validity_; }
+
+ FieldDescriptorPtr output() const { return output_; }
+
+ void SetIRFunction(SelectionVector::Mode mode, llvm::Function* ir_function) {
+ ir_functions_[static_cast<int>(mode)] = ir_function;
+ }
+
+ llvm::Function* GetIRFunction(SelectionVector::Mode mode) const {
+ return ir_functions_[static_cast<int>(mode)];
+ }
+
+ void SetJITFunction(SelectionVector::Mode mode, EvalFunc jit_function) {
+ jit_functions_[static_cast<int>(mode)] = jit_function;
+ }
+
+ EvalFunc GetJITFunction(SelectionVector::Mode mode) const {
+ return jit_functions_[static_cast<int>(mode)];
+ }
+
+ private:
+ // value & validities for the expression tree (root)
+ ValueValidityPairPtr value_validity_;
+
+ // output field
+ FieldDescriptorPtr output_;
+
+ // IR functions for various modes in the generated code
+ std::array<llvm::Function*, SelectionVector::kNumModes> ir_functions_;
+
+ // JIT functions in the generated code (set after the module is optimised and finalized)
+ std::array<EvalFunc, SelectionVector::kNumModes> jit_functions_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/condition.h b/src/arrow/cpp/src/gandiva/condition.h
new file mode 100644
index 000000000..a3e8f9d1f
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/condition.h
@@ -0,0 +1,37 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "gandiva/arrow.h"
+#include "gandiva/expression.h"
+#include "gandiva/gandiva_aliases.h"
+
+namespace gandiva {
+
+/// \brief A condition expression.
+class Condition : public Expression {
+ public:
+ explicit Condition(const NodePtr root)
+ : Expression(root, std::make_shared<arrow::Field>("cond", arrow::boolean())) {}
+
+ virtual ~Condition() = default;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/configuration.cc b/src/arrow/cpp/src/gandiva/configuration.cc
new file mode 100644
index 000000000..1e26c5c70
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/configuration.cc
@@ -0,0 +1,43 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/configuration.h"
+
+#include "arrow/util/hash_util.h"
+
+namespace gandiva {
+
+const std::shared_ptr<Configuration> ConfigurationBuilder::default_configuration_ =
+ InitDefaultConfig();
+
+std::size_t Configuration::Hash() const {
+ static constexpr size_t kHashSeed = 0;
+ size_t result = kHashSeed;
+ arrow::internal::hash_combine(result, static_cast<size_t>(optimize_));
+ arrow::internal::hash_combine(result, static_cast<size_t>(target_host_cpu_));
+ return result;
+}
+
+bool Configuration::operator==(const Configuration& other) const {
+ return optimize_ == other.optimize_ && target_host_cpu_ == other.target_host_cpu_;
+}
+
+bool Configuration::operator!=(const Configuration& other) const {
+ return !(*this == other);
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/configuration.h b/src/arrow/cpp/src/gandiva/configuration.h
new file mode 100644
index 000000000..9cd301524
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/configuration.h
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "arrow/status.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+class ConfigurationBuilder;
+/// \brief runtime config for gandiva
+///
+/// It contains elements to customize gandiva execution
+/// at run time.
+class GANDIVA_EXPORT Configuration {
+ public:
+ friend class ConfigurationBuilder;
+
+ Configuration() : optimize_(true), target_host_cpu_(true) {}
+ explicit Configuration(bool optimize) : optimize_(optimize), target_host_cpu_(true) {}
+
+ std::size_t Hash() const;
+ bool operator==(const Configuration& other) const;
+ bool operator!=(const Configuration& other) const;
+
+ bool optimize() const { return optimize_; }
+ bool target_host_cpu() const { return target_host_cpu_; }
+
+ void set_optimize(bool optimize) { optimize_ = optimize; }
+ void target_host_cpu(bool target_host_cpu) { target_host_cpu_ = target_host_cpu; }
+
+ private:
+ bool optimize_; /* optimise the generated llvm IR */
+ bool target_host_cpu_; /* set the mcpu flag to host cpu while compiling llvm ir */
+};
+
+/// \brief configuration builder for gandiva
+///
+/// Provides a default configuration and convenience methods
+/// to override specific values and build a custom instance
+class GANDIVA_EXPORT ConfigurationBuilder {
+ public:
+ std::shared_ptr<Configuration> build() {
+ std::shared_ptr<Configuration> configuration(new Configuration());
+ return configuration;
+ }
+
+ std::shared_ptr<Configuration> build(bool optimize) {
+ std::shared_ptr<Configuration> configuration(new Configuration(optimize));
+ return configuration;
+ }
+
+ static std::shared_ptr<Configuration> DefaultConfiguration() {
+ return default_configuration_;
+ }
+
+ private:
+ static std::shared_ptr<Configuration> InitDefaultConfig() {
+ std::shared_ptr<Configuration> configuration(new Configuration());
+ return configuration;
+ }
+
+ static const std::shared_ptr<Configuration> default_configuration_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/context_helper.cc b/src/arrow/cpp/src/gandiva/context_helper.cc
new file mode 100644
index 000000000..224bfd8f5
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/context_helper.cc
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This file is also used in the pre-compiled unit tests, which do include
+// llvm/engine/..
+#ifndef GANDIVA_UNIT_TEST
+#include "gandiva/exported_funcs.h"
+#include "gandiva/gdv_function_stubs.h"
+
+#include "gandiva/engine.h"
+
+namespace gandiva {
+
+void ExportedContextFunctions::AddMappings(Engine* engine) const {
+ std::vector<llvm::Type*> args;
+ auto types = engine->types();
+
+ // gdv_fn_context_set_error_msg
+ args = {types->i64_type(), // int64_t context_ptr
+ types->i8_ptr_type()}; // char const* err_msg
+
+ engine->AddGlobalMappingForFunc("gdv_fn_context_set_error_msg", types->void_type(),
+ args,
+ reinterpret_cast<void*>(gdv_fn_context_set_error_msg));
+
+ // gdv_fn_context_arena_malloc
+ args = {types->i64_type(), // int64_t context_ptr
+ types->i32_type()}; // int32_t size
+
+ engine->AddGlobalMappingForFunc("gdv_fn_context_arena_malloc", types->i8_ptr_type(),
+ args,
+ reinterpret_cast<void*>(gdv_fn_context_arena_malloc));
+
+ // gdv_fn_context_arena_reset
+ args = {types->i64_type()}; // int64_t context_ptr
+
+ engine->AddGlobalMappingForFunc("gdv_fn_context_arena_reset", types->void_type(), args,
+ reinterpret_cast<void*>(gdv_fn_context_arena_reset));
+}
+
+} // namespace gandiva
+#endif // !GANDIVA_UNIT_TEST
+
+#include "gandiva/execution_context.h"
+
+extern "C" {
+
+void gdv_fn_context_set_error_msg(int64_t context_ptr, char const* err_msg) {
+ auto context = reinterpret_cast<gandiva::ExecutionContext*>(context_ptr);
+ context->set_error_msg(err_msg);
+}
+
+uint8_t* gdv_fn_context_arena_malloc(int64_t context_ptr, int32_t size) {
+ auto context = reinterpret_cast<gandiva::ExecutionContext*>(context_ptr);
+ return context->arena()->Allocate(size);
+}
+
+void gdv_fn_context_arena_reset(int64_t context_ptr) {
+ auto context = reinterpret_cast<gandiva::ExecutionContext*>(context_ptr);
+ return context->arena()->Reset();
+}
+}
diff --git a/src/arrow/cpp/src/gandiva/date_utils.cc b/src/arrow/cpp/src/gandiva/date_utils.cc
new file mode 100644
index 000000000..f0a80d3c9
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/date_utils.cc
@@ -0,0 +1,232 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <sstream>
+#include <vector>
+
+#include "gandiva/date_utils.h"
+
+namespace gandiva {
+
+std::vector<std::string> DateUtils::GetMatches(std::string pattern, bool exactMatch) {
+ // we are case insensitive
+ std::transform(pattern.begin(), pattern.end(), pattern.begin(), ::tolower);
+ std::vector<std::string> matches;
+
+ for (const auto& it : sql_date_format_to_boost_map_) {
+ if (it.first.find(pattern) != std::string::npos &&
+ (!exactMatch || (it.first.length() == pattern.length()))) {
+ matches.push_back(it.first);
+ }
+ }
+
+ return matches;
+}
+
+std::vector<std::string> DateUtils::GetPotentialMatches(const std::string& pattern) {
+ return GetMatches(pattern, false);
+}
+
+std::vector<std::string> DateUtils::GetExactMatches(const std::string& pattern) {
+ return GetMatches(pattern, true);
+}
+
+/**
+ * Validates and converts format to the strptime equivalent
+ *
+ */
+Status DateUtils::ToInternalFormat(const std::string& format,
+ std::shared_ptr<std::string>* internal_format) {
+ std::stringstream builder;
+ std::stringstream buffer;
+ bool is_in_quoted_text = false;
+
+ for (size_t i = 0; i < format.size(); i++) {
+ char currentChar = format[i];
+
+ // logic before we append to the buffer
+ if (currentChar == '"') {
+ if (is_in_quoted_text) {
+ // we are done with a quoted block
+ is_in_quoted_text = false;
+
+ // use ' for quoting
+ builder << '\'';
+ builder << buffer.str();
+ builder << '\'';
+
+ // clear buffer
+ buffer.str("");
+ continue;
+ } else {
+ ARROW_RETURN_IF(buffer.str().length() > 0,
+ Status::Invalid("Invalid date format string '", format, "'"));
+
+ is_in_quoted_text = true;
+ continue;
+ }
+ }
+
+ // handle special characters we want to simply pass through, but only if not in quoted
+ // and the buffer is empty
+ std::string special_characters = "*-/,.;: ";
+ if (!is_in_quoted_text && buffer.str().length() == 0 &&
+ (special_characters.find_first_of(currentChar) != std::string::npos)) {
+ builder << currentChar;
+ continue;
+ }
+
+ // append to the buffer
+ buffer << currentChar;
+
+ // nothing else to do if we are in quoted text
+ if (is_in_quoted_text) {
+ continue;
+ }
+
+ // check how many matches we have for our buffer
+ std::vector<std::string> potentialList = GetPotentialMatches(buffer.str());
+ int64_t potentialCount = potentialList.size();
+
+ if (potentialCount >= 1) {
+ // one potential and the length match
+ if (potentialCount == 1 && potentialList[0].length() == buffer.str().length()) {
+ // we have a match!
+ builder << sql_date_format_to_boost_map_[potentialList[0]];
+ buffer.str("");
+ } else {
+ // Some patterns (like MON, MONTH) can cause ambiguity, such as "MON:". "MON"
+ // will have two potential matches, but "MON:" will match nothing, so we want to
+ // look ahead when we match "MON" and check if adding the next char leads to 0
+ // potentials. If it does, we go ahead and treat the buffer as matched (if a
+ // potential match exists that matches the buffer)
+ if (format.length() - 1 > i) {
+ std::string lookAheadPattern = (buffer.str() + format.at(i + 1));
+ std::transform(lookAheadPattern.begin(), lookAheadPattern.end(),
+ lookAheadPattern.begin(), ::tolower);
+ bool lookAheadMatched = false;
+
+ // we can query potentialList to see if it has anything that matches the
+ // lookahead pattern
+ for (std::string potential : potentialList) {
+ if (potential.find(lookAheadPattern) != std::string::npos) {
+ lookAheadMatched = true;
+ break;
+ }
+ }
+
+ if (!lookAheadMatched) {
+ // check if any of the potential matches are the same length as our buffer, we
+ // do not want to match "MO:"
+ bool matched = false;
+ for (std::string potential : potentialList) {
+ if (potential.length() == buffer.str().length()) {
+ matched = true;
+ break;
+ }
+ }
+
+ if (matched) {
+ std::string match = buffer.str();
+ std::transform(match.begin(), match.end(), match.begin(), ::tolower);
+ builder << sql_date_format_to_boost_map_[match];
+ buffer.str("");
+ continue;
+ }
+ }
+ }
+ }
+ } else {
+ return Status::Invalid("Invalid date format string '", format, "'");
+ }
+ }
+
+ if (buffer.str().length() > 0) {
+ // Some patterns (like MON, MONTH) can cause us to reach this point with a valid
+ // buffer value as MON has 2 valid potential matches, so double check here
+ std::vector<std::string> exactMatches = GetExactMatches(buffer.str());
+ if (exactMatches.size() == 1 && exactMatches[0].length() == buffer.str().length()) {
+ builder << sql_date_format_to_boost_map_[exactMatches[0]];
+ } else {
+ // Format partially parsed
+ int64_t pos = format.length() - buffer.str().length();
+ return Status::Invalid("Invalid date format string '", format, "' at position ",
+ pos);
+ }
+ }
+ std::string final_pattern = builder.str();
+ internal_format->reset(new std::string(final_pattern));
+ return Status::OK();
+}
+
+DateUtils::date_format_converter DateUtils::sql_date_format_to_boost_map_ = InitMap();
+
+DateUtils::date_format_converter DateUtils::InitMap() {
+ date_format_converter map;
+
+ // Era
+ map["ad"] = "%EC";
+ map["bc"] = "%EC";
+ // Meridian
+ map["am"] = "%p";
+ map["pm"] = "%p";
+ // Century
+ map["cc"] = "%C";
+ // Week of year
+ map["ww"] = "%W";
+ // Day of week
+ map["d"] = "%u";
+ // Day name of week
+ map["dy"] = "%a";
+ map["day"] = "%a";
+ // Year
+ map["yyyy"] = "%Y";
+ map["yy"] = "%y";
+ // Day of year
+ map["ddd"] = "%j";
+ // Month
+ map["mm"] = "%m";
+ map["mon"] = "%b";
+ map["month"] = "%b";
+ // Day of month
+ map["dd"] = "%d";
+ // Hour of day
+ map["hh"] = "%I";
+ map["hh12"] = "%I";
+ map["hh24"] = "%H";
+ // Minutes
+ map["mi"] = "%M";
+ // Seconds
+ map["ss"] = "%S";
+ // Milliseconds
+ map["f"] = "S";
+ map["ff"] = "SS";
+ map["fff"] = "SSS";
+ /*
+ // Timezone not tested/supported yet fully.
+ map["tzd"] = "%Z";
+ map["tzo"] = "%z";
+ map["tzh:tzm"] = "%z";
+ */
+
+ return map;
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/date_utils.h b/src/arrow/cpp/src/gandiva/date_utils.h
new file mode 100644
index 000000000..0d39a5f29
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/date_utils.h
@@ -0,0 +1,52 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "arrow/util/macros.h"
+
+#include "gandiva/arrow.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+/// \brief Utility class for converting sql date patterns to internal date patterns.
+class GANDIVA_EXPORT DateUtils {
+ public:
+ static Status ToInternalFormat(const std::string& format,
+ std::shared_ptr<std::string>* internal_format);
+
+ private:
+ using date_format_converter = std::unordered_map<std::string, std::string>;
+
+ static date_format_converter sql_date_format_to_boost_map_;
+
+ static date_format_converter InitMap();
+
+ static std::vector<std::string> GetMatches(std::string pattern, bool exactMatch);
+
+ static std::vector<std::string> GetPotentialMatches(const std::string& pattern);
+
+ static std::vector<std::string> GetExactMatches(const std::string& pattern);
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/decimal_ir.cc b/src/arrow/cpp/src/gandiva/decimal_ir.cc
new file mode 100644
index 000000000..5d5d30b4a
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/decimal_ir.cc
@@ -0,0 +1,559 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <sstream>
+#include <unordered_set>
+#include <utility>
+
+#include "arrow/status.h"
+#include "gandiva/decimal_ir.h"
+#include "gandiva/decimal_type_util.h"
+
+// Algorithms adapted from Apache Impala
+
+namespace gandiva {
+
+#define ADD_TRACE_32(msg, value) \
+ if (enable_ir_traces_) { \
+ AddTrace32(msg, value); \
+ }
+#define ADD_TRACE_128(msg, value) \
+ if (enable_ir_traces_) { \
+ AddTrace128(msg, value); \
+ }
+
+// These are the functions defined in this file. The rest are in precompiled folder,
+// and the i128 needs to be dis-assembled for those.
+static const char* kAddFunction = "add_decimal128_decimal128";
+static const char* kSubtractFunction = "subtract_decimal128_decimal128";
+static const char* kEQFunction = "equal_decimal128_decimal128";
+static const char* kNEFunction = "not_equal_decimal128_decimal128";
+static const char* kLTFunction = "less_than_decimal128_decimal128";
+static const char* kLEFunction = "less_than_or_equal_to_decimal128_decimal128";
+static const char* kGTFunction = "greater_than_decimal128_decimal128";
+static const char* kGEFunction = "greater_than_or_equal_to_decimal128_decimal128";
+
+static const std::unordered_set<std::string> kDecimalIRBuilderFunctions{
+ kAddFunction, kSubtractFunction, kEQFunction, kNEFunction,
+ kLTFunction, kLEFunction, kGTFunction, kGEFunction};
+
+const char* DecimalIR::kScaleMultipliersName = "gandivaScaleMultipliers";
+
+/// Populate globals required by decimal IR.
+/// TODO: can this be done just once ?
+void DecimalIR::AddGlobals(Engine* engine) {
+ auto types = engine->types();
+
+ // populate vector : [ 1, 10, 100, 1000, ..]
+ std::string value = "1";
+ std::vector<llvm::Constant*> scale_multipliers;
+ for (int i = 0; i < DecimalTypeUtil::kMaxPrecision + 1; ++i) {
+ auto multiplier =
+ llvm::ConstantInt::get(llvm::Type::getInt128Ty(*engine->context()), value, 10);
+ scale_multipliers.push_back(multiplier);
+ value.append("0");
+ }
+
+ auto array_type =
+ llvm::ArrayType::get(types->i128_type(), DecimalTypeUtil::kMaxPrecision + 1);
+ auto initializer = llvm::ConstantArray::get(
+ array_type, llvm::ArrayRef<llvm::Constant*>(scale_multipliers));
+
+ auto globalScaleMultipliers = new llvm::GlobalVariable(
+ *engine->module(), array_type, true /*constant*/,
+ llvm::GlobalValue::LinkOnceAnyLinkage, initializer, kScaleMultipliersName);
+ globalScaleMultipliers->setAlignment(LLVM_ALIGN(16));
+}
+
+// Lookup intrinsic functions
+void DecimalIR::InitializeIntrinsics() {
+ sadd_with_overflow_fn_ = llvm::Intrinsic::getDeclaration(
+ module(), llvm::Intrinsic::sadd_with_overflow, types()->i128_type());
+ DCHECK_NE(sadd_with_overflow_fn_, nullptr);
+
+ smul_with_overflow_fn_ = llvm::Intrinsic::getDeclaration(
+ module(), llvm::Intrinsic::smul_with_overflow, types()->i128_type());
+ DCHECK_NE(smul_with_overflow_fn_, nullptr);
+
+ i128_with_overflow_struct_type_ =
+ sadd_with_overflow_fn_->getFunctionType()->getReturnType();
+}
+
+// CPP: return kScaleMultipliers[scale]
+llvm::Value* DecimalIR::GetScaleMultiplier(llvm::Value* scale) {
+ auto const_array = module()->getGlobalVariable(kScaleMultipliersName);
+ auto ptr = CreateGEP(ir_builder(), const_array, {types()->i32_constant(0), scale});
+ return CreateLoad(ir_builder(), ptr);
+}
+
+// CPP: x <= y ? y : x
+llvm::Value* DecimalIR::GetHigherScale(llvm::Value* x_scale, llvm::Value* y_scale) {
+ llvm::Value* le = ir_builder()->CreateICmpSLE(x_scale, y_scale);
+ return ir_builder()->CreateSelect(le, y_scale, x_scale);
+}
+
+// CPP: return (increase_scale_by <= 0) ?
+// in_value : in_value * GetScaleMultiplier(increase_scale_by)
+llvm::Value* DecimalIR::IncreaseScale(llvm::Value* in_value,
+ llvm::Value* increase_scale_by) {
+ llvm::Value* le_zero =
+ ir_builder()->CreateICmpSLE(increase_scale_by, types()->i32_constant(0));
+ // then block
+ auto then_lambda = [&] { return in_value; };
+
+ // else block
+ auto else_lambda = [&] {
+ llvm::Value* multiplier = GetScaleMultiplier(increase_scale_by);
+ return ir_builder()->CreateMul(in_value, multiplier);
+ };
+
+ return BuildIfElse(le_zero, types()->i128_type(), then_lambda, else_lambda);
+}
+
+// CPP: return (increase_scale_by <= 0) ?
+// {in_value,false} : {in_value * GetScaleMultiplier(increase_scale_by),true}
+//
+// The return value also indicates if there was an overflow while increasing the scale.
+DecimalIR::ValueWithOverflow DecimalIR::IncreaseScaleWithOverflowCheck(
+ llvm::Value* in_value, llvm::Value* increase_scale_by) {
+ llvm::Value* le_zero =
+ ir_builder()->CreateICmpSLE(increase_scale_by, types()->i32_constant(0));
+
+ // then block
+ auto then_lambda = [&] {
+ ValueWithOverflow ret{in_value, types()->false_constant()};
+ return ret.AsStruct(this);
+ };
+
+ // else block
+ auto else_lambda = [&] {
+ llvm::Value* multiplier = GetScaleMultiplier(increase_scale_by);
+ return ir_builder()->CreateCall(smul_with_overflow_fn_, {in_value, multiplier});
+ };
+
+ auto ir_struct =
+ BuildIfElse(le_zero, i128_with_overflow_struct_type_, then_lambda, else_lambda);
+ return ValueWithOverflow::MakeFromStruct(this, ir_struct);
+}
+
+// CPP: return (reduce_scale_by <= 0) ?
+// in_value : in_value / GetScaleMultiplier(reduce_scale_by)
+//
+// ReduceScale cannot cause an overflow.
+llvm::Value* DecimalIR::ReduceScale(llvm::Value* in_value, llvm::Value* reduce_scale_by) {
+ auto le_zero = ir_builder()->CreateICmpSLE(reduce_scale_by, types()->i32_constant(0));
+ // then block
+ auto then_lambda = [&] { return in_value; };
+
+ // else block
+ auto else_lambda = [&] {
+ // TODO : handle rounding.
+ llvm::Value* multiplier = GetScaleMultiplier(reduce_scale_by);
+ return ir_builder()->CreateSDiv(in_value, multiplier);
+ };
+
+ return BuildIfElse(le_zero, types()->i128_type(), then_lambda, else_lambda);
+}
+
+/// @brief Fast-path for add
+/// Adjust x and y to the same scale, and add them.
+llvm::Value* DecimalIR::AddFastPath(const ValueFull& x, const ValueFull& y) {
+ auto higher_scale = GetHigherScale(x.scale(), y.scale());
+ ADD_TRACE_32("AddFastPath : higher_scale", higher_scale);
+
+ // CPP : x_scaled = IncreaseScale(x_value, higher_scale - x_scale)
+ auto x_delta = ir_builder()->CreateSub(higher_scale, x.scale());
+ auto x_scaled = IncreaseScale(x.value(), x_delta);
+ ADD_TRACE_128("AddFastPath : x_scaled", x_scaled);
+
+ // CPP : y_scaled = IncreaseScale(y_value, higher_scale - y_scale)
+ auto y_delta = ir_builder()->CreateSub(higher_scale, y.scale());
+ auto y_scaled = IncreaseScale(y.value(), y_delta);
+ ADD_TRACE_128("AddFastPath : y_scaled", y_scaled);
+
+ auto sum = ir_builder()->CreateAdd(x_scaled, y_scaled);
+ ADD_TRACE_128("AddFastPath : sum", sum);
+ return sum;
+}
+
+// @brief Add with overflow check.
+/// Adjust x and y to the same scale, add them, and reduce sum to output scale.
+/// If there is an overflow, the sum is set to 0.
+DecimalIR::ValueWithOverflow DecimalIR::AddWithOverflowCheck(const ValueFull& x,
+ const ValueFull& y,
+ const ValueFull& out) {
+ auto higher_scale = GetHigherScale(x.scale(), y.scale());
+ ADD_TRACE_32("AddWithOverflowCheck : higher_scale", higher_scale);
+
+ // CPP : x_scaled = IncreaseScale(x_value, higher_scale - x.scale())
+ auto x_delta = ir_builder()->CreateSub(higher_scale, x.scale());
+ auto x_scaled = IncreaseScaleWithOverflowCheck(x.value(), x_delta);
+ ADD_TRACE_128("AddWithOverflowCheck : x_scaled", x_scaled.value());
+
+ // CPP : y_scaled = IncreaseScale(y_value, higher_scale - y_scale)
+ auto y_delta = ir_builder()->CreateSub(higher_scale, y.scale());
+ auto y_scaled = IncreaseScaleWithOverflowCheck(y.value(), y_delta);
+ ADD_TRACE_128("AddWithOverflowCheck : y_scaled", y_scaled.value());
+
+ // CPP : sum = x_scaled + y_scaled
+ auto sum_ir_struct = ir_builder()->CreateCall(sadd_with_overflow_fn_,
+ {x_scaled.value(), y_scaled.value()});
+ auto sum = ValueWithOverflow::MakeFromStruct(this, sum_ir_struct);
+ ADD_TRACE_128("AddWithOverflowCheck : sum", sum.value());
+
+ // CPP : overflow ? 0 : sum / GetScaleMultiplier(max_scale - out_scale)
+ auto overflow = GetCombinedOverflow({x_scaled, y_scaled, sum});
+ ADD_TRACE_32("AddWithOverflowCheck : overflow", overflow);
+ auto then_lambda = [&] {
+ // if there is an overflow, the value returned won't be used. so, save the division.
+ return types()->i128_constant(0);
+ };
+ auto else_lambda = [&] {
+ auto reduce_scale_by = ir_builder()->CreateSub(higher_scale, out.scale());
+ return ReduceScale(sum.value(), reduce_scale_by);
+ };
+ auto sum_descaled =
+ BuildIfElse(overflow, types()->i128_type(), then_lambda, else_lambda);
+ return ValueWithOverflow(sum_descaled, overflow);
+}
+
+// This is pretty complex, so use CPP fns.
+llvm::Value* DecimalIR::AddLarge(const ValueFull& x, const ValueFull& y,
+ const ValueFull& out) {
+ auto block = ir_builder()->GetInsertBlock();
+ auto out_high_ptr = new llvm::AllocaInst(types()->i64_type(), 0, "out_hi", block);
+ auto out_low_ptr = new llvm::AllocaInst(types()->i64_type(), 0, "out_low", block);
+ auto x_split = ValueSplit::MakeFromInt128(this, x.value());
+ auto y_split = ValueSplit::MakeFromInt128(this, y.value());
+
+ std::vector<llvm::Value*> args = {
+ x_split.high(), x_split.low(), x.precision(), x.scale(),
+ y_split.high(), y_split.low(), y.precision(), y.scale(),
+ out.precision(), out.scale(), out_high_ptr, out_low_ptr,
+ };
+ ir_builder()->CreateCall(module()->getFunction("add_large_decimal128_decimal128"),
+ args);
+
+ auto out_high = CreateLoad(ir_builder(), out_high_ptr);
+ auto out_low = CreateLoad(ir_builder(), out_low_ptr);
+ auto sum = ValueSplit(out_high, out_low).AsInt128(this);
+ ADD_TRACE_128("AddLarge : sum", sum);
+ return sum;
+}
+
+/// The output scale/precision cannot be arbitrary values. The algo here depends on them
+/// to be the same as computed in DecimalTypeSql.
+/// TODO: enforce this.
+Status DecimalIR::BuildAdd() {
+ // Create fn prototype :
+ // int128_t
+ // add_decimal128_decimal128(int128_t x_value, int32_t x_precision, int32_t x_scale,
+ // int128_t y_value, int32_t y_precision, int32_t y_scale
+ // int32_t out_precision, int32_t out_scale)
+ auto i32 = types()->i32_type();
+ auto i128 = types()->i128_type();
+ auto function = BuildFunction(kAddFunction, i128,
+ {
+ {"x_value", i128},
+ {"x_precision", i32},
+ {"x_scale", i32},
+ {"y_value", i128},
+ {"y_precision", i32},
+ {"y_scale", i32},
+ {"out_precision", i32},
+ {"out_scale", i32},
+ });
+
+ auto arg_iter = function->arg_begin();
+ ValueFull x(&arg_iter[0], &arg_iter[1], &arg_iter[2]);
+ ValueFull y(&arg_iter[3], &arg_iter[4], &arg_iter[5]);
+ ValueFull out(nullptr, &arg_iter[6], &arg_iter[7]);
+
+ auto entry = llvm::BasicBlock::Create(*context(), "entry", function);
+ ir_builder()->SetInsertPoint(entry);
+
+ // CPP :
+ // if (out_precision < 38) {
+ // return AddFastPath(x, y)
+ // } else {
+ // ret = AddWithOverflowCheck(x, y)
+ // if (ret.overflow)
+ // return AddLarge(x, y)
+ // else
+ // return ret.value;
+ // }
+ llvm::Value* lt_max_precision = ir_builder()->CreateICmpSLT(
+ out.precision(), types()->i32_constant(DecimalTypeUtil::kMaxPrecision));
+ auto then_lambda = [&] {
+ // fast-path add
+ return AddFastPath(x, y);
+ };
+ auto else_lambda = [&] {
+ if (kUseOverflowIntrinsics) {
+ // do the add and check if there was overflow
+ auto ret = AddWithOverflowCheck(x, y, out);
+
+ // if there is an overflow, switch to the AddLarge codepath.
+ return BuildIfElse(
+ ret.overflow(), types()->i128_type(), [&] { return AddLarge(x, y, out); },
+ [&] { return ret.value(); });
+ } else {
+ return AddLarge(x, y, out);
+ }
+ };
+ auto value =
+ BuildIfElse(lt_max_precision, types()->i128_type(), then_lambda, else_lambda);
+
+ // store result to out
+ ir_builder()->CreateRet(value);
+ return Status::OK();
+}
+
+Status DecimalIR::BuildSubtract() {
+ // Create fn prototype :
+ // int128_t
+ // subtract_decimal128_decimal128(int128_t x_value, int32_t x_precision, int32_t
+ // x_scale,
+ // int128_t y_value, int32_t y_precision, int32_t y_scale
+ // int32_t out_precision, int32_t out_scale)
+ auto i32 = types()->i32_type();
+ auto i128 = types()->i128_type();
+ auto function = BuildFunction(kSubtractFunction, i128,
+ {
+ {"x_value", i128},
+ {"x_precision", i32},
+ {"x_scale", i32},
+ {"y_value", i128},
+ {"y_precision", i32},
+ {"y_scale", i32},
+ {"out_precision", i32},
+ {"out_scale", i32},
+ });
+
+ auto entry = llvm::BasicBlock::Create(*context(), "entry", function);
+ ir_builder()->SetInsertPoint(entry);
+
+ // reuse add function after negating y_value. i.e
+ // add(x_value, x_precision, x_scale, -y_value, y_precision, y_scale,
+ // out_precision, out_scale)
+ std::vector<llvm::Value*> args;
+ int i = 0;
+ for (auto& in_arg : function->args()) {
+ if (i == 3) {
+ auto y_neg_value = ir_builder()->CreateNeg(&in_arg);
+ args.push_back(y_neg_value);
+ } else {
+ args.push_back(&in_arg);
+ }
+ ++i;
+ }
+ auto value = ir_builder()->CreateCall(module()->getFunction(kAddFunction), args);
+
+ // store result to out
+ ir_builder()->CreateRet(value);
+ return Status::OK();
+}
+
+Status DecimalIR::BuildCompare(const std::string& function_name,
+ llvm::ICmpInst::Predicate cmp_instruction) {
+ // Create fn prototype :
+ // bool
+ // function_name(int128_t x_value, int32_t x_precision, int32_t x_scale,
+ // int128_t y_value, int32_t y_precision, int32_t y_scale)
+
+ auto i32 = types()->i32_type();
+ auto i128 = types()->i128_type();
+ auto function = BuildFunction(function_name, types()->i1_type(),
+ {
+ {"x_value", i128},
+ {"x_precision", i32},
+ {"x_scale", i32},
+ {"y_value", i128},
+ {"y_precision", i32},
+ {"y_scale", i32},
+ });
+
+ auto arg_iter = function->arg_begin();
+ ValueFull x(&arg_iter[0], &arg_iter[1], &arg_iter[2]);
+ ValueFull y(&arg_iter[3], &arg_iter[4], &arg_iter[5]);
+
+ auto entry = llvm::BasicBlock::Create(*context(), "entry", function);
+ ir_builder()->SetInsertPoint(entry);
+
+ // Make call to pre-compiled IR function.
+ auto x_split = ValueSplit::MakeFromInt128(this, x.value());
+ auto y_split = ValueSplit::MakeFromInt128(this, y.value());
+
+ std::vector<llvm::Value*> args = {
+ x_split.high(), x_split.low(), x.precision(), x.scale(),
+ y_split.high(), y_split.low(), y.precision(), y.scale(),
+ };
+ auto cmp_value = ir_builder()->CreateCall(
+ module()->getFunction("compare_decimal128_decimal128_internal"), args);
+ auto result =
+ ir_builder()->CreateICmp(cmp_instruction, cmp_value, types()->i32_constant(0));
+ ir_builder()->CreateRet(result);
+ return Status::OK();
+}
+
+llvm::Value* DecimalIR::CallDecimalFunction(const std::string& function_name,
+ llvm::Type* return_type,
+ const std::vector<llvm::Value*>& params) {
+ if (kDecimalIRBuilderFunctions.count(function_name) != 0) {
+ // this is fn built with the irbuilder.
+ return ir_builder()->CreateCall(module()->getFunction(function_name), params);
+ }
+
+ // ppre-compiler fn : disassemble i128 to two i64s and re-assemble.
+ auto i128 = types()->i128_type();
+ auto i64 = types()->i64_type();
+ std::vector<llvm::Value*> dis_assembled_args;
+ for (auto& arg : params) {
+ if (arg->getType() == i128) {
+ // split i128 arg into two int64s.
+ auto split = ValueSplit::MakeFromInt128(this, arg);
+ dis_assembled_args.push_back(split.high());
+ dis_assembled_args.push_back(split.low());
+ } else {
+ dis_assembled_args.push_back(arg);
+ }
+ }
+
+ llvm::Value* result = nullptr;
+ if (return_type == i128) {
+ // for i128 ret, replace with two int64* args, and join them.
+ auto block = ir_builder()->GetInsertBlock();
+ auto out_high_ptr = new llvm::AllocaInst(i64, 0, "out_hi", block);
+ auto out_low_ptr = new llvm::AllocaInst(i64, 0, "out_low", block);
+ dis_assembled_args.push_back(out_high_ptr);
+ dis_assembled_args.push_back(out_low_ptr);
+
+ // Make call to pre-compiled IR function.
+ ir_builder()->CreateCall(module()->getFunction(function_name), dis_assembled_args);
+
+ auto out_high = CreateLoad(ir_builder(), out_high_ptr);
+ auto out_low = CreateLoad(ir_builder(), out_low_ptr);
+ result = ValueSplit(out_high, out_low).AsInt128(this);
+ } else {
+ DCHECK_NE(return_type, types()->void_type());
+
+ // Make call to pre-compiled IR function.
+ result = ir_builder()->CreateCall(module()->getFunction(function_name),
+ dis_assembled_args);
+ }
+ return result;
+}
+
+Status DecimalIR::AddFunctions(Engine* engine) {
+ auto decimal_ir = std::make_shared<DecimalIR>(engine);
+
+ // Populate global variables used by decimal operations.
+ decimal_ir->AddGlobals(engine);
+
+ // Lookup intrinsic functions
+ decimal_ir->InitializeIntrinsics();
+
+ ARROW_RETURN_NOT_OK(decimal_ir->BuildAdd());
+ ARROW_RETURN_NOT_OK(decimal_ir->BuildSubtract());
+ ARROW_RETURN_NOT_OK(decimal_ir->BuildCompare(kEQFunction, llvm::ICmpInst::ICMP_EQ));
+ ARROW_RETURN_NOT_OK(decimal_ir->BuildCompare(kNEFunction, llvm::ICmpInst::ICMP_NE));
+ ARROW_RETURN_NOT_OK(decimal_ir->BuildCompare(kLTFunction, llvm::ICmpInst::ICMP_SLT));
+ ARROW_RETURN_NOT_OK(decimal_ir->BuildCompare(kLEFunction, llvm::ICmpInst::ICMP_SLE));
+ ARROW_RETURN_NOT_OK(decimal_ir->BuildCompare(kGTFunction, llvm::ICmpInst::ICMP_SGT));
+ ARROW_RETURN_NOT_OK(decimal_ir->BuildCompare(kGEFunction, llvm::ICmpInst::ICMP_SGE));
+ return Status::OK();
+}
+
+// Do an bitwise-or of all the overflow bits.
+llvm::Value* DecimalIR::GetCombinedOverflow(
+ std::vector<DecimalIR::ValueWithOverflow> vec) {
+ llvm::Value* res = types()->false_constant();
+ for (auto& val : vec) {
+ res = ir_builder()->CreateOr(res, val.overflow());
+ }
+ return res;
+}
+
+DecimalIR::ValueSplit DecimalIR::ValueSplit::MakeFromInt128(DecimalIR* decimal_ir,
+ llvm::Value* in) {
+ auto builder = decimal_ir->ir_builder();
+ auto types = decimal_ir->types();
+
+ auto high = builder->CreateLShr(in, types->i128_constant(64));
+ high = builder->CreateTrunc(high, types->i64_type());
+ auto low = builder->CreateTrunc(in, types->i64_type());
+ return ValueSplit(high, low);
+}
+
+/// Convert IR struct {%i64, %i64} to cpp class ValueSplit
+DecimalIR::ValueSplit DecimalIR::ValueSplit::MakeFromStruct(DecimalIR* decimal_ir,
+ llvm::Value* dstruct) {
+ auto builder = decimal_ir->ir_builder();
+ auto high = builder->CreateExtractValue(dstruct, 0);
+ auto low = builder->CreateExtractValue(dstruct, 1);
+ return DecimalIR::ValueSplit(high, low);
+}
+
+llvm::Value* DecimalIR::ValueSplit::AsInt128(DecimalIR* decimal_ir) const {
+ auto builder = decimal_ir->ir_builder();
+ auto types = decimal_ir->types();
+
+ auto value = builder->CreateSExt(high_, types->i128_type());
+ value = builder->CreateShl(value, types->i128_constant(64));
+ value = builder->CreateAdd(value, builder->CreateZExt(low_, types->i128_type()));
+ return value;
+}
+
+/// Convert IR struct {%i128, %i1} to cpp class ValueWithOverflow
+DecimalIR::ValueWithOverflow DecimalIR::ValueWithOverflow::MakeFromStruct(
+ DecimalIR* decimal_ir, llvm::Value* dstruct) {
+ auto builder = decimal_ir->ir_builder();
+ auto value = builder->CreateExtractValue(dstruct, 0);
+ auto overflow = builder->CreateExtractValue(dstruct, 1);
+ return DecimalIR::ValueWithOverflow(value, overflow);
+}
+
+/// Convert to IR struct {%i128, %i1}
+llvm::Value* DecimalIR::ValueWithOverflow::AsStruct(DecimalIR* decimal_ir) const {
+ auto builder = decimal_ir->ir_builder();
+
+ auto undef = llvm::UndefValue::get(decimal_ir->i128_with_overflow_struct_type_);
+ auto struct_val = builder->CreateInsertValue(undef, value(), 0);
+ return builder->CreateInsertValue(struct_val, overflow(), 1);
+}
+
+/// debug traces
+void DecimalIR::AddTrace(const std::string& fmt, std::vector<llvm::Value*> args) {
+ DCHECK(enable_ir_traces_);
+
+ auto ir_str = ir_builder()->CreateGlobalStringPtr(fmt);
+ args.insert(args.begin(), ir_str);
+ ir_builder()->CreateCall(module()->getFunction("printf"), args, "trace");
+}
+
+void DecimalIR::AddTrace32(const std::string& msg, llvm::Value* value) {
+ AddTrace("DECIMAL_IR_TRACE:: " + msg + " %d\n", {value});
+}
+
+void DecimalIR::AddTrace128(const std::string& msg, llvm::Value* value) {
+ // convert i128 into two i64s for printing
+ auto split = ValueSplit::MakeFromInt128(this, value);
+ AddTrace("DECIMAL_IR_TRACE:: " + msg + " %llx:%llx (%lld:%llu)\n",
+ {split.high(), split.low(), split.high(), split.low()});
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/decimal_ir.h b/src/arrow/cpp/src/gandiva/decimal_ir.h
new file mode 100644
index 000000000..b11730f1e
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/decimal_ir.h
@@ -0,0 +1,188 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "gandiva/function_ir_builder.h"
+
+namespace gandiva {
+
+/// @brief Decimal IR functions
+class DecimalIR : public FunctionIRBuilder {
+ public:
+ explicit DecimalIR(Engine* engine)
+ : FunctionIRBuilder(engine), enable_ir_traces_(false) {}
+
+ /// Build decimal IR functions and add them to the engine.
+ static Status AddFunctions(Engine* engine);
+
+ void EnableTraces() { enable_ir_traces_ = true; }
+
+ llvm::Value* CallDecimalFunction(const std::string& function_name,
+ llvm::Type* return_type,
+ const std::vector<llvm::Value*>& args);
+
+ private:
+ /// The intrinsic fn for divide with small divisors is about 10x slower, so not
+ /// using these.
+ static const bool kUseOverflowIntrinsics = false;
+
+ // Holder for an i128 value, along with its with scale and precision.
+ class ValueFull {
+ public:
+ ValueFull(llvm::Value* value, llvm::Value* precision, llvm::Value* scale)
+ : value_(value), precision_(precision), scale_(scale) {}
+
+ llvm::Value* value() const { return value_; }
+ llvm::Value* precision() const { return precision_; }
+ llvm::Value* scale() const { return scale_; }
+
+ private:
+ llvm::Value* value_;
+ llvm::Value* precision_;
+ llvm::Value* scale_;
+ };
+
+ // Holder for an i128 value, and a boolean indicating overflow.
+ class ValueWithOverflow {
+ public:
+ ValueWithOverflow(llvm::Value* value, llvm::Value* overflow)
+ : value_(value), overflow_(overflow) {}
+
+ // Make from IR struct
+ static ValueWithOverflow MakeFromStruct(DecimalIR* decimal_ir, llvm::Value* dstruct);
+
+ // Build a corresponding IR struct
+ llvm::Value* AsStruct(DecimalIR* decimal_ir) const;
+
+ llvm::Value* value() const { return value_; }
+ llvm::Value* overflow() const { return overflow_; }
+
+ private:
+ llvm::Value* value_;
+ llvm::Value* overflow_;
+ };
+
+ // Holder for an i128 value that is split into two i64s
+ class ValueSplit {
+ public:
+ ValueSplit(llvm::Value* high, llvm::Value* low) : high_(high), low_(low) {}
+
+ // Make from i128 value
+ static ValueSplit MakeFromInt128(DecimalIR* decimal_ir, llvm::Value* in);
+
+ // Make from IR struct
+ static ValueSplit MakeFromStruct(DecimalIR* decimal_ir, llvm::Value* dstruct);
+
+ // Combine the two parts into an i128
+ llvm::Value* AsInt128(DecimalIR* decimal_ir) const;
+
+ llvm::Value* high() const { return high_; }
+ llvm::Value* low() const { return low_; }
+
+ private:
+ llvm::Value* high_;
+ llvm::Value* low_;
+ };
+
+ // Add global variables to the module.
+ static void AddGlobals(Engine* engine);
+
+ // Initialize intrinsic functions that are used by decimal operations.
+ void InitializeIntrinsics();
+
+ // Create IR builder for decimal add function.
+ static Status MakeAdd(Engine* engine, std::shared_ptr<FunctionIRBuilder>* out);
+
+ // Get the multiplier for specified scale (i.e 10^scale)
+ llvm::Value* GetScaleMultiplier(llvm::Value* scale);
+
+ // Get the higher of the two scales
+ llvm::Value* GetHigherScale(llvm::Value* x_scale, llvm::Value* y_scale);
+
+ // Increase scale of 'in_value' by 'increase_scale_by'.
+ // - If 'increase_scale_by' is <= 0, does nothing.
+ llvm::Value* IncreaseScale(llvm::Value* in_value, llvm::Value* increase_scale_by);
+
+ // Similar to IncreaseScale. but, also check if there is overflow.
+ ValueWithOverflow IncreaseScaleWithOverflowCheck(llvm::Value* in_value,
+ llvm::Value* increase_scale_by);
+
+ // Reduce scale of 'in_value' by 'reduce_scale_by'.
+ // - If 'reduce_scale_by' is <= 0, does nothing.
+ llvm::Value* ReduceScale(llvm::Value* in_value, llvm::Value* reduce_scale_by);
+
+ // Fast path of add: guaranteed no overflow
+ llvm::Value* AddFastPath(const ValueFull& x, const ValueFull& y);
+
+ // Similar to AddFastPath, but check if there's an overflow.
+ ValueWithOverflow AddWithOverflowCheck(const ValueFull& x, const ValueFull& y,
+ const ValueFull& out);
+
+ // Do addition of large integers (both positive and negative).
+ llvm::Value* AddLarge(const ValueFull& x, const ValueFull& y, const ValueFull& out);
+
+ // Get the combined overflow (logical or).
+ llvm::Value* GetCombinedOverflow(std::vector<ValueWithOverflow> values);
+
+ // Build the function for adding decimals.
+ Status BuildAdd();
+
+ // Build the function for decimal subtraction.
+ Status BuildSubtract();
+
+ // Build the function for decimal multiplication.
+ Status BuildMultiply();
+
+ // Build the function for decimal division/mod.
+ Status BuildDivideOrMod(const std::string& function_name,
+ const std::string& internal_name);
+
+ Status BuildCompare(const std::string& function_name,
+ llvm::ICmpInst::Predicate cmp_instruction);
+
+ Status BuildDecimalFunction(const std::string& function_name, llvm::Type* return_type,
+ std::vector<NamedArg> in_types);
+
+ // Add a trace in IR code.
+ void AddTrace(const std::string& fmt, std::vector<llvm::Value*> args);
+
+ // Add a trace msg along with a 32-bit integer.
+ void AddTrace32(const std::string& msg, llvm::Value* value);
+
+ // Add a trace msg along with a 128-bit integer.
+ void AddTrace128(const std::string& msg, llvm::Value* value);
+
+ // name of the global variable having the array of scale multipliers.
+ static const char* kScaleMultipliersName;
+
+ // Intrinsic functions
+ llvm::Function* sadd_with_overflow_fn_;
+ llvm::Function* smul_with_overflow_fn_;
+
+ // struct { i128: value, i1: overflow}
+ llvm::Type* i128_with_overflow_struct_type_;
+
+ // if set to true, ir traces are enabled. Useful for debugging.
+ bool enable_ir_traces_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/decimal_scalar.h b/src/arrow/cpp/src/gandiva/decimal_scalar.h
new file mode 100644
index 000000000..a03807b35
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/decimal_scalar.h
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include "arrow/util/decimal.h"
+#include "arrow/util/hash_util.h"
+#include "gandiva/basic_decimal_scalar.h"
+
+namespace gandiva {
+
+using Decimal128 = arrow::Decimal128;
+
+/// Represents a 128-bit decimal value along with its precision and scale.
+///
+/// BasicDecimalScalar128 can be safely compiled to IR without references to libstdc++.
+/// This class has additional functionality on top of BasicDecimalScalar128 to deal with
+/// strings and streams.
+class DecimalScalar128 : public BasicDecimalScalar128 {
+ public:
+ using BasicDecimalScalar128::BasicDecimalScalar128;
+
+ DecimalScalar128(const std::string& value, int32_t precision, int32_t scale)
+ : BasicDecimalScalar128(Decimal128(value), precision, scale) {}
+
+ /// \brief constructor creates a DecimalScalar128 from a BasicDecimalScalar128.
+ constexpr DecimalScalar128(const BasicDecimalScalar128& scalar) noexcept
+ : BasicDecimalScalar128(scalar) {}
+
+ inline std::string ToString() const {
+ Decimal128 dvalue(value());
+ return dvalue.ToString(0) + "," + std::to_string(precision()) + "," +
+ std::to_string(scale());
+ }
+
+ friend std::ostream& operator<<(std::ostream& os, const DecimalScalar128& dec) {
+ os << dec.ToString();
+ return os;
+ }
+};
+
+} // namespace gandiva
+
+namespace std {
+template <>
+struct hash<gandiva::DecimalScalar128> {
+ std::size_t operator()(gandiva::DecimalScalar128 const& s) const noexcept {
+ arrow::BasicDecimal128 dvalue(s.value());
+
+ static const int kSeedValue = 4;
+ size_t result = kSeedValue;
+
+ arrow::internal::hash_combine(result, dvalue.high_bits());
+ arrow::internal::hash_combine(result, dvalue.low_bits());
+ arrow::internal::hash_combine(result, s.precision());
+ arrow::internal::hash_combine(result, s.scale());
+ return result;
+ }
+};
+} // namespace std
diff --git a/src/arrow/cpp/src/gandiva/decimal_type_util.cc b/src/arrow/cpp/src/gandiva/decimal_type_util.cc
new file mode 100644
index 000000000..2abc5a21e
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/decimal_type_util.cc
@@ -0,0 +1,75 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/decimal_type_util.h"
+#include "arrow/util/logging.h"
+
+namespace gandiva {
+
+constexpr int32_t DecimalTypeUtil::kMinAdjustedScale;
+
+#define DCHECK_TYPE(type) \
+ { \
+ DCHECK_GE(type->scale(), 0); \
+ DCHECK_LE(type->precision(), kMaxPrecision); \
+ }
+
+// Implementation of decimal rules.
+Status DecimalTypeUtil::GetResultType(Op op, const Decimal128TypeVector& in_types,
+ Decimal128TypePtr* out_type) {
+ DCHECK_EQ(in_types.size(), 2);
+
+ *out_type = nullptr;
+ auto t1 = in_types[0];
+ auto t2 = in_types[1];
+ DCHECK_TYPE(t1);
+ DCHECK_TYPE(t2);
+
+ int32_t s1 = t1->scale();
+ int32_t s2 = t2->scale();
+ int32_t p1 = t1->precision();
+ int32_t p2 = t2->precision();
+ int32_t result_scale = 0;
+ int32_t result_precision = 0;
+
+ switch (op) {
+ case kOpAdd:
+ case kOpSubtract:
+ result_scale = std::max(s1, s2);
+ result_precision = std::max(p1 - s1, p2 - s2) + result_scale + 1;
+ break;
+
+ case kOpMultiply:
+ result_scale = s1 + s2;
+ result_precision = p1 + p2 + 1;
+ break;
+
+ case kOpDivide:
+ result_scale = std::max(kMinAdjustedScale, s1 + p2 + 1);
+ result_precision = p1 - s1 + s2 + result_scale;
+ break;
+
+ case kOpMod:
+ result_scale = std::max(s1, s2);
+ result_precision = std::min(p1 - s1, p2 - s2) + result_scale;
+ break;
+ }
+ *out_type = MakeAdjustedType(result_precision, result_scale);
+ return Status::OK();
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/decimal_type_util.h b/src/arrow/cpp/src/gandiva/decimal_type_util.h
new file mode 100644
index 000000000..2b496f6cb
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/decimal_type_util.h
@@ -0,0 +1,83 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Adapted from Apache Impala
+
+#pragma once
+
+#include <algorithm>
+#include <memory>
+
+#include "gandiva/arrow.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+/// @brief Handles conversion of scale/precision for operations on decimal types.
+/// TODO : do validations for all of these.
+class GANDIVA_EXPORT DecimalTypeUtil {
+ public:
+ enum Op {
+ kOpAdd,
+ kOpSubtract,
+ kOpMultiply,
+ kOpDivide,
+ kOpMod,
+ };
+
+ /// The maximum precision representable by a 4-byte decimal
+ static constexpr int32_t kMaxDecimal32Precision = 9;
+
+ /// The maximum precision representable by a 8-byte decimal
+ static constexpr int32_t kMaxDecimal64Precision = 18;
+
+ /// The maximum precision representable by a 16-byte decimal
+ static constexpr int32_t kMaxPrecision = 38;
+
+ // The maximum scale representable.
+ static constexpr int32_t kMaxScale = kMaxPrecision;
+
+ // When operating on decimal inputs, the integer part of the output can exceed the
+ // max precision. In such cases, the scale can be reduced, up to a minimum of
+ // kMinAdjustedScale.
+ // * There is no strong reason for 6, but both SQLServer and Impala use 6 too.
+ static constexpr int32_t kMinAdjustedScale = 6;
+
+ // For specified operation and input scale/precision, determine the output
+ // scale/precision.
+ static Status GetResultType(Op op, const Decimal128TypeVector& in_types,
+ Decimal128TypePtr* out_type);
+
+ static Decimal128TypePtr MakeType(int32_t precision, int32_t scale) {
+ return std::dynamic_pointer_cast<arrow::Decimal128Type>(
+ arrow::decimal(precision, scale));
+ }
+
+ private:
+ // Reduce the scale if possible so that precision stays <= kMaxPrecision
+ static Decimal128TypePtr MakeAdjustedType(int32_t precision, int32_t scale) {
+ if (precision > kMaxPrecision) {
+ int32_t min_scale = std::min(scale, kMinAdjustedScale);
+ int32_t delta = precision - kMaxPrecision;
+ precision = kMaxPrecision;
+ scale = std::max(scale - delta, min_scale);
+ }
+ return MakeType(precision, scale);
+ }
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/decimal_type_util_test.cc b/src/arrow/cpp/src/gandiva/decimal_type_util_test.cc
new file mode 100644
index 000000000..98ea0bb16
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/decimal_type_util_test.cc
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Adapted from Apache Impala
+
+#include <gtest/gtest.h>
+
+#include "gandiva/decimal_type_util.h"
+#include "tests/test_util.h"
+
+namespace gandiva {
+
+#define DECIMAL_TYPE(p, s) DecimalTypeUtil::MakeType(p, s)
+
+Decimal128TypePtr DoOp(DecimalTypeUtil::Op op, Decimal128TypePtr d1,
+ Decimal128TypePtr d2) {
+ Decimal128TypePtr ret_type;
+ ARROW_EXPECT_OK(DecimalTypeUtil::GetResultType(op, {d1, d2}, &ret_type));
+ return ret_type;
+}
+
+TEST(DecimalResultTypes, Basic) {
+ EXPECT_ARROW_TYPE_EQUALS(
+ DECIMAL_TYPE(31, 10),
+ DoOp(DecimalTypeUtil::kOpAdd, DECIMAL_TYPE(30, 10), DECIMAL_TYPE(30, 10)));
+
+ EXPECT_ARROW_TYPE_EQUALS(
+ DECIMAL_TYPE(32, 6),
+ DoOp(DecimalTypeUtil::kOpAdd, DECIMAL_TYPE(30, 6), DECIMAL_TYPE(30, 5)));
+
+ EXPECT_ARROW_TYPE_EQUALS(
+ DECIMAL_TYPE(38, 9),
+ DoOp(DecimalTypeUtil::kOpAdd, DECIMAL_TYPE(30, 10), DECIMAL_TYPE(38, 10)));
+
+ EXPECT_ARROW_TYPE_EQUALS(
+ DECIMAL_TYPE(38, 9),
+ DoOp(DecimalTypeUtil::kOpAdd, DECIMAL_TYPE(38, 10), DECIMAL_TYPE(38, 38)));
+
+ EXPECT_ARROW_TYPE_EQUALS(
+ DECIMAL_TYPE(38, 6),
+ DoOp(DecimalTypeUtil::kOpAdd, DECIMAL_TYPE(38, 10), DECIMAL_TYPE(38, 2)));
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/decimal_xlarge.cc b/src/arrow/cpp/src/gandiva/decimal_xlarge.cc
new file mode 100644
index 000000000..caebd8b09
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/decimal_xlarge.cc
@@ -0,0 +1,284 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Operations that can deal with very large values (256-bit).
+//
+// The intermediate results with decimal can be larger than what can fit into 128-bit,
+// but the final results can fit in 128-bit after scaling down. These functions deal
+// with operations on the intermediate values.
+//
+
+#include "gandiva/decimal_xlarge.h"
+
+#include <boost/multiprecision/cpp_int.hpp>
+#include <limits>
+#include <vector>
+
+#include "arrow/util/basic_decimal.h"
+#include "arrow/util/logging.h"
+#include "gandiva/decimal_type_util.h"
+
+#ifndef GANDIVA_UNIT_TEST
+#include "gandiva/engine.h"
+#include "gandiva/exported_funcs.h"
+
+namespace gandiva {
+
+void ExportedDecimalFunctions::AddMappings(Engine* engine) const {
+ std::vector<llvm::Type*> args;
+ auto types = engine->types();
+
+ // gdv_multiply_and_scale_down
+ args = {types->i64_type(), // int64_t x_high
+ types->i64_type(), // uint64_t x_low
+ types->i64_type(), // int64_t y_high
+ types->i64_type(), // uint64_t x_low
+ types->i32_type(), // int32_t reduce_scale_by
+ types->i64_ptr_type(), // int64_t* out_high
+ types->i64_ptr_type(), // uint64_t* out_low
+ types->i8_ptr_type()}; // bool* overflow
+
+ engine->AddGlobalMappingForFunc(
+ "gdv_xlarge_multiply_and_scale_down", types->void_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_xlarge_multiply_and_scale_down));
+
+ // gdv_xlarge_scale_up_and_divide
+ args = {types->i64_type(), // int64_t x_high
+ types->i64_type(), // uint64_t x_low
+ types->i64_type(), // int64_t y_high
+ types->i64_type(), // uint64_t y_low
+ types->i32_type(), // int32_t increase_scale_by
+ types->i64_ptr_type(), // int64_t* out_high
+ types->i64_ptr_type(), // uint64_t* out_low
+ types->i8_ptr_type()}; // bool* overflow
+
+ engine->AddGlobalMappingForFunc(
+ "gdv_xlarge_scale_up_and_divide", types->void_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_xlarge_scale_up_and_divide));
+
+ // gdv_xlarge_mod
+ args = {types->i64_type(), // int64_t x_high
+ types->i64_type(), // uint64_t x_low
+ types->i32_type(), // int32_t x_scale
+ types->i64_type(), // int64_t y_high
+ types->i64_type(), // uint64_t y_low
+ types->i32_type(), // int32_t y_scale
+ types->i64_ptr_type(), // int64_t* out_high
+ types->i64_ptr_type()}; // uint64_t* out_low
+
+ engine->AddGlobalMappingForFunc("gdv_xlarge_mod", types->void_type() /*return_type*/,
+ args, reinterpret_cast<void*>(gdv_xlarge_mod));
+
+ // gdv_xlarge_compare
+ args = {types->i64_type(), // int64_t x_high
+ types->i64_type(), // uint64_t x_low
+ types->i32_type(), // int32_t x_scale
+ types->i64_type(), // int64_t y_high
+ types->i64_type(), // uint64_t y_low
+ types->i32_type()}; // int32_t y_scale
+
+ engine->AddGlobalMappingForFunc("gdv_xlarge_compare", types->i32_type() /*return_type*/,
+ args, reinterpret_cast<void*>(gdv_xlarge_compare));
+}
+
+} // namespace gandiva
+
+#endif // !GANDIVA_UNIT_TEST
+
+using arrow::BasicDecimal128;
+using boost::multiprecision::int256_t;
+
+namespace gandiva {
+namespace internal {
+
+// Convert to 256-bit integer from 128-bit decimal.
+static int256_t ConvertToInt256(BasicDecimal128 in) {
+ int256_t v = in.high_bits();
+ v <<= 64;
+ v |= in.low_bits();
+ return v;
+}
+
+// Convert to 128-bit decimal from 256-bit integer.
+// If there is an overflow, the output is undefined.
+static BasicDecimal128 ConvertToDecimal128(int256_t in, bool* overflow) {
+ BasicDecimal128 result;
+ constexpr int256_t UINT64_MASK = std::numeric_limits<uint64_t>::max();
+
+ int256_t in_abs = abs(in);
+ bool is_negative = in < 0;
+
+ uint64_t low = (in_abs & UINT64_MASK).convert_to<uint64_t>();
+ in_abs >>= 64;
+ uint64_t high = (in_abs & UINT64_MASK).convert_to<uint64_t>();
+ in_abs >>= 64;
+
+ if (in_abs > 0) {
+ // we've shifted in by 128-bit, so nothing should be left.
+ *overflow = true;
+ } else if (high > INT64_MAX) {
+ // the high-bit must not be set (signed 128-bit).
+ *overflow = true;
+ } else {
+ result = BasicDecimal128(static_cast<int64_t>(high), low);
+ if (result > BasicDecimal128::GetMaxValue()) {
+ *overflow = true;
+ }
+ }
+ return is_negative ? -result : result;
+}
+
+static constexpr int32_t kMaxLargeScale = 2 * DecimalTypeUtil::kMaxPrecision;
+
+// Compute the scale multipliers once.
+static std::array<int256_t, kMaxLargeScale + 1> kLargeScaleMultipliers =
+ ([]() -> std::array<int256_t, kMaxLargeScale + 1> {
+ std::array<int256_t, kMaxLargeScale + 1> values;
+ values[0] = 1;
+ for (int32_t idx = 1; idx <= kMaxLargeScale; idx++) {
+ values[idx] = values[idx - 1] * 10;
+ }
+ return values;
+ })();
+
+static int256_t GetScaleMultiplier(int scale) {
+ DCHECK_GE(scale, 0);
+ DCHECK_LE(scale, kMaxLargeScale);
+
+ return kLargeScaleMultipliers[scale];
+}
+
+// divide input by 10^reduce_by, and round up the fractional part.
+static int256_t ReduceScaleBy(int256_t in, int32_t reduce_by) {
+ if (reduce_by == 0) {
+ // nothing to do.
+ return in;
+ }
+
+ int256_t divisor = GetScaleMultiplier(reduce_by);
+ DCHECK_GT(divisor, 0);
+ DCHECK_EQ(divisor % 2, 0); // multiple of 10.
+ auto result = in / divisor;
+ auto remainder = in % divisor;
+ // round up (same as BasicDecimal128::ReduceScaleBy)
+ if (abs(remainder) >= (divisor >> 1)) {
+ result += (in > 0 ? 1 : -1);
+ }
+ return result;
+}
+
+// multiply input by 10^increase_by.
+static int256_t IncreaseScaleBy(int256_t in, int32_t increase_by) {
+ DCHECK_GE(increase_by, 0);
+ DCHECK_LE(increase_by, 2 * DecimalTypeUtil::kMaxPrecision);
+
+ return in * GetScaleMultiplier(increase_by);
+}
+
+} // namespace internal
+} // namespace gandiva
+
+extern "C" {
+
+void gdv_xlarge_multiply_and_scale_down(int64_t x_high, uint64_t x_low, int64_t y_high,
+ uint64_t y_low, int32_t reduce_scale_by,
+ int64_t* out_high, uint64_t* out_low,
+ bool* overflow) {
+ BasicDecimal128 x{x_high, x_low};
+ BasicDecimal128 y{y_high, y_low};
+ auto intermediate_result =
+ gandiva::internal::ConvertToInt256(x) * gandiva::internal::ConvertToInt256(y);
+ intermediate_result =
+ gandiva::internal::ReduceScaleBy(intermediate_result, reduce_scale_by);
+ auto result = gandiva::internal::ConvertToDecimal128(intermediate_result, overflow);
+ *out_high = result.high_bits();
+ *out_low = result.low_bits();
+}
+
+void gdv_xlarge_scale_up_and_divide(int64_t x_high, uint64_t x_low, int64_t y_high,
+ uint64_t y_low, int32_t increase_scale_by,
+ int64_t* out_high, uint64_t* out_low,
+ bool* overflow) {
+ BasicDecimal128 x{x_high, x_low};
+ BasicDecimal128 y{y_high, y_low};
+
+ int256_t x_large = gandiva::internal::ConvertToInt256(x);
+ int256_t x_large_scaled_up =
+ gandiva::internal::IncreaseScaleBy(x_large, increase_scale_by);
+ int256_t y_large = gandiva::internal::ConvertToInt256(y);
+ int256_t result_large = x_large_scaled_up / y_large;
+ int256_t remainder_large = x_large_scaled_up % y_large;
+
+ // Since we are scaling up and then, scaling down, round-up the result (+1 for +ve,
+ // -1 for -ve), if the remainder is >= 2 * divisor.
+ if (abs(2 * remainder_large) >= abs(y_large)) {
+ // x +ve and y +ve, result is +ve => (1 ^ 1) + 1 = 0 + 1 = +1
+ // x +ve and y -ve, result is -ve => (-1 ^ 1) + 1 = -2 + 1 = -1
+ // x +ve and y -ve, result is -ve => (1 ^ -1) + 1 = -2 + 1 = -1
+ // x -ve and y -ve, result is +ve => (-1 ^ -1) + 1 = 0 + 1 = +1
+ result_large += (x.Sign() ^ y.Sign()) + 1;
+ }
+ auto result = gandiva::internal::ConvertToDecimal128(result_large, overflow);
+ *out_high = result.high_bits();
+ *out_low = result.low_bits();
+}
+
+void gdv_xlarge_mod(int64_t x_high, uint64_t x_low, int32_t x_scale, int64_t y_high,
+ uint64_t y_low, int32_t y_scale, int64_t* out_high,
+ uint64_t* out_low) {
+ BasicDecimal128 x{x_high, x_low};
+ BasicDecimal128 y{y_high, y_low};
+
+ int256_t x_large = gandiva::internal::ConvertToInt256(x);
+ int256_t y_large = gandiva::internal::ConvertToInt256(y);
+ if (x_scale < y_scale) {
+ x_large = gandiva::internal::IncreaseScaleBy(x_large, y_scale - x_scale);
+ } else {
+ y_large = gandiva::internal::IncreaseScaleBy(y_large, x_scale - y_scale);
+ }
+ auto intermediate_result = x_large % y_large;
+ bool overflow = false;
+ auto result = gandiva::internal::ConvertToDecimal128(intermediate_result, &overflow);
+ DCHECK_EQ(overflow, false);
+
+ *out_high = result.high_bits();
+ *out_low = result.low_bits();
+}
+
+int32_t gdv_xlarge_compare(int64_t x_high, uint64_t x_low, int32_t x_scale,
+ int64_t y_high, uint64_t y_low, int32_t y_scale) {
+ BasicDecimal128 x{x_high, x_low};
+ BasicDecimal128 y{y_high, y_low};
+
+ int256_t x_large = gandiva::internal::ConvertToInt256(x);
+ int256_t y_large = gandiva::internal::ConvertToInt256(y);
+ if (x_scale < y_scale) {
+ x_large = gandiva::internal::IncreaseScaleBy(x_large, y_scale - x_scale);
+ } else {
+ y_large = gandiva::internal::IncreaseScaleBy(y_large, x_scale - y_scale);
+ }
+
+ if (x_large == y_large) {
+ return 0;
+ } else if (x_large < y_large) {
+ return -1;
+ } else {
+ return 1;
+ }
+}
+
+} // extern "C"
diff --git a/src/arrow/cpp/src/gandiva/decimal_xlarge.h b/src/arrow/cpp/src/gandiva/decimal_xlarge.h
new file mode 100644
index 000000000..264329775
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/decimal_xlarge.h
@@ -0,0 +1,41 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+/// Stub functions to deal with extra large decimals that can be accessed from LLVM-IR
+/// code.
+extern "C" {
+
+void gdv_xlarge_multiply_and_scale_down(int64_t x_high, uint64_t x_low, int64_t y_high,
+ uint64_t y_low, int32_t reduce_scale_by,
+ int64_t* out_high, uint64_t* out_low,
+ bool* overflow);
+
+void gdv_xlarge_scale_up_and_divide(int64_t x_high, uint64_t x_low, int64_t y_high,
+ uint64_t y_low, int32_t increase_scale_by,
+ int64_t* out_high, uint64_t* out_low, bool* overflow);
+
+void gdv_xlarge_mod(int64_t x_high, uint64_t x_low, int32_t x_scale, int64_t y_high,
+ uint64_t y_low, int32_t y_scale, int64_t* out_high,
+ uint64_t* out_low);
+
+int32_t gdv_xlarge_compare(int64_t x_high, uint64_t x_low, int32_t x_scale,
+ int64_t y_high, uint64_t y_low, int32_t y_scale);
+}
diff --git a/src/arrow/cpp/src/gandiva/dex.h b/src/arrow/cpp/src/gandiva/dex.h
new file mode 100644
index 000000000..d1115c051
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/dex.h
@@ -0,0 +1,396 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "gandiva/dex_visitor.h"
+#include "gandiva/field_descriptor.h"
+#include "gandiva/func_descriptor.h"
+#include "gandiva/function_holder.h"
+#include "gandiva/gandiva_aliases.h"
+#include "gandiva/in_holder.h"
+#include "gandiva/literal_holder.h"
+#include "gandiva/native_function.h"
+#include "gandiva/value_validity_pair.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+/// \brief Decomposed expression : the validity and value are separated.
+class GANDIVA_EXPORT Dex {
+ public:
+ /// Derived classes should simply invoke the Visit api of the visitor.
+ virtual void Accept(DexVisitor& visitor) = 0;
+ virtual ~Dex() = default;
+};
+
+/// Base class for other Vector related Dex.
+class GANDIVA_EXPORT VectorReadBaseDex : public Dex {
+ public:
+ explicit VectorReadBaseDex(FieldDescriptorPtr field_desc) : field_desc_(field_desc) {}
+
+ const std::string& FieldName() const { return field_desc_->Name(); }
+
+ DataTypePtr FieldType() const { return field_desc_->Type(); }
+
+ FieldPtr Field() const { return field_desc_->field(); }
+
+ protected:
+ FieldDescriptorPtr field_desc_;
+};
+
+/// validity component of a ValueVector
+class GANDIVA_EXPORT VectorReadValidityDex : public VectorReadBaseDex {
+ public:
+ explicit VectorReadValidityDex(FieldDescriptorPtr field_desc)
+ : VectorReadBaseDex(field_desc) {}
+
+ int ValidityIdx() const { return field_desc_->validity_idx(); }
+
+ void Accept(DexVisitor& visitor) override { visitor.Visit(*this); }
+};
+
+/// value component of a fixed-len ValueVector
+class GANDIVA_EXPORT VectorReadFixedLenValueDex : public VectorReadBaseDex {
+ public:
+ explicit VectorReadFixedLenValueDex(FieldDescriptorPtr field_desc)
+ : VectorReadBaseDex(field_desc) {}
+
+ int DataIdx() const { return field_desc_->data_idx(); }
+
+ void Accept(DexVisitor& visitor) override { visitor.Visit(*this); }
+};
+
+/// value component of a variable-len ValueVector
+class GANDIVA_EXPORT VectorReadVarLenValueDex : public VectorReadBaseDex {
+ public:
+ explicit VectorReadVarLenValueDex(FieldDescriptorPtr field_desc)
+ : VectorReadBaseDex(field_desc) {}
+
+ int DataIdx() const { return field_desc_->data_idx(); }
+
+ int OffsetsIdx() const { return field_desc_->offsets_idx(); }
+
+ void Accept(DexVisitor& visitor) override { visitor.Visit(*this); }
+};
+
+/// validity based on a local bitmap.
+class GANDIVA_EXPORT LocalBitMapValidityDex : public Dex {
+ public:
+ explicit LocalBitMapValidityDex(int local_bitmap_idx)
+ : local_bitmap_idx_(local_bitmap_idx) {}
+
+ int local_bitmap_idx() const { return local_bitmap_idx_; }
+
+ void Accept(DexVisitor& visitor) override { visitor.Visit(*this); }
+
+ private:
+ int local_bitmap_idx_;
+};
+
+/// base function expression
+class GANDIVA_EXPORT FuncDex : public Dex {
+ public:
+ FuncDex(FuncDescriptorPtr func_descriptor, const NativeFunction* native_function,
+ FunctionHolderPtr function_holder, const ValueValidityPairVector& args)
+ : func_descriptor_(func_descriptor),
+ native_function_(native_function),
+ function_holder_(function_holder),
+ args_(args) {}
+
+ FuncDescriptorPtr func_descriptor() const { return func_descriptor_; }
+
+ const NativeFunction* native_function() const { return native_function_; }
+
+ FunctionHolderPtr function_holder() const { return function_holder_; }
+
+ const ValueValidityPairVector& args() const { return args_; }
+
+ private:
+ FuncDescriptorPtr func_descriptor_;
+ const NativeFunction* native_function_;
+ FunctionHolderPtr function_holder_;
+ ValueValidityPairVector args_;
+};
+
+/// A function expression that only deals with non-null inputs, and generates non-null
+/// outputs.
+class GANDIVA_EXPORT NonNullableFuncDex : public FuncDex {
+ public:
+ NonNullableFuncDex(FuncDescriptorPtr func_descriptor,
+ const NativeFunction* native_function,
+ FunctionHolderPtr function_holder,
+ const ValueValidityPairVector& args)
+ : FuncDex(func_descriptor, native_function, function_holder, args) {}
+
+ void Accept(DexVisitor& visitor) override { visitor.Visit(*this); }
+};
+
+/// A function expression that deals with nullable inputs, but generates non-null
+/// outputs.
+class GANDIVA_EXPORT NullableNeverFuncDex : public FuncDex {
+ public:
+ NullableNeverFuncDex(FuncDescriptorPtr func_descriptor,
+ const NativeFunction* native_function,
+ FunctionHolderPtr function_holder,
+ const ValueValidityPairVector& args)
+ : FuncDex(func_descriptor, native_function, function_holder, args) {}
+
+ void Accept(DexVisitor& visitor) override { visitor.Visit(*this); }
+};
+
+/// A function expression that deals with nullable inputs, and
+/// nullable outputs.
+class GANDIVA_EXPORT NullableInternalFuncDex : public FuncDex {
+ public:
+ NullableInternalFuncDex(FuncDescriptorPtr func_descriptor,
+ const NativeFunction* native_function,
+ FunctionHolderPtr function_holder,
+ const ValueValidityPairVector& args, int local_bitmap_idx)
+ : FuncDex(func_descriptor, native_function, function_holder, args),
+ local_bitmap_idx_(local_bitmap_idx) {}
+
+ void Accept(DexVisitor& visitor) override { visitor.Visit(*this); }
+
+ /// The validity of the function result is saved in this bitmap.
+ int local_bitmap_idx() const { return local_bitmap_idx_; }
+
+ private:
+ int local_bitmap_idx_;
+};
+
+/// special validity type that always returns true.
+class GANDIVA_EXPORT TrueDex : public Dex {
+ void Accept(DexVisitor& visitor) override { visitor.Visit(*this); }
+};
+
+/// special validity type that always returns false.
+class GANDIVA_EXPORT FalseDex : public Dex {
+ void Accept(DexVisitor& visitor) override { visitor.Visit(*this); }
+};
+
+/// decomposed expression for a literal.
+class GANDIVA_EXPORT LiteralDex : public Dex {
+ public:
+ LiteralDex(DataTypePtr type, const LiteralHolder& holder)
+ : type_(type), holder_(holder) {}
+
+ const DataTypePtr& type() const { return type_; }
+
+ const LiteralHolder& holder() const { return holder_; }
+
+ void Accept(DexVisitor& visitor) override { visitor.Visit(*this); }
+
+ private:
+ DataTypePtr type_;
+ LiteralHolder holder_;
+};
+
+/// decomposed if-else expression.
+class GANDIVA_EXPORT IfDex : public Dex {
+ public:
+ IfDex(ValueValidityPairPtr condition_vv, ValueValidityPairPtr then_vv,
+ ValueValidityPairPtr else_vv, DataTypePtr result_type, int local_bitmap_idx,
+ bool is_terminal_else)
+ : condition_vv_(condition_vv),
+ then_vv_(then_vv),
+ else_vv_(else_vv),
+ result_type_(result_type),
+ local_bitmap_idx_(local_bitmap_idx),
+ is_terminal_else_(is_terminal_else) {}
+
+ void Accept(DexVisitor& visitor) override { visitor.Visit(*this); }
+
+ const ValueValidityPair& condition_vv() const { return *condition_vv_; }
+ const ValueValidityPair& then_vv() const { return *then_vv_; }
+ const ValueValidityPair& else_vv() const { return *else_vv_; }
+
+ /// The validity of the result is saved in this bitmap.
+ int local_bitmap_idx() const { return local_bitmap_idx_; }
+
+ /// is this a terminal else ? i.e no nested if-else underneath.
+ bool is_terminal_else() const { return is_terminal_else_; }
+
+ const DataTypePtr& result_type() const { return result_type_; }
+
+ private:
+ ValueValidityPairPtr condition_vv_;
+ ValueValidityPairPtr then_vv_;
+ ValueValidityPairPtr else_vv_;
+ DataTypePtr result_type_;
+ int local_bitmap_idx_;
+ bool is_terminal_else_;
+};
+
+// decomposed boolean expression.
+class GANDIVA_EXPORT BooleanDex : public Dex {
+ public:
+ BooleanDex(const ValueValidityPairVector& args, int local_bitmap_idx)
+ : args_(args), local_bitmap_idx_(local_bitmap_idx) {}
+
+ const ValueValidityPairVector& args() const { return args_; }
+
+ /// The validity of the result is saved in this bitmap.
+ int local_bitmap_idx() const { return local_bitmap_idx_; }
+
+ private:
+ ValueValidityPairVector args_;
+ int local_bitmap_idx_;
+};
+
+/// Boolean-AND expression
+class GANDIVA_EXPORT BooleanAndDex : public BooleanDex {
+ public:
+ BooleanAndDex(const ValueValidityPairVector& args, int local_bitmap_idx)
+ : BooleanDex(args, local_bitmap_idx) {}
+
+ void Accept(DexVisitor& visitor) override { visitor.Visit(*this); }
+};
+
+/// Boolean-OR expression
+class GANDIVA_EXPORT BooleanOrDex : public BooleanDex {
+ public:
+ BooleanOrDex(const ValueValidityPairVector& args, int local_bitmap_idx)
+ : BooleanDex(args, local_bitmap_idx) {}
+
+ void Accept(DexVisitor& visitor) override { visitor.Visit(*this); }
+};
+
+// decomposed in expression.
+template <typename Type>
+class InExprDex;
+
+template <typename Type>
+class InExprDexBase : public Dex {
+ public:
+ InExprDexBase(const ValueValidityPairVector& args,
+ const std::unordered_set<Type>& values)
+ : args_(args) {
+ in_holder_.reset(new InHolder<Type>(values));
+ }
+
+ const ValueValidityPairVector& args() const { return args_; }
+
+ void Accept(DexVisitor& visitor) override { visitor.Visit(*this); }
+
+ const std::string& runtime_function() const { return runtime_function_; }
+
+ const std::shared_ptr<InHolder<Type>>& in_holder() const { return in_holder_; }
+
+ protected:
+ ValueValidityPairVector args_;
+ std::string runtime_function_;
+ std::shared_ptr<InHolder<Type>> in_holder_;
+};
+
+template <>
+class InExprDexBase<gandiva::DecimalScalar128> : public Dex {
+ public:
+ InExprDexBase(const ValueValidityPairVector& args,
+ const std::unordered_set<gandiva::DecimalScalar128>& values,
+ int32_t precision, int32_t scale)
+ : args_(args), precision_(precision), scale_(scale) {
+ in_holder_.reset(new InHolder<gandiva::DecimalScalar128>(values));
+ }
+
+ int32_t get_precision() const { return precision_; }
+
+ int32_t get_scale() const { return scale_; }
+
+ const ValueValidityPairVector& args() const { return args_; }
+
+ void Accept(DexVisitor& visitor) override { visitor.Visit(*this); }
+
+ const std::string& runtime_function() const { return runtime_function_; }
+
+ const std::shared_ptr<InHolder<gandiva::DecimalScalar128>>& in_holder() const {
+ return in_holder_;
+ }
+
+ protected:
+ ValueValidityPairVector args_;
+ std::string runtime_function_;
+ std::shared_ptr<InHolder<gandiva::DecimalScalar128>> in_holder_;
+ int32_t precision_, scale_;
+};
+
+template <>
+class InExprDex<int32_t> : public InExprDexBase<int32_t> {
+ public:
+ InExprDex(const ValueValidityPairVector& args,
+ const std::unordered_set<int32_t>& values)
+ : InExprDexBase(args, values) {
+ runtime_function_ = "gdv_fn_in_expr_lookup_int32";
+ }
+};
+
+template <>
+class InExprDex<int64_t> : public InExprDexBase<int64_t> {
+ public:
+ InExprDex(const ValueValidityPairVector& args,
+ const std::unordered_set<int64_t>& values)
+ : InExprDexBase(args, values) {
+ runtime_function_ = "gdv_fn_in_expr_lookup_int64";
+ }
+};
+
+template <>
+class InExprDex<float> : public InExprDexBase<float> {
+ public:
+ InExprDex(const ValueValidityPairVector& args, const std::unordered_set<float>& values)
+ : InExprDexBase(args, values) {
+ runtime_function_ = "gdv_fn_in_expr_lookup_float";
+ }
+};
+
+template <>
+class InExprDex<double> : public InExprDexBase<double> {
+ public:
+ InExprDex(const ValueValidityPairVector& args, const std::unordered_set<double>& values)
+ : InExprDexBase(args, values) {
+ runtime_function_ = "gdv_fn_in_expr_lookup_double";
+ }
+};
+
+template <>
+class InExprDex<gandiva::DecimalScalar128>
+ : public InExprDexBase<gandiva::DecimalScalar128> {
+ public:
+ InExprDex(const ValueValidityPairVector& args,
+ const std::unordered_set<gandiva::DecimalScalar128>& values,
+ int32_t precision, int32_t scale)
+ : InExprDexBase<gandiva::DecimalScalar128>(args, values, precision, scale) {
+ runtime_function_ = "gdv_fn_in_expr_lookup_decimal";
+ }
+};
+
+template <>
+class InExprDex<std::string> : public InExprDexBase<std::string> {
+ public:
+ InExprDex(const ValueValidityPairVector& args,
+ const std::unordered_set<std::string>& values)
+ : InExprDexBase(args, values) {
+ runtime_function_ = "gdv_fn_in_expr_lookup_utf8";
+ }
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/dex_visitor.h b/src/arrow/cpp/src/gandiva/dex_visitor.h
new file mode 100644
index 000000000..5d160bb22
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/dex_visitor.h
@@ -0,0 +1,97 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cmath>
+#include <string>
+
+#include "arrow/util/logging.h"
+#include "gandiva/decimal_scalar.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+class VectorReadValidityDex;
+class VectorReadFixedLenValueDex;
+class VectorReadVarLenValueDex;
+class LocalBitMapValidityDex;
+class LiteralDex;
+class TrueDex;
+class FalseDex;
+class NonNullableFuncDex;
+class NullableNeverFuncDex;
+class NullableInternalFuncDex;
+class IfDex;
+class BooleanAndDex;
+class BooleanOrDex;
+template <typename Type>
+class InExprDexBase;
+
+/// \brief Visitor for decomposed expression.
+class GANDIVA_EXPORT DexVisitor {
+ public:
+ virtual ~DexVisitor() = default;
+
+ virtual void Visit(const VectorReadValidityDex& dex) = 0;
+ virtual void Visit(const VectorReadFixedLenValueDex& dex) = 0;
+ virtual void Visit(const VectorReadVarLenValueDex& dex) = 0;
+ virtual void Visit(const LocalBitMapValidityDex& dex) = 0;
+ virtual void Visit(const TrueDex& dex) = 0;
+ virtual void Visit(const FalseDex& dex) = 0;
+ virtual void Visit(const LiteralDex& dex) = 0;
+ virtual void Visit(const NonNullableFuncDex& dex) = 0;
+ virtual void Visit(const NullableNeverFuncDex& dex) = 0;
+ virtual void Visit(const NullableInternalFuncDex& dex) = 0;
+ virtual void Visit(const IfDex& dex) = 0;
+ virtual void Visit(const BooleanAndDex& dex) = 0;
+ virtual void Visit(const BooleanOrDex& dex) = 0;
+ virtual void Visit(const InExprDexBase<int32_t>& dex) = 0;
+ virtual void Visit(const InExprDexBase<int64_t>& dex) = 0;
+ virtual void Visit(const InExprDexBase<float>& dex) = 0;
+ virtual void Visit(const InExprDexBase<double>& dex) = 0;
+ virtual void Visit(const InExprDexBase<gandiva::DecimalScalar128>& dex) = 0;
+ virtual void Visit(const InExprDexBase<std::string>& dex) = 0;
+};
+
+/// Default implementation with only DCHECK().
+#define VISIT_DCHECK(DEX_CLASS) \
+ void Visit(const DEX_CLASS& dex) override { DCHECK(0); }
+
+class GANDIVA_EXPORT DexDefaultVisitor : public DexVisitor {
+ VISIT_DCHECK(VectorReadValidityDex)
+ VISIT_DCHECK(VectorReadFixedLenValueDex)
+ VISIT_DCHECK(VectorReadVarLenValueDex)
+ VISIT_DCHECK(LocalBitMapValidityDex)
+ VISIT_DCHECK(TrueDex)
+ VISIT_DCHECK(FalseDex)
+ VISIT_DCHECK(LiteralDex)
+ VISIT_DCHECK(NonNullableFuncDex)
+ VISIT_DCHECK(NullableNeverFuncDex)
+ VISIT_DCHECK(NullableInternalFuncDex)
+ VISIT_DCHECK(IfDex)
+ VISIT_DCHECK(BooleanAndDex)
+ VISIT_DCHECK(BooleanOrDex)
+ VISIT_DCHECK(InExprDexBase<int32_t>)
+ VISIT_DCHECK(InExprDexBase<int64_t>)
+ VISIT_DCHECK(InExprDexBase<float>)
+ VISIT_DCHECK(InExprDexBase<double>)
+ VISIT_DCHECK(InExprDexBase<gandiva::DecimalScalar128>)
+ VISIT_DCHECK(InExprDexBase<std::string>)
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/engine.cc b/src/arrow/cpp/src/gandiva/engine.cc
new file mode 100644
index 000000000..f0b768f5f
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/engine.cc
@@ -0,0 +1,338 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// TODO(wesm): LLVM 7 produces pesky C4244 that disable pragmas around the LLVM
+// includes seem to not fix as with LLVM 6
+#if defined(_MSC_VER)
+#pragma warning(disable : 4244)
+#endif
+
+#include "gandiva/engine.h"
+
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <unordered_set>
+#include <utility>
+
+#include "arrow/util/logging.h"
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4141)
+#pragma warning(disable : 4146)
+#pragma warning(disable : 4244)
+#pragma warning(disable : 4267)
+#pragma warning(disable : 4624)
+#endif
+
+#include <llvm/Analysis/Passes.h>
+#include <llvm/Analysis/TargetTransformInfo.h>
+#include <llvm/Bitcode/BitcodeReader.h>
+#include <llvm/ExecutionEngine/ExecutionEngine.h>
+#include <llvm/ExecutionEngine/MCJIT.h>
+#include <llvm/IR/DataLayout.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IR/Verifier.h>
+#include <llvm/Linker/Linker.h>
+#include <llvm/MC/SubtargetFeature.h>
+#include <llvm/Support/DynamicLibrary.h>
+#include <llvm/Support/Host.h>
+#include <llvm/Support/TargetRegistry.h>
+#include <llvm/Support/TargetSelect.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Transforms/IPO.h>
+#include <llvm/Transforms/IPO/PassManagerBuilder.h>
+#include <llvm/Transforms/InstCombine/InstCombine.h>
+#include <llvm/Transforms/Scalar.h>
+#include <llvm/Transforms/Scalar/GVN.h>
+#include <llvm/Transforms/Utils.h>
+#include <llvm/Transforms/Vectorize.h>
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+#include "arrow/util/make_unique.h"
+#include "gandiva/configuration.h"
+#include "gandiva/decimal_ir.h"
+#include "gandiva/exported_funcs_registry.h"
+
+namespace gandiva {
+
+extern const unsigned char kPrecompiledBitcode[];
+extern const size_t kPrecompiledBitcodeSize;
+
+std::once_flag llvm_init_once_flag;
+static bool llvm_init = false;
+static llvm::StringRef cpu_name;
+static llvm::SmallVector<std::string, 10> cpu_attrs;
+
+void Engine::InitOnce() {
+ DCHECK_EQ(llvm_init, false);
+
+ llvm::InitializeNativeTarget();
+ llvm::InitializeNativeTargetAsmPrinter();
+ llvm::InitializeNativeTargetAsmParser();
+ llvm::InitializeNativeTargetDisassembler();
+ llvm::sys::DynamicLibrary::LoadLibraryPermanently(nullptr);
+
+ cpu_name = llvm::sys::getHostCPUName();
+ llvm::StringMap<bool> host_features;
+ std::string cpu_attrs_str;
+ if (llvm::sys::getHostCPUFeatures(host_features)) {
+ for (auto& f : host_features) {
+ std::string attr = f.second ? std::string("+") + f.first().str()
+ : std::string("-") + f.first().str();
+ cpu_attrs.push_back(attr);
+ cpu_attrs_str += " " + attr;
+ }
+ }
+ ARROW_LOG(INFO) << "Detected CPU Name : " << cpu_name.str();
+ ARROW_LOG(INFO) << "Detected CPU Features:" << cpu_attrs_str;
+ llvm_init = true;
+}
+
+Engine::Engine(const std::shared_ptr<Configuration>& conf,
+ std::unique_ptr<llvm::LLVMContext> ctx,
+ std::unique_ptr<llvm::ExecutionEngine> engine, llvm::Module* module)
+ : context_(std::move(ctx)),
+ execution_engine_(std::move(engine)),
+ ir_builder_(arrow::internal::make_unique<llvm::IRBuilder<>>(*context_)),
+ module_(module),
+ types_(*context_),
+ optimize_(conf->optimize()) {}
+
+Status Engine::Init() {
+ // Add mappings for functions that can be accessed from LLVM/IR module.
+ AddGlobalMappings();
+
+ ARROW_RETURN_NOT_OK(LoadPreCompiledIR());
+ ARROW_RETURN_NOT_OK(DecimalIR::AddFunctions(this));
+
+ return Status::OK();
+}
+
+/// factory method to construct the engine.
+Status Engine::Make(const std::shared_ptr<Configuration>& conf,
+ std::unique_ptr<Engine>* out) {
+ std::call_once(llvm_init_once_flag, InitOnce);
+
+ auto ctx = arrow::internal::make_unique<llvm::LLVMContext>();
+ auto module = arrow::internal::make_unique<llvm::Module>("codegen", *ctx);
+
+ // Capture before moving, ExecutionEngine does not allow retrieving the
+ // original Module.
+ auto module_ptr = module.get();
+
+ auto opt_level =
+ conf->optimize() ? llvm::CodeGenOpt::Aggressive : llvm::CodeGenOpt::None;
+
+ // Note that the lifetime of the error string is not captured by the
+ // ExecutionEngine but only for the lifetime of the builder. Found by
+ // inspecting LLVM sources.
+ std::string builder_error;
+
+ llvm::EngineBuilder engine_builder(std::move(module));
+
+ engine_builder.setEngineKind(llvm::EngineKind::JIT)
+ .setOptLevel(opt_level)
+ .setErrorStr(&builder_error);
+
+ if (conf->target_host_cpu()) {
+ engine_builder.setMCPU(cpu_name);
+ engine_builder.setMAttrs(cpu_attrs);
+ }
+ std::unique_ptr<llvm::ExecutionEngine> exec_engine{engine_builder.create()};
+
+ if (exec_engine == nullptr) {
+ return Status::CodeGenError("Could not instantiate llvm::ExecutionEngine: ",
+ builder_error);
+ }
+
+ std::unique_ptr<Engine> engine{
+ new Engine(conf, std::move(ctx), std::move(exec_engine), module_ptr)};
+ ARROW_RETURN_NOT_OK(engine->Init());
+ *out = std::move(engine);
+ return Status::OK();
+}
+
+// This method was modified from its original version for a part of MLIR
+// Original source from
+// https://github.com/llvm/llvm-project/blob/9f2ce5b915a505a5488a5cf91bb0a8efa9ddfff7/mlir/lib/ExecutionEngine/ExecutionEngine.cpp
+// The original copyright notice follows.
+
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+static void SetDataLayout(llvm::Module* module) {
+ auto target_triple = llvm::sys::getDefaultTargetTriple();
+ std::string error_message;
+ auto target = llvm::TargetRegistry::lookupTarget(target_triple, error_message);
+ if (!target) {
+ return;
+ }
+
+ std::string cpu(llvm::sys::getHostCPUName());
+ llvm::SubtargetFeatures features;
+ llvm::StringMap<bool> host_features;
+
+ if (llvm::sys::getHostCPUFeatures(host_features)) {
+ for (auto& f : host_features) {
+ features.AddFeature(f.first(), f.second);
+ }
+ }
+
+ std::unique_ptr<llvm::TargetMachine> machine(
+ target->createTargetMachine(target_triple, cpu, features.getString(), {}, {}));
+
+ module->setDataLayout(machine->createDataLayout());
+}
+// end of the mofified method from MLIR
+
+// Handling for pre-compiled IR libraries.
+Status Engine::LoadPreCompiledIR() {
+ auto bitcode = llvm::StringRef(reinterpret_cast<const char*>(kPrecompiledBitcode),
+ kPrecompiledBitcodeSize);
+
+ /// Read from file into memory buffer.
+ llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> buffer_or_error =
+ llvm::MemoryBuffer::getMemBuffer(bitcode, "precompiled", false);
+
+ ARROW_RETURN_IF(!buffer_or_error,
+ Status::CodeGenError("Could not load module from IR: ",
+ buffer_or_error.getError().message()));
+
+ std::unique_ptr<llvm::MemoryBuffer> buffer = move(buffer_or_error.get());
+
+ /// Parse the IR module.
+ llvm::Expected<std::unique_ptr<llvm::Module>> module_or_error =
+ llvm::getOwningLazyBitcodeModule(move(buffer), *context());
+ if (!module_or_error) {
+ // NOTE: llvm::handleAllErrors() fails linking with RTTI-disabled LLVM builds
+ // (ARROW-5148)
+ std::string str;
+ llvm::raw_string_ostream stream(str);
+ stream << module_or_error.takeError();
+ return Status::CodeGenError(stream.str());
+ }
+ std::unique_ptr<llvm::Module> ir_module = move(module_or_error.get());
+
+ // set dataLayout
+ SetDataLayout(ir_module.get());
+
+ ARROW_RETURN_IF(llvm::verifyModule(*ir_module, &llvm::errs()),
+ Status::CodeGenError("verify of IR Module failed"));
+ ARROW_RETURN_IF(llvm::Linker::linkModules(*module_, move(ir_module)),
+ Status::CodeGenError("failed to link IR Modules"));
+
+ return Status::OK();
+}
+
+// Get rid of all functions that don't need to be compiled.
+// This helps in reducing the overall compilation time. This pass is trivial,
+// and is always done since the number of functions in gandiva is very high.
+// (Adapted from Apache Impala)
+//
+// Done by marking all the unused functions as internal, and then, running
+// a pass for dead code elimination.
+Status Engine::RemoveUnusedFunctions() {
+ // Setup an optimiser pipeline
+ std::unique_ptr<llvm::legacy::PassManager> pass_manager(
+ new llvm::legacy::PassManager());
+
+ std::unordered_set<std::string> used_functions;
+ used_functions.insert(functions_to_compile_.begin(), functions_to_compile_.end());
+
+ pass_manager->add(
+ llvm::createInternalizePass([&used_functions](const llvm::GlobalValue& func) {
+ return (used_functions.find(func.getName().str()) != used_functions.end());
+ }));
+ pass_manager->add(llvm::createGlobalDCEPass());
+ pass_manager->run(*module_);
+ return Status::OK();
+}
+
+// Optimise and compile the module.
+Status Engine::FinalizeModule() {
+ ARROW_RETURN_NOT_OK(RemoveUnusedFunctions());
+
+ if (optimize_) {
+ // misc passes to allow for inlining, vectorization, ..
+ std::unique_ptr<llvm::legacy::PassManager> pass_manager(
+ new llvm::legacy::PassManager());
+
+ llvm::TargetIRAnalysis target_analysis =
+ execution_engine_->getTargetMachine()->getTargetIRAnalysis();
+ pass_manager->add(llvm::createTargetTransformInfoWrapperPass(target_analysis));
+ pass_manager->add(llvm::createFunctionInliningPass());
+ pass_manager->add(llvm::createInstructionCombiningPass());
+ pass_manager->add(llvm::createPromoteMemoryToRegisterPass());
+ pass_manager->add(llvm::createGVNPass());
+ pass_manager->add(llvm::createNewGVNPass());
+ pass_manager->add(llvm::createCFGSimplificationPass());
+ pass_manager->add(llvm::createLoopVectorizePass());
+ pass_manager->add(llvm::createSLPVectorizerPass());
+ pass_manager->add(llvm::createGlobalOptimizerPass());
+
+ // run the optimiser
+ llvm::PassManagerBuilder pass_builder;
+ pass_builder.OptLevel = 3;
+ pass_builder.populateModulePassManager(*pass_manager);
+ pass_manager->run(*module_);
+ }
+
+ ARROW_RETURN_IF(llvm::verifyModule(*module_, &llvm::errs()),
+ Status::CodeGenError("Module verification failed after optimizer"));
+
+ // do the compilation
+ execution_engine_->finalizeObject();
+ module_finalized_ = true;
+
+ return Status::OK();
+}
+
+void* Engine::CompiledFunction(llvm::Function* irFunction) {
+ DCHECK(module_finalized_);
+ return execution_engine_->getPointerToFunction(irFunction);
+}
+
+void Engine::AddGlobalMappingForFunc(const std::string& name, llvm::Type* ret_type,
+ const std::vector<llvm::Type*>& args,
+ void* function_ptr) {
+ constexpr bool is_var_arg = false;
+ auto prototype = llvm::FunctionType::get(ret_type, args, is_var_arg);
+ constexpr auto linkage = llvm::GlobalValue::ExternalLinkage;
+ auto fn = llvm::Function::Create(prototype, linkage, name, module());
+ execution_engine_->addGlobalMapping(fn, function_ptr);
+}
+
+void Engine::AddGlobalMappings() { ExportedFuncsRegistry::AddMappings(this); }
+
+std::string Engine::DumpIR() {
+ std::string ir;
+ llvm::raw_string_ostream stream(ir);
+ module_->print(stream, nullptr);
+ return ir;
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/engine.h b/src/arrow/cpp/src/gandiva/engine.h
new file mode 100644
index 000000000..d26b8aa0e
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/engine.h
@@ -0,0 +1,104 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "arrow/util/macros.h"
+
+#include "arrow/util/logging.h"
+#include "gandiva/configuration.h"
+#include "gandiva/llvm_includes.h"
+#include "gandiva/llvm_types.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+/// \brief LLVM Execution engine wrapper.
+class GANDIVA_EXPORT Engine {
+ public:
+ llvm::LLVMContext* context() { return context_.get(); }
+ llvm::IRBuilder<>* ir_builder() { return ir_builder_.get(); }
+ LLVMTypes* types() { return &types_; }
+ llvm::Module* module() { return module_; }
+
+ /// Factory method to create and initialize the engine object.
+ ///
+ /// \param[in] config the engine configuration
+ /// \param[out] engine the created engine
+ static Status Make(const std::shared_ptr<Configuration>& config,
+ std::unique_ptr<Engine>* engine);
+
+ /// Add the function to the list of IR functions that need to be compiled.
+ /// Compiling only the functions that are used by the module saves time.
+ void AddFunctionToCompile(const std::string& fname) {
+ DCHECK(!module_finalized_);
+ functions_to_compile_.push_back(fname);
+ }
+
+ /// Optimise and compile the module.
+ Status FinalizeModule();
+
+ /// Get the compiled function corresponding to the irfunction.
+ void* CompiledFunction(llvm::Function* irFunction);
+
+ // Create and add a mapping for the cpp function to make it accessible from LLVM.
+ void AddGlobalMappingForFunc(const std::string& name, llvm::Type* ret_type,
+ const std::vector<llvm::Type*>& args, void* func);
+
+ /// Return the generated IR for the module.
+ std::string DumpIR();
+
+ private:
+ Engine(const std::shared_ptr<Configuration>& conf,
+ std::unique_ptr<llvm::LLVMContext> ctx,
+ std::unique_ptr<llvm::ExecutionEngine> engine, llvm::Module* module);
+
+ // Post construction init. This _must_ be called after the constructor.
+ Status Init();
+
+ static void InitOnce();
+
+ llvm::ExecutionEngine& execution_engine() { return *execution_engine_; }
+
+ /// load pre-compiled IR modules from precompiled_bitcode.cc and merge them into
+ /// the main module.
+ Status LoadPreCompiledIR();
+
+ // Create and add mappings for cpp functions that can be accessed from LLVM.
+ void AddGlobalMappings();
+
+ // Remove unused functions to reduce compile time.
+ Status RemoveUnusedFunctions();
+
+ std::unique_ptr<llvm::LLVMContext> context_;
+ std::unique_ptr<llvm::ExecutionEngine> execution_engine_;
+ std::unique_ptr<llvm::IRBuilder<>> ir_builder_;
+ llvm::Module* module_;
+ LLVMTypes types_;
+
+ std::vector<std::string> functions_to_compile_;
+
+ bool optimize_ = true;
+ bool module_finalized_ = false;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/engine_llvm_test.cc b/src/arrow/cpp/src/gandiva/engine_llvm_test.cc
new file mode 100644
index 000000000..ef2275b34
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/engine_llvm_test.cc
@@ -0,0 +1,131 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/engine.h"
+
+#include <gtest/gtest.h>
+#include <functional>
+#include "gandiva/llvm_types.h"
+#include "gandiva/tests/test_util.h"
+
+namespace gandiva {
+
+typedef int64_t (*add_vector_func_t)(int64_t* data, int n);
+
+class TestEngine : public ::testing::Test {
+ protected:
+ llvm::Function* BuildVecAdd(Engine* engine) {
+ auto types = engine->types();
+ llvm::IRBuilder<>* builder = engine->ir_builder();
+ llvm::LLVMContext* context = engine->context();
+
+ // Create fn prototype :
+ // int64_t add_longs(int64_t *elements, int32_t nelements)
+ std::vector<llvm::Type*> arguments;
+ arguments.push_back(types->i64_ptr_type());
+ arguments.push_back(types->i32_type());
+ llvm::FunctionType* prototype =
+ llvm::FunctionType::get(types->i64_type(), arguments, false /*isVarArg*/);
+
+ // Create fn
+ std::string func_name = "add_longs";
+ engine->AddFunctionToCompile(func_name);
+ llvm::Function* fn = llvm::Function::Create(
+ prototype, llvm::GlobalValue::ExternalLinkage, func_name, engine->module());
+ assert(fn != nullptr);
+
+ // Name the arguments
+ llvm::Function::arg_iterator args = fn->arg_begin();
+ llvm::Value* arg_elements = &*args;
+ arg_elements->setName("elements");
+ ++args;
+ llvm::Value* arg_nelements = &*args;
+ arg_nelements->setName("nelements");
+ ++args;
+
+ llvm::BasicBlock* loop_entry = llvm::BasicBlock::Create(*context, "entry", fn);
+ llvm::BasicBlock* loop_body = llvm::BasicBlock::Create(*context, "loop", fn);
+ llvm::BasicBlock* loop_exit = llvm::BasicBlock::Create(*context, "exit", fn);
+
+ // Loop entry
+ builder->SetInsertPoint(loop_entry);
+ builder->CreateBr(loop_body);
+
+ // Loop body
+ builder->SetInsertPoint(loop_body);
+
+ llvm::PHINode* loop_var = builder->CreatePHI(types->i32_type(), 2, "loop_var");
+ llvm::PHINode* sum = builder->CreatePHI(types->i64_type(), 2, "sum");
+
+ loop_var->addIncoming(types->i32_constant(0), loop_entry);
+ sum->addIncoming(types->i64_constant(0), loop_entry);
+
+ // setup loop PHI
+ llvm::Value* loop_update =
+ builder->CreateAdd(loop_var, types->i32_constant(1), "loop_var+1");
+ loop_var->addIncoming(loop_update, loop_body);
+
+ // get the current value
+ llvm::Value* offset = CreateGEP(builder, arg_elements, loop_var, "offset");
+ llvm::Value* current_value = CreateLoad(builder, offset, "value");
+
+ // setup sum PHI
+ llvm::Value* sum_update = builder->CreateAdd(sum, current_value, "sum+ith");
+ sum->addIncoming(sum_update, loop_body);
+
+ // check loop_var
+ llvm::Value* loop_var_check =
+ builder->CreateICmpSLT(loop_update, arg_nelements, "loop_var < nrec");
+ builder->CreateCondBr(loop_var_check, loop_body, loop_exit);
+
+ // Loop exit
+ builder->SetInsertPoint(loop_exit);
+ builder->CreateRet(sum_update);
+ return fn;
+ }
+
+ void BuildEngine() { ASSERT_OK(Engine::Make(TestConfiguration(), &engine)); }
+
+ std::unique_ptr<Engine> engine;
+ std::shared_ptr<Configuration> configuration = TestConfiguration();
+};
+
+TEST_F(TestEngine, TestAddUnoptimised) {
+ configuration->set_optimize(false);
+ BuildEngine();
+
+ llvm::Function* ir_func = BuildVecAdd(engine.get());
+ ASSERT_OK(engine->FinalizeModule());
+ auto add_func = reinterpret_cast<add_vector_func_t>(engine->CompiledFunction(ir_func));
+
+ int64_t my_array[] = {1, 3, -5, 8, 10};
+ EXPECT_EQ(add_func(my_array, 5), 17);
+}
+
+TEST_F(TestEngine, TestAddOptimised) {
+ configuration->set_optimize(true);
+ BuildEngine();
+
+ llvm::Function* ir_func = BuildVecAdd(engine.get());
+ ASSERT_OK(engine->FinalizeModule());
+ auto add_func = reinterpret_cast<add_vector_func_t>(engine->CompiledFunction(ir_func));
+
+ int64_t my_array[] = {1, 3, -5, 8, 10};
+ EXPECT_EQ(add_func(my_array, 5), 17);
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/eval_batch.h b/src/arrow/cpp/src/gandiva/eval_batch.h
new file mode 100644
index 000000000..25d9ab1d9
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/eval_batch.h
@@ -0,0 +1,107 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/util/logging.h"
+
+#include "gandiva/arrow.h"
+#include "gandiva/execution_context.h"
+#include "gandiva/gandiva_aliases.h"
+#include "gandiva/local_bitmaps_holder.h"
+
+namespace gandiva {
+
+/// \brief The buffers corresponding to one batch of records, used for
+/// expression evaluation.
+class EvalBatch {
+ public:
+ explicit EvalBatch(int64_t num_records, int num_buffers, int num_local_bitmaps)
+ : num_records_(num_records), num_buffers_(num_buffers) {
+ if (num_buffers > 0) {
+ buffers_array_.reset(new uint8_t*[num_buffers]);
+ buffer_offsets_array_.reset(new int64_t[num_buffers]);
+ }
+ local_bitmaps_holder_.reset(new LocalBitMapsHolder(num_records, num_local_bitmaps));
+ execution_context_.reset(new ExecutionContext());
+ }
+
+ int64_t num_records() const { return num_records_; }
+
+ uint8_t** GetBufferArray() const { return buffers_array_.get(); }
+
+ int64_t* GetBufferOffsetArray() const { return buffer_offsets_array_.get(); }
+
+ int GetNumBuffers() const { return num_buffers_; }
+
+ uint8_t* GetBuffer(int idx) const {
+ DCHECK(idx <= num_buffers_);
+ return (buffers_array_.get())[idx];
+ }
+
+ int64_t GetBufferOffset(int idx) const {
+ DCHECK(idx <= num_buffers_);
+ return (buffer_offsets_array_.get())[idx];
+ }
+
+ void SetBuffer(int idx, uint8_t* buffer, int64_t offset) {
+ DCHECK(idx <= num_buffers_);
+ (buffers_array_.get())[idx] = buffer;
+ (buffer_offsets_array_.get())[idx] = offset;
+ }
+
+ int GetNumLocalBitMaps() const { return local_bitmaps_holder_->GetNumLocalBitMaps(); }
+
+ int64_t GetLocalBitmapSize() const {
+ return local_bitmaps_holder_->GetLocalBitMapSize();
+ }
+
+ uint8_t* GetLocalBitMap(int idx) const {
+ DCHECK(idx <= GetNumLocalBitMaps());
+ return local_bitmaps_holder_->GetLocalBitMap(idx);
+ }
+
+ uint8_t** GetLocalBitMapArray() const {
+ return local_bitmaps_holder_->GetLocalBitMapArray();
+ }
+
+ ExecutionContext* GetExecutionContext() const { return execution_context_.get(); }
+
+ private:
+ /// number of records in the current batch.
+ int64_t num_records_;
+
+ // number of buffers.
+ int num_buffers_;
+
+ /// An array of 'num_buffers_', each containing a buffer. The buffer
+ /// sizes depends on the data type, but all of them have the same
+ /// number of slots (equal to num_records_).
+ std::unique_ptr<uint8_t*[]> buffers_array_;
+
+ /// An array of 'num_buffers_', each containing the offset for
+ /// corresponding buffer.
+ std::unique_ptr<int64_t[]> buffer_offsets_array_;
+
+ std::unique_ptr<LocalBitMapsHolder> local_bitmaps_holder_;
+
+ std::unique_ptr<ExecutionContext> execution_context_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/execution_context.h b/src/arrow/cpp/src/gandiva/execution_context.h
new file mode 100644
index 000000000..efa546874
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/execution_context.h
@@ -0,0 +1,54 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "gandiva/simple_arena.h"
+
+namespace gandiva {
+
+/// Execution context during llvm evaluation
+class ExecutionContext {
+ public:
+ explicit ExecutionContext(arrow::MemoryPool* pool = arrow::default_memory_pool())
+ : arena_(pool) {}
+ std::string get_error() const { return error_msg_; }
+
+ void set_error_msg(const char* error_msg) {
+ // Remember the first error only.
+ if (error_msg_.empty()) {
+ error_msg_ = std::string(error_msg);
+ }
+ }
+
+ bool has_error() const { return !error_msg_.empty(); }
+
+ SimpleArena* arena() { return &arena_; }
+
+ void Reset() {
+ error_msg_.clear();
+ arena_.Reset();
+ }
+
+ private:
+ std::string error_msg_;
+ SimpleArena arena_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/exported_funcs.h b/src/arrow/cpp/src/gandiva/exported_funcs.h
new file mode 100644
index 000000000..582052660
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/exported_funcs.h
@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <gandiva/exported_funcs_registry.h>
+#include <vector>
+
+namespace gandiva {
+
+class Engine;
+
+// Base-class type for exporting functions that can be accessed from LLVM/IR.
+class ExportedFuncsBase {
+ public:
+ virtual ~ExportedFuncsBase() = default;
+
+ virtual void AddMappings(Engine* engine) const = 0;
+};
+
+// Class for exporting Stub functions
+class ExportedStubFunctions : public ExportedFuncsBase {
+ void AddMappings(Engine* engine) const override;
+};
+REGISTER_EXPORTED_FUNCS(ExportedStubFunctions);
+
+// Class for exporting Context functions
+class ExportedContextFunctions : public ExportedFuncsBase {
+ void AddMappings(Engine* engine) const override;
+};
+REGISTER_EXPORTED_FUNCS(ExportedContextFunctions);
+
+// Class for exporting Time functions
+class ExportedTimeFunctions : public ExportedFuncsBase {
+ void AddMappings(Engine* engine) const override;
+};
+REGISTER_EXPORTED_FUNCS(ExportedTimeFunctions);
+
+// Class for exporting Decimal functions
+class ExportedDecimalFunctions : public ExportedFuncsBase {
+ void AddMappings(Engine* engine) const override;
+};
+REGISTER_EXPORTED_FUNCS(ExportedDecimalFunctions);
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/exported_funcs_registry.cc b/src/arrow/cpp/src/gandiva/exported_funcs_registry.cc
new file mode 100644
index 000000000..4c87c4d40
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/exported_funcs_registry.cc
@@ -0,0 +1,30 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/exported_funcs_registry.h"
+
+#include "gandiva/exported_funcs.h"
+
+namespace gandiva {
+
+void ExportedFuncsRegistry::AddMappings(Engine* engine) {
+ for (auto entry : registered()) {
+ entry->AddMappings(engine);
+ }
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/exported_funcs_registry.h b/src/arrow/cpp/src/gandiva/exported_funcs_registry.h
new file mode 100644
index 000000000..1504f2130
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/exported_funcs_registry.h
@@ -0,0 +1,54 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include <gandiva/engine.h>
+
+namespace gandiva {
+
+class ExportedFuncsBase;
+
+/// Registry for classes that export functions which can be accessed by
+/// LLVM/IR code.
+class ExportedFuncsRegistry {
+ public:
+ using list_type = std::vector<std::shared_ptr<ExportedFuncsBase>>;
+
+ // Add functions from all the registered classes to the engine.
+ static void AddMappings(Engine* engine);
+
+ static bool Register(std::shared_ptr<ExportedFuncsBase> entry) {
+ registered().push_back(entry);
+ return true;
+ }
+
+ private:
+ static list_type& registered() {
+ static list_type registered_list;
+ return registered_list;
+ }
+};
+
+#define REGISTER_EXPORTED_FUNCS(classname) \
+ static bool _registered_##classname = \
+ ExportedFuncsRegistry::Register(std::make_shared<classname>())
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/expr_decomposer.cc b/src/arrow/cpp/src/gandiva/expr_decomposer.cc
new file mode 100644
index 000000000..1c09d28f5
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/expr_decomposer.cc
@@ -0,0 +1,310 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/expr_decomposer.h"
+
+#include <memory>
+#include <stack>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "gandiva/annotator.h"
+#include "gandiva/dex.h"
+#include "gandiva/function_holder_registry.h"
+#include "gandiva/function_registry.h"
+#include "gandiva/function_signature.h"
+#include "gandiva/in_holder.h"
+#include "gandiva/node.h"
+
+namespace gandiva {
+
+// Decompose a field node - simply separate out validity & value arrays.
+Status ExprDecomposer::Visit(const FieldNode& node) {
+ auto desc = annotator_.CheckAndAddInputFieldDescriptor(node.field());
+
+ DexPtr validity_dex = std::make_shared<VectorReadValidityDex>(desc);
+ DexPtr value_dex;
+ if (desc->HasOffsetsIdx()) {
+ value_dex = std::make_shared<VectorReadVarLenValueDex>(desc);
+ } else {
+ value_dex = std::make_shared<VectorReadFixedLenValueDex>(desc);
+ }
+ result_ = std::make_shared<ValueValidityPair>(validity_dex, value_dex);
+ return Status::OK();
+}
+
+// Try and optimize a function node, by substituting with cheaper alternatives.
+// eg. replacing 'like' with 'starts_with' can save function calls at evaluation
+// time.
+const FunctionNode ExprDecomposer::TryOptimize(const FunctionNode& node) {
+ if (node.descriptor()->name() == "like") {
+ return LikeHolder::TryOptimize(node);
+ } else {
+ return node;
+ }
+}
+
+// Decompose a field node - wherever possible, merge the validity vectors of the
+// child nodes.
+Status ExprDecomposer::Visit(const FunctionNode& in_node) {
+ auto node = TryOptimize(in_node);
+ auto desc = node.descriptor();
+ FunctionSignature signature(desc->name(), desc->params(), desc->return_type());
+ const NativeFunction* native_function = registry_.LookupSignature(signature);
+ DCHECK(native_function) << "Missing Signature " << signature.ToString();
+
+ // decompose the children.
+ std::vector<ValueValidityPairPtr> args;
+ for (auto& child : node.children()) {
+ auto status = child->Accept(*this);
+ ARROW_RETURN_NOT_OK(status);
+
+ args.push_back(result());
+ }
+
+ // Make a function holder, if required.
+ std::shared_ptr<FunctionHolder> holder;
+ if (native_function->NeedsFunctionHolder()) {
+ auto status = FunctionHolderRegistry::Make(desc->name(), node, &holder);
+ ARROW_RETURN_NOT_OK(status);
+ }
+
+ if (native_function->result_nullable_type() == kResultNullIfNull) {
+ // These functions are decomposable, merge the validity bits of the children.
+
+ std::vector<DexPtr> merged_validity;
+ for (auto& decomposed : args) {
+ // Merge the validity_expressions of the children to build a combined validity
+ // expression.
+ merged_validity.insert(merged_validity.end(), decomposed->validity_exprs().begin(),
+ decomposed->validity_exprs().end());
+ }
+
+ auto value_dex =
+ std::make_shared<NonNullableFuncDex>(desc, native_function, holder, args);
+ result_ = std::make_shared<ValueValidityPair>(merged_validity, value_dex);
+ } else if (native_function->result_nullable_type() == kResultNullNever) {
+ // These functions always output valid results. So, no validity dex.
+ auto value_dex =
+ std::make_shared<NullableNeverFuncDex>(desc, native_function, holder, args);
+ result_ = std::make_shared<ValueValidityPair>(value_dex);
+ } else {
+ DCHECK(native_function->result_nullable_type() == kResultNullInternal);
+
+ // Add a local bitmap to track the output validity.
+ int local_bitmap_idx = annotator_.AddLocalBitMap();
+ auto validity_dex = std::make_shared<LocalBitMapValidityDex>(local_bitmap_idx);
+
+ auto value_dex = std::make_shared<NullableInternalFuncDex>(
+ desc, native_function, holder, args, local_bitmap_idx);
+ result_ = std::make_shared<ValueValidityPair>(validity_dex, value_dex);
+ }
+ return Status::OK();
+}
+
+// Decompose an IfNode
+Status ExprDecomposer::Visit(const IfNode& node) {
+ // nested_if_else_ might get overwritten when visiting the condition-node, so
+ // saving the value to a local variable and resetting nested_if_else_ to false
+ bool svd_nested_if_else = nested_if_else_;
+ nested_if_else_ = false;
+
+ PushConditionEntry(node);
+ auto status = node.condition()->Accept(*this);
+ ARROW_RETURN_NOT_OK(status);
+ auto condition_vv = result();
+ PopConditionEntry(node);
+
+ // Add a local bitmap to track the output validity.
+ int local_bitmap_idx = PushThenEntry(node, svd_nested_if_else);
+ status = node.then_node()->Accept(*this);
+ ARROW_RETURN_NOT_OK(status);
+ auto then_vv = result();
+ PopThenEntry(node);
+
+ PushElseEntry(node, local_bitmap_idx);
+ nested_if_else_ = (dynamic_cast<IfNode*>(node.else_node().get()) != nullptr);
+
+ status = node.else_node()->Accept(*this);
+ ARROW_RETURN_NOT_OK(status);
+ auto else_vv = result();
+ bool is_terminal_else = PopElseEntry(node);
+
+ auto validity_dex = std::make_shared<LocalBitMapValidityDex>(local_bitmap_idx);
+ auto value_dex =
+ std::make_shared<IfDex>(condition_vv, then_vv, else_vv, node.return_type(),
+ local_bitmap_idx, is_terminal_else);
+
+ result_ = std::make_shared<ValueValidityPair>(validity_dex, value_dex);
+ return Status::OK();
+}
+
+// Decompose a BooleanNode
+Status ExprDecomposer::Visit(const BooleanNode& node) {
+ // decompose the children.
+ std::vector<ValueValidityPairPtr> args;
+ for (auto& child : node.children()) {
+ auto status = child->Accept(*this);
+ ARROW_RETURN_NOT_OK(status);
+
+ args.push_back(result());
+ }
+
+ // Add a local bitmap to track the output validity.
+ int local_bitmap_idx = annotator_.AddLocalBitMap();
+ auto validity_dex = std::make_shared<LocalBitMapValidityDex>(local_bitmap_idx);
+
+ std::shared_ptr<BooleanDex> value_dex;
+ switch (node.expr_type()) {
+ case BooleanNode::AND:
+ value_dex = std::make_shared<BooleanAndDex>(args, local_bitmap_idx);
+ break;
+ case BooleanNode::OR:
+ value_dex = std::make_shared<BooleanOrDex>(args, local_bitmap_idx);
+ break;
+ }
+ result_ = std::make_shared<ValueValidityPair>(validity_dex, value_dex);
+ return Status::OK();
+}
+Status ExprDecomposer::Visit(const InExpressionNode<gandiva::DecimalScalar128>& node) {
+ /* decompose the children. */
+ std::vector<ValueValidityPairPtr> args;
+ auto status = node.eval_expr()->Accept(*this);
+ ARROW_RETURN_NOT_OK(status);
+ args.push_back(result());
+ /* In always outputs valid results, so no validity dex */
+ auto value_dex = std::make_shared<InExprDex<gandiva::DecimalScalar128>>(
+ args, node.values(), node.get_precision(), node.get_scale());
+ result_ = std::make_shared<ValueValidityPair>(value_dex);
+ return Status::OK();
+}
+
+#define MAKE_VISIT_IN(ctype) \
+ Status ExprDecomposer::Visit(const InExpressionNode<ctype>& node) { \
+ /* decompose the children. */ \
+ std::vector<ValueValidityPairPtr> args; \
+ auto status = node.eval_expr()->Accept(*this); \
+ ARROW_RETURN_NOT_OK(status); \
+ args.push_back(result()); \
+ /* In always outputs valid results, so no validity dex */ \
+ auto value_dex = std::make_shared<InExprDex<ctype>>(args, node.values()); \
+ result_ = std::make_shared<ValueValidityPair>(value_dex); \
+ return Status::OK(); \
+ }
+
+MAKE_VISIT_IN(int32_t);
+MAKE_VISIT_IN(int64_t);
+MAKE_VISIT_IN(float);
+MAKE_VISIT_IN(double);
+MAKE_VISIT_IN(std::string);
+
+Status ExprDecomposer::Visit(const LiteralNode& node) {
+ auto value_dex = std::make_shared<LiteralDex>(node.return_type(), node.holder());
+ DexPtr validity_dex;
+ if (node.is_null()) {
+ validity_dex = std::make_shared<FalseDex>();
+ } else {
+ validity_dex = std::make_shared<TrueDex>();
+ }
+ result_ = std::make_shared<ValueValidityPair>(validity_dex, value_dex);
+ return Status::OK();
+}
+
+// The bolow functions use a stack to detect :
+// a. nested if-else expressions.
+// In such cases, the local bitmap can be re-used.
+// b. detect terminal else expressions
+// The non-terminal else expressions do not need to track validity (the if statement
+// that has a match will do it).
+// Both of the above optimisations save CPU cycles during expression evaluation.
+
+int ExprDecomposer::PushThenEntry(const IfNode& node, bool reuse_bitmap) {
+ int local_bitmap_idx;
+
+ if (reuse_bitmap) {
+ // we also need stack in addition to reuse_bitmap flag since we
+ // can also enter other if-else nodes when we visit the condition-node
+ // (which themselves might be nested) before we visit then-node
+ DCHECK_EQ(if_entries_stack_.empty(), false) << "PushThenEntry: stack is empty";
+ DCHECK_EQ(if_entries_stack_.top()->entry_type_, kStackEntryElse)
+ << "PushThenEntry: top of stack is not of type entry_else";
+ auto top = if_entries_stack_.top().get();
+
+ // inside a nested else statement (i.e if-else-if). use the parent's bitmap.
+ local_bitmap_idx = top->local_bitmap_idx_;
+
+ // clear the is_terminal bit in the current top entry (else).
+ top->is_terminal_else_ = false;
+ } else {
+ // alloc a new bitmap.
+ local_bitmap_idx = annotator_.AddLocalBitMap();
+ }
+
+ // push new entry to the stack.
+ std::unique_ptr<IfStackEntry> entry(new IfStackEntry(
+ node, kStackEntryThen, false /*is_terminal_else*/, local_bitmap_idx));
+ if_entries_stack_.emplace(std::move(entry));
+ return local_bitmap_idx;
+}
+
+void ExprDecomposer::PopThenEntry(const IfNode& node) {
+ DCHECK_EQ(if_entries_stack_.empty(), false) << "PopThenEntry: found empty stack";
+
+ auto top = if_entries_stack_.top().get();
+ DCHECK_EQ(top->entry_type_, kStackEntryThen)
+ << "PopThenEntry: found " << top->entry_type_ << " expected then";
+ DCHECK_EQ(&top->if_node_, &node) << "PopThenEntry: found mismatched node";
+
+ if_entries_stack_.pop();
+}
+
+void ExprDecomposer::PushElseEntry(const IfNode& node, int local_bitmap_idx) {
+ std::unique_ptr<IfStackEntry> entry(new IfStackEntry(
+ node, kStackEntryElse, true /*is_terminal_else*/, local_bitmap_idx));
+ if_entries_stack_.emplace(std::move(entry));
+}
+
+bool ExprDecomposer::PopElseEntry(const IfNode& node) {
+ DCHECK_EQ(if_entries_stack_.empty(), false) << "PopElseEntry: found empty stack";
+
+ auto top = if_entries_stack_.top().get();
+ DCHECK_EQ(top->entry_type_, kStackEntryElse)
+ << "PopElseEntry: found " << top->entry_type_ << " expected else";
+ DCHECK_EQ(&top->if_node_, &node) << "PopElseEntry: found mismatched node";
+ bool is_terminal_else = top->is_terminal_else_;
+
+ if_entries_stack_.pop();
+ return is_terminal_else;
+}
+
+void ExprDecomposer::PushConditionEntry(const IfNode& node) {
+ std::unique_ptr<IfStackEntry> entry(new IfStackEntry(node, kStackEntryCondition));
+ if_entries_stack_.emplace(std::move(entry));
+}
+
+void ExprDecomposer::PopConditionEntry(const IfNode& node) {
+ DCHECK_EQ(if_entries_stack_.empty(), false) << "PopConditionEntry: found empty stack";
+
+ auto top = if_entries_stack_.top().get();
+ DCHECK_EQ(top->entry_type_, kStackEntryCondition)
+ << "PopConditionEntry: found " << top->entry_type_ << " expected condition";
+ DCHECK_EQ(&top->if_node_, &node) << "PopConditionEntry: found mismatched node";
+ if_entries_stack_.pop();
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/expr_decomposer.h b/src/arrow/cpp/src/gandiva/expr_decomposer.h
new file mode 100644
index 000000000..f68b8a8fc
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/expr_decomposer.h
@@ -0,0 +1,128 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cmath>
+#include <memory>
+#include <stack>
+#include <string>
+#include <utility>
+
+#include "gandiva/arrow.h"
+#include "gandiva/expression.h"
+#include "gandiva/node.h"
+#include "gandiva/node_visitor.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+class FunctionRegistry;
+class Annotator;
+
+/// \brief Decomposes an expression tree to separate out the validity and
+/// value expressions.
+class GANDIVA_EXPORT ExprDecomposer : public NodeVisitor {
+ public:
+ explicit ExprDecomposer(const FunctionRegistry& registry, Annotator& annotator)
+ : registry_(registry), annotator_(annotator), nested_if_else_(false) {}
+
+ Status Decompose(const Node& root, ValueValidityPairPtr* out) {
+ auto status = root.Accept(*this);
+ if (status.ok()) {
+ *out = std::move(result_);
+ }
+ return status;
+ }
+
+ private:
+ ARROW_DISALLOW_COPY_AND_ASSIGN(ExprDecomposer);
+
+ FRIEND_TEST(TestExprDecomposer, TestStackSimple);
+ FRIEND_TEST(TestExprDecomposer, TestNested);
+ FRIEND_TEST(TestExprDecomposer, TestInternalIf);
+ FRIEND_TEST(TestExprDecomposer, TestParallelIf);
+ FRIEND_TEST(TestExprDecomposer, TestIfInCondition);
+ FRIEND_TEST(TestExprDecomposer, TestFunctionBetweenNestedIf);
+ FRIEND_TEST(TestExprDecomposer, TestComplexIfCondition);
+
+ Status Visit(const FieldNode& node) override;
+ Status Visit(const FunctionNode& node) override;
+ Status Visit(const IfNode& node) override;
+ Status Visit(const LiteralNode& node) override;
+ Status Visit(const BooleanNode& node) override;
+ Status Visit(const InExpressionNode<int32_t>& node) override;
+ Status Visit(const InExpressionNode<int64_t>& node) override;
+ Status Visit(const InExpressionNode<float>& node) override;
+ Status Visit(const InExpressionNode<double>& node) override;
+ Status Visit(const InExpressionNode<gandiva::DecimalScalar128>& node) override;
+ Status Visit(const InExpressionNode<std::string>& node) override;
+
+ // Optimize a function node, if possible.
+ const FunctionNode TryOptimize(const FunctionNode& node);
+
+ enum StackEntryType { kStackEntryCondition, kStackEntryThen, kStackEntryElse };
+
+ // stack of if nodes.
+ class IfStackEntry {
+ public:
+ IfStackEntry(const IfNode& if_node, StackEntryType entry_type,
+ bool is_terminal_else = false, int local_bitmap_idx = 0)
+ : if_node_(if_node),
+ entry_type_(entry_type),
+ is_terminal_else_(is_terminal_else),
+ local_bitmap_idx_(local_bitmap_idx) {}
+
+ const IfNode& if_node_;
+ StackEntryType entry_type_;
+ bool is_terminal_else_;
+ int local_bitmap_idx_;
+
+ private:
+ ARROW_DISALLOW_COPY_AND_ASSIGN(IfStackEntry);
+ };
+
+ // pop 'condition entry' into stack.
+ void PushConditionEntry(const IfNode& node);
+
+ // pop 'condition entry' from stack.
+ void PopConditionEntry(const IfNode& node);
+
+ // push 'then entry' to stack. returns either a new local bitmap or the parent's
+ // bitmap (in case of nested if-else).
+ int PushThenEntry(const IfNode& node, bool reuse_bitmap);
+
+ // pop 'then entry' from stack.
+ void PopThenEntry(const IfNode& node);
+
+ // push 'else entry' into stack.
+ void PushElseEntry(const IfNode& node, int local_bitmap_idx);
+
+ // pop 'else entry' from stack. returns 'true' if this is a terminal else condition
+ // i.e no nested if condition below this node.
+ bool PopElseEntry(const IfNode& node);
+
+ ValueValidityPairPtr result() { return std::move(result_); }
+
+ const FunctionRegistry& registry_;
+ Annotator& annotator_;
+ std::stack<std::unique_ptr<IfStackEntry>> if_entries_stack_;
+ ValueValidityPairPtr result_;
+ bool nested_if_else_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/expr_decomposer_test.cc b/src/arrow/cpp/src/gandiva/expr_decomposer_test.cc
new file mode 100644
index 000000000..638ceebcb
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/expr_decomposer_test.cc
@@ -0,0 +1,409 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/expr_decomposer.h"
+
+#include <gtest/gtest.h>
+
+#include "gandiva/annotator.h"
+#include "gandiva/dex.h"
+#include "gandiva/function_registry.h"
+#include "gandiva/gandiva_aliases.h"
+#include "gandiva/node.h"
+#include "gandiva/tree_expr_builder.h"
+
+namespace gandiva {
+
+using arrow::int32;
+
+class TestExprDecomposer : public ::testing::Test {
+ protected:
+ FunctionRegistry registry_;
+};
+
+TEST_F(TestExprDecomposer, TestStackSimple) {
+ Annotator annotator;
+ ExprDecomposer decomposer(registry_, annotator);
+
+ // if (a) _
+ // else _
+ IfNode node_a(nullptr, nullptr, nullptr, int32());
+
+ decomposer.PushConditionEntry(node_a);
+ decomposer.PopConditionEntry(node_a);
+
+ int idx_a = decomposer.PushThenEntry(node_a, false);
+ EXPECT_EQ(idx_a, 0);
+ decomposer.PopThenEntry(node_a);
+
+ decomposer.PushElseEntry(node_a, idx_a);
+ bool is_terminal_a = decomposer.PopElseEntry(node_a);
+ EXPECT_EQ(is_terminal_a, true);
+ EXPECT_EQ(decomposer.if_entries_stack_.empty(), true);
+}
+
+TEST_F(TestExprDecomposer, TestNested) {
+ Annotator annotator;
+ ExprDecomposer decomposer(registry_, annotator);
+
+ // if (a) _
+ // else _
+ // if (b) _
+ // else _
+ IfNode node_a(nullptr, nullptr, nullptr, int32());
+ IfNode node_b(nullptr, nullptr, nullptr, int32());
+
+ decomposer.PushConditionEntry(node_a);
+ decomposer.PopConditionEntry(node_a);
+
+ int idx_a = decomposer.PushThenEntry(node_a, false);
+ EXPECT_EQ(idx_a, 0);
+ decomposer.PopThenEntry(node_a);
+
+ decomposer.PushElseEntry(node_a, idx_a);
+
+ { // start b
+ decomposer.PushConditionEntry(node_b);
+ decomposer.PopConditionEntry(node_b);
+
+ int idx_b = decomposer.PushThenEntry(node_b, true);
+ EXPECT_EQ(idx_b, 0); // must reuse bitmap.
+ decomposer.PopThenEntry(node_b);
+
+ decomposer.PushElseEntry(node_b, idx_b);
+ bool is_terminal_b = decomposer.PopElseEntry(node_b);
+ EXPECT_EQ(is_terminal_b, true);
+ } // end b
+
+ bool is_terminal_a = decomposer.PopElseEntry(node_a);
+ EXPECT_EQ(is_terminal_a, false); // there was a nested if.
+
+ EXPECT_EQ(decomposer.if_entries_stack_.empty(), true);
+}
+
+TEST_F(TestExprDecomposer, TestInternalIf) {
+ Annotator annotator;
+ ExprDecomposer decomposer(registry_, annotator);
+
+ // if (a) _
+ // if (b) _
+ // else _
+ // else _
+ IfNode node_a(nullptr, nullptr, nullptr, int32());
+ IfNode node_b(nullptr, nullptr, nullptr, int32());
+
+ decomposer.PushConditionEntry(node_a);
+ decomposer.PopConditionEntry(node_a);
+
+ int idx_a = decomposer.PushThenEntry(node_a, false);
+ EXPECT_EQ(idx_a, 0);
+
+ { // start b
+ decomposer.PushConditionEntry(node_b);
+ decomposer.PopConditionEntry(node_b);
+
+ int idx_b = decomposer.PushThenEntry(node_b, false);
+ EXPECT_EQ(idx_b, 1); // must not reuse bitmap.
+ decomposer.PopThenEntry(node_b);
+
+ decomposer.PushElseEntry(node_b, idx_b);
+ bool is_terminal_b = decomposer.PopElseEntry(node_b);
+ EXPECT_EQ(is_terminal_b, true);
+ } // end b
+
+ decomposer.PopThenEntry(node_a);
+ decomposer.PushElseEntry(node_a, idx_a);
+
+ bool is_terminal_a = decomposer.PopElseEntry(node_a);
+ EXPECT_EQ(is_terminal_a, true); // there was no nested if.
+
+ EXPECT_EQ(decomposer.if_entries_stack_.empty(), true);
+}
+
+TEST_F(TestExprDecomposer, TestParallelIf) {
+ Annotator annotator;
+ ExprDecomposer decomposer(registry_, annotator);
+
+ // if (a) _
+ // else _
+ // if (b) _
+ // else _
+ IfNode node_a(nullptr, nullptr, nullptr, int32());
+ IfNode node_b(nullptr, nullptr, nullptr, int32());
+
+ decomposer.PushConditionEntry(node_a);
+ decomposer.PopConditionEntry(node_a);
+
+ int idx_a = decomposer.PushThenEntry(node_a, false);
+ EXPECT_EQ(idx_a, 0);
+
+ decomposer.PopThenEntry(node_a);
+ decomposer.PushElseEntry(node_a, idx_a);
+
+ bool is_terminal_a = decomposer.PopElseEntry(node_a);
+ EXPECT_EQ(is_terminal_a, true); // there was no nested if.
+
+ // start b
+ decomposer.PushConditionEntry(node_b);
+ decomposer.PopConditionEntry(node_b);
+
+ int idx_b = decomposer.PushThenEntry(node_b, false);
+ EXPECT_EQ(idx_b, 1); // must not reuse bitmap.
+ decomposer.PopThenEntry(node_b);
+
+ decomposer.PushElseEntry(node_b, idx_b);
+ bool is_terminal_b = decomposer.PopElseEntry(node_b);
+ EXPECT_EQ(is_terminal_b, true);
+
+ EXPECT_EQ(decomposer.if_entries_stack_.empty(), true);
+}
+
+TEST_F(TestExprDecomposer, TestIfInCondition) {
+ Annotator annotator;
+ ExprDecomposer decomposer(registry_, annotator);
+
+ // if (if _ else _) : a
+ // -
+ // else
+ // if (if _ else _) : b
+ // -
+ // else
+ // -
+ IfNode node_a(nullptr, nullptr, nullptr, int32());
+ IfNode node_b(nullptr, nullptr, nullptr, int32());
+ IfNode cond_node_a(nullptr, nullptr, nullptr, int32());
+ IfNode cond_node_b(nullptr, nullptr, nullptr, int32());
+
+ // start a
+ decomposer.PushConditionEntry(node_a);
+ {
+ // start cond_node_a
+ decomposer.PushConditionEntry(cond_node_a);
+ decomposer.PopConditionEntry(cond_node_a);
+
+ int idx_cond_a = decomposer.PushThenEntry(cond_node_a, false);
+ EXPECT_EQ(idx_cond_a, 0);
+ decomposer.PopThenEntry(cond_node_a);
+
+ decomposer.PushElseEntry(cond_node_a, idx_cond_a);
+ bool is_terminal = decomposer.PopElseEntry(cond_node_a);
+ EXPECT_EQ(is_terminal, true); // there was no nested if.
+ }
+ decomposer.PopConditionEntry(node_a);
+
+ int idx_a = decomposer.PushThenEntry(node_a, false);
+ EXPECT_EQ(idx_a, 1); // no re-use
+ decomposer.PopThenEntry(node_a);
+
+ decomposer.PushElseEntry(node_a, idx_a);
+
+ { // start b
+ decomposer.PushConditionEntry(node_b);
+ {
+ // start cond_node_b
+ decomposer.PushConditionEntry(cond_node_b);
+ decomposer.PopConditionEntry(cond_node_b);
+
+ int idx_cond_b = decomposer.PushThenEntry(cond_node_b, false);
+ EXPECT_EQ(idx_cond_b, 2); // no re-use
+ decomposer.PopThenEntry(cond_node_b);
+
+ decomposer.PushElseEntry(cond_node_b, idx_cond_b);
+ bool is_terminal = decomposer.PopElseEntry(cond_node_b);
+ EXPECT_EQ(is_terminal, true); // there was no nested if.
+ }
+ decomposer.PopConditionEntry(node_b);
+
+ int idx_b = decomposer.PushThenEntry(node_b, true);
+ EXPECT_EQ(idx_b, 1); // must reuse bitmap.
+ decomposer.PopThenEntry(node_b);
+
+ decomposer.PushElseEntry(node_b, idx_b);
+ bool is_terminal = decomposer.PopElseEntry(node_b);
+ EXPECT_EQ(is_terminal, true);
+ } // end b
+
+ bool is_terminal_a = decomposer.PopElseEntry(node_a);
+ EXPECT_EQ(is_terminal_a, false); // there was a nested if.
+
+ EXPECT_EQ(decomposer.if_entries_stack_.empty(), true);
+}
+
+TEST_F(TestExprDecomposer, TestFunctionBetweenNestedIf) {
+ Annotator annotator;
+ ExprDecomposer decomposer(registry_, annotator);
+
+ // if (a) _
+ // else
+ // function(
+ // if (b) _
+ // else _
+ // )
+
+ IfNode node_a(nullptr, nullptr, nullptr, int32());
+ IfNode node_b(nullptr, nullptr, nullptr, int32());
+
+ // start outer if
+ decomposer.PushConditionEntry(node_a);
+ decomposer.PopConditionEntry(node_a);
+
+ int idx_a = decomposer.PushThenEntry(node_a, false);
+ EXPECT_EQ(idx_a, 0);
+ decomposer.PopThenEntry(node_a);
+
+ decomposer.PushElseEntry(node_a, idx_a);
+ { // start b
+ decomposer.PushConditionEntry(node_b);
+ decomposer.PopConditionEntry(node_b);
+
+ int idx_b = decomposer.PushThenEntry(node_b, false); // not else node of parent if
+ EXPECT_EQ(idx_b, 1); // can't reuse bitmap.
+ decomposer.PopThenEntry(node_b);
+
+ decomposer.PushElseEntry(node_b, idx_b);
+ bool is_terminal_b = decomposer.PopElseEntry(node_b);
+ EXPECT_EQ(is_terminal_b, true);
+ }
+ bool is_terminal_a = decomposer.PopElseEntry(node_a);
+ EXPECT_EQ(is_terminal_a, true); // a else is also terminal
+
+ EXPECT_TRUE(decomposer.if_entries_stack_.empty());
+}
+
+TEST_F(TestExprDecomposer, TestComplexIfCondition) {
+ Annotator annotator;
+ ExprDecomposer decomposer(registry_, annotator);
+
+ // if (if _
+ // else
+ // if _
+ // else _
+ // )
+ // then
+ // if _
+ // else
+ // if _
+ // else _
+ //
+ // else
+ // if _
+ // else
+ // if _
+ // else _
+
+ IfNode node_a(nullptr, nullptr, nullptr, int32());
+
+ IfNode cond_node_a(nullptr, nullptr, nullptr, int32());
+ IfNode cond_node_a_inner_if(nullptr, nullptr, nullptr, int32());
+
+ IfNode then_node_a(nullptr, nullptr, nullptr, int32());
+ IfNode then_node_a_inner_if(nullptr, nullptr, nullptr, int32());
+
+ IfNode else_node_a(nullptr, nullptr, nullptr, int32());
+ IfNode else_node_a_inner_if(nullptr, nullptr, nullptr, int32());
+
+ // start outer if
+ decomposer.PushConditionEntry(node_a);
+ {
+ // start the nested if inside the condition of a
+ decomposer.PushConditionEntry(cond_node_a);
+ decomposer.PopConditionEntry(cond_node_a);
+
+ int idx_cond_a = decomposer.PushThenEntry(cond_node_a, false);
+ EXPECT_EQ(idx_cond_a, 0);
+ decomposer.PopThenEntry(cond_node_a);
+
+ decomposer.PushElseEntry(cond_node_a, idx_cond_a);
+ {
+ decomposer.PushConditionEntry(cond_node_a_inner_if);
+ decomposer.PopConditionEntry(cond_node_a_inner_if);
+
+ int idx_cond_a_inner_if = decomposer.PushThenEntry(cond_node_a_inner_if, true);
+ EXPECT_EQ(idx_cond_a_inner_if,
+ 0); // expect bitmap to be resused since nested if else
+ decomposer.PopThenEntry(cond_node_a_inner_if);
+
+ decomposer.PushElseEntry(cond_node_a_inner_if, idx_cond_a_inner_if);
+ bool is_terminal = decomposer.PopElseEntry(cond_node_a_inner_if);
+ EXPECT_TRUE(is_terminal);
+ }
+ EXPECT_FALSE(decomposer.PopElseEntry(cond_node_a));
+ }
+ decomposer.PopConditionEntry(node_a);
+
+ int idx_a = decomposer.PushThenEntry(node_a, false);
+ EXPECT_EQ(idx_a, 1);
+
+ {
+ // start the nested if inside the then node of a
+ decomposer.PushConditionEntry(then_node_a);
+ decomposer.PopConditionEntry(then_node_a);
+
+ int idx_then_a = decomposer.PushThenEntry(then_node_a, false);
+ EXPECT_EQ(idx_then_a, 2);
+ decomposer.PopThenEntry(then_node_a);
+
+ decomposer.PushElseEntry(then_node_a, idx_then_a);
+ {
+ decomposer.PushConditionEntry(then_node_a_inner_if);
+ decomposer.PopConditionEntry(then_node_a_inner_if);
+
+ int idx_then_a_inner_if = decomposer.PushThenEntry(then_node_a_inner_if, true);
+ EXPECT_EQ(idx_then_a_inner_if,
+ 2); // expect bitmap to be resused since nested if else
+ decomposer.PopThenEntry(then_node_a_inner_if);
+
+ decomposer.PushElseEntry(then_node_a_inner_if, idx_then_a_inner_if);
+ bool is_terminal = decomposer.PopElseEntry(then_node_a_inner_if);
+ EXPECT_TRUE(is_terminal);
+ }
+ EXPECT_FALSE(decomposer.PopElseEntry(then_node_a));
+ }
+ decomposer.PopThenEntry(node_a);
+
+ decomposer.PushElseEntry(node_a, idx_a);
+ {
+ // start the nested if inside the else node of a
+ decomposer.PushConditionEntry(else_node_a);
+ decomposer.PopConditionEntry(else_node_a);
+
+ int idx_else_a =
+ decomposer.PushThenEntry(else_node_a, true); // else node is another if-node
+ EXPECT_EQ(idx_else_a, 1); // reuse the outer if node bitmap since nested if-else
+ decomposer.PopThenEntry(else_node_a);
+
+ decomposer.PushElseEntry(else_node_a, idx_else_a);
+ {
+ decomposer.PushConditionEntry(else_node_a_inner_if);
+ decomposer.PopConditionEntry(else_node_a_inner_if);
+
+ int idx_else_a_inner_if = decomposer.PushThenEntry(else_node_a_inner_if, true);
+ EXPECT_EQ(idx_else_a_inner_if,
+ 1); // expect bitmap to be resused since nested if else
+ decomposer.PopThenEntry(else_node_a_inner_if);
+
+ decomposer.PushElseEntry(else_node_a_inner_if, idx_else_a_inner_if);
+ bool is_terminal = decomposer.PopElseEntry(else_node_a_inner_if);
+ EXPECT_TRUE(is_terminal);
+ }
+ EXPECT_FALSE(decomposer.PopElseEntry(else_node_a));
+ }
+ EXPECT_FALSE(decomposer.PopElseEntry(node_a));
+ EXPECT_TRUE(decomposer.if_entries_stack_.empty());
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/expr_validator.cc b/src/arrow/cpp/src/gandiva/expr_validator.cc
new file mode 100644
index 000000000..c3c784c95
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/expr_validator.cc
@@ -0,0 +1,193 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "gandiva/expr_validator.h"
+
+namespace gandiva {
+
+Status ExprValidator::Validate(const ExpressionPtr& expr) {
+ ARROW_RETURN_IF(expr == nullptr,
+ Status::ExpressionValidationError("Expression cannot be null"));
+
+ Node& root = *expr->root();
+ ARROW_RETURN_NOT_OK(root.Accept(*this));
+
+ // Ensure root's return type match the expression return type. Type
+ // support validation is not required because root type is already supported.
+ ARROW_RETURN_IF(!root.return_type()->Equals(*expr->result()->type()),
+ Status::ExpressionValidationError("Return type of root node ",
+ root.return_type()->ToString(),
+ " does not match that of expression ",
+ expr->result()->type()->ToString()));
+
+ return Status::OK();
+}
+
+Status ExprValidator::Visit(const FieldNode& node) {
+ auto llvm_type = types_->IRType(node.return_type()->id());
+ ARROW_RETURN_IF(llvm_type == nullptr,
+ Status::ExpressionValidationError("Field ", node.field()->name(),
+ " has unsupported data type ",
+ node.return_type()->name()));
+
+ // Ensure that field is found in schema
+ auto field_in_schema_entry = field_map_.find(node.field()->name());
+ ARROW_RETURN_IF(field_in_schema_entry == field_map_.end(),
+ Status::ExpressionValidationError("Field ", node.field()->name(),
+ " not in schema."));
+
+ // Ensure that that the found field match.
+ FieldPtr field_in_schema = field_in_schema_entry->second;
+ ARROW_RETURN_IF(!field_in_schema->Equals(node.field()),
+ Status::ExpressionValidationError(
+ "Field definition in schema ", field_in_schema->ToString(),
+ " different from field in expression ", node.field()->ToString()));
+
+ return Status::OK();
+}
+
+Status ExprValidator::Visit(const FunctionNode& node) {
+ auto desc = node.descriptor();
+ FunctionSignature signature(desc->name(), desc->params(), desc->return_type());
+
+ const NativeFunction* native_function = registry_.LookupSignature(signature);
+ ARROW_RETURN_IF(native_function == nullptr,
+ Status::ExpressionValidationError("Function ", signature.ToString(),
+ " not supported yet. "));
+
+ for (auto& child : node.children()) {
+ ARROW_RETURN_NOT_OK(child->Accept(*this));
+ }
+
+ return Status::OK();
+}
+
+Status ExprValidator::Visit(const IfNode& node) {
+ ARROW_RETURN_NOT_OK(node.condition()->Accept(*this));
+ ARROW_RETURN_NOT_OK(node.then_node()->Accept(*this));
+ ARROW_RETURN_NOT_OK(node.else_node()->Accept(*this));
+
+ auto if_node_ret_type = node.return_type();
+ auto then_node_ret_type = node.then_node()->return_type();
+ auto else_node_ret_type = node.else_node()->return_type();
+
+ // condition must be of boolean type.
+ ARROW_RETURN_IF(
+ !node.condition()->return_type()->Equals(arrow::boolean()),
+ Status::ExpressionValidationError("condition must be of boolean type, found type ",
+ node.condition()->return_type()->ToString()));
+
+ // Then-branch return type must match.
+ ARROW_RETURN_IF(!if_node_ret_type->Equals(*then_node_ret_type),
+ Status::ExpressionValidationError(
+ "Return type of if ", if_node_ret_type->ToString(), " and then ",
+ then_node_ret_type->ToString(), " not matching."));
+
+ // Else-branch return type must match.
+ ARROW_RETURN_IF(!if_node_ret_type->Equals(*else_node_ret_type),
+ Status::ExpressionValidationError(
+ "Return type of if ", if_node_ret_type->ToString(), " and else ",
+ else_node_ret_type->ToString(), " not matching."));
+
+ return Status::OK();
+}
+
+Status ExprValidator::Visit(const LiteralNode& node) {
+ auto llvm_type = types_->IRType(node.return_type()->id());
+ ARROW_RETURN_IF(llvm_type == nullptr,
+ Status::ExpressionValidationError("Value ", ToString(node.holder()),
+ " has unsupported data type ",
+ node.return_type()->name()));
+
+ return Status::OK();
+}
+
+Status ExprValidator::Visit(const BooleanNode& node) {
+ ARROW_RETURN_IF(
+ node.children().size() < 2,
+ Status::ExpressionValidationError("Boolean expression has ", node.children().size(),
+ " children, expected at least two"));
+
+ for (auto& child : node.children()) {
+ const auto bool_type = arrow::boolean();
+ const auto ret_type = child->return_type();
+
+ ARROW_RETURN_IF(!ret_type->Equals(bool_type),
+ Status::ExpressionValidationError(
+ "Boolean expression has a child with return type ",
+ ret_type->ToString(), ", expected return type boolean"));
+
+ ARROW_RETURN_NOT_OK(child->Accept(*this));
+ }
+
+ return Status::OK();
+}
+
+/*
+ * Validate the following
+ *
+ * 1. Non empty list of constants to search in.
+ * 2. Expression returns of the same type as the constants.
+ */
+Status ExprValidator::Visit(const InExpressionNode<int32_t>& node) {
+ return ValidateInExpression(node.values().size(), node.eval_expr()->return_type(),
+ arrow::int32());
+}
+
+Status ExprValidator::Visit(const InExpressionNode<int64_t>& node) {
+ return ValidateInExpression(node.values().size(), node.eval_expr()->return_type(),
+ arrow::int64());
+}
+Status ExprValidator::Visit(const InExpressionNode<float>& node) {
+ return ValidateInExpression(node.values().size(), node.eval_expr()->return_type(),
+ arrow::float32());
+}
+Status ExprValidator::Visit(const InExpressionNode<double>& node) {
+ return ValidateInExpression(node.values().size(), node.eval_expr()->return_type(),
+ arrow::float64());
+}
+
+Status ExprValidator::Visit(const InExpressionNode<gandiva::DecimalScalar128>& node) {
+ return ValidateInExpression(node.values().size(), node.eval_expr()->return_type(),
+ arrow::decimal(node.get_precision(), node.get_scale()));
+}
+
+Status ExprValidator::Visit(const InExpressionNode<std::string>& node) {
+ return ValidateInExpression(node.values().size(), node.eval_expr()->return_type(),
+ arrow::utf8());
+}
+
+Status ExprValidator::ValidateInExpression(size_t number_of_values,
+ DataTypePtr in_expr_return_type,
+ DataTypePtr type_of_values) {
+ ARROW_RETURN_IF(number_of_values == 0,
+ Status::ExpressionValidationError(
+ "IN Expression needs a non-empty constant list to match."));
+ ARROW_RETURN_IF(
+ !in_expr_return_type->Equals(type_of_values),
+ Status::ExpressionValidationError(
+ "Evaluation expression for IN clause returns ", in_expr_return_type->ToString(),
+ " values are of type", type_of_values->ToString()));
+
+ return Status::OK();
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/expr_validator.h b/src/arrow/cpp/src/gandiva/expr_validator.h
new file mode 100644
index 000000000..daaf50897
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/expr_validator.h
@@ -0,0 +1,80 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+#include "arrow/status.h"
+
+#include "gandiva/arrow.h"
+#include "gandiva/expression.h"
+#include "gandiva/function_registry.h"
+#include "gandiva/llvm_types.h"
+#include "gandiva/node.h"
+#include "gandiva/node_visitor.h"
+
+namespace gandiva {
+
+class FunctionRegistry;
+
+/// \brief Validates the entire expression tree including
+/// data types, signatures and return types
+class ExprValidator : public NodeVisitor {
+ public:
+ explicit ExprValidator(LLVMTypes* types, SchemaPtr schema)
+ : types_(types), schema_(schema) {
+ for (auto& field : schema_->fields()) {
+ field_map_[field->name()] = field;
+ }
+ }
+
+ /// \brief Validates the root node
+ /// of an expression.
+ /// 1. Data type of fields and literals.
+ /// 2. Function signature is supported.
+ /// 3. For if nodes that return types match
+ /// for if, then and else nodes.
+ Status Validate(const ExpressionPtr& expr);
+
+ private:
+ Status Visit(const FieldNode& node) override;
+ Status Visit(const FunctionNode& node) override;
+ Status Visit(const IfNode& node) override;
+ Status Visit(const LiteralNode& node) override;
+ Status Visit(const BooleanNode& node) override;
+ Status Visit(const InExpressionNode<int32_t>& node) override;
+ Status Visit(const InExpressionNode<int64_t>& node) override;
+ Status Visit(const InExpressionNode<float>& node) override;
+ Status Visit(const InExpressionNode<double>& node) override;
+ Status Visit(const InExpressionNode<gandiva::DecimalScalar128>& node) override;
+ Status Visit(const InExpressionNode<std::string>& node) override;
+ Status ValidateInExpression(size_t number_of_values, DataTypePtr in_expr_return_type,
+ DataTypePtr type_of_values);
+
+ FunctionRegistry registry_;
+
+ LLVMTypes* types_;
+
+ SchemaPtr schema_;
+
+ using FieldMap = std::unordered_map<std::string, FieldPtr>;
+ FieldMap field_map_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/expression.cc b/src/arrow/cpp/src/gandiva/expression.cc
new file mode 100644
index 000000000..06aada27b
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/expression.cc
@@ -0,0 +1,25 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/expression.h"
+#include "gandiva/node.h"
+
+namespace gandiva {
+
+std::string Expression::ToString() { return root()->ToString(); }
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/expression.h b/src/arrow/cpp/src/gandiva/expression.h
new file mode 100644
index 000000000..cdda2512b
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/expression.h
@@ -0,0 +1,46 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+
+#include "gandiva/arrow.h"
+#include "gandiva/gandiva_aliases.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+/// \brief An expression tree with a root node, and a result field.
+class GANDIVA_EXPORT Expression {
+ public:
+ Expression(const NodePtr root, const FieldPtr result) : root_(root), result_(result) {}
+
+ virtual ~Expression() = default;
+
+ const NodePtr& root() const { return root_; }
+
+ const FieldPtr& result() const { return result_; }
+
+ std::string ToString();
+
+ private:
+ const NodePtr root_;
+ const FieldPtr result_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/expression_registry.cc b/src/arrow/cpp/src/gandiva/expression_registry.cc
new file mode 100644
index 000000000..c3a08fd3a
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/expression_registry.cc
@@ -0,0 +1,187 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/expression_registry.h"
+
+#include "gandiva/function_registry.h"
+#include "gandiva/llvm_types.h"
+
+namespace gandiva {
+
+ExpressionRegistry::ExpressionRegistry() {
+ function_registry_.reset(new FunctionRegistry());
+}
+
+ExpressionRegistry::~ExpressionRegistry() {}
+
+// to be used only to create function_signature_start
+ExpressionRegistry::FunctionSignatureIterator::FunctionSignatureIterator(
+ native_func_iterator_type nf_it, native_func_iterator_type nf_it_end)
+ : native_func_it_{nf_it},
+ native_func_it_end_{nf_it_end},
+ func_sig_it_{&(nf_it->signatures().front())} {}
+
+// to be used only to create function_signature_end
+ExpressionRegistry::FunctionSignatureIterator::FunctionSignatureIterator(
+ func_sig_iterator_type fs_it)
+ : native_func_it_{nullptr}, native_func_it_end_{nullptr}, func_sig_it_{fs_it} {}
+
+const ExpressionRegistry::FunctionSignatureIterator
+ExpressionRegistry::function_signature_begin() {
+ return FunctionSignatureIterator(function_registry_->begin(),
+ function_registry_->end());
+}
+
+const ExpressionRegistry::FunctionSignatureIterator
+ExpressionRegistry::function_signature_end() const {
+ return FunctionSignatureIterator(&(*(function_registry_->back()->signatures().end())));
+}
+
+bool ExpressionRegistry::FunctionSignatureIterator::operator!=(
+ const FunctionSignatureIterator& func_sign_it) {
+ return func_sign_it.func_sig_it_ != this->func_sig_it_;
+}
+
+FunctionSignature ExpressionRegistry::FunctionSignatureIterator::operator*() {
+ return *func_sig_it_;
+}
+
+ExpressionRegistry::func_sig_iterator_type ExpressionRegistry::FunctionSignatureIterator::
+operator++(int increment) {
+ ++func_sig_it_;
+ // point func_sig_it_ to first signature of next nativefunction if func_sig_it_ is
+ // pointing to end
+ if (func_sig_it_ == &(*native_func_it_->signatures().end())) {
+ ++native_func_it_;
+ if (native_func_it_ == native_func_it_end_) { // last native function
+ return func_sig_it_;
+ }
+ func_sig_it_ = &(native_func_it_->signatures().front());
+ }
+ return func_sig_it_;
+}
+
+static void AddArrowTypesToVector(arrow::Type::type type, DataTypeVector& vector);
+
+static DataTypeVector InitSupportedTypes() {
+ DataTypeVector data_type_vector;
+ llvm::LLVMContext llvm_context;
+ LLVMTypes llvm_types(llvm_context);
+ auto supported_arrow_types = llvm_types.GetSupportedArrowTypes();
+ for (auto& type_id : supported_arrow_types) {
+ AddArrowTypesToVector(type_id, data_type_vector);
+ }
+ return data_type_vector;
+}
+
+DataTypeVector ExpressionRegistry::supported_types_ = InitSupportedTypes();
+
+static void AddArrowTypesToVector(arrow::Type::type type, DataTypeVector& vector) {
+ switch (type) {
+ case arrow::Type::type::BOOL:
+ vector.push_back(arrow::boolean());
+ break;
+ case arrow::Type::type::UINT8:
+ vector.push_back(arrow::uint8());
+ break;
+ case arrow::Type::type::INT8:
+ vector.push_back(arrow::int8());
+ break;
+ case arrow::Type::type::UINT16:
+ vector.push_back(arrow::uint16());
+ break;
+ case arrow::Type::type::INT16:
+ vector.push_back(arrow::int16());
+ break;
+ case arrow::Type::type::UINT32:
+ vector.push_back(arrow::uint32());
+ break;
+ case arrow::Type::type::INT32:
+ vector.push_back(arrow::int32());
+ break;
+ case arrow::Type::type::UINT64:
+ vector.push_back(arrow::uint64());
+ break;
+ case arrow::Type::type::INT64:
+ vector.push_back(arrow::int64());
+ break;
+ case arrow::Type::type::HALF_FLOAT:
+ vector.push_back(arrow::float16());
+ break;
+ case arrow::Type::type::FLOAT:
+ vector.push_back(arrow::float32());
+ break;
+ case arrow::Type::type::DOUBLE:
+ vector.push_back(arrow::float64());
+ break;
+ case arrow::Type::type::STRING:
+ vector.push_back(arrow::utf8());
+ break;
+ case arrow::Type::type::BINARY:
+ vector.push_back(arrow::binary());
+ break;
+ case arrow::Type::type::DATE32:
+ vector.push_back(arrow::date32());
+ break;
+ case arrow::Type::type::DATE64:
+ vector.push_back(arrow::date64());
+ break;
+ case arrow::Type::type::TIMESTAMP:
+ vector.push_back(arrow::timestamp(arrow::TimeUnit::SECOND));
+ vector.push_back(arrow::timestamp(arrow::TimeUnit::MILLI));
+ vector.push_back(arrow::timestamp(arrow::TimeUnit::NANO));
+ vector.push_back(arrow::timestamp(arrow::TimeUnit::MICRO));
+ break;
+ case arrow::Type::type::TIME32:
+ vector.push_back(arrow::time32(arrow::TimeUnit::SECOND));
+ vector.push_back(arrow::time32(arrow::TimeUnit::MILLI));
+ break;
+ case arrow::Type::type::TIME64:
+ vector.push_back(arrow::time64(arrow::TimeUnit::MICRO));
+ vector.push_back(arrow::time64(arrow::TimeUnit::NANO));
+ break;
+ case arrow::Type::type::NA:
+ vector.push_back(arrow::null());
+ break;
+ case arrow::Type::type::DECIMAL:
+ vector.push_back(arrow::decimal(38, 0));
+ break;
+ case arrow::Type::type::INTERVAL_MONTHS:
+ vector.push_back(arrow::month_interval());
+ break;
+ case arrow::Type::type::INTERVAL_DAY_TIME:
+ vector.push_back(arrow::day_time_interval());
+ break;
+ default:
+ // Unsupported types. test ensures that
+ // when one of these are added build breaks.
+ DCHECK(false);
+ }
+}
+
+std::vector<std::shared_ptr<FunctionSignature>> GetRegisteredFunctionSignatures() {
+ ExpressionRegistry registry;
+ std::vector<std::shared_ptr<FunctionSignature>> signatures;
+ for (auto iter = registry.function_signature_begin();
+ iter != registry.function_signature_end(); iter++) {
+ signatures.push_back(std::make_shared<FunctionSignature>(
+ (*iter).base_name(), (*iter).param_types(), (*iter).ret_type()));
+ }
+ return signatures;
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/expression_registry.h b/src/arrow/cpp/src/gandiva/expression_registry.h
new file mode 100644
index 000000000..fb4f177ba
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/expression_registry.h
@@ -0,0 +1,71 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "gandiva/arrow.h"
+#include "gandiva/function_signature.h"
+#include "gandiva/gandiva_aliases.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+class NativeFunction;
+class FunctionRegistry;
+/// \brief Exports types supported by Gandiva for processing.
+///
+/// Has helper methods for clients to programmatically discover
+/// data types and functions supported by Gandiva.
+class GANDIVA_EXPORT ExpressionRegistry {
+ public:
+ using native_func_iterator_type = const NativeFunction*;
+ using func_sig_iterator_type = const FunctionSignature*;
+ ExpressionRegistry();
+ ~ExpressionRegistry();
+ static DataTypeVector supported_types() { return supported_types_; }
+ class GANDIVA_EXPORT FunctionSignatureIterator {
+ public:
+ explicit FunctionSignatureIterator(native_func_iterator_type nf_it,
+ native_func_iterator_type nf_it_end_);
+ explicit FunctionSignatureIterator(func_sig_iterator_type fs_it);
+
+ bool operator!=(const FunctionSignatureIterator& func_sign_it);
+
+ FunctionSignature operator*();
+
+ func_sig_iterator_type operator++(int);
+
+ private:
+ native_func_iterator_type native_func_it_;
+ const native_func_iterator_type native_func_it_end_;
+ func_sig_iterator_type func_sig_it_;
+ };
+ const FunctionSignatureIterator function_signature_begin();
+ const FunctionSignatureIterator function_signature_end() const;
+
+ private:
+ static DataTypeVector supported_types_;
+ std::unique_ptr<FunctionRegistry> function_registry_;
+};
+
+GANDIVA_EXPORT
+std::vector<std::shared_ptr<FunctionSignature>> GetRegisteredFunctionSignatures();
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/expression_registry_test.cc b/src/arrow/cpp/src/gandiva/expression_registry_test.cc
new file mode 100644
index 000000000..c254ff4f3
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/expression_registry_test.cc
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/expression_registry.h"
+
+#include <algorithm>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "gandiva/function_registry.h"
+#include "gandiva/function_signature.h"
+#include "gandiva/llvm_types.h"
+
+namespace gandiva {
+
+typedef int64_t (*add_vector_func_t)(int64_t* elements, int nelements);
+
+class TestExpressionRegistry : public ::testing::Test {
+ protected:
+ FunctionRegistry registry_;
+};
+
+// Verify all functions in registry are exported.
+TEST_F(TestExpressionRegistry, VerifySupportedFunctions) {
+ std::vector<FunctionSignature> functions;
+ ExpressionRegistry expr_registry;
+ for (auto iter = expr_registry.function_signature_begin();
+ iter != expr_registry.function_signature_end(); iter++) {
+ functions.push_back((*iter));
+ }
+ for (auto& iter : registry_) {
+ for (auto& func_iter : iter.signatures()) {
+ auto element = std::find(functions.begin(), functions.end(), func_iter);
+ EXPECT_NE(element, functions.end()) << "function signature " << func_iter.ToString()
+ << " missing in supported functions.\n";
+ }
+ }
+}
+
+// Verify all types are supported.
+TEST_F(TestExpressionRegistry, VerifyDataTypes) {
+ DataTypeVector data_types = ExpressionRegistry::supported_types();
+ llvm::LLVMContext llvm_context;
+ LLVMTypes llvm_types(llvm_context);
+ auto supported_arrow_types = llvm_types.GetSupportedArrowTypes();
+ for (auto& type_id : supported_arrow_types) {
+ auto element =
+ std::find(supported_arrow_types.begin(), supported_arrow_types.end(), type_id);
+ EXPECT_NE(element, supported_arrow_types.end())
+ << "data type " << type_id << " missing in supported data types.\n";
+ }
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/field_descriptor.h b/src/arrow/cpp/src/gandiva/field_descriptor.h
new file mode 100644
index 000000000..0fe6fe37f
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/field_descriptor.h
@@ -0,0 +1,69 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+
+#include "gandiva/arrow.h"
+
+namespace gandiva {
+
+/// \brief Descriptor for an arrow field. Holds indexes into the flattened array of
+/// buffers that is passed to LLVM generated functions.
+class FieldDescriptor {
+ public:
+ static const int kInvalidIdx = -1;
+
+ FieldDescriptor(FieldPtr field, int data_idx, int validity_idx = kInvalidIdx,
+ int offsets_idx = kInvalidIdx, int data_buffer_ptr_idx = kInvalidIdx)
+ : field_(field),
+ data_idx_(data_idx),
+ validity_idx_(validity_idx),
+ offsets_idx_(offsets_idx),
+ data_buffer_ptr_idx_(data_buffer_ptr_idx) {}
+
+ /// Index of validity array in the array-of-buffers
+ int validity_idx() const { return validity_idx_; }
+
+ /// Index of data array in the array-of-buffers
+ int data_idx() const { return data_idx_; }
+
+ /// Index of offsets array in the array-of-buffers
+ int offsets_idx() const { return offsets_idx_; }
+
+ /// Index of data buffer pointer in the array-of-buffers
+ int data_buffer_ptr_idx() const { return data_buffer_ptr_idx_; }
+
+ FieldPtr field() const { return field_; }
+
+ const std::string& Name() const { return field_->name(); }
+ DataTypePtr Type() const { return field_->type(); }
+
+ bool HasOffsetsIdx() const { return offsets_idx_ != kInvalidIdx; }
+
+ bool HasDataBufferPtrIdx() const { return data_buffer_ptr_idx_ != kInvalidIdx; }
+
+ private:
+ FieldPtr field_;
+ int data_idx_;
+ int validity_idx_;
+ int offsets_idx_;
+ int data_buffer_ptr_idx_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/filter.cc b/src/arrow/cpp/src/gandiva/filter.cc
new file mode 100644
index 000000000..875cc5447
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/filter.cc
@@ -0,0 +1,171 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/filter.h"
+
+#include <memory>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include "arrow/util/hash_util.h"
+
+#include "gandiva/bitmap_accumulator.h"
+#include "gandiva/cache.h"
+#include "gandiva/condition.h"
+#include "gandiva/expr_validator.h"
+#include "gandiva/llvm_generator.h"
+#include "gandiva/selection_vector_impl.h"
+
+namespace gandiva {
+
+FilterCacheKey::FilterCacheKey(SchemaPtr schema,
+ std::shared_ptr<Configuration> configuration,
+ Expression& expression)
+ : schema_(schema), configuration_(configuration), uniqifier_(0) {
+ static const int kSeedValue = 4;
+ size_t result = kSeedValue;
+ expression_as_string_ = expression.ToString();
+ UpdateUniqifier(expression_as_string_);
+ arrow::internal::hash_combine(result, expression_as_string_);
+ arrow::internal::hash_combine(result, configuration);
+ arrow::internal::hash_combine(result, schema_->ToString());
+ arrow::internal::hash_combine(result, uniqifier_);
+ hash_code_ = result;
+}
+
+bool FilterCacheKey::operator==(const FilterCacheKey& other) const {
+ // arrow schema does not overload equality operators.
+ if (!(schema_->Equals(*other.schema().get(), true))) {
+ return false;
+ }
+
+ if (configuration_ != other.configuration_) {
+ return false;
+ }
+
+ if (expression_as_string_ != other.expression_as_string_) {
+ return false;
+ }
+
+ if (uniqifier_ != other.uniqifier_) {
+ return false;
+ }
+ return true;
+}
+
+std::string FilterCacheKey::ToString() const {
+ std::stringstream ss;
+ // indent, window, indent_size, null_rep and skip new lines.
+ arrow::PrettyPrintOptions options{0, 10, 2, "null", true};
+ DCHECK_OK(PrettyPrint(*schema_.get(), options, &ss));
+
+ ss << "Condition: [" << expression_as_string_ << "]";
+ return ss.str();
+}
+
+void FilterCacheKey::UpdateUniqifier(const std::string& expr) {
+ // caching of expressions with re2 patterns causes lock contention. So, use
+ // multiple instances to reduce contention.
+ if (expr.find(" like(") != std::string::npos) {
+ uniqifier_ = std::hash<std::thread::id>()(std::this_thread::get_id()) % 16;
+ }
+}
+
+Filter::Filter(std::unique_ptr<LLVMGenerator> llvm_generator, SchemaPtr schema,
+ std::shared_ptr<Configuration> configuration)
+ : llvm_generator_(std::move(llvm_generator)),
+ schema_(schema),
+ configuration_(configuration) {}
+
+Filter::~Filter() {}
+
+Status Filter::Make(SchemaPtr schema, ConditionPtr condition,
+ std::shared_ptr<Configuration> configuration,
+ std::shared_ptr<Filter>* filter) {
+ ARROW_RETURN_IF(schema == nullptr, Status::Invalid("Schema cannot be null"));
+ ARROW_RETURN_IF(condition == nullptr, Status::Invalid("Condition cannot be null"));
+ ARROW_RETURN_IF(configuration == nullptr,
+ Status::Invalid("Configuration cannot be null"));
+
+ static Cache<FilterCacheKey, std::shared_ptr<Filter>> cache;
+ FilterCacheKey cache_key(schema, configuration, *(condition.get()));
+ auto cachedFilter = cache.GetModule(cache_key);
+ if (cachedFilter != nullptr) {
+ *filter = cachedFilter;
+ return Status::OK();
+ }
+
+ // Build LLVM generator, and generate code for the specified expression
+ std::unique_ptr<LLVMGenerator> llvm_gen;
+ ARROW_RETURN_NOT_OK(LLVMGenerator::Make(configuration, &llvm_gen));
+
+ // Run the validation on the expression.
+ // Return if the expression is invalid since we will not be able to process further.
+ ExprValidator expr_validator(llvm_gen->types(), schema);
+ ARROW_RETURN_NOT_OK(expr_validator.Validate(condition));
+
+ // Start measuring build time
+ auto begin = std::chrono::high_resolution_clock::now();
+ ARROW_RETURN_NOT_OK(llvm_gen->Build({condition}, SelectionVector::Mode::MODE_NONE));
+ // Stop measuring time and calculate the elapsed time
+ auto end = std::chrono::high_resolution_clock::now();
+ auto elapsed =
+ std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
+
+ // Instantiate the filter with the completely built llvm generator
+ *filter = std::make_shared<Filter>(std::move(llvm_gen), schema, configuration);
+ ValueCacheObject<std::shared_ptr<Filter>> value_cache(*filter, elapsed);
+ cache.PutModule(cache_key, value_cache);
+
+ return Status::OK();
+}
+
+Status Filter::Evaluate(const arrow::RecordBatch& batch,
+ std::shared_ptr<SelectionVector> out_selection) {
+ const auto num_rows = batch.num_rows();
+ ARROW_RETURN_IF(!batch.schema()->Equals(*schema_),
+ Status::Invalid("RecordBatch schema must expected filter schema"));
+ ARROW_RETURN_IF(num_rows == 0, Status::Invalid("RecordBatch must be non-empty."));
+ ARROW_RETURN_IF(out_selection == nullptr,
+ Status::Invalid("out_selection must be non-null."));
+ ARROW_RETURN_IF(out_selection->GetMaxSlots() < num_rows,
+ Status::Invalid("Output selection vector capacity too small"));
+
+ // Allocate three local_bitmaps (one for output, one for validity, one to compute the
+ // intersection).
+ LocalBitMapsHolder bitmaps(num_rows, 3 /*local_bitmaps*/);
+ int64_t bitmap_size = bitmaps.GetLocalBitMapSize();
+
+ auto validity = std::make_shared<arrow::Buffer>(bitmaps.GetLocalBitMap(0), bitmap_size);
+ auto value = std::make_shared<arrow::Buffer>(bitmaps.GetLocalBitMap(1), bitmap_size);
+ auto array_data = arrow::ArrayData::Make(arrow::boolean(), num_rows, {validity, value});
+
+ // Execute the expression(s).
+ ARROW_RETURN_NOT_OK(llvm_generator_->Execute(batch, {array_data}));
+
+ // Compute the intersection of the value and validity.
+ auto result = bitmaps.GetLocalBitMap(2);
+ BitMapAccumulator::IntersectBitMaps(
+ result, {bitmaps.GetLocalBitMap(0), bitmaps.GetLocalBitMap((1))}, {0, 0}, num_rows);
+
+ return out_selection->PopulateFromBitMap(result, bitmap_size, num_rows - 1);
+}
+
+std::string Filter::DumpIR() { return llvm_generator_->DumpIR(); }
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/filter.h b/src/arrow/cpp/src/gandiva/filter.h
new file mode 100644
index 000000000..70ccd7cf0
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/filter.h
@@ -0,0 +1,112 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/status.h"
+
+#include "gandiva/arrow.h"
+#include "gandiva/condition.h"
+#include "gandiva/configuration.h"
+#include "gandiva/selection_vector.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+class LLVMGenerator;
+
+class FilterCacheKey {
+ public:
+ FilterCacheKey(SchemaPtr schema, std::shared_ptr<Configuration> configuration,
+ Expression& expression);
+
+ std::size_t Hash() const { return hash_code_; }
+
+ bool operator==(const FilterCacheKey& other) const;
+
+ bool operator!=(const FilterCacheKey& other) const { return !(*this == other); }
+
+ SchemaPtr schema() const { return schema_; }
+
+ std::string ToString() const;
+
+ private:
+ void UpdateUniqifier(const std::string& expr);
+
+ const SchemaPtr schema_;
+ const std::shared_ptr<Configuration> configuration_;
+ std::string expression_as_string_;
+ size_t hash_code_;
+ uint32_t uniqifier_;
+};
+
+/// \brief filter records based on a condition.
+///
+/// A filter is built for a specific schema and condition. Once the filter is built, it
+/// can be used to evaluate many row batches.
+class GANDIVA_EXPORT Filter {
+ public:
+ Filter(std::unique_ptr<LLVMGenerator> llvm_generator, SchemaPtr schema,
+ std::shared_ptr<Configuration> config);
+
+ // Inline dtor will attempt to resolve the destructor for
+ // LLVMGenerator on MSVC, so we compile the dtor in the object code
+ ~Filter();
+
+ /// Build a filter for the given schema and condition, with the default configuration.
+ ///
+ /// \param[in] schema schema for the record batches, and the condition.
+ /// \param[in] condition filter condition.
+ /// \param[out] filter the returned filter object
+ static Status Make(SchemaPtr schema, ConditionPtr condition,
+ std::shared_ptr<Filter>* filter) {
+ return Make(schema, condition, ConfigurationBuilder::DefaultConfiguration(), filter);
+ }
+
+ /// \brief Build a filter for the given schema and condition.
+ /// Customize the filter with runtime configuration.
+ ///
+ /// \param[in] schema schema for the record batches, and the condition.
+ /// \param[in] condition filter conditions.
+ /// \param[in] config run time configuration.
+ /// \param[out] filter the returned filter object
+ static Status Make(SchemaPtr schema, ConditionPtr condition,
+ std::shared_ptr<Configuration> config,
+ std::shared_ptr<Filter>* filter);
+
+ /// Evaluate the specified record batch, and populate output selection vector.
+ ///
+ /// \param[in] batch the record batch. schema should be the same as the one in 'Make'
+ /// \param[in,out] out_selection the selection array with indices of rows that match
+ /// the condition.
+ Status Evaluate(const arrow::RecordBatch& batch,
+ std::shared_ptr<SelectionVector> out_selection);
+
+ std::string DumpIR();
+
+ private:
+ std::unique_ptr<LLVMGenerator> llvm_generator_;
+ SchemaPtr schema_;
+ std::shared_ptr<Configuration> configuration_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/formatting_utils.h b/src/arrow/cpp/src/gandiva/formatting_utils.h
new file mode 100644
index 000000000..7bc6a4969
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/formatting_utils.h
@@ -0,0 +1,69 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/type.h"
+#include "arrow/util/formatting.h"
+#include "arrow/vendored/double-conversion/double-conversion.h"
+
+namespace gandiva {
+
+/// \brief The entry point for conversion to strings.
+template <typename ARROW_TYPE, typename Enable = void>
+class GdvStringFormatter;
+
+using double_conversion::DoubleToStringConverter;
+
+template <typename ARROW_TYPE>
+class FloatToStringGdvMixin
+ : public arrow::internal::FloatToStringFormatterMixin<ARROW_TYPE> {
+ public:
+ using arrow::internal::FloatToStringFormatterMixin<
+ ARROW_TYPE>::FloatToStringFormatterMixin;
+
+ // The mixin is a modified version of the existent FloatToStringFormatterMixin, but
+ // it defines some specific parameters in the FloatToStringFormatterMixin to cast
+ // the float numbers to string using the same patterns like Java.
+ //
+ // The Java real numbers are represented in two ways following these rules:
+ //- If the number is greater or equals than 10^7 and less than 10^(-3)
+ // it will be represented using scientific notation, e.g:
+ // - 0.000012 -> 1.2E-5
+ // - 10000002.3 -> 1.00000023E7
+ //- If the numbers are between that interval above, they are showed as is.
+ explicit FloatToStringGdvMixin(const std::shared_ptr<arrow::DataType>& = NULLPTR)
+ : arrow::internal::FloatToStringFormatterMixin<ARROW_TYPE>(
+ DoubleToStringConverter::EMIT_TRAILING_ZERO_AFTER_POINT |
+ DoubleToStringConverter::EMIT_TRAILING_DECIMAL_POINT,
+ "Infinity", "NaN", 'E', -3, 7, 3, 1) {}
+};
+
+template <>
+class GdvStringFormatter<arrow::FloatType>
+ : public FloatToStringGdvMixin<arrow::FloatType> {
+ public:
+ using FloatToStringGdvMixin::FloatToStringGdvMixin;
+};
+
+template <>
+class GdvStringFormatter<arrow::DoubleType>
+ : public FloatToStringGdvMixin<arrow::DoubleType> {
+ public:
+ using FloatToStringGdvMixin::FloatToStringGdvMixin;
+};
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/func_descriptor.h b/src/arrow/cpp/src/gandiva/func_descriptor.h
new file mode 100644
index 000000000..a2bf3a16b
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/func_descriptor.h
@@ -0,0 +1,50 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "gandiva/arrow.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+/// Descriptor for a function in the expression.
+class GANDIVA_EXPORT FuncDescriptor {
+ public:
+ FuncDescriptor(const std::string& name, const DataTypeVector& params,
+ DataTypePtr return_type)
+ : name_(name), params_(params), return_type_(return_type) {}
+
+ /// base function name.
+ const std::string& name() const { return name_; }
+
+ /// Data types of the input params.
+ const DataTypeVector& params() const { return params_; }
+
+ /// Data type of the return parameter.
+ DataTypePtr return_type() const { return return_type_; }
+
+ private:
+ std::string name_;
+ DataTypeVector params_;
+ DataTypePtr return_type_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/function_holder.h b/src/arrow/cpp/src/gandiva/function_holder.h
new file mode 100644
index 000000000..e3576f09c
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/function_holder.h
@@ -0,0 +1,34 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+/// Holder for a function that can be invoked from LLVM.
+class GANDIVA_EXPORT FunctionHolder {
+ public:
+ virtual ~FunctionHolder() = default;
+};
+
+using FunctionHolderPtr = std::shared_ptr<FunctionHolder>;
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/function_holder_registry.h b/src/arrow/cpp/src/gandiva/function_holder_registry.h
new file mode 100644
index 000000000..ced153891
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/function_holder_registry.h
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "arrow/status.h"
+
+#include "gandiva/function_holder.h"
+#include "gandiva/like_holder.h"
+#include "gandiva/node.h"
+#include "gandiva/random_generator_holder.h"
+#include "gandiva/replace_holder.h"
+#include "gandiva/to_date_holder.h"
+
+namespace gandiva {
+
+#define LAMBDA_MAKER(derived) \
+ [](const FunctionNode& node, FunctionHolderPtr* holder) { \
+ std::shared_ptr<derived> derived_instance; \
+ auto status = derived::Make(node, &derived_instance); \
+ if (status.ok()) { \
+ *holder = derived_instance; \
+ } \
+ return status; \
+ }
+
+/// Static registry of function holders.
+class FunctionHolderRegistry {
+ public:
+ using maker_type = std::function<Status(const FunctionNode&, FunctionHolderPtr*)>;
+ using map_type = std::unordered_map<std::string, maker_type>;
+
+ static Status Make(const std::string& name, const FunctionNode& node,
+ FunctionHolderPtr* holder) {
+ auto found = makers().find(name);
+ if (found == makers().end()) {
+ return Status::Invalid("function holder not registered for function " + name);
+ }
+
+ return found->second(node, holder);
+ }
+
+ private:
+ static map_type& makers() {
+ static map_type maker_map = {
+ {"like", LAMBDA_MAKER(LikeHolder)},
+ {"ilike", LAMBDA_MAKER(LikeHolder)},
+ {"to_date", LAMBDA_MAKER(ToDateHolder)},
+ {"random", LAMBDA_MAKER(RandomGeneratorHolder)},
+ {"rand", LAMBDA_MAKER(RandomGeneratorHolder)},
+ {"regexp_replace", LAMBDA_MAKER(ReplaceHolder)},
+ };
+ return maker_map;
+ }
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/function_ir_builder.cc b/src/arrow/cpp/src/gandiva/function_ir_builder.cc
new file mode 100644
index 000000000..194273933
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/function_ir_builder.cc
@@ -0,0 +1,81 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/function_ir_builder.h"
+
+namespace gandiva {
+
+llvm::Value* FunctionIRBuilder::BuildIfElse(llvm::Value* condition,
+ llvm::Type* return_type,
+ std::function<llvm::Value*()> then_func,
+ std::function<llvm::Value*()> else_func) {
+ llvm::IRBuilder<>* builder = ir_builder();
+ llvm::Function* function = builder->GetInsertBlock()->getParent();
+ DCHECK_NE(function, nullptr);
+
+ // Create blocks for the then, else and merge cases.
+ llvm::BasicBlock* then_bb = llvm::BasicBlock::Create(*context(), "then", function);
+ llvm::BasicBlock* else_bb = llvm::BasicBlock::Create(*context(), "else", function);
+ llvm::BasicBlock* merge_bb = llvm::BasicBlock::Create(*context(), "merge", function);
+
+ builder->CreateCondBr(condition, then_bb, else_bb);
+
+ // Emit the then block.
+ builder->SetInsertPoint(then_bb);
+ auto then_value = then_func();
+ builder->CreateBr(merge_bb);
+
+ // refresh then_bb for phi (could have changed due to code generation of then_value).
+ then_bb = builder->GetInsertBlock();
+
+ // Emit the else block.
+ builder->SetInsertPoint(else_bb);
+ auto else_value = else_func();
+ builder->CreateBr(merge_bb);
+
+ // refresh else_bb for phi (could have changed due to code generation of else_value).
+ else_bb = builder->GetInsertBlock();
+
+ // Emit the merge block.
+ builder->SetInsertPoint(merge_bb);
+ llvm::PHINode* result_value = builder->CreatePHI(return_type, 2, "res_value");
+ result_value->addIncoming(then_value, then_bb);
+ result_value->addIncoming(else_value, else_bb);
+ return result_value;
+}
+
+llvm::Function* FunctionIRBuilder::BuildFunction(const std::string& function_name,
+ llvm::Type* return_type,
+ std::vector<NamedArg> in_args) {
+ std::vector<llvm::Type*> arg_types;
+ for (auto& arg : in_args) {
+ arg_types.push_back(arg.type);
+ }
+ auto prototype = llvm::FunctionType::get(return_type, arg_types, false /*isVarArg*/);
+ auto function = llvm::Function::Create(prototype, llvm::GlobalValue::ExternalLinkage,
+ function_name, module());
+
+ uint32_t i = 0;
+ for (auto& fn_arg : function->args()) {
+ DCHECK_LT(i, in_args.size());
+ fn_arg.setName(in_args[i].name);
+ ++i;
+ }
+ return function;
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/function_ir_builder.h b/src/arrow/cpp/src/gandiva/function_ir_builder.h
new file mode 100644
index 000000000..388f55840
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/function_ir_builder.h
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "gandiva/engine.h"
+#include "gandiva/gandiva_aliases.h"
+#include "gandiva/llvm_types.h"
+
+namespace gandiva {
+
+/// @brief Base class for building IR functions.
+class FunctionIRBuilder {
+ public:
+ explicit FunctionIRBuilder(Engine* engine) : engine_(engine) {}
+ virtual ~FunctionIRBuilder() = default;
+
+ protected:
+ LLVMTypes* types() { return engine_->types(); }
+ llvm::Module* module() { return engine_->module(); }
+ llvm::LLVMContext* context() { return engine_->context(); }
+ llvm::IRBuilder<>* ir_builder() { return engine_->ir_builder(); }
+
+ /// Build an if-else block.
+ llvm::Value* BuildIfElse(llvm::Value* condition, llvm::Type* return_type,
+ std::function<llvm::Value*()> then_func,
+ std::function<llvm::Value*()> else_func);
+
+ struct NamedArg {
+ std::string name;
+ llvm::Type* type;
+ };
+
+ /// Build llvm fn.
+ llvm::Function* BuildFunction(const std::string& function_name, llvm::Type* return_type,
+ std::vector<NamedArg> in_args);
+
+ private:
+ Engine* engine_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/function_registry.cc b/src/arrow/cpp/src/gandiva/function_registry.cc
new file mode 100644
index 000000000..d5d015c10
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/function_registry.cc
@@ -0,0 +1,83 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/function_registry.h"
+#include "gandiva/function_registry_arithmetic.h"
+#include "gandiva/function_registry_datetime.h"
+#include "gandiva/function_registry_hash.h"
+#include "gandiva/function_registry_math_ops.h"
+#include "gandiva/function_registry_string.h"
+#include "gandiva/function_registry_timestamp_arithmetic.h"
+
+#include <iterator>
+#include <utility>
+#include <vector>
+
+namespace gandiva {
+
+FunctionRegistry::iterator FunctionRegistry::begin() const {
+ return &(*pc_registry_.begin());
+}
+
+FunctionRegistry::iterator FunctionRegistry::end() const {
+ return &(*pc_registry_.end());
+}
+
+FunctionRegistry::iterator FunctionRegistry::back() const {
+ return &(pc_registry_.back());
+}
+
+std::vector<NativeFunction> FunctionRegistry::pc_registry_;
+
+SignatureMap FunctionRegistry::pc_registry_map_ = InitPCMap();
+
+SignatureMap FunctionRegistry::InitPCMap() {
+ SignatureMap map;
+
+ auto v1 = GetArithmeticFunctionRegistry();
+ pc_registry_.insert(std::end(pc_registry_), v1.begin(), v1.end());
+ auto v2 = GetDateTimeFunctionRegistry();
+ pc_registry_.insert(std::end(pc_registry_), v2.begin(), v2.end());
+
+ auto v3 = GetHashFunctionRegistry();
+ pc_registry_.insert(std::end(pc_registry_), v3.begin(), v3.end());
+
+ auto v4 = GetMathOpsFunctionRegistry();
+ pc_registry_.insert(std::end(pc_registry_), v4.begin(), v4.end());
+
+ auto v5 = GetStringFunctionRegistry();
+ pc_registry_.insert(std::end(pc_registry_), v5.begin(), v5.end());
+
+ auto v6 = GetDateTimeArithmeticFunctionRegistry();
+ pc_registry_.insert(std::end(pc_registry_), v6.begin(), v6.end());
+
+ for (auto& elem : pc_registry_) {
+ for (auto& func_signature : elem.signatures()) {
+ map.insert(std::make_pair(&(func_signature), &elem));
+ }
+ }
+
+ return map;
+}
+
+const NativeFunction* FunctionRegistry::LookupSignature(
+ const FunctionSignature& signature) const {
+ auto got = pc_registry_map_.find(&signature);
+ return got == pc_registry_map_.end() ? nullptr : got->second;
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/function_registry.h b/src/arrow/cpp/src/gandiva/function_registry.h
new file mode 100644
index 000000000..d92563260
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/function_registry.h
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <vector>
+#include "gandiva/function_registry_common.h"
+#include "gandiva/gandiva_aliases.h"
+#include "gandiva/native_function.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+///\brief Registry of pre-compiled IR functions.
+class GANDIVA_EXPORT FunctionRegistry {
+ public:
+ using iterator = const NativeFunction*;
+
+ /// Lookup a pre-compiled function by its signature.
+ const NativeFunction* LookupSignature(const FunctionSignature& signature) const;
+
+ iterator begin() const;
+ iterator end() const;
+ iterator back() const;
+
+ private:
+ static SignatureMap InitPCMap();
+
+ static std::vector<NativeFunction> pc_registry_;
+ static SignatureMap pc_registry_map_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/function_registry_arithmetic.cc b/src/arrow/cpp/src/gandiva/function_registry_arithmetic.cc
new file mode 100644
index 000000000..f34289f37
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/function_registry_arithmetic.cc
@@ -0,0 +1,125 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/function_registry_arithmetic.h"
+#include "gandiva/function_registry_common.h"
+
+namespace gandiva {
+
+#define BINARY_SYMMETRIC_FN(name, ALIASES) \
+ NUMERIC_TYPES(BINARY_SYMMETRIC_SAFE_NULL_IF_NULL, name, ALIASES)
+
+#define BINARY_RELATIONAL_BOOL_FN(name, ALIASES) \
+ NUMERIC_BOOL_DATE_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, name, ALIASES)
+
+#define BINARY_RELATIONAL_BOOL_DATE_FN(name, ALIASES) \
+ NUMERIC_DATE_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, name, ALIASES)
+
+#define UNARY_CAST_TO_FLOAT64(type) UNARY_SAFE_NULL_IF_NULL(castFLOAT8, {}, type, float64)
+
+#define UNARY_CAST_TO_FLOAT32(type) UNARY_SAFE_NULL_IF_NULL(castFLOAT4, {}, type, float32)
+
+#define UNARY_CAST_TO_INT32(type) UNARY_SAFE_NULL_IF_NULL(castINT, {}, type, int32)
+
+#define UNARY_CAST_TO_INT64(type) UNARY_SAFE_NULL_IF_NULL(castBIGINT, {}, type, int64)
+
+std::vector<NativeFunction> GetArithmeticFunctionRegistry() {
+ static std::vector<NativeFunction> arithmetic_fn_registry_ = {
+ UNARY_SAFE_NULL_IF_NULL(not, {}, boolean, boolean),
+ UNARY_SAFE_NULL_IF_NULL(castBIGINT, {}, int32, int64),
+ UNARY_SAFE_NULL_IF_NULL(castINT, {}, int64, int32),
+ UNARY_SAFE_NULL_IF_NULL(castBIGINT, {}, decimal128, int64),
+
+ // cast to float32
+ UNARY_CAST_TO_FLOAT32(int32), UNARY_CAST_TO_FLOAT32(int64),
+ UNARY_CAST_TO_FLOAT32(float64),
+
+ // cast to int32
+ UNARY_CAST_TO_INT32(float32), UNARY_CAST_TO_INT32(float64),
+
+ // cast to int64
+ UNARY_CAST_TO_INT64(float32), UNARY_CAST_TO_INT64(float64),
+
+ // cast to float64
+ UNARY_CAST_TO_FLOAT64(int32), UNARY_CAST_TO_FLOAT64(int64),
+ UNARY_CAST_TO_FLOAT64(float32), UNARY_CAST_TO_FLOAT64(decimal128),
+
+ // cast to decimal
+ UNARY_SAFE_NULL_IF_NULL(castDECIMAL, {}, int32, decimal128),
+ UNARY_SAFE_NULL_IF_NULL(castDECIMAL, {}, int64, decimal128),
+ UNARY_SAFE_NULL_IF_NULL(castDECIMAL, {}, float32, decimal128),
+ UNARY_SAFE_NULL_IF_NULL(castDECIMAL, {}, float64, decimal128),
+ UNARY_SAFE_NULL_IF_NULL(castDECIMAL, {}, decimal128, decimal128),
+ UNARY_UNSAFE_NULL_IF_NULL(castDECIMAL, {}, utf8, decimal128),
+
+ NativeFunction("castDECIMALNullOnOverflow", {}, DataTypeVector{decimal128()},
+ decimal128(), kResultNullInternal,
+ "castDECIMALNullOnOverflow_decimal128"),
+
+ UNARY_SAFE_NULL_IF_NULL(castDATE, {}, int64, date64),
+ UNARY_SAFE_NULL_IF_NULL(castDATE, {}, int32, date32),
+ UNARY_SAFE_NULL_IF_NULL(castDATE, {}, date32, date64),
+
+ // add/sub/multiply/divide/mod
+ BINARY_SYMMETRIC_FN(add, {}), BINARY_SYMMETRIC_FN(subtract, {}),
+ BINARY_SYMMETRIC_FN(multiply, {}),
+ NUMERIC_TYPES(BINARY_SYMMETRIC_UNSAFE_NULL_IF_NULL, divide, {}),
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(mod, {"modulo"}, int64, int32, int32),
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(mod, {"modulo"}, int64, int64, int64),
+ BINARY_SYMMETRIC_UNSAFE_NULL_IF_NULL(mod, {"modulo"}, decimal128),
+ BINARY_SYMMETRIC_UNSAFE_NULL_IF_NULL(mod, {"modulo"}, float64),
+ BINARY_SYMMETRIC_UNSAFE_NULL_IF_NULL(div, {}, int32),
+ BINARY_SYMMETRIC_UNSAFE_NULL_IF_NULL(div, {}, int64),
+ BINARY_SYMMETRIC_UNSAFE_NULL_IF_NULL(div, {}, float32),
+ BINARY_SYMMETRIC_UNSAFE_NULL_IF_NULL(div, {}, float64),
+
+ // bitwise operators
+ BINARY_SYMMETRIC_SAFE_NULL_IF_NULL(bitwise_and, {}, int32),
+ BINARY_SYMMETRIC_SAFE_NULL_IF_NULL(bitwise_and, {}, int64),
+ BINARY_SYMMETRIC_SAFE_NULL_IF_NULL(bitwise_or, {}, int32),
+ BINARY_SYMMETRIC_SAFE_NULL_IF_NULL(bitwise_or, {}, int64),
+ BINARY_SYMMETRIC_SAFE_NULL_IF_NULL(bitwise_xor, {}, int32),
+ BINARY_SYMMETRIC_SAFE_NULL_IF_NULL(bitwise_xor, {}, int64),
+ UNARY_SAFE_NULL_IF_NULL(bitwise_not, {}, int32, int32),
+ UNARY_SAFE_NULL_IF_NULL(bitwise_not, {}, int64, int64),
+
+ // round functions
+ UNARY_SAFE_NULL_IF_NULL(round, {}, float32, float32),
+ UNARY_SAFE_NULL_IF_NULL(round, {}, float64, float64),
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(round, {}, float32, int32, float32),
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(round, {}, float64, int32, float64),
+ UNARY_SAFE_NULL_IF_NULL(round, {}, int32, int32),
+ UNARY_SAFE_NULL_IF_NULL(round, {}, int64, int64),
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(round, {}, int32, int32, int32),
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(round, {}, int64, int32, int64),
+
+ // compare functions
+ BINARY_RELATIONAL_BOOL_FN(equal, ({"eq", "same"})),
+ BINARY_RELATIONAL_BOOL_FN(not_equal, {}),
+ BINARY_RELATIONAL_BOOL_DATE_FN(less_than, {}),
+ BINARY_RELATIONAL_BOOL_DATE_FN(less_than_or_equal_to, {}),
+ BINARY_RELATIONAL_BOOL_DATE_FN(greater_than, {}),
+ BINARY_RELATIONAL_BOOL_DATE_FN(greater_than_or_equal_to, {}),
+
+ // binary representation of integer values
+ UNARY_UNSAFE_NULL_IF_NULL(bin, {}, int32, utf8),
+ UNARY_UNSAFE_NULL_IF_NULL(bin, {}, int64, utf8)};
+
+ return arithmetic_fn_registry_;
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/function_registry_arithmetic.h b/src/arrow/cpp/src/gandiva/function_registry_arithmetic.h
new file mode 100644
index 000000000..693d3b95e
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/function_registry_arithmetic.h
@@ -0,0 +1,27 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <vector>
+#include "gandiva/native_function.h"
+
+namespace gandiva {
+
+std::vector<NativeFunction> GetArithmeticFunctionRegistry();
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/function_registry_common.h b/src/arrow/cpp/src/gandiva/function_registry_common.h
new file mode 100644
index 000000000..66f945150
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/function_registry_common.h
@@ -0,0 +1,268 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "gandiva/arrow.h"
+#include "gandiva/function_signature.h"
+#include "gandiva/gandiva_aliases.h"
+#include "gandiva/native_function.h"
+
+/* This is a private file, intended for internal use by gandiva & must not be included
+ * directly.
+ */
+namespace gandiva {
+
+using arrow::binary;
+using arrow::boolean;
+using arrow::date32;
+using arrow::date64;
+using arrow::day_time_interval;
+using arrow::float32;
+using arrow::float64;
+using arrow::int16;
+using arrow::int32;
+using arrow::int64;
+using arrow::int8;
+using arrow::month_interval;
+using arrow::uint16;
+using arrow::uint32;
+using arrow::uint64;
+using arrow::uint8;
+using arrow::utf8;
+
+inline DataTypePtr time32() { return arrow::time32(arrow::TimeUnit::MILLI); }
+
+inline DataTypePtr time64() { return arrow::time64(arrow::TimeUnit::MICRO); }
+
+inline DataTypePtr timestamp() { return arrow::timestamp(arrow::TimeUnit::MILLI); }
+inline DataTypePtr decimal128() { return arrow::decimal(38, 0); }
+
+struct KeyHash {
+ std::size_t operator()(const FunctionSignature* k) const { return k->Hash(); }
+};
+
+struct KeyEquals {
+ bool operator()(const FunctionSignature* s1, const FunctionSignature* s2) const {
+ return *s1 == *s2;
+ }
+};
+
+typedef std::unordered_map<const FunctionSignature*, const NativeFunction*, KeyHash,
+ KeyEquals>
+ SignatureMap;
+
+// Binary functions that :
+// - have the same input type for both params
+// - output type is same as the input type
+// - NULL handling is of type NULL_IF_NULL
+//
+// The pre-compiled fn name includes the base name & input type names. eg. add_int32_int32
+#define BINARY_SYMMETRIC_SAFE_NULL_IF_NULL(NAME, ALIASES, TYPE) \
+ NativeFunction(#NAME, std::vector<std::string> ALIASES, \
+ DataTypeVector{TYPE(), TYPE()}, TYPE(), kResultNullIfNull, \
+ ARROW_STRINGIFY(NAME##_##TYPE##_##TYPE))
+
+// Binary functions that :
+// - have the same input type for both params
+// - NULL handling is of type NULL_IINTERNAL
+// - can return error.
+//
+// The pre-compiled fn name includes the base name & input type names. eg. add_int32_int32
+#define BINARY_UNSAFE_NULL_IF_NULL(NAME, ALIASES, IN_TYPE, OUT_TYPE) \
+ NativeFunction(#NAME, std::vector<std::string> ALIASES, \
+ DataTypeVector{IN_TYPE(), IN_TYPE()}, OUT_TYPE(), kResultNullIfNull, \
+ ARROW_STRINGIFY(NAME##_##IN_TYPE##_##IN_TYPE), \
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors)
+
+#define BINARY_SYMMETRIC_UNSAFE_NULL_IF_NULL(NAME, ALIASES, TYPE) \
+ BINARY_UNSAFE_NULL_IF_NULL(NAME, ALIASES, TYPE, TYPE)
+
+// Binary functions that :
+// - have different input types, or output type
+// - NULL handling is of type NULL_IF_NULL
+//
+// The pre-compiled fn name includes the base name & input type names. eg. mod_int64_int32
+#define BINARY_GENERIC_SAFE_NULL_IF_NULL(NAME, ALIASES, IN_TYPE1, IN_TYPE2, OUT_TYPE) \
+ NativeFunction(#NAME, std::vector<std::string> ALIASES, \
+ DataTypeVector{IN_TYPE1(), IN_TYPE2()}, OUT_TYPE(), kResultNullIfNull, \
+ ARROW_STRINGIFY(NAME##_##IN_TYPE1##_##IN_TYPE2))
+
+// Binary functions that :
+// - have the same input type
+// - output type is boolean
+// - NULL handling is of type NULL_IF_NULL
+//
+// The pre-compiled fn name includes the base name & input type names.
+// eg. equal_int32_int32
+#define BINARY_RELATIONAL_SAFE_NULL_IF_NULL(NAME, ALIASES, TYPE) \
+ NativeFunction(#NAME, std::vector<std::string> ALIASES, \
+ DataTypeVector{TYPE(), TYPE()}, boolean(), kResultNullIfNull, \
+ ARROW_STRINGIFY(NAME##_##TYPE##_##TYPE))
+
+// Unary functions that :
+// - NULL handling is of type NULL_IF_NULL
+//
+// The pre-compiled fn name includes the base name & input type name. eg. castFloat_int32
+#define UNARY_SAFE_NULL_IF_NULL(NAME, ALIASES, IN_TYPE, OUT_TYPE) \
+ NativeFunction(#NAME, std::vector<std::string> ALIASES, DataTypeVector{IN_TYPE()}, \
+ OUT_TYPE(), kResultNullIfNull, ARROW_STRINGIFY(NAME##_##IN_TYPE))
+
+// Unary functions that :
+// - NULL handling is of type NULL_NEVER
+//
+// The pre-compiled fn name includes the base name & input type name. eg. isnull_int32
+#define UNARY_SAFE_NULL_NEVER_BOOL(NAME, ALIASES, TYPE) \
+ NativeFunction(#NAME, std::vector<std::string> ALIASES, DataTypeVector{TYPE()}, \
+ boolean(), kResultNullNever, ARROW_STRINGIFY(NAME##_##TYPE))
+
+// Unary functions that :
+// - NULL handling is of type NULL_INTERNAL
+//
+// The pre-compiled fn name includes the base name & input type name. eg. castFloat_int32
+#define UNARY_UNSAFE_NULL_IF_NULL(NAME, ALIASES, IN_TYPE, OUT_TYPE) \
+ NativeFunction(#NAME, std::vector<std::string> ALIASES, DataTypeVector{IN_TYPE()}, \
+ OUT_TYPE(), kResultNullIfNull, ARROW_STRINGIFY(NAME##_##IN_TYPE), \
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors)
+
+// Binary functions that :
+// - NULL handling is of type NULL_NEVER
+//
+// The pre-compiled fn name includes the base name & input type names,
+// eg. is_distinct_from_int32_int32
+#define BINARY_SAFE_NULL_NEVER_BOOL(NAME, ALIASES, TYPE) \
+ NativeFunction(#NAME, std::vector<std::string> ALIASES, \
+ DataTypeVector{TYPE(), TYPE()}, boolean(), kResultNullNever, \
+ ARROW_STRINGIFY(NAME##_##TYPE##_##TYPE))
+
+// Extract functions (used with data/time types) that :
+// - NULL handling is of type NULL_IF_NULL
+//
+// The pre-compiled fn name includes the base name & input type name. eg. extractYear_date
+#define EXTRACT_SAFE_NULL_IF_NULL(NAME, ALIASES, TYPE) \
+ NativeFunction(#NAME, std::vector<std::string> ALIASES, DataTypeVector{TYPE()}, \
+ int64(), kResultNullIfNull, ARROW_STRINGIFY(NAME##_##TYPE))
+
+#define TRUNCATE_SAFE_NULL_IF_NULL(NAME, ALIASES, TYPE) \
+ NativeFunction(#NAME, std::vector<std::string> ALIASES, DataTypeVector{TYPE()}, \
+ TYPE(), kResultNullIfNull, ARROW_STRINGIFY(NAME##_##TYPE))
+
+// Last day functions (used with data/time types) that :
+// - NULL handling is of type NULL_IF_NULL
+//
+// The pre-compiled fn name includes the base name & input type name. eg:
+// - last_day_from_date64
+#define LAST_DAY_SAFE_NULL_IF_NULL(NAME, ALIASES, TYPE) \
+ NativeFunction(#NAME, std::vector<std::string> ALIASES, DataTypeVector{TYPE()}, \
+ date64(), kResultNullIfNull, ARROW_STRINGIFY(NAME##_from_##TYPE))
+
+// Hash32 functions that :
+// - NULL handling is of type NULL_NEVER
+//
+// The pre-compiled fn name includes the base name & input type name. hash32_int8
+#define HASH32_SAFE_NULL_NEVER(NAME, ALIASES, TYPE) \
+ NativeFunction(#NAME, std::vector<std::string> ALIASES, DataTypeVector{TYPE()}, \
+ int32(), kResultNullNever, ARROW_STRINGIFY(NAME##_##TYPE))
+
+// Hash32 functions that :
+// - NULL handling is of type NULL_NEVER
+//
+// The pre-compiled fn name includes the base name & input type name. hash32_int8
+#define HASH64_SAFE_NULL_NEVER(NAME, ALIASES, TYPE) \
+ NativeFunction(#NAME, std::vector<std::string> ALIASES, DataTypeVector{TYPE()}, \
+ int64(), kResultNullNever, ARROW_STRINGIFY(NAME##_##TYPE))
+
+// Hash32 functions with seed that :
+// - NULL handling is of type NULL_NEVER
+//
+// The pre-compiled fn name includes the base name & input type name. hash32WithSeed_int8
+#define HASH32_SEED_SAFE_NULL_NEVER(NAME, ALIASES, TYPE) \
+ NativeFunction(#NAME, std::vector<std::string> ALIASES, \
+ DataTypeVector{TYPE(), int32()}, int32(), kResultNullNever, \
+ ARROW_STRINGIFY(NAME##WithSeed_##TYPE))
+
+// Hash64 functions with seed that :
+// - NULL handling is of type NULL_NEVER
+//
+// The pre-compiled fn name includes the base name & input type name. hash32WithSeed_int8
+#define HASH64_SEED_SAFE_NULL_NEVER(NAME, ALIASES, TYPE) \
+ NativeFunction(#NAME, std::vector<std::string> ALIASES, \
+ DataTypeVector{TYPE(), int64()}, int64(), kResultNullNever, \
+ ARROW_STRINGIFY(NAME##WithSeed_##TYPE))
+
+// HashSHA1 functions that :
+// - NULL handling is of type NULL_NEVER
+// - can return errors
+//
+// The function name includes the base name & input type name. gdv_fn_sha1_float64
+#define HASH_SHA1_NULL_NEVER(NAME, ALIASES, TYPE) \
+ NativeFunction(#NAME, {"sha", "sha1"}, DataTypeVector{TYPE()}, utf8(), \
+ kResultNullNever, ARROW_STRINGIFY(gdv_fn_sha1_##TYPE), \
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors)
+
+// HashSHA256 functions that :
+// - NULL handling is of type NULL_NEVER
+// - can return errors
+//
+// The function name includes the base name & input type name. gdv_fn_sha256_float64
+#define HASH_SHA256_NULL_NEVER(NAME, ALIASES, TYPE) \
+ NativeFunction(#NAME, {"sha256"}, DataTypeVector{TYPE()}, utf8(), kResultNullNever, \
+ ARROW_STRINGIFY(gdv_fn_sha256_##TYPE), \
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors)
+
+// Iterate the inner macro over all numeric types
+#define BASE_NUMERIC_TYPES(INNER, NAME, ALIASES) \
+ INNER(NAME, ALIASES, int8), INNER(NAME, ALIASES, int16), INNER(NAME, ALIASES, int32), \
+ INNER(NAME, ALIASES, int64), INNER(NAME, ALIASES, uint8), \
+ INNER(NAME, ALIASES, uint16), INNER(NAME, ALIASES, uint32), \
+ INNER(NAME, ALIASES, uint64), INNER(NAME, ALIASES, float32), \
+ INNER(NAME, ALIASES, float64)
+
+// Iterate the inner macro over all base numeric types
+#define NUMERIC_TYPES(INNER, NAME, ALIASES) \
+ BASE_NUMERIC_TYPES(INNER, NAME, ALIASES), INNER(NAME, ALIASES, decimal128)
+
+// Iterate the inner macro over numeric and date/time types
+#define NUMERIC_DATE_TYPES(INNER, NAME, ALIASES) \
+ NUMERIC_TYPES(INNER, NAME, ALIASES), DATE_TYPES(INNER, NAME, ALIASES), \
+ TIME_TYPES(INNER, NAME, ALIASES), INNER(NAME, ALIASES, date32)
+
+// Iterate the inner macro over all date types
+#define DATE_TYPES(INNER, NAME, ALIASES) \
+ INNER(NAME, ALIASES, date64), INNER(NAME, ALIASES, timestamp)
+
+// Iterate the inner macro over all time types
+#define TIME_TYPES(INNER, NAME, ALIASES) INNER(NAME, ALIASES, time32)
+
+// Iterate the inner macro over all data types
+#define VAR_LEN_TYPES(INNER, NAME, ALIASES) \
+ INNER(NAME, ALIASES, utf8), INNER(NAME, ALIASES, binary)
+
+// Iterate the inner macro over all numeric types, date types and bool type
+#define NUMERIC_BOOL_DATE_TYPES(INNER, NAME, ALIASES) \
+ NUMERIC_DATE_TYPES(INNER, NAME, ALIASES), INNER(NAME, ALIASES, boolean)
+
+// Iterate the inner macro over all numeric types, date types, bool and varlen types
+#define NUMERIC_BOOL_DATE_VAR_LEN_TYPES(INNER, NAME, ALIASES) \
+ NUMERIC_BOOL_DATE_TYPES(INNER, NAME, ALIASES), VAR_LEN_TYPES(INNER, NAME, ALIASES)
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/function_registry_datetime.cc b/src/arrow/cpp/src/gandiva/function_registry_datetime.cc
new file mode 100644
index 000000000..b8d2e7b6c
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/function_registry_datetime.cc
@@ -0,0 +1,132 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/function_registry_datetime.h"
+
+#include "gandiva/function_registry_common.h"
+
+namespace gandiva {
+
+#define DATE_EXTRACTION_TRUNCATION_FNS(INNER, name) \
+ DATE_TYPES(INNER, name##Millennium, {}), DATE_TYPES(INNER, name##Century, {}), \
+ DATE_TYPES(INNER, name##Decade, {}), DATE_TYPES(INNER, name##Year, {"year"}), \
+ DATE_TYPES(INNER, name##Quarter, {}), DATE_TYPES(INNER, name##Month, {"month"}), \
+ DATE_TYPES(INNER, name##Week, ({"weekofyear", "yearweek"})), \
+ DATE_TYPES(INNER, name##Day, ({"day", "dayofmonth"})), \
+ DATE_TYPES(INNER, name##Hour, {"hour"}), \
+ DATE_TYPES(INNER, name##Minute, {"minute"}), \
+ DATE_TYPES(INNER, name##Second, {"second"})
+
+#define TO_TIMESTAMP_SAFE_NULL_IF_NULL(NAME, ALIASES, TYPE) \
+ NativeFunction(#NAME, std::vector<std::string> ALIASES, DataTypeVector{TYPE()}, \
+ timestamp(), kResultNullIfNull, ARROW_STRINGIFY(NAME##_##TYPE))
+
+#define TO_TIME_SAFE_NULL_IF_NULL(NAME, ALIASES, TYPE) \
+ NativeFunction(#NAME, std::vector<std::string> ALIASES, DataTypeVector{TYPE()}, \
+ time32(), kResultNullIfNull, ARROW_STRINGIFY(NAME##_##TYPE))
+
+#define TIME_EXTRACTION_FNS(name) \
+ TIME_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Hour, {"hour"}), \
+ TIME_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Minute, {"minute"}), \
+ TIME_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Second, {"second"})
+
+std::vector<NativeFunction> GetDateTimeFunctionRegistry() {
+ static std::vector<NativeFunction> date_time_fn_registry_ = {
+ DATE_EXTRACTION_TRUNCATION_FNS(EXTRACT_SAFE_NULL_IF_NULL, extract),
+ DATE_EXTRACTION_TRUNCATION_FNS(TRUNCATE_SAFE_NULL_IF_NULL, date_trunc_),
+
+ DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractDoy, {}),
+ DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractDow, {}),
+ DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractEpoch, {}),
+
+ TIME_EXTRACTION_FNS(extract),
+
+ NativeFunction("castDATE", {}, DataTypeVector{utf8()}, date64(), kResultNullIfNull,
+ "castDATE_utf8",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+ NativeFunction("castTIMESTAMP", {}, DataTypeVector{utf8()}, timestamp(),
+ kResultNullIfNull, "castTIMESTAMP_utf8",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+ NativeFunction("castVARCHAR", {}, DataTypeVector{timestamp(), int64()}, utf8(),
+ kResultNullIfNull, "castVARCHAR_timestamp_int64",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("to_date", {}, DataTypeVector{utf8(), utf8()}, date64(),
+ kResultNullInternal, "gdv_fn_to_date_utf8_utf8",
+ NativeFunction::kNeedsContext |
+ NativeFunction::kNeedsFunctionHolder |
+ NativeFunction::kCanReturnErrors),
+
+ NativeFunction("to_date", {}, DataTypeVector{utf8(), utf8(), int32()}, date64(),
+ kResultNullInternal, "gdv_fn_to_date_utf8_utf8_int32",
+ NativeFunction::kNeedsContext |
+ NativeFunction::kNeedsFunctionHolder |
+ NativeFunction::kCanReturnErrors),
+ NativeFunction("castTIMESTAMP", {}, DataTypeVector{date64()}, timestamp(),
+ kResultNullIfNull, "castTIMESTAMP_date64"),
+
+ NativeFunction("castTIMESTAMP", {}, DataTypeVector{int64()}, timestamp(),
+ kResultNullIfNull, "castTIMESTAMP_int64"),
+
+ NativeFunction("castDATE", {"to_date"}, DataTypeVector{timestamp()}, date64(),
+ kResultNullIfNull, "castDATE_timestamp"),
+
+ NativeFunction("castTIME", {}, DataTypeVector{timestamp()}, time32(),
+ kResultNullIfNull, "castTIME_timestamp"),
+
+ NativeFunction("castBIGINT", {}, DataTypeVector{day_time_interval()}, int64(),
+ kResultNullIfNull, "castBIGINT_daytimeinterval"),
+
+ NativeFunction("castINT", {"castNULLABLEINT"}, DataTypeVector{month_interval()},
+ int32(), kResultNullIfNull, "castINT_year_interval",
+ NativeFunction::kCanReturnErrors),
+
+ NativeFunction("castBIGINT", {"castNULLABLEBIGINT"},
+ DataTypeVector{month_interval()}, int64(), kResultNullIfNull,
+ "castBIGINT_year_interval", NativeFunction::kCanReturnErrors),
+
+ NativeFunction("castNULLABLEINTERVALYEAR", {"castINTERVALYEAR"},
+ DataTypeVector{int32()}, month_interval(), kResultNullIfNull,
+ "castNULLABLEINTERVALYEAR_int32",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+ NativeFunction("castNULLABLEINTERVALYEAR", {"castINTERVALYEAR"},
+ DataTypeVector{int64()}, month_interval(), kResultNullIfNull,
+ "castNULLABLEINTERVALYEAR_int64",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+ NativeFunction("castNULLABLEINTERVALDAY", {"castINTERVALDAY"},
+ DataTypeVector{int32()}, day_time_interval(), kResultNullIfNull,
+ "castNULLABLEINTERVALDAY_int32"),
+
+ NativeFunction("castNULLABLEINTERVALDAY", {"castINTERVALDAY"},
+ DataTypeVector{int64()}, day_time_interval(), kResultNullIfNull,
+ "castNULLABLEINTERVALDAY_int64"),
+
+ NativeFunction("extractDay", {}, DataTypeVector{day_time_interval()}, int64(),
+ kResultNullIfNull, "extractDay_daytimeinterval"),
+
+ DATE_TYPES(LAST_DAY_SAFE_NULL_IF_NULL, last_day, {}),
+ BASE_NUMERIC_TYPES(TO_TIME_SAFE_NULL_IF_NULL, to_time, {}),
+ BASE_NUMERIC_TYPES(TO_TIMESTAMP_SAFE_NULL_IF_NULL, to_timestamp, {})};
+
+ return date_time_fn_registry_;
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/function_registry_datetime.h b/src/arrow/cpp/src/gandiva/function_registry_datetime.h
new file mode 100644
index 000000000..46172ec62
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/function_registry_datetime.h
@@ -0,0 +1,27 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <vector>
+#include "gandiva/native_function.h"
+
+namespace gandiva {
+
+std::vector<NativeFunction> GetDateTimeFunctionRegistry();
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/function_registry_hash.cc b/src/arrow/cpp/src/gandiva/function_registry_hash.cc
new file mode 100644
index 000000000..7fad9321e
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/function_registry_hash.cc
@@ -0,0 +1,63 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/function_registry_hash.h"
+#include "gandiva/function_registry_common.h"
+
+namespace gandiva {
+
+#define HASH32_SAFE_NULL_NEVER_FN(name, ALIASES) \
+ NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SAFE_NULL_NEVER, name, ALIASES)
+
+#define HASH32_SEED_SAFE_NULL_NEVER_FN(name, ALIASES) \
+ NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SEED_SAFE_NULL_NEVER, name, ALIASES)
+
+#define HASH64_SAFE_NULL_NEVER_FN(name, ALIASES) \
+ NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH64_SAFE_NULL_NEVER, name, ALIASES)
+
+#define HASH64_SEED_SAFE_NULL_NEVER_FN(name, ALIASES) \
+ NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH64_SEED_SAFE_NULL_NEVER, name, ALIASES)
+
+#define HASH_SHA1_NULL_NEVER_FN(name, ALIASES) \
+ NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH_SHA1_NULL_NEVER, name, ALIASES)
+
+#define HASH_SHA256_NULL_NEVER_FN(name, ALIASES) \
+ NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH_SHA256_NULL_NEVER, name, ALIASES)
+
+std::vector<NativeFunction> GetHashFunctionRegistry() {
+ static std::vector<NativeFunction> hash_fn_registry_ = {
+ HASH32_SAFE_NULL_NEVER_FN(hash, {}),
+ HASH32_SAFE_NULL_NEVER_FN(hash32, {}),
+ HASH32_SAFE_NULL_NEVER_FN(hash32AsDouble, {}),
+
+ HASH32_SEED_SAFE_NULL_NEVER_FN(hash32, {}),
+ HASH32_SEED_SAFE_NULL_NEVER_FN(hash32AsDouble, {}),
+
+ HASH64_SAFE_NULL_NEVER_FN(hash64, {}),
+ HASH64_SAFE_NULL_NEVER_FN(hash64AsDouble, {}),
+
+ HASH64_SEED_SAFE_NULL_NEVER_FN(hash64, {}),
+ HASH64_SEED_SAFE_NULL_NEVER_FN(hash64AsDouble, {}),
+
+ HASH_SHA1_NULL_NEVER_FN(hashSHA1, {}),
+
+ HASH_SHA256_NULL_NEVER_FN(hashSHA256, {})};
+
+ return hash_fn_registry_;
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/function_registry_hash.h b/src/arrow/cpp/src/gandiva/function_registry_hash.h
new file mode 100644
index 000000000..4f96d30cf
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/function_registry_hash.h
@@ -0,0 +1,27 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <vector>
+#include "gandiva/native_function.h"
+
+namespace gandiva {
+
+std::vector<NativeFunction> GetHashFunctionRegistry();
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/function_registry_math_ops.cc b/src/arrow/cpp/src/gandiva/function_registry_math_ops.cc
new file mode 100644
index 000000000..49afd4003
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/function_registry_math_ops.cc
@@ -0,0 +1,106 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/function_registry_math_ops.h"
+#include "gandiva/function_registry_common.h"
+
+namespace gandiva {
+
+#define MATH_UNARY_OPS(name, ALIASES) \
+ UNARY_SAFE_NULL_IF_NULL(name, ALIASES, int32, float64), \
+ UNARY_SAFE_NULL_IF_NULL(name, ALIASES, int64, float64), \
+ UNARY_SAFE_NULL_IF_NULL(name, ALIASES, uint32, float64), \
+ UNARY_SAFE_NULL_IF_NULL(name, ALIASES, uint64, float64), \
+ UNARY_SAFE_NULL_IF_NULL(name, ALIASES, float32, float64), \
+ UNARY_SAFE_NULL_IF_NULL(name, ALIASES, float64, float64)
+
+#define MATH_BINARY_UNSAFE(name, ALIASES) \
+ BINARY_UNSAFE_NULL_IF_NULL(name, ALIASES, int32, float64), \
+ BINARY_UNSAFE_NULL_IF_NULL(name, ALIASES, int64, float64), \
+ BINARY_UNSAFE_NULL_IF_NULL(name, ALIASES, uint32, float64), \
+ BINARY_UNSAFE_NULL_IF_NULL(name, ALIASES, uint64, float64), \
+ BINARY_UNSAFE_NULL_IF_NULL(name, ALIASES, float32, float64), \
+ BINARY_UNSAFE_NULL_IF_NULL(name, ALIASES, float64, float64)
+
+#define MATH_BINARY_SAFE(name, ALIASES) \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, int32, int32, float64), \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, int64, int64, float64), \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, uint32, uint32, float64), \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, uint64, uint64, float64), \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, float32, float32, float64), \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, float64, float64, float64)
+
+#define UNARY_SAFE_NULL_NEVER_BOOL_FN(name, ALIASES) \
+ NUMERIC_BOOL_DATE_TYPES(UNARY_SAFE_NULL_NEVER_BOOL, name, ALIASES)
+
+#define BINARY_SAFE_NULL_NEVER_BOOL_FN(name, ALIASES) \
+ NUMERIC_BOOL_DATE_TYPES(BINARY_SAFE_NULL_NEVER_BOOL, name, ALIASES)
+
+std::vector<NativeFunction> GetMathOpsFunctionRegistry() {
+ static std::vector<NativeFunction> math_fn_registry_ = {
+ MATH_UNARY_OPS(cbrt, {}), MATH_UNARY_OPS(exp, {}), MATH_UNARY_OPS(log, {}),
+ MATH_UNARY_OPS(log10, {}),
+
+ MATH_BINARY_UNSAFE(log, {}),
+
+ BINARY_SYMMETRIC_SAFE_NULL_IF_NULL(power, {"pow"}, float64),
+
+ UNARY_SAFE_NULL_NEVER_BOOL_FN(isnull, {}),
+ UNARY_SAFE_NULL_NEVER_BOOL_FN(isnotnull, {}),
+
+ NUMERIC_TYPES(UNARY_SAFE_NULL_NEVER_BOOL, isnumeric, {}),
+
+ BINARY_SAFE_NULL_NEVER_BOOL_FN(is_distinct_from, {}),
+ BINARY_SAFE_NULL_NEVER_BOOL_FN(is_not_distinct_from, {}),
+
+ // trigonometry functions
+ MATH_UNARY_OPS(sin, {}), MATH_UNARY_OPS(cos, {}), MATH_UNARY_OPS(asin, {}),
+ MATH_UNARY_OPS(acos, {}), MATH_UNARY_OPS(tan, {}), MATH_UNARY_OPS(atan, {}),
+ MATH_UNARY_OPS(sinh, {}), MATH_UNARY_OPS(cosh, {}), MATH_UNARY_OPS(tanh, {}),
+ MATH_UNARY_OPS(cot, {}), MATH_UNARY_OPS(radians, {}), MATH_UNARY_OPS(degrees, {}),
+ MATH_BINARY_SAFE(atan2, {}),
+
+ // decimal functions
+ UNARY_SAFE_NULL_IF_NULL(abs, {}, decimal128, decimal128),
+ UNARY_SAFE_NULL_IF_NULL(ceil, {}, decimal128, decimal128),
+ UNARY_SAFE_NULL_IF_NULL(floor, {}, decimal128, decimal128),
+ UNARY_SAFE_NULL_IF_NULL(round, {}, decimal128, decimal128),
+ UNARY_SAFE_NULL_IF_NULL(truncate, {"trunc"}, decimal128, decimal128),
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(round, {}, decimal128, int32, decimal128),
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(truncate, {"trunc"}, decimal128, int32,
+ decimal128),
+
+ NativeFunction("truncate", {"trunc"}, DataTypeVector{int64(), int32()}, int64(),
+ kResultNullIfNull, "truncate_int64_int32"),
+ NativeFunction("random", {"rand"}, DataTypeVector{}, float64(), kResultNullNever,
+ "gdv_fn_random", NativeFunction::kNeedsFunctionHolder),
+ NativeFunction("random", {"rand"}, DataTypeVector{int32()}, float64(),
+ kResultNullNever, "gdv_fn_random_with_seed",
+ NativeFunction::kNeedsFunctionHolder)};
+
+ return math_fn_registry_;
+}
+
+#undef MATH_UNARY_OPS
+
+#undef MATH_BINARY_UNSAFE
+
+#undef UNARY_SAFE_NULL_NEVER_BOOL_FN
+
+#undef BINARY_SAFE_NULL_NEVER_BOOL_FN
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/function_registry_math_ops.h b/src/arrow/cpp/src/gandiva/function_registry_math_ops.h
new file mode 100644
index 000000000..2c8a40d53
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/function_registry_math_ops.h
@@ -0,0 +1,27 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <vector>
+#include "gandiva/native_function.h"
+
+namespace gandiva {
+
+std::vector<NativeFunction> GetMathOpsFunctionRegistry();
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/function_registry_string.cc b/src/arrow/cpp/src/gandiva/function_registry_string.cc
new file mode 100644
index 000000000..3ea426c85
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/function_registry_string.cc
@@ -0,0 +1,422 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/function_registry_string.h"
+
+#include "gandiva/function_registry_common.h"
+
+namespace gandiva {
+
+#define BINARY_RELATIONAL_SAFE_NULL_IF_NULL_FN(name, ALIASES) \
+ VAR_LEN_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, name, ALIASES)
+
+#define BINARY_RELATIONAL_SAFE_NULL_IF_NULL_UTF8_FN(name, ALIASES) \
+ BINARY_RELATIONAL_SAFE_NULL_IF_NULL(name, ALIASES, utf8)
+
+#define UNARY_OCTET_LEN_FN(name, ALIASES) \
+ UNARY_SAFE_NULL_IF_NULL(name, ALIASES, utf8, int32), \
+ UNARY_SAFE_NULL_IF_NULL(name, ALIASES, binary, int32)
+
+#define UNARY_SAFE_NULL_NEVER_BOOL_FN(name, ALIASES) \
+ VAR_LEN_TYPES(UNARY_SAFE_NULL_NEVER_BOOL, name, ALIASES)
+
+std::vector<NativeFunction> GetStringFunctionRegistry() {
+ static std::vector<NativeFunction> string_fn_registry_ = {
+ BINARY_RELATIONAL_SAFE_NULL_IF_NULL_FN(equal, {}),
+ BINARY_RELATIONAL_SAFE_NULL_IF_NULL_FN(not_equal, {}),
+ BINARY_RELATIONAL_SAFE_NULL_IF_NULL_FN(less_than, {}),
+ BINARY_RELATIONAL_SAFE_NULL_IF_NULL_FN(less_than_or_equal_to, {}),
+ BINARY_RELATIONAL_SAFE_NULL_IF_NULL_FN(greater_than, {}),
+ BINARY_RELATIONAL_SAFE_NULL_IF_NULL_FN(greater_than_or_equal_to, {}),
+
+ BINARY_RELATIONAL_SAFE_NULL_IF_NULL_UTF8_FN(starts_with, {}),
+ BINARY_RELATIONAL_SAFE_NULL_IF_NULL_UTF8_FN(ends_with, {}),
+ BINARY_RELATIONAL_SAFE_NULL_IF_NULL_UTF8_FN(is_substr, {}),
+
+ BINARY_UNSAFE_NULL_IF_NULL(locate, {"position"}, utf8, int32),
+ BINARY_UNSAFE_NULL_IF_NULL(strpos, {}, utf8, int32),
+
+ UNARY_OCTET_LEN_FN(octet_length, {}), UNARY_OCTET_LEN_FN(bit_length, {}),
+
+ UNARY_UNSAFE_NULL_IF_NULL(char_length, {}, utf8, int32),
+ UNARY_UNSAFE_NULL_IF_NULL(length, {}, utf8, int32),
+ UNARY_UNSAFE_NULL_IF_NULL(lengthUtf8, {}, binary, int32),
+ UNARY_UNSAFE_NULL_IF_NULL(reverse, {}, utf8, utf8),
+ UNARY_UNSAFE_NULL_IF_NULL(ltrim, {}, utf8, utf8),
+ UNARY_UNSAFE_NULL_IF_NULL(rtrim, {}, utf8, utf8),
+ UNARY_UNSAFE_NULL_IF_NULL(btrim, {}, utf8, utf8),
+ UNARY_UNSAFE_NULL_IF_NULL(space, {}, int32, utf8),
+ UNARY_UNSAFE_NULL_IF_NULL(space, {}, int64, utf8),
+
+ UNARY_SAFE_NULL_NEVER_BOOL_FN(isnull, {}),
+ UNARY_SAFE_NULL_NEVER_BOOL_FN(isnotnull, {}),
+
+ NativeFunction("ascii", {}, DataTypeVector{utf8()}, int32(), kResultNullIfNull,
+ "ascii_utf8"),
+
+ NativeFunction("base64", {}, DataTypeVector{binary()}, utf8(), kResultNullIfNull,
+ "gdv_fn_base64_encode_binary", NativeFunction::kNeedsContext),
+
+ NativeFunction("unbase64", {}, DataTypeVector{utf8()}, binary(), kResultNullIfNull,
+ "gdv_fn_base64_decode_utf8", NativeFunction::kNeedsContext),
+
+ NativeFunction("repeat", {}, DataTypeVector{utf8(), int32()}, utf8(),
+ kResultNullIfNull, "repeat_utf8_int32",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("upper", {}, DataTypeVector{utf8()}, utf8(), kResultNullIfNull,
+ "gdv_fn_upper_utf8", NativeFunction::kNeedsContext),
+
+ NativeFunction("lower", {}, DataTypeVector{utf8()}, utf8(), kResultNullIfNull,
+ "gdv_fn_lower_utf8", NativeFunction::kNeedsContext),
+
+ NativeFunction("initcap", {}, DataTypeVector{utf8()}, utf8(), kResultNullIfNull,
+ "gdv_fn_initcap_utf8",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+ NativeFunction("castBIT", {"castBOOLEAN"}, DataTypeVector{utf8()}, boolean(),
+ kResultNullIfNull, "castBIT_utf8", NativeFunction::kNeedsContext),
+
+ NativeFunction("castINT", {}, DataTypeVector{utf8()}, int32(), kResultNullIfNull,
+ "gdv_fn_castINT_utf8",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+ NativeFunction("castBIGINT", {}, DataTypeVector{utf8()}, int64(), kResultNullIfNull,
+ "gdv_fn_castBIGINT_utf8",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+ NativeFunction("castFLOAT4", {}, DataTypeVector{utf8()}, float32(),
+ kResultNullIfNull, "gdv_fn_castFLOAT4_utf8",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+ NativeFunction("castFLOAT8", {}, DataTypeVector{utf8()}, float64(),
+ kResultNullIfNull, "gdv_fn_castFLOAT8_utf8",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+ NativeFunction("castINT", {}, DataTypeVector{binary()}, int32(), kResultNullIfNull,
+ "gdv_fn_castINT_varbinary",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+ NativeFunction("castBIGINT", {}, DataTypeVector{binary()}, int64(),
+ kResultNullIfNull, "gdv_fn_castBIGINT_varbinary",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+ NativeFunction("castFLOAT4", {}, DataTypeVector{binary()}, float32(),
+ kResultNullIfNull, "gdv_fn_castFLOAT4_varbinary",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+ NativeFunction("castFLOAT8", {}, DataTypeVector{binary()}, float64(),
+ kResultNullIfNull, "gdv_fn_castFLOAT8_varbinary",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+ NativeFunction("castVARCHAR", {}, DataTypeVector{boolean(), int64()}, utf8(),
+ kResultNullIfNull, "castVARCHAR_bool_int64",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("castVARCHAR", {}, DataTypeVector{utf8(), int64()}, utf8(),
+ kResultNullIfNull, "castVARCHAR_utf8_int64",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("castVARCHAR", {}, DataTypeVector{binary(), int64()}, utf8(),
+ kResultNullIfNull, "castVARCHAR_binary_int64",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("castVARCHAR", {}, DataTypeVector{int32(), int64()}, utf8(),
+ kResultNullIfNull, "gdv_fn_castVARCHAR_int32_int64",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+ NativeFunction("castVARCHAR", {}, DataTypeVector{int64(), int64()}, utf8(),
+ kResultNullIfNull, "gdv_fn_castVARCHAR_int64_int64",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+ NativeFunction("castVARCHAR", {}, DataTypeVector{float32(), int64()}, utf8(),
+ kResultNullIfNull, "gdv_fn_castVARCHAR_float32_int64",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+ NativeFunction("castVARCHAR", {}, DataTypeVector{float64(), int64()}, utf8(),
+ kResultNullIfNull, "gdv_fn_castVARCHAR_float64_int64",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+ NativeFunction("castVARCHAR", {}, DataTypeVector{decimal128(), int64()}, utf8(),
+ kResultNullIfNull, "castVARCHAR_decimal128_int64",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("like", {}, DataTypeVector{utf8(), utf8()}, boolean(),
+ kResultNullIfNull, "gdv_fn_like_utf8_utf8",
+ NativeFunction::kNeedsFunctionHolder),
+
+ NativeFunction("like", {}, DataTypeVector{utf8(), utf8(), utf8()}, boolean(),
+ kResultNullIfNull, "gdv_fn_like_utf8_utf8_utf8",
+ NativeFunction::kNeedsFunctionHolder),
+
+ NativeFunction("ilike", {}, DataTypeVector{utf8(), utf8()}, boolean(),
+ kResultNullIfNull, "gdv_fn_ilike_utf8_utf8",
+ NativeFunction::kNeedsFunctionHolder),
+
+ NativeFunction("ltrim", {}, DataTypeVector{utf8(), utf8()}, utf8(),
+ kResultNullIfNull, "ltrim_utf8_utf8", NativeFunction::kNeedsContext),
+
+ NativeFunction("rtrim", {}, DataTypeVector{utf8(), utf8()}, utf8(),
+ kResultNullIfNull, "rtrim_utf8_utf8", NativeFunction::kNeedsContext),
+
+ NativeFunction("btrim", {}, DataTypeVector{utf8(), utf8()}, utf8(),
+ kResultNullIfNull, "btrim_utf8_utf8", NativeFunction::kNeedsContext),
+
+ NativeFunction("substr", {"substring"},
+ DataTypeVector{utf8(), int64() /*offset*/, int64() /*length*/},
+ utf8(), kResultNullIfNull, "substr_utf8_int64_int64",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("substr", {"substring"}, DataTypeVector{utf8(), int64() /*offset*/},
+ utf8(), kResultNullIfNull, "substr_utf8_int64",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("lpad", {}, DataTypeVector{utf8(), int32(), utf8()}, utf8(),
+ kResultNullIfNull, "lpad_utf8_int32_utf8",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("lpad", {}, DataTypeVector{utf8(), int32()}, utf8(),
+ kResultNullIfNull, "lpad_utf8_int32", NativeFunction::kNeedsContext),
+
+ NativeFunction("rpad", {}, DataTypeVector{utf8(), int32(), utf8()}, utf8(),
+ kResultNullIfNull, "rpad_utf8_int32_utf8",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("rpad", {}, DataTypeVector{utf8(), int32()}, utf8(),
+ kResultNullIfNull, "rpad_utf8_int32", NativeFunction::kNeedsContext),
+
+ NativeFunction("regexp_replace", {}, DataTypeVector{utf8(), utf8(), utf8()}, utf8(),
+ kResultNullIfNull, "gdv_fn_regexp_replace_utf8_utf8",
+ NativeFunction::kNeedsContext |
+ NativeFunction::kNeedsFunctionHolder |
+ NativeFunction::kCanReturnErrors),
+
+ NativeFunction("concatOperator", {}, DataTypeVector{utf8(), utf8()}, utf8(),
+ kResultNullIfNull, "concatOperator_utf8_utf8",
+ NativeFunction::kNeedsContext),
+ NativeFunction("concatOperator", {}, DataTypeVector{utf8(), utf8(), utf8()}, utf8(),
+ kResultNullIfNull, "concatOperator_utf8_utf8_utf8",
+ NativeFunction::kNeedsContext),
+ NativeFunction("concatOperator", {}, DataTypeVector{utf8(), utf8(), utf8(), utf8()},
+ utf8(), kResultNullIfNull, "concatOperator_utf8_utf8_utf8_utf8",
+ NativeFunction::kNeedsContext),
+ NativeFunction("concatOperator", {},
+ DataTypeVector{utf8(), utf8(), utf8(), utf8(), utf8()}, utf8(),
+ kResultNullIfNull, "concatOperator_utf8_utf8_utf8_utf8_utf8",
+ NativeFunction::kNeedsContext),
+ NativeFunction("concatOperator", {},
+ DataTypeVector{utf8(), utf8(), utf8(), utf8(), utf8(), utf8()},
+ utf8(), kResultNullIfNull,
+ "concatOperator_utf8_utf8_utf8_utf8_utf8_utf8",
+ NativeFunction::kNeedsContext),
+ NativeFunction(
+ "concatOperator", {},
+ DataTypeVector{utf8(), utf8(), utf8(), utf8(), utf8(), utf8(), utf8()}, utf8(),
+ kResultNullIfNull, "concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8",
+ NativeFunction::kNeedsContext),
+ NativeFunction(
+ "concatOperator", {},
+ DataTypeVector{utf8(), utf8(), utf8(), utf8(), utf8(), utf8(), utf8(), utf8()},
+ utf8(), kResultNullIfNull,
+ "concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8",
+ NativeFunction::kNeedsContext),
+ NativeFunction("concatOperator", {},
+ DataTypeVector{utf8(), utf8(), utf8(), utf8(), utf8(), utf8(),
+ utf8(), utf8(), utf8()},
+ utf8(), kResultNullIfNull,
+ "concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8",
+ NativeFunction::kNeedsContext),
+ NativeFunction("concatOperator", {},
+ DataTypeVector{utf8(), utf8(), utf8(), utf8(), utf8(), utf8(),
+ utf8(), utf8(), utf8(), utf8()},
+ utf8(), kResultNullIfNull,
+ "concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8",
+ NativeFunction::kNeedsContext),
+
+ // concat treats null inputs as empty strings whereas concatOperator returns null if
+ // one of the inputs is null
+ NativeFunction("concat", {}, DataTypeVector{utf8(), utf8()}, utf8(),
+ kResultNullNever, "concat_utf8_utf8", NativeFunction::kNeedsContext),
+ NativeFunction("concat", {}, DataTypeVector{utf8(), utf8(), utf8()}, utf8(),
+ kResultNullNever, "concat_utf8_utf8_utf8",
+ NativeFunction::kNeedsContext),
+ NativeFunction("concat", {}, DataTypeVector{utf8(), utf8(), utf8(), utf8()}, utf8(),
+ kResultNullNever, "concat_utf8_utf8_utf8_utf8",
+ NativeFunction::kNeedsContext),
+ NativeFunction("concat", {}, DataTypeVector{utf8(), utf8(), utf8(), utf8(), utf8()},
+ utf8(), kResultNullNever, "concat_utf8_utf8_utf8_utf8_utf8",
+ NativeFunction::kNeedsContext),
+ NativeFunction("concat", {},
+ DataTypeVector{utf8(), utf8(), utf8(), utf8(), utf8(), utf8()},
+ utf8(), kResultNullNever, "concat_utf8_utf8_utf8_utf8_utf8_utf8",
+ NativeFunction::kNeedsContext),
+ NativeFunction(
+ "concat", {},
+ DataTypeVector{utf8(), utf8(), utf8(), utf8(), utf8(), utf8(), utf8()}, utf8(),
+ kResultNullNever, "concat_utf8_utf8_utf8_utf8_utf8_utf8_utf8",
+ NativeFunction::kNeedsContext),
+ NativeFunction(
+ "concat", {},
+ DataTypeVector{utf8(), utf8(), utf8(), utf8(), utf8(), utf8(), utf8(), utf8()},
+ utf8(), kResultNullNever, "concat_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8",
+ NativeFunction::kNeedsContext),
+ NativeFunction("concat", {},
+ DataTypeVector{utf8(), utf8(), utf8(), utf8(), utf8(), utf8(),
+ utf8(), utf8(), utf8()},
+ utf8(), kResultNullNever,
+ "concat_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8",
+ NativeFunction::kNeedsContext),
+ NativeFunction("concat", {},
+ DataTypeVector{utf8(), utf8(), utf8(), utf8(), utf8(), utf8(),
+ utf8(), utf8(), utf8(), utf8()},
+ utf8(), kResultNullNever,
+ "concat_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("byte_substr", {"bytesubstring"},
+ DataTypeVector{binary(), int32(), int32()}, binary(),
+ kResultNullIfNull, "byte_substr_binary_int32_int32",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("convert_fromUTF8", {"convert_fromutf8"}, DataTypeVector{binary()},
+ utf8(), kResultNullIfNull, "convert_fromUTF8_binary",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("convert_replaceUTF8", {"convert_replaceutf8"},
+ DataTypeVector{binary(), utf8()}, utf8(), kResultNullIfNull,
+ "convert_replace_invalid_fromUTF8_binary",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("convert_toDOUBLE", {}, DataTypeVector{float64()}, binary(),
+ kResultNullIfNull, "convert_toDOUBLE",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("convert_toDOUBLE_be", {}, DataTypeVector{float64()}, binary(),
+ kResultNullIfNull, "convert_toDOUBLE_be",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("convert_toFLOAT", {}, DataTypeVector{float32()}, binary(),
+ kResultNullIfNull, "convert_toFLOAT", NativeFunction::kNeedsContext),
+
+ NativeFunction("convert_toFLOAT_be", {}, DataTypeVector{float32()}, binary(),
+ kResultNullIfNull, "convert_toFLOAT_be",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("convert_toINT", {}, DataTypeVector{int32()}, binary(),
+ kResultNullIfNull, "convert_toINT", NativeFunction::kNeedsContext),
+
+ NativeFunction("convert_toINT_be", {}, DataTypeVector{int32()}, binary(),
+ kResultNullIfNull, "convert_toINT_be",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("convert_toBIGINT", {}, DataTypeVector{int64()}, binary(),
+ kResultNullIfNull, "convert_toBIGINT",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("convert_toBIGINT_be", {}, DataTypeVector{int64()}, binary(),
+ kResultNullIfNull, "convert_toBIGINT_be",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("convert_toBOOLEAN_BYTE", {}, DataTypeVector{boolean()}, binary(),
+ kResultNullIfNull, "convert_toBOOLEAN",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("convert_toTIME_EPOCH", {}, DataTypeVector{time32()}, binary(),
+ kResultNullIfNull, "convert_toTIME_EPOCH",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("convert_toTIME_EPOCH_be", {}, DataTypeVector{time32()}, binary(),
+ kResultNullIfNull, "convert_toTIME_EPOCH_be",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("convert_toTIMESTAMP_EPOCH", {}, DataTypeVector{timestamp()},
+ binary(), kResultNullIfNull, "convert_toTIMESTAMP_EPOCH",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("convert_toTIMESTAMP_EPOCH_be", {}, DataTypeVector{timestamp()},
+ binary(), kResultNullIfNull, "convert_toTIMESTAMP_EPOCH_be",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("convert_toDATE_EPOCH", {}, DataTypeVector{date64()}, binary(),
+ kResultNullIfNull, "convert_toDATE_EPOCH",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("convert_toDATE_EPOCH_be", {}, DataTypeVector{date64()}, binary(),
+ kResultNullIfNull, "convert_toDATE_EPOCH_be",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("convert_toUTF8", {}, DataTypeVector{utf8()}, binary(),
+ kResultNullIfNull, "convert_toUTF8", NativeFunction::kNeedsContext),
+
+ NativeFunction("locate", {"position"}, DataTypeVector{utf8(), utf8(), int32()},
+ int32(), kResultNullIfNull, "locate_utf8_utf8_int32",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+ NativeFunction("replace", {}, DataTypeVector{utf8(), utf8(), utf8()}, utf8(),
+ kResultNullIfNull, "replace_utf8_utf8_utf8",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+ NativeFunction("binary_string", {}, DataTypeVector{utf8()}, binary(),
+ kResultNullIfNull, "binary_string", NativeFunction::kNeedsContext),
+
+ NativeFunction("left", {}, DataTypeVector{utf8(), int32()}, utf8(),
+ kResultNullIfNull, "left_utf8_int32", NativeFunction::kNeedsContext),
+
+ NativeFunction("right", {}, DataTypeVector{utf8(), int32()}, utf8(),
+ kResultNullIfNull, "right_utf8_int32",
+ NativeFunction::kNeedsContext),
+
+ NativeFunction("castVARBINARY", {}, DataTypeVector{binary(), int64()}, binary(),
+ kResultNullIfNull, "castVARBINARY_binary_int64",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+ NativeFunction("castVARBINARY", {}, DataTypeVector{utf8(), int64()}, binary(),
+ kResultNullIfNull, "castVARBINARY_utf8_int64",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+ NativeFunction("castVARBINARY", {}, DataTypeVector{int32(), int64()}, binary(),
+ kResultNullIfNull, "gdv_fn_castVARBINARY_int32_int64",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+ NativeFunction("castVARBINARY", {}, DataTypeVector{int64(), int64()}, binary(),
+ kResultNullIfNull, "gdv_fn_castVARBINARY_int64_int64",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+ NativeFunction("castVARBINARY", {}, DataTypeVector{float32(), int64()}, binary(),
+ kResultNullIfNull, "gdv_fn_castVARBINARY_float32_int64",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+ NativeFunction("castVARBINARY", {}, DataTypeVector{float64(), int64()}, binary(),
+ kResultNullIfNull, "gdv_fn_castVARBINARY_float64_int64",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
+ NativeFunction("split_part", {}, DataTypeVector{utf8(), utf8(), int32()}, utf8(),
+ kResultNullIfNull, "split_part",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors)};
+
+ return string_fn_registry_;
+}
+
+#undef BINARY_RELATIONAL_SAFE_NULL_IF_NULL_FN
+
+#undef BINARY_RELATIONAL_SAFE_NULL_IF_NULL_UTF8_FN
+
+#undef UNARY_OCTET_LEN_FN
+
+#undef UNARY_SAFE_NULL_NEVER_BOOL_FN
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/function_registry_string.h b/src/arrow/cpp/src/gandiva/function_registry_string.h
new file mode 100644
index 000000000..f14c95a81
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/function_registry_string.h
@@ -0,0 +1,27 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <vector>
+#include "gandiva/native_function.h"
+
+namespace gandiva {
+
+std::vector<NativeFunction> GetStringFunctionRegistry();
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/function_registry_test.cc b/src/arrow/cpp/src/gandiva/function_registry_test.cc
new file mode 100644
index 000000000..e3c1e85f7
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/function_registry_test.cc
@@ -0,0 +1,96 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/function_registry.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <string>
+#include <unordered_set>
+
+namespace gandiva {
+
+class TestFunctionRegistry : public ::testing::Test {
+ protected:
+ FunctionRegistry registry_;
+};
+
+TEST_F(TestFunctionRegistry, TestFound) {
+ FunctionSignature add_i32_i32("add", {arrow::int32(), arrow::int32()}, arrow::int32());
+
+ const NativeFunction* function = registry_.LookupSignature(add_i32_i32);
+ EXPECT_NE(function, nullptr);
+ EXPECT_THAT(function->signatures(), testing::Contains(add_i32_i32));
+ EXPECT_EQ(function->pc_name(), "add_int32_int32");
+}
+
+TEST_F(TestFunctionRegistry, TestNotFound) {
+ FunctionSignature addX_i32_i32("addX", {arrow::int32(), arrow::int32()},
+ arrow::int32());
+ EXPECT_EQ(registry_.LookupSignature(addX_i32_i32), nullptr);
+
+ FunctionSignature add_i32_i32_ret64("add", {arrow::int32(), arrow::int32()},
+ arrow::int64());
+ EXPECT_EQ(registry_.LookupSignature(add_i32_i32_ret64), nullptr);
+}
+
+// one nativefunction object per precompiled function
+TEST_F(TestFunctionRegistry, TestNoDuplicates) {
+ std::unordered_set<std::string> pc_func_sigs;
+ std::unordered_set<std::string> native_func_duplicates;
+ std::unordered_set<std::string> func_sigs;
+ std::unordered_set<std::string> func_sig_duplicates;
+ for (auto native_func_it = registry_.begin(); native_func_it != registry_.end();
+ ++native_func_it) {
+ auto& first_sig = native_func_it->signatures().front();
+ auto pc_func_sig = FunctionSignature(native_func_it->pc_name(),
+ first_sig.param_types(), first_sig.ret_type())
+ .ToString();
+ if (pc_func_sigs.count(pc_func_sig) == 0) {
+ pc_func_sigs.insert(pc_func_sig);
+ } else {
+ native_func_duplicates.insert(pc_func_sig);
+ }
+
+ for (auto& sig : native_func_it->signatures()) {
+ auto sig_str = sig.ToString();
+ if (func_sigs.count(sig_str) == 0) {
+ func_sigs.insert(sig_str);
+ } else {
+ func_sig_duplicates.insert(sig_str);
+ }
+ }
+ }
+ std::ostringstream stream;
+ std::copy(native_func_duplicates.begin(), native_func_duplicates.end(),
+ std::ostream_iterator<std::string>(stream, "\n"));
+ std::string result = stream.str();
+ EXPECT_TRUE(native_func_duplicates.empty())
+ << "Registry has duplicates.\nMultiple NativeFunction objects refer to the "
+ "following precompiled functions:\n"
+ << result;
+
+ stream.clear();
+ std::copy(func_sig_duplicates.begin(), func_sig_duplicates.end(),
+ std::ostream_iterator<std::string>(stream, "\n"));
+ EXPECT_TRUE(func_sig_duplicates.empty())
+ << "The following signatures are defined more than once possibly pointing to "
+ "different precompiled functions:\n"
+ << stream.str();
+}
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/function_registry_timestamp_arithmetic.cc b/src/arrow/cpp/src/gandiva/function_registry_timestamp_arithmetic.cc
new file mode 100644
index 000000000..c277dab72
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/function_registry_timestamp_arithmetic.cc
@@ -0,0 +1,89 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/function_registry_timestamp_arithmetic.h"
+
+#include "gandiva/function_registry_common.h"
+
+namespace gandiva {
+
+#define TIMESTAMP_ADD_FNS(name, ALIASES) \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, int32, timestamp, timestamp), \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, int32, date64, date64), \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, int64, timestamp, timestamp), \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, int64, date64, date64), \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, timestamp, int32, timestamp), \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, date64, int32, date64), \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, timestamp, int64, timestamp), \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, date64, int64, date64)
+
+#define TIMESTAMP_DIFF_FN(name, ALIASES) \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, timestamp, timestamp, int32)
+
+#define DATE_ADD_FNS(name, ALIASES) \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, date64, int32, date64), \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, timestamp, int32, timestamp), \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, date64, int64, date64), \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, timestamp, int64, timestamp), \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, int32, date64, date64), \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, int32, timestamp, timestamp), \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, int64, date64, date64), \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, int64, timestamp, timestamp)
+
+#define DATE_DIFF_FNS(name, ALIASES) \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, date64, int32, date64), \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, timestamp, int32, timestamp), \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, date64, int64, date64), \
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, timestamp, int64, timestamp)
+
+std::vector<NativeFunction> GetDateTimeArithmeticFunctionRegistry() {
+ static std::vector<NativeFunction> datetime_fn_registry_ = {
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(months_between, {}, date64, date64, float64),
+ BINARY_GENERIC_SAFE_NULL_IF_NULL(months_between, {}, timestamp, timestamp, float64),
+
+ TIMESTAMP_DIFF_FN(timestampdiffSecond, {}),
+ TIMESTAMP_DIFF_FN(timestampdiffMinute, {}),
+ TIMESTAMP_DIFF_FN(timestampdiffHour, {}),
+ TIMESTAMP_DIFF_FN(timestampdiffDay, {"datediff"}),
+ TIMESTAMP_DIFF_FN(timestampdiffWeek, {}),
+ TIMESTAMP_DIFF_FN(timestampdiffMonth, {}),
+ TIMESTAMP_DIFF_FN(timestampdiffQuarter, {}),
+ TIMESTAMP_DIFF_FN(timestampdiffYear, {}),
+
+ TIMESTAMP_ADD_FNS(timestampaddSecond, {}),
+ TIMESTAMP_ADD_FNS(timestampaddMinute, {}),
+ TIMESTAMP_ADD_FNS(timestampaddHour, {}),
+ TIMESTAMP_ADD_FNS(timestampaddDay, {}),
+ TIMESTAMP_ADD_FNS(timestampaddWeek, {}),
+ TIMESTAMP_ADD_FNS(timestampaddMonth, {"add_months"}),
+ TIMESTAMP_ADD_FNS(timestampaddQuarter, {}),
+ TIMESTAMP_ADD_FNS(timestampaddYear, {}),
+
+ DATE_ADD_FNS(date_add, {}),
+ DATE_ADD_FNS(add, {}),
+
+ NativeFunction("add", {}, DataTypeVector{date64(), int64()}, timestamp(),
+ kResultNullIfNull, "add_date64_int64"),
+
+ DATE_DIFF_FNS(date_sub, {}),
+ DATE_DIFF_FNS(subtract, {}),
+ DATE_DIFF_FNS(date_diff, {})};
+
+ return datetime_fn_registry_;
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/function_registry_timestamp_arithmetic.h b/src/arrow/cpp/src/gandiva/function_registry_timestamp_arithmetic.h
new file mode 100644
index 000000000..9ac3ab2ec
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/function_registry_timestamp_arithmetic.h
@@ -0,0 +1,27 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <vector>
+#include "gandiva/native_function.h"
+
+namespace gandiva {
+
+std::vector<NativeFunction> GetDateTimeArithmeticFunctionRegistry();
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/function_signature.cc b/src/arrow/cpp/src/gandiva/function_signature.cc
new file mode 100644
index 000000000..6dc641617
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/function_signature.cc
@@ -0,0 +1,113 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/function_signature.h"
+
+#include <cstddef>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/hash_util.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/string.h"
+
+using arrow::internal::AsciiEqualsCaseInsensitive;
+using arrow::internal::AsciiToLower;
+using arrow::internal::checked_cast;
+using arrow::internal::hash_combine;
+
+namespace gandiva {
+
+bool DataTypeEquals(const DataTypePtr& left, const DataTypePtr& right) {
+ if (left->id() == right->id()) {
+ switch (left->id()) {
+ case arrow::Type::DECIMAL: {
+ // For decimal types, the precision/scale isn't part of the signature.
+ auto dleft = checked_cast<arrow::DecimalType*>(left.get());
+ auto dright = checked_cast<arrow::DecimalType*>(right.get());
+ return (dleft != NULL) && (dright != NULL) &&
+ (dleft->byte_width() == dright->byte_width());
+ }
+ default:
+ return left->Equals(right);
+ }
+ } else {
+ return false;
+ }
+}
+
+FunctionSignature::FunctionSignature(std::string base_name, DataTypeVector param_types,
+ DataTypePtr ret_type)
+ : base_name_(std::move(base_name)),
+ param_types_(std::move(param_types)),
+ ret_type_(std::move(ret_type)) {
+ DCHECK_GT(base_name_.length(), 0);
+ for (auto it = param_types_.begin(); it != param_types_.end(); it++) {
+ DCHECK(*it);
+ }
+ DCHECK(ret_type_);
+}
+
+bool FunctionSignature::operator==(const FunctionSignature& other) const {
+ if (param_types_.size() != other.param_types_.size() ||
+ !DataTypeEquals(ret_type_, other.ret_type_) ||
+ !AsciiEqualsCaseInsensitive(base_name_, other.base_name_)) {
+ return false;
+ }
+
+ for (size_t idx = 0; idx < param_types_.size(); idx++) {
+ if (!DataTypeEquals(param_types_[idx], other.param_types_[idx])) {
+ return false;
+ }
+ }
+ return true;
+}
+
+/// calculated based on name, datatype id of parameters and datatype id
+/// of return type.
+std::size_t FunctionSignature::Hash() const {
+ static const size_t kSeedValue = 17;
+ size_t result = kSeedValue;
+ hash_combine(result, AsciiToLower(base_name_));
+ hash_combine(result, static_cast<size_t>(ret_type_->id()));
+ // not using hash_range since we only want to include the id from the data type
+ for (auto& param_type : param_types_) {
+ hash_combine(result, static_cast<size_t>(param_type->id()));
+ }
+ return result;
+}
+
+std::string FunctionSignature::ToString() const {
+ std::stringstream s;
+
+ s << ret_type_->ToString() << " " << base_name_ << "(";
+ for (uint32_t i = 0; i < param_types_.size(); i++) {
+ if (i > 0) {
+ s << ", ";
+ }
+
+ s << param_types_[i]->ToString();
+ }
+
+ s << ")";
+ return s.str();
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/function_signature.h b/src/arrow/cpp/src/gandiva/function_signature.h
new file mode 100644
index 000000000..c3e363949
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/function_signature.h
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "gandiva/arrow.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+/// \brief Signature for a function : includes the base name, input param types and
+/// output types.
+class GANDIVA_EXPORT FunctionSignature {
+ public:
+ FunctionSignature(std::string base_name, DataTypeVector param_types,
+ DataTypePtr ret_type);
+
+ bool operator==(const FunctionSignature& other) const;
+
+ /// calculated based on name, datatype id of parameters and datatype id
+ /// of return type.
+ std::size_t Hash() const;
+
+ DataTypePtr ret_type() const { return ret_type_; }
+
+ const std::string& base_name() const { return base_name_; }
+
+ DataTypeVector param_types() const { return param_types_; }
+
+ std::string ToString() const;
+
+ private:
+ std::string base_name_;
+ DataTypeVector param_types_;
+ DataTypePtr ret_type_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/function_signature_test.cc b/src/arrow/cpp/src/gandiva/function_signature_test.cc
new file mode 100644
index 000000000..0eb62d4e7
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/function_signature_test.cc
@@ -0,0 +1,113 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/function_signature.h"
+
+#include <memory>
+
+#include <gtest/gtest.h>
+
+namespace gandiva {
+
+class TestFunctionSignature : public ::testing::Test {
+ protected:
+ virtual void SetUp() {
+ // Use make_shared so these are distinct from the static instances returned
+ // by e.g. arrow::int32()
+ local_i32_type_ = std::make_shared<arrow::Int32Type>();
+ local_i64_type_ = std::make_shared<arrow::Int64Type>();
+ local_date32_type_ = std::make_shared<arrow::Date32Type>();
+ }
+
+ virtual void TearDown() {
+ local_i32_type_.reset();
+ local_i64_type_.reset();
+ local_date32_type_.reset();
+ }
+
+ // virtual void TearDown() {}
+ DataTypePtr local_i32_type_;
+ DataTypePtr local_i64_type_;
+ DataTypePtr local_date32_type_;
+};
+
+TEST_F(TestFunctionSignature, TestToString) {
+ EXPECT_EQ(
+ FunctionSignature("myfunc", {arrow::int32(), arrow::float32()}, arrow::float64())
+ .ToString(),
+ "double myfunc(int32, float)");
+}
+
+TEST_F(TestFunctionSignature, TestEqualsName) {
+ EXPECT_EQ(FunctionSignature("add", {arrow::int32()}, arrow::int32()),
+ FunctionSignature("add", {arrow::int32()}, arrow::int32()));
+
+ EXPECT_EQ(FunctionSignature("add", {arrow::int32()}, arrow::int64()),
+ FunctionSignature("add", {local_i32_type_}, local_i64_type_));
+
+ EXPECT_FALSE(FunctionSignature("add", {arrow::int32()}, arrow::int32()) ==
+ FunctionSignature("sub", {arrow::int32()}, arrow::int32()));
+
+ EXPECT_EQ(FunctionSignature("extractDay", {arrow::int64()}, arrow::int64()),
+ FunctionSignature("extractday", {arrow::int64()}, arrow::int64()));
+
+ EXPECT_EQ(
+ FunctionSignature("castVARCHAR", {arrow::utf8(), arrow::int64()}, arrow::utf8()),
+ FunctionSignature("castvarchar", {arrow::utf8(), arrow::int64()}, arrow::utf8()));
+}
+
+TEST_F(TestFunctionSignature, TestEqualsParamCount) {
+ EXPECT_FALSE(
+ FunctionSignature("add", {arrow::int32(), arrow::int32()}, arrow::int32()) ==
+ FunctionSignature("add", {arrow::int32()}, arrow::int32()));
+}
+
+TEST_F(TestFunctionSignature, TestEqualsParamValue) {
+ EXPECT_FALSE(FunctionSignature("add", {arrow::int32()}, arrow::int32()) ==
+ FunctionSignature("add", {arrow::int64()}, arrow::int32()));
+
+ EXPECT_FALSE(
+ FunctionSignature("add", {arrow::int32()}, arrow::int32()) ==
+ FunctionSignature("add", {arrow::float32(), arrow::float32()}, arrow::int32()));
+
+ EXPECT_FALSE(
+ FunctionSignature("add", {arrow::int32(), arrow::int64()}, arrow::int32()) ==
+ FunctionSignature("add", {arrow::int64(), arrow::int32()}, arrow::int32()));
+
+ EXPECT_EQ(FunctionSignature("extract_month", {arrow::date32()}, arrow::int64()),
+ FunctionSignature("extract_month", {local_date32_type_}, local_i64_type_));
+
+ EXPECT_FALSE(FunctionSignature("extract_month", {arrow::date32()}, arrow::int64()) ==
+ FunctionSignature("extract_month", {arrow::date64()}, arrow::date32()));
+}
+
+TEST_F(TestFunctionSignature, TestEqualsReturn) {
+ EXPECT_FALSE(FunctionSignature("add", {arrow::int32()}, arrow::int64()) ==
+ FunctionSignature("add", {arrow::int32()}, arrow::int32()));
+}
+
+TEST_F(TestFunctionSignature, TestHash) {
+ FunctionSignature f1("add", {arrow::int32(), arrow::int32()}, arrow::int64());
+ FunctionSignature f2("add", {local_i32_type_, local_i32_type_}, local_i64_type_);
+ EXPECT_EQ(f1.Hash(), f2.Hash());
+
+ FunctionSignature f3("extractDay", {arrow::int64()}, arrow::int64());
+ FunctionSignature f4("extractday", {arrow::int64()}, arrow::int64());
+ EXPECT_EQ(f3.Hash(), f4.Hash());
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/gandiva.pc.in b/src/arrow/cpp/src/gandiva/gandiva.pc.in
new file mode 100644
index 000000000..22ff11a4f
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/gandiva.pc.in
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+prefix=@CMAKE_INSTALL_PREFIX@
+libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@
+includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
+
+Name: Gandiva
+Description: Gandiva is a toolset for compiling and evaluating expressions on Arrow data.
+Version: @GANDIVA_VERSION@
+Requires: arrow
+Libs: -L${libdir} -lgandiva
+Cflags: -I${includedir}
diff --git a/src/arrow/cpp/src/gandiva/gandiva_aliases.h b/src/arrow/cpp/src/gandiva/gandiva_aliases.h
new file mode 100644
index 000000000..6cbb671ff
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/gandiva_aliases.h
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+namespace gandiva {
+
+class Dex;
+using DexPtr = std::shared_ptr<Dex>;
+using DexVector = std::vector<std::shared_ptr<Dex>>;
+
+class ValueValidityPair;
+using ValueValidityPairPtr = std::shared_ptr<ValueValidityPair>;
+using ValueValidityPairVector = std::vector<ValueValidityPairPtr>;
+
+class FieldDescriptor;
+using FieldDescriptorPtr = std::shared_ptr<FieldDescriptor>;
+
+class FuncDescriptor;
+using FuncDescriptorPtr = std::shared_ptr<FuncDescriptor>;
+
+class LValue;
+using LValuePtr = std::shared_ptr<LValue>;
+
+class Expression;
+using ExpressionPtr = std::shared_ptr<Expression>;
+using ExpressionVector = std::vector<ExpressionPtr>;
+
+class Condition;
+using ConditionPtr = std::shared_ptr<Condition>;
+
+class Node;
+using NodePtr = std::shared_ptr<Node>;
+using NodeVector = std::vector<std::shared_ptr<Node>>;
+
+class EvalBatch;
+using EvalBatchPtr = std::shared_ptr<EvalBatch>;
+
+class FunctionSignature;
+using FuncSignaturePtr = std::shared_ptr<FunctionSignature>;
+using FuncSignatureVector = std::vector<FuncSignaturePtr>;
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/gdv_function_stubs.cc b/src/arrow/cpp/src/gandiva/gdv_function_stubs.cc
new file mode 100644
index 000000000..ed34eef4a
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/gdv_function_stubs.cc
@@ -0,0 +1,1603 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/gdv_function_stubs.h"
+
+#include <utf8proc.h>
+
+#include <string>
+#include <vector>
+
+#include "arrow/util/base64.h"
+#include "arrow/util/double_conversion.h"
+#include "arrow/util/formatting.h"
+#include "arrow/util/string_view.h"
+#include "arrow/util/utf8.h"
+#include "arrow/util/value_parsing.h"
+#include "gandiva/engine.h"
+#include "gandiva/exported_funcs.h"
+#include "gandiva/formatting_utils.h"
+#include "gandiva/hash_utils.h"
+#include "gandiva/in_holder.h"
+#include "gandiva/like_holder.h"
+#include "gandiva/precompiled/types.h"
+#include "gandiva/random_generator_holder.h"
+#include "gandiva/replace_holder.h"
+#include "gandiva/to_date_holder.h"
+
+/// Stub functions that can be accessed from LLVM or the pre-compiled library.
+
+extern "C" {
+
+bool gdv_fn_like_utf8_utf8(int64_t ptr, const char* data, int data_len,
+ const char* pattern, int pattern_len) {
+ gandiva::LikeHolder* holder = reinterpret_cast<gandiva::LikeHolder*>(ptr);
+ return (*holder)(std::string(data, data_len));
+}
+
+bool gdv_fn_like_utf8_utf8_utf8(int64_t ptr, const char* data, int data_len,
+ const char* pattern, int pattern_len,
+ const char* escape_char, int escape_char_len) {
+ gandiva::LikeHolder* holder = reinterpret_cast<gandiva::LikeHolder*>(ptr);
+ return (*holder)(std::string(data, data_len));
+}
+
+bool gdv_fn_ilike_utf8_utf8(int64_t ptr, const char* data, int data_len,
+ const char* pattern, int pattern_len) {
+ gandiva::LikeHolder* holder = reinterpret_cast<gandiva::LikeHolder*>(ptr);
+ return (*holder)(std::string(data, data_len));
+}
+
+const char* gdv_fn_regexp_replace_utf8_utf8(
+ int64_t ptr, int64_t holder_ptr, const char* data, int32_t data_len,
+ const char* /*pattern*/, int32_t /*pattern_len*/, const char* replace_string,
+ int32_t replace_string_len, int32_t* out_length) {
+ gandiva::ExecutionContext* context = reinterpret_cast<gandiva::ExecutionContext*>(ptr);
+
+ gandiva::ReplaceHolder* holder = reinterpret_cast<gandiva::ReplaceHolder*>(holder_ptr);
+
+ return (*holder)(context, data, data_len, replace_string, replace_string_len,
+ out_length);
+}
+
+double gdv_fn_random(int64_t ptr) {
+ gandiva::RandomGeneratorHolder* holder =
+ reinterpret_cast<gandiva::RandomGeneratorHolder*>(ptr);
+ return (*holder)();
+}
+
+double gdv_fn_random_with_seed(int64_t ptr, int32_t seed, bool seed_validity) {
+ gandiva::RandomGeneratorHolder* holder =
+ reinterpret_cast<gandiva::RandomGeneratorHolder*>(ptr);
+ return (*holder)();
+}
+
+int64_t gdv_fn_to_date_utf8_utf8(int64_t context_ptr, int64_t holder_ptr,
+ const char* data, int data_len, bool in1_validity,
+ const char* pattern, int pattern_len, bool in2_validity,
+ bool* out_valid) {
+ gandiva::ExecutionContext* context =
+ reinterpret_cast<gandiva::ExecutionContext*>(context_ptr);
+ gandiva::ToDateHolder* holder = reinterpret_cast<gandiva::ToDateHolder*>(holder_ptr);
+ return (*holder)(context, data, data_len, in1_validity, out_valid);
+}
+
+int64_t gdv_fn_to_date_utf8_utf8_int32(int64_t context_ptr, int64_t holder_ptr,
+ const char* data, int data_len, bool in1_validity,
+ const char* pattern, int pattern_len,
+ bool in2_validity, int32_t suppress_errors,
+ bool in3_validity, bool* out_valid) {
+ gandiva::ExecutionContext* context =
+ reinterpret_cast<gandiva::ExecutionContext*>(context_ptr);
+ gandiva::ToDateHolder* holder = reinterpret_cast<gandiva::ToDateHolder*>(holder_ptr);
+ return (*holder)(context, data, data_len, in1_validity, out_valid);
+}
+
+bool gdv_fn_in_expr_lookup_int32(int64_t ptr, int32_t value, bool in_validity) {
+ if (!in_validity) {
+ return false;
+ }
+ gandiva::InHolder<int32_t>* holder = reinterpret_cast<gandiva::InHolder<int32_t>*>(ptr);
+ return holder->HasValue(value);
+}
+
+bool gdv_fn_in_expr_lookup_int64(int64_t ptr, int64_t value, bool in_validity) {
+ if (!in_validity) {
+ return false;
+ }
+ gandiva::InHolder<int64_t>* holder = reinterpret_cast<gandiva::InHolder<int64_t>*>(ptr);
+ return holder->HasValue(value);
+}
+
+bool gdv_fn_in_expr_lookup_decimal(int64_t ptr, int64_t value_high, int64_t value_low,
+ int32_t precision, int32_t scale, bool in_validity) {
+ if (!in_validity) {
+ return false;
+ }
+ gandiva::DecimalScalar128 value(value_high, value_low, precision, scale);
+ gandiva::InHolder<gandiva::DecimalScalar128>* holder =
+ reinterpret_cast<gandiva::InHolder<gandiva::DecimalScalar128>*>(ptr);
+ return holder->HasValue(value);
+}
+
+bool gdv_fn_in_expr_lookup_float(int64_t ptr, float value, bool in_validity) {
+ if (!in_validity) {
+ return false;
+ }
+ gandiva::InHolder<float>* holder = reinterpret_cast<gandiva::InHolder<float>*>(ptr);
+ return holder->HasValue(value);
+}
+
+bool gdv_fn_in_expr_lookup_double(int64_t ptr, double value, bool in_validity) {
+ if (!in_validity) {
+ return false;
+ }
+ gandiva::InHolder<double>* holder = reinterpret_cast<gandiva::InHolder<double>*>(ptr);
+ return holder->HasValue(value);
+}
+
+bool gdv_fn_in_expr_lookup_utf8(int64_t ptr, const char* data, int data_len,
+ bool in_validity) {
+ if (!in_validity) {
+ return false;
+ }
+ gandiva::InHolder<std::string>* holder =
+ reinterpret_cast<gandiva::InHolder<std::string>*>(ptr);
+ return holder->HasValue(arrow::util::string_view(data, data_len));
+}
+
+int32_t gdv_fn_populate_varlen_vector(int64_t context_ptr, int8_t* data_ptr,
+ int32_t* offsets, int64_t slot,
+ const char* entry_buf, int32_t entry_len) {
+ auto buffer = reinterpret_cast<arrow::ResizableBuffer*>(data_ptr);
+ int32_t offset = static_cast<int32_t>(buffer->size());
+
+ // This also sets the size in the buffer.
+ auto status = buffer->Resize(offset + entry_len, false /*shrink*/);
+ if (!status.ok()) {
+ gandiva::ExecutionContext* context =
+ reinterpret_cast<gandiva::ExecutionContext*>(context_ptr);
+
+ context->set_error_msg(status.message().c_str());
+ return -1;
+ }
+
+ // append the new entry.
+ memcpy(buffer->mutable_data() + offset, entry_buf, entry_len);
+
+ // update offsets buffer.
+ offsets[slot] = offset;
+ offsets[slot + 1] = offset + entry_len;
+ return 0;
+}
+
+#define SHA1_HASH_FUNCTION(TYPE) \
+ GANDIVA_EXPORT \
+ const char* gdv_fn_sha1_##TYPE(int64_t context, gdv_##TYPE value, bool validity, \
+ int32_t* out_length) { \
+ if (!validity) { \
+ return gandiva::gdv_hash_using_sha1(context, NULLPTR, 0, out_length); \
+ } \
+ auto value_as_long = gandiva::gdv_double_to_long((double)value); \
+ const char* result = gandiva::gdv_hash_using_sha1( \
+ context, &value_as_long, sizeof(value_as_long), out_length); \
+ \
+ return result; \
+ }
+
+#define SHA1_HASH_FUNCTION_BUF(TYPE) \
+ GANDIVA_EXPORT \
+ const char* gdv_fn_sha1_##TYPE(int64_t context, gdv_##TYPE value, \
+ int32_t value_length, bool value_validity, \
+ int32_t* out_length) { \
+ if (!value_validity) { \
+ return gandiva::gdv_hash_using_sha1(context, NULLPTR, 0, out_length); \
+ } \
+ return gandiva::gdv_hash_using_sha1(context, value, value_length, out_length); \
+ }
+
+#define SHA256_HASH_FUNCTION(TYPE) \
+ GANDIVA_EXPORT \
+ const char* gdv_fn_sha256_##TYPE(int64_t context, gdv_##TYPE value, bool validity, \
+ int32_t* out_length) { \
+ if (!validity) { \
+ return gandiva::gdv_hash_using_sha256(context, NULLPTR, 0, out_length); \
+ } \
+ auto value_as_long = gandiva::gdv_double_to_long((double)value); \
+ const char* result = gandiva::gdv_hash_using_sha256( \
+ context, &value_as_long, sizeof(value_as_long), out_length); \
+ return result; \
+ }
+
+#define SHA256_HASH_FUNCTION_BUF(TYPE) \
+ GANDIVA_EXPORT \
+ const char* gdv_fn_sha256_##TYPE(int64_t context, gdv_##TYPE value, \
+ int32_t value_length, bool value_validity, \
+ int32_t* out_length) { \
+ if (!value_validity) { \
+ return gandiva::gdv_hash_using_sha256(context, NULLPTR, 0, out_length); \
+ } \
+ \
+ return gandiva::gdv_hash_using_sha256(context, value, value_length, out_length); \
+ }
+
+// Expand inner macro for all numeric types.
+#define SHA_NUMERIC_BOOL_DATE_PARAMS(INNER) \
+ INNER(int8) \
+ INNER(int16) \
+ INNER(int32) \
+ INNER(int64) \
+ INNER(uint8) \
+ INNER(uint16) \
+ INNER(uint32) \
+ INNER(uint64) \
+ INNER(float32) \
+ INNER(float64) \
+ INNER(boolean) \
+ INNER(date64) \
+ INNER(date32) \
+ INNER(time32) \
+ INNER(timestamp)
+
+// Expand inner macro for all numeric types.
+#define SHA_VAR_LEN_PARAMS(INNER) \
+ INNER(utf8) \
+ INNER(binary)
+
+SHA_NUMERIC_BOOL_DATE_PARAMS(SHA256_HASH_FUNCTION)
+SHA_VAR_LEN_PARAMS(SHA256_HASH_FUNCTION_BUF)
+
+SHA_NUMERIC_BOOL_DATE_PARAMS(SHA1_HASH_FUNCTION)
+SHA_VAR_LEN_PARAMS(SHA1_HASH_FUNCTION_BUF)
+
+#undef SHA_NUMERIC_BOOL_DATE_PARAMS
+#undef SHA_VAR_LEN_PARAMS
+
+// Add functions for decimal128
+GANDIVA_EXPORT
+const char* gdv_fn_sha256_decimal128(int64_t context, int64_t x_high, uint64_t x_low,
+ int32_t /*x_precision*/, int32_t /*x_scale*/,
+ gdv_boolean x_isvalid, int32_t* out_length) {
+ if (!x_isvalid) {
+ return gandiva::gdv_hash_using_sha256(context, NULLPTR, 0, out_length);
+ }
+
+ const gandiva::BasicDecimal128 decimal_128(x_high, x_low);
+ return gandiva::gdv_hash_using_sha256(context, decimal_128.ToBytes().data(), 16,
+ out_length);
+}
+
+GANDIVA_EXPORT
+const char* gdv_fn_sha1_decimal128(int64_t context, int64_t x_high, uint64_t x_low,
+ int32_t /*x_precision*/, int32_t /*x_scale*/,
+ gdv_boolean x_isvalid, int32_t* out_length) {
+ if (!x_isvalid) {
+ return gandiva::gdv_hash_using_sha1(context, NULLPTR, 0, out_length);
+ }
+
+ const gandiva::BasicDecimal128 decimal_128(x_high, x_low);
+ return gandiva::gdv_hash_using_sha1(context, decimal_128.ToBytes().data(), 16,
+ out_length);
+}
+
+int32_t gdv_fn_dec_from_string(int64_t context, const char* in, int32_t in_length,
+ int32_t* precision_from_str, int32_t* scale_from_str,
+ int64_t* dec_high_from_str, uint64_t* dec_low_from_str) {
+ arrow::Decimal128 dec;
+ auto status = arrow::Decimal128::FromString(std::string(in, in_length), &dec,
+ precision_from_str, scale_from_str);
+ if (!status.ok()) {
+ gdv_fn_context_set_error_msg(context, status.message().data());
+ return -1;
+ }
+ *dec_high_from_str = dec.high_bits();
+ *dec_low_from_str = dec.low_bits();
+ return 0;
+}
+
+char* gdv_fn_dec_to_string(int64_t context, int64_t x_high, uint64_t x_low,
+ int32_t x_scale, int32_t* dec_str_len) {
+ arrow::Decimal128 dec(arrow::BasicDecimal128(x_high, x_low));
+ std::string dec_str = dec.ToString(x_scale);
+ *dec_str_len = static_cast<int32_t>(dec_str.length());
+ char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *dec_str_len));
+ if (ret == nullptr) {
+ std::string err_msg = "Could not allocate memory for string: " + dec_str;
+ gdv_fn_context_set_error_msg(context, err_msg.data());
+ return nullptr;
+ }
+ memcpy(ret, dec_str.data(), *dec_str_len);
+ return ret;
+}
+
+GANDIVA_EXPORT
+const char* gdv_fn_base64_encode_binary(int64_t context, const char* in, int32_t in_len,
+ int32_t* out_len) {
+ if (in_len < 0) {
+ gdv_fn_context_set_error_msg(context, "Buffer length can not be negative");
+ *out_len = 0;
+ return "";
+ }
+ if (in_len == 0) {
+ *out_len = 0;
+ return "";
+ }
+ // use arrow method to encode base64 string
+ std::string encoded_str =
+ arrow::util::base64_encode(arrow::util::string_view(in, in_len));
+ *out_len = static_cast<int32_t>(encoded_str.length());
+ // allocate memory for response
+ char* ret = reinterpret_cast<char*>(
+ gdv_fn_context_arena_malloc(context, static_cast<int32_t>(*out_len)));
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory");
+ *out_len = 0;
+ return "";
+ }
+ memcpy(ret, encoded_str.data(), *out_len);
+ return ret;
+}
+
+GANDIVA_EXPORT
+const char* gdv_fn_base64_decode_utf8(int64_t context, const char* in, int32_t in_len,
+ int32_t* out_len) {
+ if (in_len < 0) {
+ gdv_fn_context_set_error_msg(context, "Buffer length can not be negative");
+ *out_len = 0;
+ return "";
+ }
+ if (in_len == 0) {
+ *out_len = 0;
+ return "";
+ }
+ // use arrow method to decode base64 string
+ std::string decoded_str =
+ arrow::util::base64_decode(arrow::util::string_view(in, in_len));
+ *out_len = static_cast<int32_t>(decoded_str.length());
+ // allocate memory for response
+ char* ret = reinterpret_cast<char*>(
+ gdv_fn_context_arena_malloc(context, static_cast<int32_t>(*out_len)));
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory");
+ *out_len = 0;
+ return "";
+ }
+ memcpy(ret, decoded_str.data(), *out_len);
+ return ret;
+}
+
+#define CAST_NUMERIC_FROM_VARLEN_TYPES(OUT_TYPE, ARROW_TYPE, TYPE_NAME, INNER_TYPE) \
+ GANDIVA_EXPORT \
+ OUT_TYPE gdv_fn_cast##TYPE_NAME##_##INNER_TYPE(int64_t context, const char* data, \
+ int32_t len) { \
+ OUT_TYPE val = 0; \
+ /* trim leading and trailing spaces */ \
+ int32_t trimmed_len; \
+ int32_t start = 0, end = len - 1; \
+ while (start <= end && data[start] == ' ') { \
+ ++start; \
+ } \
+ while (end >= start && data[end] == ' ') { \
+ --end; \
+ } \
+ trimmed_len = end - start + 1; \
+ const char* trimmed_data = data + start; \
+ if (!arrow::internal::ParseValue<ARROW_TYPE>(trimmed_data, trimmed_len, &val)) { \
+ std::string err = \
+ "Failed to cast the string " + std::string(data, len) + " to " #OUT_TYPE; \
+ gdv_fn_context_set_error_msg(context, err.c_str()); \
+ } \
+ return val; \
+ }
+
+#define CAST_NUMERIC_FROM_STRING(OUT_TYPE, ARROW_TYPE, TYPE_NAME) \
+ CAST_NUMERIC_FROM_VARLEN_TYPES(OUT_TYPE, ARROW_TYPE, TYPE_NAME, utf8)
+
+CAST_NUMERIC_FROM_STRING(int32_t, arrow::Int32Type, INT)
+CAST_NUMERIC_FROM_STRING(int64_t, arrow::Int64Type, BIGINT)
+CAST_NUMERIC_FROM_STRING(float, arrow::FloatType, FLOAT4)
+CAST_NUMERIC_FROM_STRING(double, arrow::DoubleType, FLOAT8)
+
+#undef CAST_NUMERIC_FROM_STRING
+
+#define CAST_NUMERIC_FROM_VARBINARY(OUT_TYPE, ARROW_TYPE, TYPE_NAME) \
+ CAST_NUMERIC_FROM_VARLEN_TYPES(OUT_TYPE, ARROW_TYPE, TYPE_NAME, varbinary)
+
+CAST_NUMERIC_FROM_VARBINARY(int32_t, arrow::Int32Type, INT)
+CAST_NUMERIC_FROM_VARBINARY(int64_t, arrow::Int64Type, BIGINT)
+CAST_NUMERIC_FROM_VARBINARY(float, arrow::FloatType, FLOAT4)
+CAST_NUMERIC_FROM_VARBINARY(double, arrow::DoubleType, FLOAT8)
+
+#undef CAST_NUMERIC_STRING
+
+#define GDV_FN_CAST_VARLEN_TYPE_FROM_INTEGER(IN_TYPE, CAST_NAME, ARROW_TYPE) \
+ GANDIVA_EXPORT \
+ const char* gdv_fn_cast##CAST_NAME##_##IN_TYPE##_int64( \
+ int64_t context, gdv_##IN_TYPE value, int64_t len, int32_t * out_len) { \
+ if (len < 0) { \
+ gdv_fn_context_set_error_msg(context, "Buffer length can not be negative"); \
+ *out_len = 0; \
+ return ""; \
+ } \
+ if (len == 0) { \
+ *out_len = 0; \
+ return ""; \
+ } \
+ arrow::internal::StringFormatter<arrow::ARROW_TYPE> formatter; \
+ char* ret = reinterpret_cast<char*>( \
+ gdv_fn_context_arena_malloc(context, static_cast<int32_t>(len))); \
+ if (ret == nullptr) { \
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory"); \
+ *out_len = 0; \
+ return ""; \
+ } \
+ arrow::Status status = formatter(value, [&](arrow::util::string_view v) { \
+ int64_t size = static_cast<int64_t>(v.size()); \
+ *out_len = static_cast<int32_t>(len < size ? len : size); \
+ memcpy(ret, v.data(), *out_len); \
+ return arrow::Status::OK(); \
+ }); \
+ if (!status.ok()) { \
+ std::string err = "Could not cast " + std::to_string(value) + " to string"; \
+ gdv_fn_context_set_error_msg(context, err.c_str()); \
+ *out_len = 0; \
+ return ""; \
+ } \
+ return ret; \
+ }
+
+#define GDV_FN_CAST_VARLEN_TYPE_FROM_REAL(IN_TYPE, CAST_NAME, ARROW_TYPE) \
+ GANDIVA_EXPORT \
+ const char* gdv_fn_cast##CAST_NAME##_##IN_TYPE##_int64( \
+ int64_t context, gdv_##IN_TYPE value, int64_t len, int32_t * out_len) { \
+ if (len < 0) { \
+ gdv_fn_context_set_error_msg(context, "Buffer length can not be negative"); \
+ *out_len = 0; \
+ return ""; \
+ } \
+ if (len == 0) { \
+ *out_len = 0; \
+ return ""; \
+ } \
+ gandiva::GdvStringFormatter<arrow::ARROW_TYPE> formatter; \
+ char* ret = reinterpret_cast<char*>( \
+ gdv_fn_context_arena_malloc(context, static_cast<int32_t>(len))); \
+ if (ret == nullptr) { \
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory"); \
+ *out_len = 0; \
+ return ""; \
+ } \
+ arrow::Status status = formatter(value, [&](arrow::util::string_view v) { \
+ int64_t size = static_cast<int64_t>(v.size()); \
+ *out_len = static_cast<int32_t>(len < size ? len : size); \
+ memcpy(ret, v.data(), *out_len); \
+ return arrow::Status::OK(); \
+ }); \
+ if (!status.ok()) { \
+ std::string err = "Could not cast " + std::to_string(value) + " to string"; \
+ gdv_fn_context_set_error_msg(context, err.c_str()); \
+ *out_len = 0; \
+ return ""; \
+ } \
+ return ret; \
+ }
+
+#define CAST_VARLEN_TYPE_FROM_NUMERIC(VARLEN_TYPE) \
+ GDV_FN_CAST_VARLEN_TYPE_FROM_INTEGER(int32, VARLEN_TYPE, Int32Type) \
+ GDV_FN_CAST_VARLEN_TYPE_FROM_INTEGER(int64, VARLEN_TYPE, Int64Type) \
+ GDV_FN_CAST_VARLEN_TYPE_FROM_REAL(float32, VARLEN_TYPE, FloatType) \
+ GDV_FN_CAST_VARLEN_TYPE_FROM_REAL(float64, VARLEN_TYPE, DoubleType)
+
+CAST_VARLEN_TYPE_FROM_NUMERIC(VARCHAR)
+CAST_VARLEN_TYPE_FROM_NUMERIC(VARBINARY)
+
+#undef CAST_VARLEN_TYPE_FROM_NUMERIC
+#undef GDV_FN_CAST_VARLEN_TYPE_FROM_INTEGER
+#undef GDV_FN_CAST_VARLEN_TYPE_FROM_REAL
+#undef GDV_FN_CAST_VARCHAR_INTEGER
+#undef GDV_FN_CAST_VARCHAR_REAL
+
+GDV_FORCE_INLINE
+int32_t gdv_fn_utf8_char_length(char c) {
+ if ((signed char)c >= 0) { // 1-byte char (0x00 ~ 0x7F)
+ return 1;
+ } else if ((c & 0xE0) == 0xC0) { // 2-byte char
+ return 2;
+ } else if ((c & 0xF0) == 0xE0) { // 3-byte char
+ return 3;
+ } else if ((c & 0xF8) == 0xF0) { // 4-byte char
+ return 4;
+ }
+ // invalid char
+ return 0;
+}
+
+GDV_FORCE_INLINE
+void gdv_fn_set_error_for_invalid_utf8(int64_t execution_context, char val) {
+ char const* fmt = "unexpected byte \\%02hhx encountered while decoding utf8 string";
+ int size = static_cast<int>(strlen(fmt)) + 64;
+ char* error = reinterpret_cast<char*>(malloc(size));
+ snprintf(error, size, fmt, (unsigned char)val);
+ gdv_fn_context_set_error_msg(execution_context, error);
+ free(error);
+}
+
+// Convert an utf8 string to its corresponding uppercase string
+GANDIVA_EXPORT
+const char* gdv_fn_upper_utf8(int64_t context, const char* data, int32_t data_len,
+ int32_t* out_len) {
+ if (data_len == 0) {
+ *out_len = 0;
+ return "";
+ }
+
+ // If it is a single-byte character (ASCII), corresponding uppercase is always 1-byte
+ // long; if it is >= 2 bytes long, uppercase can be at most 4 bytes long, so length of
+ // the output can be at most twice the length of the input
+ char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 2 * data_len));
+ if (out == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+
+ int32_t char_len, out_char_len, out_idx = 0;
+ uint32_t char_codepoint;
+
+ for (int32_t i = 0; i < data_len; i += char_len) {
+ char_len = gdv_fn_utf8_char_length(data[i]);
+ // For single byte characters:
+ // If it is a lowercase ASCII character, set the output to its corresponding uppercase
+ // character; else, set the output to the read character
+ if (char_len == 1) {
+ char cur = data[i];
+ // 'A' - 'Z' : 0x41 - 0x5a
+ // 'a' - 'z' : 0x61 - 0x7a
+ if (cur >= 0x61 && cur <= 0x7a) {
+ out[out_idx++] = static_cast<char>(cur - 0x20);
+ } else {
+ out[out_idx++] = cur;
+ }
+ continue;
+ }
+
+ // Control reaches here when we encounter a multibyte character
+ const auto* in_char = (const uint8_t*)(data + i);
+
+ // Decode the multibyte character
+ bool is_valid_utf8_char =
+ arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint);
+
+ // If it is an invalid utf8 character, UTF8Decode evaluates to false
+ if (!is_valid_utf8_char) {
+ gdv_fn_set_error_for_invalid_utf8(context, data[i]);
+ *out_len = 0;
+ return "";
+ }
+
+ // Convert the encoded codepoint to its uppercase codepoint
+ int32_t upper_codepoint = utf8proc_toupper(char_codepoint);
+
+ // UTF8Encode advances the pointer by the number of bytes present in the uppercase
+ // character
+ auto* out_char = (uint8_t*)(out + out_idx);
+ uint8_t* out_char_start = out_char;
+
+ // Encode the uppercase character
+ out_char = arrow::util::UTF8Encode(out_char, upper_codepoint);
+
+ out_char_len = static_cast<int32_t>(out_char - out_char_start);
+ out_idx += out_char_len;
+ }
+
+ *out_len = out_idx;
+ return out;
+}
+
+// Convert an utf8 string to its corresponding lowercase string
+GANDIVA_EXPORT
+const char* gdv_fn_lower_utf8(int64_t context, const char* data, int32_t data_len,
+ int32_t* out_len) {
+ if (data_len == 0) {
+ *out_len = 0;
+ return "";
+ }
+
+ // If it is a single-byte character (ASCII), corresponding lowercase is always 1-byte
+ // long; if it is >= 2 bytes long, lowercase can be at most 4 bytes long, so length of
+ // the output can be at most twice the length of the input
+ char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 2 * data_len));
+ if (out == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+
+ int32_t char_len, out_char_len, out_idx = 0;
+ uint32_t char_codepoint;
+
+ for (int32_t i = 0; i < data_len; i += char_len) {
+ char_len = gdv_fn_utf8_char_length(data[i]);
+ // For single byte characters:
+ // If it is an uppercase ASCII character, set the output to its corresponding
+ // lowercase character; else, set the output to the read character
+ if (char_len == 1) {
+ char cur = data[i];
+ // 'A' - 'Z' : 0x41 - 0x5a
+ // 'a' - 'z' : 0x61 - 0x7a
+ if (cur >= 0x41 && cur <= 0x5a) {
+ out[out_idx++] = static_cast<char>(cur + 0x20);
+ } else {
+ out[out_idx++] = cur;
+ }
+ continue;
+ }
+
+ // Control reaches here when we encounter a multibyte character
+ const auto* in_char = (const uint8_t*)(data + i);
+
+ // Decode the multibyte character
+ bool is_valid_utf8_char =
+ arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint);
+
+ // If it is an invalid utf8 character, UTF8Decode evaluates to false
+ if (!is_valid_utf8_char) {
+ gdv_fn_set_error_for_invalid_utf8(context, data[i]);
+ *out_len = 0;
+ return "";
+ }
+
+ // Convert the encoded codepoint to its lowercase codepoint
+ int32_t lower_codepoint = utf8proc_tolower(char_codepoint);
+
+ // UTF8Encode advances the pointer by the number of bytes present in the lowercase
+ // character
+ auto* out_char = (uint8_t*)(out + out_idx);
+ uint8_t* out_char_start = out_char;
+
+ // Encode the lowercase character
+ out_char = arrow::util::UTF8Encode(out_char, lower_codepoint);
+
+ out_char_len = static_cast<int32_t>(out_char - out_char_start);
+ out_idx += out_char_len;
+ }
+
+ *out_len = out_idx;
+ return out;
+}
+
+// Any codepoint, except the ones for lowercase letters, uppercase letters,
+// titlecase letters, decimal digits and letter numbers categories will be
+// considered as word separators.
+//
+// The Unicode characters also are divided between categories. This link
+// https://www.compart.com/en/unicode/category shows
+// more information about characters categories.
+GDV_FORCE_INLINE
+bool gdv_fn_is_codepoint_for_space(uint32_t val) {
+ auto category = utf8proc_category(val);
+
+ return category != utf8proc_category_t::UTF8PROC_CATEGORY_LU &&
+ category != utf8proc_category_t::UTF8PROC_CATEGORY_LL &&
+ category != utf8proc_category_t::UTF8PROC_CATEGORY_LT &&
+ category != utf8proc_category_t::UTF8PROC_CATEGORY_NL &&
+ category != utf8proc_category_t ::UTF8PROC_CATEGORY_ND;
+}
+
+// For a given text, initialize the first letter after a word-separator and lowercase
+// the others e.g:
+// - "IT is a tEXt str" -> "It Is A Text Str"
+GANDIVA_EXPORT
+const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_len,
+ int32_t* out_len) {
+ if (data_len == 0) {
+ *out_len = data_len;
+ return "";
+ }
+
+ // If it is a single-byte character (ASCII), corresponding uppercase is always 1-byte
+ // long; if it is >= 2 bytes long, uppercase can be at most 4 bytes long, so length of
+ // the output can be at most twice the length of the input
+ char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 2 * data_len));
+ if (out == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+
+ int32_t char_len = 0;
+ int32_t out_char_len = 0;
+ int32_t out_idx = 0;
+ uint32_t char_codepoint;
+
+ // Any character is considered as space, except if it is alphanumeric
+ bool last_char_was_space = true;
+
+ for (int32_t i = 0; i < data_len; i += char_len) {
+ // An optimization for single byte characters:
+ if (static_cast<signed char>(data[i]) >= 0) { // 1-byte char (0x00 ~ 0x7F)
+ char_len = 1;
+ char cur = data[i];
+
+ if (cur >= 0x61 && cur <= 0x7a && last_char_was_space) {
+ // Check if the character is the first one of the word and it is
+ // lowercase -> 'a' - 'z' : 0x61 - 0x7a.
+ // Then turn it into uppercase -> 'A' - 'Z' : 0x41 - 0x5a
+ out[out_idx++] = static_cast<char>(cur - 0x20);
+ last_char_was_space = false;
+ } else if (cur >= 0x41 && cur <= 0x5a && !last_char_was_space) {
+ out[out_idx++] = static_cast<char>(cur + 0x20);
+ } else {
+ // Check if the ASCII character is not an alphanumeric character:
+ // '0' - '9': 0x30 - 0x39
+ // 'a' - 'z' : 0x61 - 0x7a
+ // 'A' - 'Z' : 0x41 - 0x5a
+ last_char_was_space = (cur < 0x30) || (cur > 0x39 && cur < 0x41) ||
+ (cur > 0x5a && cur < 0x61) || (cur > 0x7a);
+ out[out_idx++] = cur;
+ }
+ continue;
+ }
+
+ char_len = gdv_fn_utf8_char_length(data[i]);
+
+ // Control reaches here when we encounter a multibyte character
+ const auto* in_char = (const uint8_t*)(data + i);
+
+ // Decode the multibyte character
+ bool is_valid_utf8_char =
+ arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint);
+
+ // If it is an invalid utf8 character, UTF8Decode evaluates to false
+ if (!is_valid_utf8_char) {
+ gdv_fn_set_error_for_invalid_utf8(context, data[i]);
+ *out_len = 0;
+ return "";
+ }
+
+ bool is_char_space = gdv_fn_is_codepoint_for_space(char_codepoint);
+
+ int32_t formatted_codepoint;
+ if (last_char_was_space && !is_char_space) {
+ formatted_codepoint = utf8proc_toupper(char_codepoint);
+ } else {
+ formatted_codepoint = utf8proc_tolower(char_codepoint);
+ }
+
+ // UTF8Encode advances the pointer by the number of bytes present in the character
+ auto* out_char = (uint8_t*)(out + out_idx);
+ uint8_t* out_char_start = out_char;
+
+ // Encode the character
+ out_char = arrow::util::UTF8Encode(out_char, formatted_codepoint);
+
+ out_char_len = static_cast<int32_t>(out_char - out_char_start);
+ out_idx += out_char_len;
+
+ last_char_was_space = is_char_space;
+ }
+
+ *out_len = out_idx;
+ return out;
+}
+}
+
+namespace gandiva {
+
+void ExportedStubFunctions::AddMappings(Engine* engine) const {
+ std::vector<llvm::Type*> args;
+ auto types = engine->types();
+
+ // gdv_fn_castVARBINARY_int32
+ args = {
+ types->i64_type(), // context
+ types->i32_type(), // int32_t value
+ types->i64_type(), // int64_t out value length
+ types->i32_ptr_type() // int32_t out_length
+ };
+
+ engine->AddGlobalMappingForFunc(
+ "gdv_fn_castVARBINARY_int32_int64", types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_castVARBINARY_int32_int64));
+
+ // gdv_fn_castVARBINARY_int64
+ args = {
+ types->i64_type(), // context
+ types->i64_type(), // int64_t value
+ types->i64_type(), // int64_t out value length
+ types->i32_ptr_type() // int32_t out_length
+ };
+
+ engine->AddGlobalMappingForFunc(
+ "gdv_fn_castVARBINARY_int64_int64", types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_castVARBINARY_int64_int64));
+
+ // gdv_fn_castVARBINARY_float32
+ args = {
+ types->i64_type(), // context
+ types->float_type(), // float value
+ types->i64_type(), // int64_t out value length
+ types->i64_ptr_type() // int32_t out_length
+ };
+
+ engine->AddGlobalMappingForFunc(
+ "gdv_fn_castVARBINARY_float32_int64", types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_castVARBINARY_float32_int64));
+
+ // gdv_fn_castVARBINARY_float64
+ args = {
+ types->i64_type(), // context
+ types->i64_type(), // double value
+ types->i64_type(), // int64_t out value length
+ types->i32_ptr_type() // int32_t out_length
+ };
+
+ engine->AddGlobalMappingForFunc(
+ "gdv_fn_castVARBINARY_float64_int64", types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_castVARBINARY_float64_int64));
+
+ // gdv_fn_dec_from_string
+ args = {
+ types->i64_type(), // context
+ types->i8_ptr_type(), // const char* in
+ types->i32_type(), // int32_t in_length
+ types->i32_ptr_type(), // int32_t* precision_from_str
+ types->i32_ptr_type(), // int32_t* scale_from_str
+ types->i64_ptr_type(), // int64_t* dec_high_from_str
+ types->i64_ptr_type(), // int64_t* dec_low_from_str
+ };
+
+ engine->AddGlobalMappingForFunc("gdv_fn_dec_from_string",
+ types->i32_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_dec_from_string));
+
+ // gdv_fn_dec_to_string
+ args = {
+ types->i64_type(), // context
+ types->i64_type(), // int64_t x_high
+ types->i64_type(), // int64_t x_low
+ types->i32_type(), // int32_t x_scale
+ types->i64_ptr_type(), // int64_t* dec_str_len
+ };
+
+ engine->AddGlobalMappingForFunc("gdv_fn_dec_to_string",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_dec_to_string));
+
+ // gdv_fn_like_utf8_utf8
+ args = {types->i64_type(), // int64_t ptr
+ types->i8_ptr_type(), // const char* data
+ types->i32_type(), // int data_len
+ types->i8_ptr_type(), // const char* pattern
+ types->i32_type()}; // int pattern_len
+
+ engine->AddGlobalMappingForFunc("gdv_fn_like_utf8_utf8",
+ types->i1_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_like_utf8_utf8));
+
+ // gdv_fn_like_utf8_utf8_utf8
+ args = {types->i64_type(), // int64_t ptr
+ types->i8_ptr_type(), // const char* data
+ types->i32_type(), // int data_len
+ types->i8_ptr_type(), // const char* pattern
+ types->i32_type(), // int pattern_len
+ types->i8_ptr_type(), // const char* escape_char
+ types->i32_type()}; // int escape_char_len
+
+ engine->AddGlobalMappingForFunc("gdv_fn_like_utf8_utf8_utf8",
+ types->i1_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_like_utf8_utf8_utf8));
+
+ // gdv_fn_ilike_utf8_utf8
+ args = {types->i64_type(), // int64_t ptr
+ types->i8_ptr_type(), // const char* data
+ types->i32_type(), // int data_len
+ types->i8_ptr_type(), // const char* pattern
+ types->i32_type()}; // int pattern_len
+
+ engine->AddGlobalMappingForFunc("gdv_fn_ilike_utf8_utf8",
+ types->i1_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_ilike_utf8_utf8));
+
+ // gdv_fn_regexp_replace_utf8_utf8
+ args = {types->i64_type(), // int64_t ptr
+ types->i64_type(), // int64_t holder_ptr
+ types->i8_ptr_type(), // const char* data
+ types->i32_type(), // int data_len
+ types->i8_ptr_type(), // const char* pattern
+ types->i32_type(), // int pattern_len
+ types->i8_ptr_type(), // const char* replace_string
+ types->i32_type(), // int32_t replace_string_len
+ types->i32_ptr_type()}; // int32_t* out_length
+
+ engine->AddGlobalMappingForFunc(
+ "gdv_fn_regexp_replace_utf8_utf8", types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_regexp_replace_utf8_utf8));
+
+ // gdv_fn_to_date_utf8_utf8
+ args = {types->i64_type(), // int64_t execution_context
+ types->i64_type(), // int64_t holder_ptr
+ types->i8_ptr_type(), // const char* data
+ types->i32_type(), // int data_len
+ types->i1_type(), // bool in1_validity
+ types->i8_ptr_type(), // const char* pattern
+ types->i32_type(), // int pattern_len
+ types->i1_type(), // bool in2_validity
+ types->ptr_type(types->i8_type())}; // bool* out_valid
+
+ engine->AddGlobalMappingForFunc("gdv_fn_to_date_utf8_utf8",
+ types->i64_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_to_date_utf8_utf8));
+
+ // gdv_fn_to_date_utf8_utf8_int32
+ args = {types->i64_type(), // int64_t execution_context
+ types->i64_type(), // int64_t holder_ptr
+ types->i8_ptr_type(), // const char* data
+ types->i32_type(), // int data_len
+ types->i1_type(), // bool in1_validity
+ types->i8_ptr_type(), // const char* pattern
+ types->i32_type(), // int pattern_len
+ types->i1_type(), // bool in2_validity
+ types->i32_type(), // int32_t suppress_errors
+ types->i1_type(), // bool in3_validity
+ types->ptr_type(types->i8_type())}; // bool* out_valid
+
+ engine->AddGlobalMappingForFunc(
+ "gdv_fn_to_date_utf8_utf8_int32", types->i64_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_to_date_utf8_utf8_int32));
+
+ // gdv_fn_in_expr_lookup_int32
+ args = {types->i64_type(), // int64_t in holder ptr
+ types->i32_type(), // int32 value
+ types->i1_type()}; // bool in_validity
+
+ engine->AddGlobalMappingForFunc("gdv_fn_in_expr_lookup_int32",
+ types->i1_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_in_expr_lookup_int32));
+
+ // gdv_fn_in_expr_lookup_int64
+ args = {types->i64_type(), // int64_t in holder ptr
+ types->i64_type(), // int64 value
+ types->i1_type()}; // bool in_validity
+
+ engine->AddGlobalMappingForFunc("gdv_fn_in_expr_lookup_int64",
+ types->i1_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_in_expr_lookup_int64));
+
+ // gdv_fn_in_expr_lookup_decimal
+ args = {types->i64_type(), // int64_t in holder ptr
+ types->i64_type(), // high decimal value
+ types->i64_type(), // low decimal value
+ types->i32_type(), // decimal precision value
+ types->i32_type(), // decimal scale value
+ types->i1_type()}; // bool in_validity
+
+ engine->AddGlobalMappingForFunc("gdv_fn_in_expr_lookup_decimal",
+ types->i1_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_in_expr_lookup_decimal));
+
+ // gdv_fn_in_expr_lookup_utf8
+ args = {types->i64_type(), // int64_t in holder ptr
+ types->i8_ptr_type(), // const char* value
+ types->i32_type(), // int value_len
+ types->i1_type()}; // bool in_validity
+
+ engine->AddGlobalMappingForFunc("gdv_fn_in_expr_lookup_utf8",
+ types->i1_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_in_expr_lookup_utf8));
+ // gdv_fn_in_expr_lookup_float
+ args = {types->i64_type(), // int64_t in holder ptr
+ types->float_type(), // float value
+ types->i1_type()}; // bool in_validity
+
+ engine->AddGlobalMappingForFunc("gdv_fn_in_expr_lookup_float",
+ types->i1_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_in_expr_lookup_float));
+ // gdv_fn_in_expr_lookup_double
+ args = {types->i64_type(), // int64_t in holder ptr
+ types->double_type(), // double value
+ types->i1_type()}; // bool in_validity
+
+ engine->AddGlobalMappingForFunc("gdv_fn_in_expr_lookup_double",
+ types->i1_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_in_expr_lookup_double));
+ // gdv_fn_populate_varlen_vector
+ args = {types->i64_type(), // int64_t execution_context
+ types->i8_ptr_type(), // int8_t* data ptr
+ types->i32_ptr_type(), // int32_t* offsets ptr
+ types->i64_type(), // int64_t slot
+ types->i8_ptr_type(), // const char* entry_buf
+ types->i32_type()}; // int32_t entry__len
+
+ engine->AddGlobalMappingForFunc("gdv_fn_populate_varlen_vector",
+ types->i32_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_populate_varlen_vector));
+
+ // gdv_fn_random
+ args = {types->i64_type()};
+ engine->AddGlobalMappingForFunc("gdv_fn_random", types->double_type(), args,
+ reinterpret_cast<void*>(gdv_fn_random));
+
+ args = {types->i64_type(), types->i32_type(), types->i1_type()};
+ engine->AddGlobalMappingForFunc("gdv_fn_random_with_seed", types->double_type(), args,
+ reinterpret_cast<void*>(gdv_fn_random_with_seed));
+
+ args = {types->i64_type(), // int64_t context_ptr
+ types->i8_ptr_type(), // const char* data
+ types->i32_type()}; // int32_t lenr
+
+ engine->AddGlobalMappingForFunc("gdv_fn_castINT_utf8", types->i32_type(), args,
+ reinterpret_cast<void*>(gdv_fn_castINT_utf8));
+
+ args = {types->i64_type(), // int64_t context_ptr
+ types->i8_ptr_type(), // const char* data
+ types->i32_type()}; // int32_t lenr
+
+ engine->AddGlobalMappingForFunc("gdv_fn_castBIGINT_utf8", types->i64_type(), args,
+ reinterpret_cast<void*>(gdv_fn_castBIGINT_utf8));
+
+ args = {types->i64_type(), // int64_t context_ptr
+ types->i8_ptr_type(), // const char* data
+ types->i32_type()}; // int32_t lenr
+
+ engine->AddGlobalMappingForFunc("gdv_fn_castFLOAT4_utf8", types->float_type(), args,
+ reinterpret_cast<void*>(gdv_fn_castFLOAT4_utf8));
+
+ args = {types->i64_type(), // int64_t context_ptr
+ types->i8_ptr_type(), // const char* data
+ types->i32_type()}; // int32_t lenr
+
+ engine->AddGlobalMappingForFunc("gdv_fn_castFLOAT8_utf8", types->double_type(), args,
+ reinterpret_cast<void*>(gdv_fn_castFLOAT8_utf8));
+
+ // gdv_fn_castVARCHAR_int32_int64
+ args = {types->i64_type(), // int64_t execution_context
+ types->i32_type(), // int32_t value
+ types->i64_type(), // int64_t len
+ types->i32_ptr_type()}; // int32_t* out_len
+ engine->AddGlobalMappingForFunc(
+ "gdv_fn_castVARCHAR_int32_int64", types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_castVARCHAR_int32_int64));
+
+ // gdv_fn_castVARCHAR_int64_int64
+ args = {types->i64_type(), // int64_t execution_context
+ types->i64_type(), // int64_t value
+ types->i64_type(), // int64_t len
+ types->i32_ptr_type()}; // int32_t* out_len
+ engine->AddGlobalMappingForFunc(
+ "gdv_fn_castVARCHAR_int64_int64", types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_castVARCHAR_int64_int64));
+
+ // gdv_fn_castVARCHAR_float32_int64
+ args = {types->i64_type(), // int64_t execution_context
+ types->float_type(), // float value
+ types->i64_type(), // int64_t len
+ types->i32_ptr_type()}; // int32_t* out_len
+ engine->AddGlobalMappingForFunc(
+ "gdv_fn_castVARCHAR_float32_int64", types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_castVARCHAR_float32_int64));
+
+ // gdv_fn_castVARCHAR_float64_int64
+ args = {types->i64_type(), // int64_t execution_context
+ types->double_type(), // double value
+ types->i64_type(), // int64_t len
+ types->i32_ptr_type()}; // int32_t* out_len
+ engine->AddGlobalMappingForFunc(
+ "gdv_fn_castVARCHAR_float64_int64", types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_castVARCHAR_float64_int64));
+
+ args = {types->i64_type(), // int64_t context_ptr
+ types->i8_ptr_type(), // const char* data
+ types->i32_type()}; // int32_t lenr
+
+ engine->AddGlobalMappingForFunc("gdv_fn_castINT_varbinary", types->i32_type(), args,
+ reinterpret_cast<void*>(gdv_fn_castINT_varbinary));
+
+ args = {types->i64_type(), // int64_t context_ptr
+ types->i8_ptr_type(), // const char* data
+ types->i32_type()}; // int32_t lenr
+
+ engine->AddGlobalMappingForFunc("gdv_fn_castBIGINT_varbinary", types->i64_type(), args,
+ reinterpret_cast<void*>(gdv_fn_castBIGINT_varbinary));
+
+ args = {types->i64_type(), // int64_t context_ptr
+ types->i8_ptr_type(), // const char* data
+ types->i32_type()}; // int32_t lenr
+
+ engine->AddGlobalMappingForFunc("gdv_fn_castFLOAT4_varbinary", types->float_type(),
+ args,
+ reinterpret_cast<void*>(gdv_fn_castFLOAT4_varbinary));
+
+ args = {types->i64_type(), // int64_t context_ptr
+ types->i8_ptr_type(), // const char* data
+ types->i32_type()}; // int32_t lenr
+
+ engine->AddGlobalMappingForFunc("gdv_fn_castFLOAT8_varbinary", types->double_type(),
+ args,
+ reinterpret_cast<void*>(gdv_fn_castFLOAT8_varbinary));
+
+ // gdv_fn_sha1_int8
+ args = {
+ types->i64_type(), // context
+ types->i8_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha1_int8",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha1_int8));
+
+ // gdv_fn_sha1_int16
+ args = {
+ types->i64_type(), // context
+ types->i16_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha1_int16",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha1_int16));
+
+ // gdv_fn_sha1_int32
+ args = {
+ types->i64_type(), // context
+ types->i32_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha1_int32",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha1_int32));
+
+ // gdv_fn_sha1_int32
+ args = {
+ types->i64_type(), // context
+ types->i64_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha1_int64",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha1_int64));
+
+ // gdv_fn_sha1_uint8
+ args = {
+ types->i64_type(), // context
+ types->i8_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha1_uint8",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha1_uint8));
+
+ // gdv_fn_sha1_uint16
+ args = {
+ types->i64_type(), // context
+ types->i16_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha1_uint16",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha1_uint16));
+
+ // gdv_fn_sha1_uint32
+ args = {
+ types->i64_type(), // context
+ types->i32_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha1_uint32",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha1_uint32));
+
+ // gdv_fn_sha1_uint64
+ args = {
+ types->i64_type(), // context
+ types->i64_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha1_uint64",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha1_uint64));
+
+ // gdv_fn_sha1_float32
+ args = {
+ types->i64_type(), // context
+ types->float_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha1_float32",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha1_float32));
+
+ // gdv_fn_sha1_float64
+ args = {
+ types->i64_type(), // context
+ types->double_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha1_float64",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha1_float64));
+
+ // gdv_fn_sha1_boolean
+ args = {
+ types->i64_type(), // context
+ types->i1_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha1_boolean",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha1_boolean));
+
+ // gdv_fn_sha1_date64
+ args = {
+ types->i64_type(), // context
+ types->i64_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha1_date64",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha1_date64));
+
+ // gdv_fn_sha1_date32
+ args = {
+ types->i64_type(), // context
+ types->i32_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha1_date32",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha1_date32));
+
+ // gdv_fn_sha1_time32
+ args = {
+ types->i64_type(), // context
+ types->i32_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha1_time32",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha1_time32));
+
+ // gdv_fn_sha1_timestamp
+ args = {
+ types->i64_type(), // context
+ types->i64_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha1_timestamp",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha1_timestamp));
+
+ // gdv_fn_sha1_from_utf8
+ args = {
+ types->i64_type(), // context
+ types->i8_ptr_type(), // const char*
+ types->i32_type(), // value_length
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out
+ };
+
+ engine->AddGlobalMappingForFunc("gdv_fn_sha1_utf8",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha1_utf8));
+
+ // gdv_fn_sha1_from_binary
+ args = {
+ types->i64_type(), // context
+ types->i8_ptr_type(), // const char*
+ types->i32_type(), // value_length
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out
+ };
+
+ engine->AddGlobalMappingForFunc("gdv_fn_sha1_binary",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha1_binary));
+
+ // gdv_fn_sha256_int8
+ args = {
+ types->i64_type(), // context
+ types->i8_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha256_int8",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha256_int8));
+
+ // gdv_fn_sha256_int16
+ args = {
+ types->i64_type(), // context
+ types->i16_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha256_int16",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha256_int16));
+
+ // gdv_fn_sha256_int32
+ args = {
+ types->i64_type(), // context
+ types->i32_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha256_int32",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha256_int32));
+
+ // gdv_fn_sha256_int32
+ args = {
+ types->i64_type(), // context
+ types->i64_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha256_int64",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha256_int64));
+
+ // gdv_fn_sha256_uint8
+ args = {
+ types->i64_type(), // context
+ types->i8_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha256_uint8",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha256_uint8));
+
+ // gdv_fn_sha256_uint16
+ args = {
+ types->i64_type(), // context
+ types->i16_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha256_uint16",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha256_uint16));
+
+ // gdv_fn_sha256_uint32
+ args = {
+ types->i64_type(), // context
+ types->i32_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha256_uint32",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha256_uint32));
+
+ // gdv_fn_sha256_uint64
+ args = {
+ types->i64_type(), // context
+ types->i64_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha256_uint64",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha256_uint64));
+
+ // gdv_fn_sha256_float32
+ args = {
+ types->i64_type(), // context
+ types->float_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha256_float32",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha256_float32));
+
+ // gdv_fn_sha256_float64
+ args = {
+ types->i64_type(), // context
+ types->double_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha256_float64",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha256_float64));
+
+ // gdv_fn_sha256_boolean
+ args = {
+ types->i64_type(), // context
+ types->i1_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha256_boolean",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha256_boolean));
+
+ // gdv_fn_sha256_date64
+ args = {
+ types->i64_type(), // context
+ types->i64_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha256_date64",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha256_date64));
+
+ // gdv_fn_sha256_date32
+ args = {
+ types->i64_type(), // context
+ types->i32_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha256_date32",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha256_date32));
+
+ // gdv_fn_sha256_time32
+ args = {
+ types->i64_type(), // context
+ types->i32_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha256_time32",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha256_time32));
+
+ // gdv_fn_sha256_timestamp
+ args = {
+ types->i64_type(), // context
+ types->i64_type(), // value
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out_length
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_sha256_timestamp",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha256_timestamp));
+
+ // gdv_fn_hash_sha256_from_utf8
+ args = {
+ types->i64_type(), // context
+ types->i8_ptr_type(), // const char*
+ types->i32_type(), // value_length
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out
+ };
+
+ engine->AddGlobalMappingForFunc("gdv_fn_sha256_utf8",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha256_utf8));
+
+ // gdv_fn_hash_sha256_from_binary
+ args = {
+ types->i64_type(), // context
+ types->i8_ptr_type(), // const char*
+ types->i32_type(), // value_length
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out
+ };
+
+ engine->AddGlobalMappingForFunc("gdv_fn_sha256_binary",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha256_binary));
+
+ // gdv_fn_sha1_decimal128
+ args = {
+ types->i64_type(), // context
+ types->i64_type(), // high_bits
+ types->i64_type(), // low_bits
+ types->i32_type(), // precision
+ types->i32_type(), // scale
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out length
+ };
+
+ engine->AddGlobalMappingForFunc("gdv_fn_sha1_decimal128",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha1_decimal128));
+ // gdv_fn_sha256_decimal128
+ args = {
+ types->i64_type(), // context
+ types->i64_type(), // high_bits
+ types->i64_type(), // low_bits
+ types->i32_type(), // precision
+ types->i32_type(), // scale
+ types->i1_type(), // validity
+ types->i32_ptr_type() // out length
+ };
+
+ engine->AddGlobalMappingForFunc("gdv_fn_sha256_decimal128",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_sha256_decimal128));
+
+ // gdv_fn_base64_encode_utf8
+ args = {
+ types->i64_type(), // context
+ types->i8_ptr_type(), // in
+ types->i32_type(), // in_len
+ types->i32_ptr_type(), // out_len
+ };
+
+ engine->AddGlobalMappingForFunc("gdv_fn_base64_encode_binary",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_base64_encode_binary));
+
+ // gdv_fn_base64_decode_utf8
+ args = {
+ types->i64_type(), // context
+ types->i8_ptr_type(), // in
+ types->i32_type(), // in_len
+ types->i32_ptr_type(), // out_len
+ };
+
+ engine->AddGlobalMappingForFunc("gdv_fn_base64_decode_utf8",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_base64_decode_utf8));
+
+ // gdv_fn_upper_utf8
+ args = {
+ types->i64_type(), // context
+ types->i8_ptr_type(), // data
+ types->i32_type(), // data_len
+ types->i32_ptr_type(), // out_len
+ };
+
+ engine->AddGlobalMappingForFunc("gdv_fn_upper_utf8",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_upper_utf8));
+ // gdv_fn_lower_utf8
+ args = {
+ types->i64_type(), // context
+ types->i8_ptr_type(), // data
+ types->i32_type(), // data_len
+ types->i32_ptr_type(), // out_len
+ };
+
+ engine->AddGlobalMappingForFunc("gdv_fn_lower_utf8",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_lower_utf8));
+
+ // gdv_fn_initcap_utf8
+ args = {
+ types->i64_type(), // context
+ types->i8_ptr_type(), // const char*
+ types->i32_type(), // value_length
+ types->i32_ptr_type() // out_length
+ };
+
+ engine->AddGlobalMappingForFunc("gdv_fn_initcap_utf8",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_initcap_utf8));
+}
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/gdv_function_stubs.h b/src/arrow/cpp/src/gandiva/gdv_function_stubs.h
new file mode 100644
index 000000000..670ac94df
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/gdv_function_stubs.h
@@ -0,0 +1,173 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "gandiva/visibility.h"
+
+/// Stub functions that can be accessed from LLVM.
+extern "C" {
+
+using gdv_boolean = bool;
+using gdv_int8 = int8_t;
+using gdv_int16 = int16_t;
+using gdv_int32 = int32_t;
+using gdv_int64 = int64_t;
+using gdv_uint8 = uint8_t;
+using gdv_uint16 = uint16_t;
+using gdv_uint32 = uint32_t;
+using gdv_uint64 = uint64_t;
+using gdv_float32 = float;
+using gdv_float64 = double;
+using gdv_date64 = int64_t;
+using gdv_date32 = int32_t;
+using gdv_time32 = int32_t;
+using gdv_timestamp = int64_t;
+using gdv_utf8 = char*;
+using gdv_binary = char*;
+using gdv_day_time_interval = int64_t;
+using gdv_month_interval = int32_t;
+
+#ifdef GANDIVA_UNIT_TEST
+// unit tests may be compiled without O2, so inlining may not happen.
+#define GDV_FORCE_INLINE
+#else
+#ifdef _MSC_VER
+#define GDV_FORCE_INLINE __forceinline
+#else
+#define GDV_FORCE_INLINE inline __attribute__((always_inline))
+#endif
+#endif
+
+bool gdv_fn_like_utf8_utf8(int64_t ptr, const char* data, int data_len,
+ const char* pattern, int pattern_len);
+
+bool gdv_fn_like_utf8_utf8_utf8(int64_t ptr, const char* data, int data_len,
+ const char* pattern, int pattern_len,
+ const char* escape_char, int escape_char_len);
+
+bool gdv_fn_ilike_utf8_utf8(int64_t ptr, const char* data, int data_len,
+ const char* pattern, int pattern_len);
+
+int64_t gdv_fn_to_date_utf8_utf8_int32(int64_t context, int64_t ptr, const char* data,
+ int data_len, bool in1_validity,
+ const char* pattern, int pattern_len,
+ bool in2_validity, int32_t suppress_errors,
+ bool in3_validity, bool* out_valid);
+
+void gdv_fn_context_set_error_msg(int64_t context_ptr, const char* err_msg);
+
+uint8_t* gdv_fn_context_arena_malloc(int64_t context_ptr, int32_t data_len);
+
+void gdv_fn_context_arena_reset(int64_t context_ptr);
+
+bool in_expr_lookup_int32(int64_t ptr, int32_t value, bool in_validity);
+
+bool in_expr_lookup_int64(int64_t ptr, int64_t value, bool in_validity);
+
+bool in_expr_lookup_utf8(int64_t ptr, const char* data, int data_len, bool in_validity);
+
+int gdv_fn_time_with_zone(int* time_fields, const char* zone, int zone_len,
+ int64_t* ret_time);
+
+GANDIVA_EXPORT
+const char* gdv_fn_base64_encode_binary(int64_t context, const char* in, int32_t in_len,
+ int32_t* out_len);
+
+GANDIVA_EXPORT
+const char* gdv_fn_base64_decode_utf8(int64_t context, const char* in, int32_t in_len,
+ int32_t* out_len);
+
+GANDIVA_EXPORT
+const char* gdv_fn_castVARBINARY_int32_int64(int64_t context, gdv_int32 value,
+ int64_t out_len, int32_t* out_length);
+
+GANDIVA_EXPORT
+const char* gdv_fn_castVARBINARY_int64_int64(int64_t context, gdv_int64 value,
+ int64_t out_len, int32_t* out_length);
+
+GANDIVA_EXPORT
+const char* gdv_fn_sha256_decimal128(int64_t context, int64_t x_high, uint64_t x_low,
+ int32_t x_precision, int32_t x_scale,
+ gdv_boolean x_isvalid, int32_t* out_length);
+
+GANDIVA_EXPORT
+const char* gdv_fn_sha1_decimal128(int64_t context, int64_t x_high, uint64_t x_low,
+ int32_t x_precision, int32_t x_scale,
+ gdv_boolean x_isvalid, int32_t* out_length);
+
+int32_t gdv_fn_dec_from_string(int64_t context, const char* in, int32_t in_length,
+ int32_t* precision_from_str, int32_t* scale_from_str,
+ int64_t* dec_high_from_str, uint64_t* dec_low_from_str);
+
+char* gdv_fn_dec_to_string(int64_t context, int64_t x_high, uint64_t x_low,
+ int32_t x_scale, int32_t* dec_str_len);
+
+GANDIVA_EXPORT
+int32_t gdv_fn_castINT_utf8(int64_t context, const char* data, int32_t data_len);
+
+GANDIVA_EXPORT
+int64_t gdv_fn_castBIGINT_utf8(int64_t context, const char* data, int32_t data_len);
+
+GANDIVA_EXPORT
+float gdv_fn_castFLOAT4_utf8(int64_t context, const char* data, int32_t data_len);
+
+GANDIVA_EXPORT
+double gdv_fn_castFLOAT8_utf8(int64_t context, const char* data, int32_t data_len);
+
+GANDIVA_EXPORT
+const char* gdv_fn_castVARCHAR_int32_int64(int64_t context, int32_t value, int64_t len,
+ int32_t* out_len);
+GANDIVA_EXPORT
+const char* gdv_fn_castVARCHAR_int64_int64(int64_t context, int64_t value, int64_t len,
+ int32_t* out_len);
+GANDIVA_EXPORT
+const char* gdv_fn_castVARCHAR_float32_int64(int64_t context, float value, int64_t len,
+ int32_t* out_len);
+GANDIVA_EXPORT
+const char* gdv_fn_castVARCHAR_float64_int64(int64_t context, double value, int64_t len,
+ int32_t* out_len);
+
+GANDIVA_EXPORT
+int32_t gdv_fn_utf8_char_length(char c);
+
+GANDIVA_EXPORT
+const char* gdv_fn_upper_utf8(int64_t context, const char* data, int32_t data_len,
+ int32_t* out_len);
+
+GANDIVA_EXPORT
+const char* gdv_fn_lower_utf8(int64_t context, const char* data, int32_t data_len,
+ int32_t* out_len);
+
+GANDIVA_EXPORT
+const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_len,
+ int32_t* out_len);
+
+GANDIVA_EXPORT
+int32_t gdv_fn_castINT_varbinary(gdv_int64 context, const char* in, int32_t in_len);
+
+GANDIVA_EXPORT
+int64_t gdv_fn_castBIGINT_varbinary(gdv_int64 context, const char* in, int32_t in_len);
+
+GANDIVA_EXPORT
+float gdv_fn_castFLOAT4_varbinary(gdv_int64 context, const char* in, int32_t in_len);
+
+GANDIVA_EXPORT
+double gdv_fn_castFLOAT8_varbinary(gdv_int64 context, const char* in, int32_t in_len);
+}
diff --git a/src/arrow/cpp/src/gandiva/gdv_function_stubs_test.cc b/src/arrow/cpp/src/gandiva/gdv_function_stubs_test.cc
new file mode 100644
index 000000000..f7c21981c
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/gdv_function_stubs_test.cc
@@ -0,0 +1,769 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/gdv_function_stubs.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "gandiva/execution_context.h"
+
+namespace gandiva {
+
+TEST(TestGdvFnStubs, TestCastVarbinaryNumeric) {
+ gandiva::ExecutionContext ctx;
+
+ int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
+ int32_t out_len = 0;
+
+ // tests for integer values as input
+ const char* out_str = gdv_fn_castVARBINARY_int32_int64(ctx_ptr, -46, 100, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "-46");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARBINARY_int32_int64(ctx_ptr, 2147483647, 100, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "2147483647");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARBINARY_int32_int64(ctx_ptr, -2147483647 - 1, 100, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "-2147483648");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARBINARY_int32_int64(ctx_ptr, 0, 100, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "0");
+ EXPECT_FALSE(ctx.has_error());
+
+ // test with required length less than actual buffer length
+ out_str = gdv_fn_castVARBINARY_int32_int64(ctx_ptr, 34567, 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "345");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARBINARY_int32_int64(ctx_ptr, 347, 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ gdv_fn_castVARBINARY_int32_int64(ctx_ptr, 347, -1, &out_len);
+ EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Buffer length can not be negative"));
+ ctx.Reset();
+
+ // tests for big integer values as input
+ out_str =
+ gdv_fn_castVARBINARY_int64_int64(ctx_ptr, 9223372036854775807LL, 100, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "9223372036854775807");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARBINARY_int64_int64(ctx_ptr, -9223372036854775807LL - 1, 100,
+ &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "-9223372036854775808");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARBINARY_int64_int64(ctx_ptr, 0, 100, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "0");
+ EXPECT_FALSE(ctx.has_error());
+
+ // test with required length less than actual buffer length
+ out_str = gdv_fn_castVARBINARY_int64_int64(ctx_ptr, 12345, 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "123");
+ EXPECT_FALSE(ctx.has_error());
+}
+
+TEST(TestGdvFnStubs, TestBase64Encode) {
+ gandiva::ExecutionContext ctx;
+
+ auto ctx_ptr = reinterpret_cast<int64_t>(&ctx);
+ int32_t out_len = 0;
+
+ auto value = gdv_fn_base64_encode_binary(ctx_ptr, "hello", 5, &out_len);
+ std::string out_value = std::string(value, out_len);
+ EXPECT_EQ(out_value, "aGVsbG8=");
+
+ value = gdv_fn_base64_encode_binary(ctx_ptr, "test", 4, &out_len);
+ out_value = std::string(value, out_len);
+ EXPECT_EQ(out_value, "dGVzdA==");
+
+ value = gdv_fn_base64_encode_binary(ctx_ptr, "hive", 4, &out_len);
+ out_value = std::string(value, out_len);
+ EXPECT_EQ(out_value, "aGl2ZQ==");
+
+ value = gdv_fn_base64_encode_binary(ctx_ptr, "", 0, &out_len);
+ out_value = std::string(value, out_len);
+ EXPECT_EQ(out_value, "");
+
+ value = gdv_fn_base64_encode_binary(ctx_ptr, "test", -5, &out_len);
+ out_value = std::string(value, out_len);
+ EXPECT_EQ(out_value, "");
+ EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Buffer length can not be negative"));
+ ctx.Reset();
+}
+
+TEST(TestGdvFnStubs, TestBase64Decode) {
+ gandiva::ExecutionContext ctx;
+
+ auto ctx_ptr = reinterpret_cast<int64_t>(&ctx);
+ int32_t out_len = 0;
+
+ auto value = gdv_fn_base64_decode_utf8(ctx_ptr, "aGVsbG8=", 8, &out_len);
+ std::string out_value = std::string(value, out_len);
+ EXPECT_EQ(out_value, "hello");
+
+ value = gdv_fn_base64_decode_utf8(ctx_ptr, "dGVzdA==", 8, &out_len);
+ out_value = std::string(value, out_len);
+ EXPECT_EQ(out_value, "test");
+
+ value = gdv_fn_base64_decode_utf8(ctx_ptr, "aGl2ZQ==", 8, &out_len);
+ out_value = std::string(value, out_len);
+ EXPECT_EQ(out_value, "hive");
+
+ value = gdv_fn_base64_decode_utf8(ctx_ptr, "", 0, &out_len);
+ out_value = std::string(value, out_len);
+ EXPECT_EQ(out_value, "");
+
+ value = gdv_fn_base64_decode_utf8(ctx_ptr, "test", -5, &out_len);
+ out_value = std::string(value, out_len);
+ EXPECT_EQ(out_value, "");
+ EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Buffer length can not be negative"));
+ ctx.Reset();
+}
+
+TEST(TestGdvFnStubs, TestCastINT) {
+ gandiva::ExecutionContext ctx;
+
+ int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
+
+ EXPECT_EQ(gdv_fn_castINT_utf8(ctx_ptr, "-45", 3), -45);
+ EXPECT_EQ(gdv_fn_castINT_utf8(ctx_ptr, "0", 1), 0);
+ EXPECT_EQ(gdv_fn_castINT_utf8(ctx_ptr, "2147483647", 10), 2147483647);
+ EXPECT_EQ(gdv_fn_castINT_utf8(ctx_ptr, "02147483647", 11), 2147483647);
+ EXPECT_EQ(gdv_fn_castINT_utf8(ctx_ptr, "-2147483648", 11), -2147483648LL);
+ EXPECT_EQ(gdv_fn_castINT_utf8(ctx_ptr, "-02147483648", 12), -2147483648LL);
+ EXPECT_EQ(gdv_fn_castINT_utf8(ctx_ptr, " 12 ", 4), 12);
+
+ gdv_fn_castINT_utf8(ctx_ptr, "2147483648", 10);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string 2147483648 to int32"));
+ ctx.Reset();
+
+ gdv_fn_castINT_utf8(ctx_ptr, "-2147483649", 11);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string -2147483649 to int32"));
+ ctx.Reset();
+
+ gdv_fn_castINT_utf8(ctx_ptr, "12.34", 5);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string 12.34 to int32"));
+ ctx.Reset();
+
+ gdv_fn_castINT_utf8(ctx_ptr, "abc", 3);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string abc to int32"));
+ ctx.Reset();
+
+ gdv_fn_castINT_utf8(ctx_ptr, "", 0);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string to int32"));
+ ctx.Reset();
+
+ gdv_fn_castINT_utf8(ctx_ptr, "-", 1);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string - to int32"));
+ ctx.Reset();
+}
+
+TEST(TestGdvFnStubs, TestCastBIGINT) {
+ gandiva::ExecutionContext ctx;
+
+ int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
+
+ EXPECT_EQ(gdv_fn_castBIGINT_utf8(ctx_ptr, "-45", 3), -45);
+ EXPECT_EQ(gdv_fn_castBIGINT_utf8(ctx_ptr, "0", 1), 0);
+ EXPECT_EQ(gdv_fn_castBIGINT_utf8(ctx_ptr, "9223372036854775807", 19),
+ 9223372036854775807LL);
+ EXPECT_EQ(gdv_fn_castBIGINT_utf8(ctx_ptr, "09223372036854775807", 20),
+ 9223372036854775807LL);
+ EXPECT_EQ(gdv_fn_castBIGINT_utf8(ctx_ptr, "-9223372036854775808", 20),
+ -9223372036854775807LL - 1);
+ EXPECT_EQ(gdv_fn_castBIGINT_utf8(ctx_ptr, "-009223372036854775808", 22),
+ -9223372036854775807LL - 1);
+ EXPECT_EQ(gdv_fn_castBIGINT_utf8(ctx_ptr, " 12 ", 4), 12);
+
+ gdv_fn_castBIGINT_utf8(ctx_ptr, "9223372036854775808", 19);
+ EXPECT_THAT(
+ ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string 9223372036854775808 to int64"));
+ ctx.Reset();
+
+ gdv_fn_castBIGINT_utf8(ctx_ptr, "-9223372036854775809", 20);
+ EXPECT_THAT(
+ ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string -9223372036854775809 to int64"));
+ ctx.Reset();
+
+ gdv_fn_castBIGINT_utf8(ctx_ptr, "12.34", 5);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string 12.34 to int64"));
+ ctx.Reset();
+
+ gdv_fn_castBIGINT_utf8(ctx_ptr, "abc", 3);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string abc to int64"));
+ ctx.Reset();
+
+ gdv_fn_castBIGINT_utf8(ctx_ptr, "", 0);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string to int64"));
+ ctx.Reset();
+
+ gdv_fn_castBIGINT_utf8(ctx_ptr, "-", 1);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string - to int64"));
+ ctx.Reset();
+}
+
+TEST(TestGdvFnStubs, TestCastFloat4) {
+ gandiva::ExecutionContext ctx;
+
+ int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
+
+ EXPECT_EQ(gdv_fn_castFLOAT4_utf8(ctx_ptr, "-45.34", 6), -45.34f);
+ EXPECT_EQ(gdv_fn_castFLOAT4_utf8(ctx_ptr, "0", 1), 0.0f);
+ EXPECT_EQ(gdv_fn_castFLOAT4_utf8(ctx_ptr, "5", 1), 5.0f);
+ EXPECT_EQ(gdv_fn_castFLOAT4_utf8(ctx_ptr, " 3.4 ", 5), 3.4f);
+
+ gdv_fn_castFLOAT4_utf8(ctx_ptr, "", 0);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string to float"));
+ ctx.Reset();
+
+ gdv_fn_castFLOAT4_utf8(ctx_ptr, "e", 1);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string e to float"));
+ ctx.Reset();
+}
+
+TEST(TestGdvFnStubs, TestCastFloat8) {
+ gandiva::ExecutionContext ctx;
+
+ int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
+
+ EXPECT_EQ(gdv_fn_castFLOAT8_utf8(ctx_ptr, "-45.34", 6), -45.34);
+ EXPECT_EQ(gdv_fn_castFLOAT8_utf8(ctx_ptr, "0", 1), 0.0);
+ EXPECT_EQ(gdv_fn_castFLOAT8_utf8(ctx_ptr, "5", 1), 5.0);
+ EXPECT_EQ(gdv_fn_castFLOAT8_utf8(ctx_ptr, " 3.4 ", 5), 3.4);
+
+ gdv_fn_castFLOAT8_utf8(ctx_ptr, "", 0);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string to double"));
+ ctx.Reset();
+
+ gdv_fn_castFLOAT8_utf8(ctx_ptr, "e", 1);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string e to double"));
+ ctx.Reset();
+}
+
+TEST(TestGdvFnStubs, TestCastVARCHARFromInt32) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
+ int32_t out_len = 0;
+
+ const char* out_str = gdv_fn_castVARCHAR_int32_int64(ctx_ptr, -46, 100, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "-46");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARCHAR_int32_int64(ctx_ptr, 2147483647, 100, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "2147483647");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARCHAR_int32_int64(ctx_ptr, -2147483647 - 1, 100, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "-2147483648");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARCHAR_int32_int64(ctx_ptr, 0, 100, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "0");
+ EXPECT_FALSE(ctx.has_error());
+
+ // test with required length less than actual buffer length
+ out_str = gdv_fn_castVARCHAR_int32_int64(ctx_ptr, 34567, 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "345");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARCHAR_int32_int64(ctx_ptr, 347, 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARCHAR_int32_int64(ctx_ptr, 347, -1, &out_len);
+ EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Buffer length can not be negative"));
+ ctx.Reset();
+}
+
+TEST(TestGdvFnStubs, TestCastVARCHARFromInt64) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
+ int32_t out_len = 0;
+
+ const char* out_str =
+ gdv_fn_castVARCHAR_int64_int64(ctx_ptr, 9223372036854775807LL, 100, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "9223372036854775807");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str =
+ gdv_fn_castVARCHAR_int64_int64(ctx_ptr, -9223372036854775807LL - 1, 100, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "-9223372036854775808");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARCHAR_int64_int64(ctx_ptr, 0, 100, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "0");
+ EXPECT_FALSE(ctx.has_error());
+
+ // test with required length less than actual buffer length
+ out_str = gdv_fn_castVARCHAR_int64_int64(ctx_ptr, 12345, 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "123");
+ EXPECT_FALSE(ctx.has_error());
+}
+
+TEST(TestGdvFnStubs, TestCastVARCHARFromFloat) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
+ int32_t out_len = 0;
+
+ const char* out_str = gdv_fn_castVARCHAR_float32_int64(ctx_ptr, 4.567f, 100, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "4.567");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARCHAR_float32_int64(ctx_ptr, -3.4567f, 100, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "-3.4567");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARCHAR_float32_int64(ctx_ptr, 0.00001f, 100, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "1.0E-5");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARCHAR_float32_int64(ctx_ptr, 0.00099999f, 100, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "9.9999E-4");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARCHAR_float32_int64(ctx_ptr, 0.0f, 100, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "0.0");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARCHAR_float32_int64(ctx_ptr, 10.00000f, 100, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "10.0");
+ EXPECT_FALSE(ctx.has_error());
+
+ // test with required length less than actual buffer length
+ out_str = gdv_fn_castVARCHAR_float32_int64(ctx_ptr, 1.2345f, 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "1.2");
+ EXPECT_FALSE(ctx.has_error());
+}
+
+TEST(TestGdvFnStubs, TestCastVARCHARFromDouble) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
+ int32_t out_len = 0;
+
+ const char* out_str = gdv_fn_castVARCHAR_float64_int64(ctx_ptr, 4.567, 100, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "4.567");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARCHAR_float64_int64(ctx_ptr, -3.4567, 100, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "-3.4567");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARCHAR_float64_int64(ctx_ptr, 0.00001, 100, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "1.0E-5");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARCHAR_float32_int64(ctx_ptr, 0.00099999f, 100, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "9.9999E-4");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARCHAR_float64_int64(ctx_ptr, 0.0, 100, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "0.0");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARCHAR_float64_int64(ctx_ptr, 10.0000000000, 100, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "10.0");
+ EXPECT_FALSE(ctx.has_error());
+
+ // test with required length less than actual buffer length
+ out_str = gdv_fn_castVARCHAR_float64_int64(ctx_ptr, 1.2345, 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "1.2");
+ EXPECT_FALSE(ctx.has_error());
+}
+
+TEST(TestGdvFnStubs, TestUpper) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+
+ const char* out_str = gdv_fn_upper_utf8(ctx_ptr, "AbcDEfGh", 8, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "ABCDEFGH");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_upper_utf8(ctx_ptr, "asdfj", 5, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "ASDFJ");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_upper_utf8(ctx_ptr, "s;dcGS,jO!l", 11, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "S;DCGS,JO!L");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_upper_utf8(ctx_ptr, "münchen", 8, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "MÜNCHEN");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_upper_utf8(ctx_ptr, "CITROËN", 8, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "CITROËN");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_upper_utf8(ctx_ptr, "âBćDëFGH", 11, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "ÂBĆDËFGH");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_upper_utf8(ctx_ptr, "øhpqRšvñ", 11, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "ØHPQRŠVÑ");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_upper_utf8(ctx_ptr, "Möbelträgerfüße", 19, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "MÖBELTRÄGERFÜẞE");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_upper_utf8(ctx_ptr, "{õhp,PQŚv}ń+", 15, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "{ÕHP,PQŚV}Ń+");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_upper_utf8(ctx_ptr, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ std::string d("AbOJjÜoß\xc3");
+ out_str = gdv_fn_upper_utf8(ctx_ptr, d.data(), static_cast<int>(d.length()), &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr(
+ "unexpected byte \\c3 encountered while decoding utf8 string"));
+ ctx.Reset();
+
+ std::string e(
+ "åbÑg\xe0\xa0"
+ "åBUå");
+ out_str = gdv_fn_upper_utf8(ctx_ptr, e.data(), static_cast<int>(e.length()), &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr(
+ "unexpected byte \\e0 encountered while decoding utf8 string"));
+ ctx.Reset();
+}
+
+TEST(TestGdvFnStubs, TestLower) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+
+ const char* out_str = gdv_fn_lower_utf8(ctx_ptr, "AbcDEfGh", 8, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "abcdefgh");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_lower_utf8(ctx_ptr, "asdfj", 5, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "asdfj");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_lower_utf8(ctx_ptr, "S;DCgs,Jo!L", 11, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "s;dcgs,jo!l");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_lower_utf8(ctx_ptr, "MÜNCHEN", 8, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "münchen");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_lower_utf8(ctx_ptr, "citroën", 8, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "citroën");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_lower_utf8(ctx_ptr, "ÂbĆDËFgh", 11, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "âbćdëfgh");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_lower_utf8(ctx_ptr, "ØHPQrŠvÑ", 11, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "øhpqršvñ");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_lower_utf8(ctx_ptr, "MÖBELTRÄGERFÜẞE", 20, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "möbelträgerfüße");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_lower_utf8(ctx_ptr, "{ÕHP,pqśv}Ń+", 15, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "{õhp,pqśv}ń+");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_lower_utf8(ctx_ptr, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ std::string d("AbOJjÜoß\xc3");
+ out_str = gdv_fn_lower_utf8(ctx_ptr, d.data(), static_cast<int>(d.length()), &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr(
+ "unexpected byte \\c3 encountered while decoding utf8 string"));
+ ctx.Reset();
+
+ std::string e(
+ "åbÑg\xe0\xa0"
+ "åBUå");
+ out_str = gdv_fn_lower_utf8(ctx_ptr, e.data(), static_cast<int>(e.length()), &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr(
+ "unexpected byte \\e0 encountered while decoding utf8 string"));
+ ctx.Reset();
+}
+
+TEST(TestGdvFnStubs, TestInitCap) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+
+ const char* out_str = gdv_fn_initcap_utf8(ctx_ptr, "test string", 11, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Test String");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_initcap_utf8(ctx_ptr, "asdfj\nhlqf", 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Asdfj\nHlqf");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_initcap_utf8(ctx_ptr, "s;DCgs,Jo!l", 11, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "S;Dcgs,Jo!L");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_initcap_utf8(ctx_ptr, " mÜNCHEN", 9, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), " München");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_initcap_utf8(ctx_ptr, "citroën CaR", 12, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Citroën Car");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_initcap_utf8(ctx_ptr, "ÂbĆDËFgh\néll", 16, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Âbćdëfgh\nÉll");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_initcap_utf8(ctx_ptr, " øhpqršvñ \n\n", 17, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), " Øhpqršvñ \n\n");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str =
+ gdv_fn_initcap_utf8(ctx_ptr, "möbelträgerfüße \nmöbelträgerfüße", 42, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Möbelträgerfüße \nMöbelträgerfüße");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_initcap_utf8(ctx_ptr, "{ÕHP,pqśv}Ń+", 15, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "{Õhp,Pqśv}Ń+");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_initcap_utf8(ctx_ptr, "sɦasasdsɦsd\"sdsdɦ", 19, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Sɦasasdsɦsd\"Sdsdɦ");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_initcap_utf8(ctx_ptr, "mysuperscipt@number²isfine", 27, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Mysuperscipt@Number²Isfine");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_initcap_utf8(ctx_ptr, "Ő<tŵas̓老ƕɱ¢vIYwށ", 25, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Ő<Tŵas̓老Ƕɱ¢Viywށ");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_initcap_utf8(ctx_ptr, "ↆcheckↆnumberisspace", 24, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "ↆcheckↆnumberisspace");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_initcap_utf8(ctx_ptr, "testing ᾌTitleᾌcase", 23, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Testing ᾌtitleᾄcase");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_initcap_utf8(ctx_ptr, "ʳTesting mʳodified", 20, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "ʳTesting MʳOdified");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_initcap_utf8(ctx_ptr, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ std::string d("AbOJjÜoß\xc3");
+ out_str =
+ gdv_fn_initcap_utf8(ctx_ptr, d.data(), static_cast<int>(d.length()), &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr(
+ "unexpected byte \\c3 encountered while decoding utf8 string"));
+ ctx.Reset();
+
+ std::string e(
+ "åbÑg\xe0\xa0"
+ "åBUå");
+ out_str =
+ gdv_fn_initcap_utf8(ctx_ptr, e.data(), static_cast<int>(e.length()), &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr(
+ "unexpected byte \\e0 encountered while decoding utf8 string"));
+ ctx.Reset();
+}
+
+TEST(TestGdvFnStubs, TestCastVarbinaryINT) {
+ gandiva::ExecutionContext ctx;
+
+ int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
+
+ EXPECT_EQ(gdv_fn_castINT_varbinary(ctx_ptr, "-45", 3), -45);
+ EXPECT_EQ(gdv_fn_castINT_varbinary(ctx_ptr, "0", 1), 0);
+ EXPECT_EQ(gdv_fn_castINT_varbinary(ctx_ptr, "2147483647", 10), 2147483647);
+ EXPECT_EQ(gdv_fn_castINT_varbinary(ctx_ptr, "\x32\x33", 2), 23);
+ EXPECT_EQ(gdv_fn_castINT_varbinary(ctx_ptr, "02147483647", 11), 2147483647);
+ EXPECT_EQ(gdv_fn_castINT_varbinary(ctx_ptr, "-2147483648", 11), -2147483648LL);
+ EXPECT_EQ(gdv_fn_castINT_varbinary(ctx_ptr, "-02147483648", 12), -2147483648LL);
+ EXPECT_EQ(gdv_fn_castINT_varbinary(ctx_ptr, " 12 ", 4), 12);
+
+ gdv_fn_castINT_varbinary(ctx_ptr, "2147483648", 10);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string 2147483648 to int32"));
+ ctx.Reset();
+
+ gdv_fn_castINT_varbinary(ctx_ptr, "-2147483649", 11);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string -2147483649 to int32"));
+ ctx.Reset();
+
+ gdv_fn_castINT_varbinary(ctx_ptr, "12.34", 5);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string 12.34 to int32"));
+ ctx.Reset();
+
+ gdv_fn_castINT_varbinary(ctx_ptr, "abc", 3);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string abc to int32"));
+ ctx.Reset();
+
+ gdv_fn_castINT_varbinary(ctx_ptr, "", 0);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string to int32"));
+ ctx.Reset();
+
+ gdv_fn_castINT_varbinary(ctx_ptr, "-", 1);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string - to int32"));
+ ctx.Reset();
+}
+
+TEST(TestGdvFnStubs, TestCastVarbinaryBIGINT) {
+ gandiva::ExecutionContext ctx;
+
+ int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
+
+ EXPECT_EQ(gdv_fn_castBIGINT_varbinary(ctx_ptr, "-45", 3), -45);
+ EXPECT_EQ(gdv_fn_castBIGINT_varbinary(ctx_ptr, "0", 1), 0);
+ EXPECT_EQ(gdv_fn_castBIGINT_varbinary(ctx_ptr, "9223372036854775807", 19),
+ 9223372036854775807LL);
+ EXPECT_EQ(gdv_fn_castBIGINT_varbinary(ctx_ptr, "09223372036854775807", 20),
+ 9223372036854775807LL);
+ EXPECT_EQ(gdv_fn_castBIGINT_varbinary(ctx_ptr, "-9223372036854775808", 20),
+ -9223372036854775807LL - 1);
+ EXPECT_EQ(gdv_fn_castBIGINT_varbinary(ctx_ptr, "-009223372036854775808", 22),
+ -9223372036854775807LL - 1);
+ EXPECT_EQ(gdv_fn_castBIGINT_varbinary(ctx_ptr, " 12 ", 4), 12);
+
+ EXPECT_EQ(gdv_fn_castBIGINT_varbinary(ctx_ptr,
+ "\x39\x39\x39\x39\x39\x39\x39\x39\x39\x39", 10),
+ 9999999999LL);
+
+ gdv_fn_castBIGINT_varbinary(ctx_ptr, "9223372036854775808", 19);
+ EXPECT_THAT(
+ ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string 9223372036854775808 to int64"));
+ ctx.Reset();
+
+ gdv_fn_castBIGINT_varbinary(ctx_ptr, "-9223372036854775809", 20);
+ EXPECT_THAT(
+ ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string -9223372036854775809 to int64"));
+ ctx.Reset();
+
+ gdv_fn_castBIGINT_varbinary(ctx_ptr, "12.34", 5);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string 12.34 to int64"));
+ ctx.Reset();
+
+ gdv_fn_castBIGINT_varbinary(ctx_ptr, "abc", 3);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string abc to int64"));
+ ctx.Reset();
+
+ gdv_fn_castBIGINT_varbinary(ctx_ptr, "", 0);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string to int64"));
+ ctx.Reset();
+
+ gdv_fn_castBIGINT_varbinary(ctx_ptr, "-", 1);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string - to int64"));
+ ctx.Reset();
+}
+
+TEST(TestGdvFnStubs, TestCastVarbinaryFloat4) {
+ gandiva::ExecutionContext ctx;
+
+ int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
+
+ EXPECT_EQ(gdv_fn_castFLOAT4_varbinary(ctx_ptr, "-45.34", 6), -45.34f);
+ EXPECT_EQ(gdv_fn_castFLOAT4_varbinary(ctx_ptr, "0", 1), 0.0f);
+ EXPECT_EQ(gdv_fn_castFLOAT4_varbinary(ctx_ptr, "5", 1), 5.0f);
+ EXPECT_EQ(gdv_fn_castFLOAT4_varbinary(ctx_ptr, " 3.4 ", 5), 3.4f);
+ EXPECT_EQ(gdv_fn_castFLOAT4_varbinary(ctx_ptr, " \x33\x2E\x34 ", 5), 3.4f);
+
+ gdv_fn_castFLOAT4_varbinary(ctx_ptr, "", 0);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string to float"));
+ ctx.Reset();
+
+ gdv_fn_castFLOAT4_varbinary(ctx_ptr, "e", 1);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string e to float"));
+ ctx.Reset();
+}
+
+TEST(TestGdvFnStubs, TestCastVarbinaryFloat8) {
+ gandiva::ExecutionContext ctx;
+
+ int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
+
+ EXPECT_EQ(gdv_fn_castFLOAT8_varbinary(ctx_ptr, "-45.34", 6), -45.34);
+ EXPECT_EQ(gdv_fn_castFLOAT8_varbinary(ctx_ptr, "0", 1), 0.0);
+ EXPECT_EQ(gdv_fn_castFLOAT8_varbinary(ctx_ptr, "5", 1), 5.0);
+ EXPECT_EQ(gdv_fn_castFLOAT8_varbinary(ctx_ptr, " \x33\x2E\x34 ", 5), 3.4);
+
+ gdv_fn_castFLOAT8_varbinary(ctx_ptr, "", 0);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string to double"));
+ ctx.Reset();
+
+ gdv_fn_castFLOAT8_varbinary(ctx_ptr, "e", 1);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Failed to cast the string e to double"));
+ ctx.Reset();
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/greedy_dual_size_cache.h b/src/arrow/cpp/src/gandiva/greedy_dual_size_cache.h
new file mode 100644
index 000000000..cb5c38e07
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/greedy_dual_size_cache.h
@@ -0,0 +1,154 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <list>
+#include <queue>
+#include <set>
+#include <unordered_map>
+#include <utility>
+
+#include "arrow/util/optional.h"
+
+// modified cache to support evict policy using the GreedyDual-Size algorithm.
+namespace gandiva {
+// Defines a base value object supported on the cache that may contain properties
+template <typename ValueType>
+class ValueCacheObject {
+ public:
+ ValueCacheObject(ValueType module, uint64_t cost) : module(module), cost(cost) {}
+ ValueType module;
+ uint64_t cost;
+ bool operator<(const ValueCacheObject& other) const { return cost < other.cost; }
+};
+
+// A particular cache based on the GreedyDual-Size cache which is a generalization of LRU
+// which defines costs for each cache values.
+// The algorithm associates a cost, C, with each cache value. Initially, when the value
+// is brought into cache, C is set to be the cost related to the value (the cost is
+// always non-negative). When a replacement needs to be made, the value with the lowest C
+// cost is replaced, and then all values reduce their C costs by the minimum value of C
+// over all the values already in the cache.
+// If a value is accessed, its C value is restored to its initial cost. Thus, the C costs
+// of recently accessed values retain a larger portion of the original cost than those of
+// values that have not been accessed for a long time. The C costs are reduced as time
+// goes and are restored when accessed.
+
+template <class Key, class Value>
+class GreedyDualSizeCache {
+ // inner class to define the priority item
+ class PriorityItem {
+ public:
+ PriorityItem(uint64_t actual_priority, uint64_t original_priority, Key key)
+ : actual_priority(actual_priority),
+ original_priority(original_priority),
+ cache_key(key) {}
+ // this ensure that the items with low priority stays in the beginning of the queue,
+ // so it can be the one removed by evict operation
+ bool operator<(const PriorityItem& other) const {
+ return actual_priority < other.actual_priority;
+ }
+ uint64_t actual_priority;
+ uint64_t original_priority;
+ Key cache_key;
+ };
+
+ public:
+ struct hasher {
+ template <typename I>
+ std::size_t operator()(const I& i) const {
+ return i.Hash();
+ }
+ };
+ // a map from 'key' to a pair of Value and a pointer to the priority value
+ using map_type = std::unordered_map<
+ Key, std::pair<ValueCacheObject<Value>, typename std::set<PriorityItem>::iterator>,
+ hasher>;
+
+ explicit GreedyDualSizeCache(size_t capacity) : inflation_(0), capacity_(capacity) {}
+
+ ~GreedyDualSizeCache() = default;
+
+ size_t size() const { return map_.size(); }
+
+ size_t capacity() const { return capacity_; }
+
+ bool empty() const { return map_.empty(); }
+
+ bool contains(const Key& key) { return map_.find(key) != map_.end(); }
+
+ void insert(const Key& key, const ValueCacheObject<Value>& value) {
+ typename map_type::iterator i = map_.find(key);
+ // check if element is not in the cache to add it
+ if (i == map_.end()) {
+ // insert item into the cache, but first check if it is full, to evict an item
+ // if it is necessary
+ if (size() >= capacity_) {
+ evict();
+ }
+
+ // insert the new item
+ auto item =
+ priority_set_.insert(PriorityItem(value.cost + inflation_, value.cost, key));
+ // save on map the value and the priority item iterator position
+ map_.emplace(key, std::make_pair(value, item.first));
+ }
+ }
+
+ arrow::util::optional<ValueCacheObject<Value>> get(const Key& key) {
+ // lookup value in the cache
+ typename map_type::iterator value_for_key = map_.find(key);
+ if (value_for_key == map_.end()) {
+ // value not in cache
+ return arrow::util::nullopt;
+ }
+ PriorityItem item = *value_for_key->second.second;
+ // if the value was found on the cache, update its cost (original + inflation)
+ if (item.actual_priority != item.original_priority + inflation_) {
+ priority_set_.erase(value_for_key->second.second);
+ auto iter = priority_set_.insert(PriorityItem(
+ item.original_priority + inflation_, item.original_priority, item.cache_key));
+ value_for_key->second.second = iter.first;
+ }
+ return value_for_key->second.first;
+ }
+
+ void clear() {
+ map_.clear();
+ priority_set_.clear();
+ }
+
+ private:
+ void evict() {
+ // TODO: inflation overflow is unlikely to happen but needs to be handled
+ // for correctness.
+ // evict item from the beginning of the set. This set is ordered from the
+ // lower priority value to the higher priority value.
+ typename std::set<PriorityItem>::iterator i = priority_set_.begin();
+ // update the inflation cost related to the evicted item
+ inflation_ = (*i).actual_priority;
+ map_.erase((*i).cache_key);
+ priority_set_.erase(i);
+ }
+
+ map_type map_;
+ std::set<PriorityItem> priority_set_;
+ uint64_t inflation_;
+ size_t capacity_;
+};
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/greedy_dual_size_cache_test.cc b/src/arrow/cpp/src/gandiva/greedy_dual_size_cache_test.cc
new file mode 100644
index 000000000..3c72eef70
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/greedy_dual_size_cache_test.cc
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/greedy_dual_size_cache.h"
+
+#include <string>
+#include <typeinfo>
+
+#include <gtest/gtest.h>
+
+namespace gandiva {
+
+class GreedyDualSizeCacheKey {
+ public:
+ explicit GreedyDualSizeCacheKey(int tmp) : tmp_(tmp) {}
+ std::size_t Hash() const { return tmp_; }
+ bool operator==(const GreedyDualSizeCacheKey& other) const {
+ return tmp_ == other.tmp_;
+ }
+
+ private:
+ int tmp_;
+};
+
+class TestGreedyDualSizeCache : public ::testing::Test {
+ public:
+ TestGreedyDualSizeCache() : cache_(2) {}
+
+ protected:
+ GreedyDualSizeCache<GreedyDualSizeCacheKey, std::string> cache_;
+};
+
+TEST_F(TestGreedyDualSizeCache, TestEvict) {
+ // check if the cache is evicting the items with low priority on cache
+ cache_.insert(GreedyDualSizeCacheKey(1), ValueCacheObject<std::string>("1", 1));
+ cache_.insert(GreedyDualSizeCacheKey(2), ValueCacheObject<std::string>("2", 10));
+ cache_.insert(GreedyDualSizeCacheKey(3), ValueCacheObject<std::string>("3", 20));
+ cache_.insert(GreedyDualSizeCacheKey(4), ValueCacheObject<std::string>("4", 15));
+ cache_.insert(GreedyDualSizeCacheKey(1), ValueCacheObject<std::string>("5", 1));
+ ASSERT_EQ(2, cache_.size());
+ // we check initially the values that won't be on the cache, since the get operation
+ // may affect the entity costs, which is not the purpose of this test
+ ASSERT_EQ(cache_.get(GreedyDualSizeCacheKey(2)), arrow::util::nullopt);
+ ASSERT_EQ(cache_.get(GreedyDualSizeCacheKey(3)), arrow::util::nullopt);
+ ASSERT_EQ(cache_.get(GreedyDualSizeCacheKey(1))->module, "5");
+ ASSERT_EQ(cache_.get(GreedyDualSizeCacheKey(4))->module, "4");
+}
+
+TEST_F(TestGreedyDualSizeCache, TestGreedyDualSizeBehavior) {
+ // insert 1 and 3 evicting 2 (this eviction will increase the inflation cost by 20)
+ cache_.insert(GreedyDualSizeCacheKey(1), ValueCacheObject<std::string>("1", 40));
+ cache_.insert(GreedyDualSizeCacheKey(2), ValueCacheObject<std::string>("2", 20));
+ cache_.insert(GreedyDualSizeCacheKey(3), ValueCacheObject<std::string>("3", 30));
+
+ // when accessing key 3, its actual cost will be increased by the inflation, so in the
+ // next eviction, the key 1 will be evicted, since the key 1 actual cost (original(40))
+ // is smaller than key 3 actual increased cost (original(30) + inflation(20))
+ ASSERT_EQ(cache_.get(GreedyDualSizeCacheKey(3))->module, "3");
+
+ // try to insert key 2 and expect the eviction of key 1
+ cache_.insert(GreedyDualSizeCacheKey(2), ValueCacheObject<std::string>("2", 20));
+ ASSERT_EQ(cache_.get(GreedyDualSizeCacheKey(1)), arrow::util::nullopt);
+
+ // when accessing key 2, its original cost should be increased by inflation, so when
+ // inserting the key 1 again, now the key 3 should be evicted
+ ASSERT_EQ(cache_.get(GreedyDualSizeCacheKey(2))->module, "2");
+ cache_.insert(GreedyDualSizeCacheKey(1), ValueCacheObject<std::string>("1", 20));
+
+ ASSERT_EQ(cache_.get(GreedyDualSizeCacheKey(1))->module, "1");
+ ASSERT_EQ(cache_.get(GreedyDualSizeCacheKey(2))->module, "2");
+ ASSERT_EQ(cache_.get(GreedyDualSizeCacheKey(3)), arrow::util::nullopt);
+ ASSERT_EQ(2, cache_.size());
+}
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/hash_utils.cc b/src/arrow/cpp/src/gandiva/hash_utils.cc
new file mode 100644
index 000000000..8ebf60a9b
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/hash_utils.cc
@@ -0,0 +1,134 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/hash_utils.h"
+#include <cstring>
+#include "arrow/util/logging.h"
+#include "gandiva/gdv_function_stubs.h"
+#include "openssl/evp.h"
+
+namespace gandiva {
+/// Hashes a generic message using the SHA256 algorithm
+GANDIVA_EXPORT
+const char* gdv_hash_using_sha256(int64_t context, const void* message,
+ size_t message_length, int32_t* out_length) {
+ constexpr int sha256_result_length = 64;
+ return gdv_hash_using_sha(context, message, message_length, EVP_sha256(),
+ sha256_result_length, out_length);
+}
+
+/// Hashes a generic message using the SHA1 algorithm
+GANDIVA_EXPORT
+const char* gdv_hash_using_sha1(int64_t context, const void* message,
+ size_t message_length, int32_t* out_length) {
+ constexpr int sha1_result_length = 40;
+ return gdv_hash_using_sha(context, message, message_length, EVP_sha1(),
+ sha1_result_length, out_length);
+}
+
+/// \brief Hashes a generic message using SHA algorithm.
+///
+/// It uses the EVP API in the OpenSSL library to generate
+/// the hash. The type of the hash is defined by the
+/// \b hash_type \b parameter.
+GANDIVA_EXPORT
+const char* gdv_hash_using_sha(int64_t context, const void* message,
+ size_t message_length, const EVP_MD* hash_type,
+ uint32_t result_buf_size, int32_t* out_length) {
+ EVP_MD_CTX* md_ctx = EVP_MD_CTX_new();
+
+ if (md_ctx == nullptr) {
+ gdv_fn_context_set_error_msg(context,
+ "Could not create the context for SHA processing.");
+ *out_length = 0;
+ return "";
+ }
+
+ int evp_success_status = 1;
+
+ if (EVP_DigestInit_ex(md_ctx, hash_type, nullptr) != evp_success_status ||
+ EVP_DigestUpdate(md_ctx, message, message_length) != evp_success_status) {
+ gdv_fn_context_set_error_msg(context,
+ "Could not obtain the hash for the defined value.");
+ EVP_MD_CTX_free(md_ctx);
+
+ *out_length = 0;
+ return "";
+ }
+
+ // Create the temporary buffer used by the EVP to generate the hash
+ unsigned int hash_digest_size = EVP_MD_size(hash_type);
+ auto* result = static_cast<unsigned char*>(OPENSSL_malloc(hash_digest_size));
+
+ if (result == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for SHA processing");
+ EVP_MD_CTX_free(md_ctx);
+ *out_length = 0;
+ return "";
+ }
+
+ unsigned int result_length;
+ EVP_DigestFinal_ex(md_ctx, result, &result_length);
+
+ if (result_length != hash_digest_size && result_buf_size != (2 * hash_digest_size)) {
+ gdv_fn_context_set_error_msg(context,
+ "Could not obtain the hash for the defined value");
+ EVP_MD_CTX_free(md_ctx);
+ OPENSSL_free(result);
+
+ *out_length = 0;
+ return "";
+ }
+
+ auto result_buffer =
+ reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, result_buf_size));
+
+ if (result_buffer == nullptr) {
+ gdv_fn_context_set_error_msg(context,
+ "Could not allocate memory for the result buffer");
+ // Free the resources used by the EVP
+ EVP_MD_CTX_free(md_ctx);
+ OPENSSL_free(result);
+
+ *out_length = 0;
+ return "";
+ }
+
+ unsigned int result_buff_index = 0;
+ for (unsigned int j = 0; j < result_length; j++) {
+ DCHECK(result_buff_index >= 0 && result_buff_index < result_buf_size);
+
+ unsigned char hex_number = result[j];
+ result_buff_index +=
+ snprintf(result_buffer + result_buff_index, result_buf_size, "%02x", hex_number);
+ }
+
+ // Free the resources used by the EVP to avoid memory leaks
+ EVP_MD_CTX_free(md_ctx);
+ OPENSSL_free(result);
+
+ *out_length = result_buf_size;
+ return result_buffer;
+}
+
+GANDIVA_EXPORT
+uint64_t gdv_double_to_long(double value) {
+ uint64_t result;
+ memcpy(&result, &value, sizeof(result));
+ return result;
+}
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/hash_utils.h b/src/arrow/cpp/src/gandiva/hash_utils.h
new file mode 100644
index 000000000..483993f30
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/hash_utils.h
@@ -0,0 +1,44 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef ARROW_SRC_HASH_UTILS_H_
+#define ARROW_SRC_HASH_UTILS_H_
+
+#include <cstdint>
+#include <cstdlib>
+#include "gandiva/visibility.h"
+#include "openssl/evp.h"
+
+namespace gandiva {
+GANDIVA_EXPORT
+const char* gdv_hash_using_sha256(int64_t context, const void* message,
+ size_t message_length, int32_t* out_length);
+
+GANDIVA_EXPORT
+const char* gdv_hash_using_sha1(int64_t context, const void* message,
+ size_t message_length, int32_t* out_length);
+
+GANDIVA_EXPORT
+const char* gdv_hash_using_sha(int64_t context, const void* message,
+ size_t message_length, const EVP_MD* hash_type,
+ uint32_t result_buf_size, int32_t* out_length);
+
+GANDIVA_EXPORT
+uint64_t gdv_double_to_long(double value);
+} // namespace gandiva
+
+#endif // ARROW_SRC_HASH_UTILS_H_
diff --git a/src/arrow/cpp/src/gandiva/hash_utils_test.cc b/src/arrow/cpp/src/gandiva/hash_utils_test.cc
new file mode 100644
index 000000000..a8f55e1ed
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/hash_utils_test.cc
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+#include <unordered_set>
+
+#include "gandiva/execution_context.h"
+#include "gandiva/hash_utils.h"
+
+TEST(TestShaHashUtils, TestSha1Numeric) {
+ gandiva::ExecutionContext ctx;
+
+ auto ctx_ptr = reinterpret_cast<int64_t>(&ctx);
+
+ std::vector<uint64_t> values_to_be_hashed;
+
+ // Generate a list of values to obtains the SHA1 hash
+ values_to_be_hashed.push_back(gandiva::gdv_double_to_long(0.0));
+ values_to_be_hashed.push_back(gandiva::gdv_double_to_long(0.1));
+ values_to_be_hashed.push_back(gandiva::gdv_double_to_long(0.2));
+ values_to_be_hashed.push_back(gandiva::gdv_double_to_long(-0.10000001));
+ values_to_be_hashed.push_back(gandiva::gdv_double_to_long(-0.0000001));
+ values_to_be_hashed.push_back(gandiva::gdv_double_to_long(1.000000));
+ values_to_be_hashed.push_back(gandiva::gdv_double_to_long(-0.0000002));
+ values_to_be_hashed.push_back(gandiva::gdv_double_to_long(0.999999));
+
+ // Checks if the hash value is different for each one of the values
+ std::unordered_set<std::string> sha_values;
+
+ int sha1_size = 40;
+
+ for (auto value : values_to_be_hashed) {
+ int out_length;
+ const char* sha_1 =
+ gandiva::gdv_hash_using_sha1(ctx_ptr, &value, sizeof(value), &out_length);
+ std::string sha1_as_str(sha_1, out_length);
+ EXPECT_EQ(sha1_as_str.size(), sha1_size);
+
+ // The value can not exists inside the set with the hash results
+ EXPECT_EQ(sha_values.find(sha1_as_str), sha_values.end());
+ sha_values.insert(sha1_as_str);
+ }
+}
+
+TEST(TestShaHashUtils, TestSha256Numeric) {
+ gandiva::ExecutionContext ctx;
+
+ auto ctx_ptr = reinterpret_cast<int64_t>(&ctx);
+
+ std::vector<uint64_t> values_to_be_hashed;
+
+ // Generate a list of values to obtains the SHA1 hash
+ values_to_be_hashed.push_back(gandiva::gdv_double_to_long(0.0));
+ values_to_be_hashed.push_back(gandiva::gdv_double_to_long(0.1));
+ values_to_be_hashed.push_back(gandiva::gdv_double_to_long(0.2));
+ values_to_be_hashed.push_back(gandiva::gdv_double_to_long(-0.10000001));
+ values_to_be_hashed.push_back(gandiva::gdv_double_to_long(-0.0000001));
+ values_to_be_hashed.push_back(gandiva::gdv_double_to_long(1.000000));
+ values_to_be_hashed.push_back(gandiva::gdv_double_to_long(-0.0000002));
+ values_to_be_hashed.push_back(gandiva::gdv_double_to_long(0.999999));
+
+ // Checks if the hash value is different for each one of the values
+ std::unordered_set<std::string> sha_values;
+
+ int sha256_size = 64;
+
+ for (auto value : values_to_be_hashed) {
+ int out_length;
+ const char* sha_256 =
+ gandiva::gdv_hash_using_sha256(ctx_ptr, &value, sizeof(value), &out_length);
+ std::string sha256_as_str(sha_256, out_length);
+ EXPECT_EQ(sha256_as_str.size(), sha256_size);
+
+ // The value can not exists inside the set with the hash results
+ EXPECT_EQ(sha_values.find(sha256_as_str), sha_values.end());
+ sha_values.insert(sha256_as_str);
+ }
+}
+
+TEST(TestShaHashUtils, TestSha1Varlen) {
+ gandiva::ExecutionContext ctx;
+
+ auto ctx_ptr = reinterpret_cast<int64_t>(&ctx);
+
+ std::string first_string =
+ "ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn\nY [ˈʏpsilɔn], "
+ "Yen [jɛn], Yoga [ˈjoːgɑ]";
+
+ std::string second_string =
+ "ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeın\nY [ˈʏpsilɔn], "
+ "Yen [jɛn], Yoga [ˈjoːgɑ] コンニチハ";
+
+ // The strings expected hashes are obtained from shell executing the following command:
+ // echo -n <output-string> | openssl dgst sha1
+ std::string expected_first_result = "160fcdbc2fa694d884868f5fae7a4bae82706185";
+ std::string expected_second_result = "a456b3e0f88669d2482170a42fade226a815bee1";
+
+ // Generate the hashes and compare with expected outputs
+ const int sha1_size = 40;
+ int out_length;
+
+ const char* sha_1 = gandiva::gdv_hash_using_sha1(ctx_ptr, first_string.c_str(),
+ first_string.size(), &out_length);
+ std::string sha1_as_str(sha_1, out_length);
+ EXPECT_EQ(sha1_as_str.size(), sha1_size);
+ EXPECT_EQ(sha1_as_str, expected_first_result);
+
+ const char* sha_2 = gandiva::gdv_hash_using_sha1(ctx_ptr, second_string.c_str(),
+ second_string.size(), &out_length);
+ std::string sha2_as_str(sha_2, out_length);
+ EXPECT_EQ(sha2_as_str.size(), sha1_size);
+ EXPECT_EQ(sha2_as_str, expected_second_result);
+}
+
+TEST(TestShaHashUtils, TestSha256Varlen) {
+ gandiva::ExecutionContext ctx;
+
+ auto ctx_ptr = reinterpret_cast<int64_t>(&ctx);
+
+ std::string first_string =
+ "ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn\nY [ˈʏpsilɔn], "
+ "Yen [jɛn], Yoga [ˈjoːgɑ]";
+
+ std::string second_string =
+ "ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeın\nY [ˈʏpsilɔn], "
+ "Yen [jɛn], Yoga [ˈjoːgɑ] コンニチハ";
+
+ // The strings expected hashes are obtained from shell executing the following command:
+ // echo -n <output-string> | openssl dgst sha1
+ std::string expected_first_result =
+ "55aeb2e789871dbd289edae94d4c1c82a1c25ca0bcd5a873924da2fefdd57acb";
+ std::string expected_second_result =
+ "86b29c13d0d0e26ea8f85bfa649dc9b8622ae59a4da2409d7d9b463e86e796f2";
+
+ // Generate the hashes and compare with expected outputs
+ const int sha256_size = 64;
+ int out_length;
+
+ const char* sha_1 = gandiva::gdv_hash_using_sha256(ctx_ptr, first_string.c_str(),
+ first_string.size(), &out_length);
+ std::string sha1_as_str(sha_1, out_length);
+ EXPECT_EQ(sha1_as_str.size(), sha256_size);
+ EXPECT_EQ(sha1_as_str, expected_first_result);
+
+ const char* sha_2 = gandiva::gdv_hash_using_sha256(ctx_ptr, second_string.c_str(),
+ second_string.size(), &out_length);
+ std::string sha2_as_str(sha_2, out_length);
+ EXPECT_EQ(sha2_as_str.size(), sha256_size);
+ EXPECT_EQ(sha2_as_str, expected_second_result);
+}
diff --git a/src/arrow/cpp/src/gandiva/in_holder.h b/src/arrow/cpp/src/gandiva/in_holder.h
new file mode 100644
index 000000000..d55ab5ec5
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/in_holder.h
@@ -0,0 +1,91 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+#include <unordered_set>
+
+#include "arrow/util/hashing.h"
+#include "gandiva/arrow.h"
+#include "gandiva/decimal_scalar.h"
+#include "gandiva/gandiva_aliases.h"
+
+namespace gandiva {
+
+/// Function Holder for IN Expressions
+template <typename Type>
+class InHolder {
+ public:
+ explicit InHolder(const std::unordered_set<Type>& values) {
+ values_.max_load_factor(0.25f);
+ for (auto& value : values) {
+ values_.insert(value);
+ }
+ }
+
+ bool HasValue(Type value) const { return values_.count(value) == 1; }
+
+ private:
+ std::unordered_set<Type> values_;
+};
+
+template <>
+class InHolder<gandiva::DecimalScalar128> {
+ public:
+ explicit InHolder(const std::unordered_set<gandiva::DecimalScalar128>& values) {
+ values_.max_load_factor(0.25f);
+ for (auto& value : values) {
+ values_.insert(value);
+ }
+ }
+
+ bool HasValue(gandiva::DecimalScalar128 value) const {
+ return values_.count(value) == 1;
+ }
+
+ private:
+ std::unordered_set<gandiva::DecimalScalar128> values_;
+};
+
+template <>
+class InHolder<std::string> {
+ public:
+ explicit InHolder(std::unordered_set<std::string> values) : values_(std::move(values)) {
+ values_lookup_.max_load_factor(0.25f);
+ for (const std::string& value : values_) {
+ values_lookup_.emplace(value);
+ }
+ }
+
+ bool HasValue(arrow::util::string_view value) const {
+ return values_lookup_.count(value) == 1;
+ }
+
+ private:
+ struct string_view_hash {
+ public:
+ std::size_t operator()(arrow::util::string_view v) const {
+ return arrow::internal::ComputeStringHash<0>(v.data(), v.length());
+ }
+ };
+
+ std::unordered_set<arrow::util::string_view, string_view_hash> values_lookup_;
+ const std::unordered_set<std::string> values_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/jni/CMakeLists.txt b/src/arrow/cpp/src/gandiva/jni/CMakeLists.txt
new file mode 100644
index 000000000..046934141
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/jni/CMakeLists.txt
@@ -0,0 +1,107 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if(CMAKE_VERSION VERSION_LESS 3.11)
+ message(FATAL_ERROR "Building the Gandiva JNI bindings requires CMake version >= 3.11")
+endif()
+
+if(MSVC)
+ add_definitions(-DPROTOBUF_USE_DLLS)
+endif()
+
+# Find JNI
+find_package(JNI REQUIRED)
+
+set(PROTO_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR})
+set(PROTO_OUTPUT_FILES "${PROTO_OUTPUT_DIR}/Types.pb.cc")
+set(PROTO_OUTPUT_FILES ${PROTO_OUTPUT_FILES} "${PROTO_OUTPUT_DIR}/Types.pb.h")
+
+set_source_files_properties(${PROTO_OUTPUT_FILES} PROPERTIES GENERATED TRUE)
+
+get_filename_component(ABS_GANDIVA_PROTO
+ ${CMAKE_SOURCE_DIR}/src/gandiva/proto/Types.proto ABSOLUTE)
+
+add_custom_command(OUTPUT ${PROTO_OUTPUT_FILES}
+ COMMAND ${ARROW_PROTOBUF_PROTOC} --proto_path
+ ${CMAKE_SOURCE_DIR}/src/gandiva/proto --cpp_out
+ ${PROTO_OUTPUT_DIR}
+ ${CMAKE_SOURCE_DIR}/src/gandiva/proto/Types.proto
+ DEPENDS ${ABS_GANDIVA_PROTO} ${ARROW_PROTOBUF_LIBPROTOBUF}
+ COMMENT "Running PROTO compiler on Types.proto"
+ VERBATIM)
+
+add_custom_target(gandiva_jni_proto ALL DEPENDS ${PROTO_OUTPUT_FILES})
+set(PROTO_SRCS "${PROTO_OUTPUT_DIR}/Types.pb.cc")
+set(PROTO_HDRS "${PROTO_OUTPUT_DIR}/Types.pb.h")
+
+# Create the jni header file (from the java class).
+set(JNI_HEADERS_DIR "${CMAKE_CURRENT_BINARY_DIR}/java")
+add_subdirectory(../../../../java/gandiva ./java/gandiva)
+
+set(GANDIVA_LINK_LIBS ${ARROW_PROTOBUF_LIBPROTOBUF})
+if(ARROW_BUILD_STATIC)
+ list(APPEND GANDIVA_LINK_LIBS gandiva_static)
+else()
+ list(APPEND GANDIVA_LINK_LIBS gandiva_shared)
+endif()
+
+set(GANDIVA_JNI_SOURCES
+ config_builder.cc
+ config_holder.cc
+ expression_registry_helper.cc
+ jni_common.cc
+ ${PROTO_SRCS})
+
+# For users of gandiva_jni library (including integ tests), include-dir is :
+# /usr/**/include dir after install,
+# cpp/include during build
+# For building gandiva_jni library itself, include-dir (in addition to above) is :
+# cpp/src
+add_arrow_lib(gandiva_jni
+ SOURCES
+ ${GANDIVA_JNI_SOURCES}
+ OUTPUTS
+ GANDIVA_JNI_LIBRARIES
+ SHARED_PRIVATE_LINK_LIBS
+ ${GANDIVA_LINK_LIBS}
+ STATIC_LINK_LIBS
+ ${GANDIVA_LINK_LIBS}
+ DEPENDENCIES
+ ${GANDIVA_LINK_LIBS}
+ gandiva_java
+ gandiva_jni_headers
+ gandiva_jni_proto
+ EXTRA_INCLUDES
+ $<INSTALL_INTERFACE:include>
+ $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/include>
+ $<BUILD_INTERFACE:${JNI_HEADERS_DIR}>
+ PRIVATE_INCLUDES
+ ${JNI_INCLUDE_DIRS}
+ ${CMAKE_CURRENT_BINARY_DIR})
+
+add_dependencies(gandiva ${GANDIVA_JNI_LIBRARIES})
+
+if(ARROW_BUILD_SHARED)
+ # filter out everything that is not needed for the jni bridge
+ # statically linked stdc++ has conflicts with stdc++ loaded by other libraries.
+ if(CXX_LINKER_SUPPORTS_VERSION_SCRIPT)
+ set_target_properties(gandiva_jni_shared
+ PROPERTIES LINK_FLAGS
+ "-Wl,--version-script=${CMAKE_SOURCE_DIR}/src/gandiva/jni/symbols.map"
+ )
+ endif()
+endif()
diff --git a/src/arrow/cpp/src/gandiva/jni/config_builder.cc b/src/arrow/cpp/src/gandiva/jni/config_builder.cc
new file mode 100644
index 000000000..b115210ce
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/jni/config_builder.cc
@@ -0,0 +1,53 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <string>
+
+#include "gandiva/configuration.h"
+#include "gandiva/jni/config_holder.h"
+#include "gandiva/jni/env_helper.h"
+#include "jni/org_apache_arrow_gandiva_evaluator_ConfigurationBuilder.h"
+
+using gandiva::ConfigHolder;
+using gandiva::Configuration;
+using gandiva::ConfigurationBuilder;
+
+/*
+ * Class: org_apache_arrow_gandiva_evaluator_ConfigBuilder
+ * Method: buildConfigInstance
+ * Signature: (ZZ)J
+ */
+JNIEXPORT jlong JNICALL
+Java_org_apache_arrow_gandiva_evaluator_ConfigurationBuilder_buildConfigInstance(
+ JNIEnv* env, jobject configuration, jboolean optimize, jboolean target_host_cpu) {
+ ConfigurationBuilder configuration_builder;
+ std::shared_ptr<Configuration> config = configuration_builder.build();
+ config->set_optimize(optimize);
+ config->target_host_cpu(target_host_cpu);
+ return ConfigHolder::MapInsert(config);
+}
+
+/*
+ * Class: org_apache_arrow_gandiva_evaluator_ConfigBuilder
+ * Method: releaseConfigInstance
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL
+Java_org_apache_arrow_gandiva_evaluator_ConfigurationBuilder_releaseConfigInstance(
+ JNIEnv* env, jobject configuration, jlong config_id) {
+ ConfigHolder::MapErase(config_id);
+}
diff --git a/src/arrow/cpp/src/gandiva/jni/config_holder.cc b/src/arrow/cpp/src/gandiva/jni/config_holder.cc
new file mode 100644
index 000000000..11d305c81
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/jni/config_holder.cc
@@ -0,0 +1,30 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/jni/config_holder.h"
+
+#include <cstdint>
+
+namespace gandiva {
+int64_t ConfigHolder::config_id_ = 1;
+
+// map of configuration objects created so far
+std::unordered_map<int64_t, std::shared_ptr<Configuration>>
+ ConfigHolder::configuration_map_;
+
+std::mutex ConfigHolder::g_mtx_;
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/jni/config_holder.h b/src/arrow/cpp/src/gandiva/jni/config_holder.h
new file mode 100644
index 000000000..3fdb7a01d
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/jni/config_holder.h
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+
+#include "gandiva/configuration.h"
+
+namespace gandiva {
+
+class ConfigHolder {
+ public:
+ static int64_t MapInsert(std::shared_ptr<Configuration> config) {
+ g_mtx_.lock();
+
+ int64_t result = config_id_++;
+ configuration_map_.insert(
+ std::pair<int64_t, std::shared_ptr<Configuration>>(result, config));
+
+ g_mtx_.unlock();
+ return result;
+ }
+
+ static void MapErase(int64_t config_id_) {
+ g_mtx_.lock();
+ configuration_map_.erase(config_id_);
+ g_mtx_.unlock();
+ }
+
+ static std::shared_ptr<Configuration> MapLookup(int64_t config_id_) {
+ std::shared_ptr<Configuration> result = nullptr;
+
+ try {
+ result = configuration_map_.at(config_id_);
+ } catch (const std::out_of_range&) {
+ }
+
+ return result;
+ }
+
+ private:
+ // map of configuration objects created so far
+ static std::unordered_map<int64_t, std::shared_ptr<Configuration>> configuration_map_;
+
+ static std::mutex g_mtx_;
+
+ // atomic counter for projector module ids
+ static int64_t config_id_;
+};
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/jni/env_helper.h b/src/arrow/cpp/src/gandiva/jni/env_helper.h
new file mode 100644
index 000000000..5ae13c807
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/jni/env_helper.h
@@ -0,0 +1,23 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <jni.h>
+
+// class references
+extern jclass configuration_builder_class_;
diff --git a/src/arrow/cpp/src/gandiva/jni/expression_registry_helper.cc b/src/arrow/cpp/src/gandiva/jni/expression_registry_helper.cc
new file mode 100644
index 000000000..0d1f74ba6
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/jni/expression_registry_helper.cc
@@ -0,0 +1,190 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "jni/org_apache_arrow_gandiva_evaluator_ExpressionRegistryJniHelper.h"
+
+#include <memory>
+
+#include "Types.pb.h"
+#include "arrow/util/logging.h"
+#include "gandiva/arrow.h"
+#include "gandiva/expression_registry.h"
+
+using gandiva::DataTypePtr;
+using gandiva::ExpressionRegistry;
+
+types::TimeUnit MapTimeUnit(arrow::TimeUnit::type& unit) {
+ switch (unit) {
+ case arrow::TimeUnit::MILLI:
+ return types::TimeUnit::MILLISEC;
+ case arrow::TimeUnit::SECOND:
+ return types::TimeUnit::SEC;
+ case arrow::TimeUnit::MICRO:
+ return types::TimeUnit::MICROSEC;
+ case arrow::TimeUnit::NANO:
+ return types::TimeUnit::NANOSEC;
+ }
+ // satisfy gcc. should be unreachable.
+ return types::TimeUnit::SEC;
+}
+
+void ArrowToProtobuf(DataTypePtr type, types::ExtGandivaType* gandiva_data_type) {
+ switch (type->id()) {
+ case arrow::Type::BOOL:
+ gandiva_data_type->set_type(types::GandivaType::BOOL);
+ break;
+ case arrow::Type::UINT8:
+ gandiva_data_type->set_type(types::GandivaType::UINT8);
+ break;
+ case arrow::Type::INT8:
+ gandiva_data_type->set_type(types::GandivaType::INT8);
+ break;
+ case arrow::Type::UINT16:
+ gandiva_data_type->set_type(types::GandivaType::UINT16);
+ break;
+ case arrow::Type::INT16:
+ gandiva_data_type->set_type(types::GandivaType::INT16);
+ break;
+ case arrow::Type::UINT32:
+ gandiva_data_type->set_type(types::GandivaType::UINT32);
+ break;
+ case arrow::Type::INT32:
+ gandiva_data_type->set_type(types::GandivaType::INT32);
+ break;
+ case arrow::Type::UINT64:
+ gandiva_data_type->set_type(types::GandivaType::UINT64);
+ break;
+ case arrow::Type::INT64:
+ gandiva_data_type->set_type(types::GandivaType::INT64);
+ break;
+ case arrow::Type::HALF_FLOAT:
+ gandiva_data_type->set_type(types::GandivaType::HALF_FLOAT);
+ break;
+ case arrow::Type::FLOAT:
+ gandiva_data_type->set_type(types::GandivaType::FLOAT);
+ break;
+ case arrow::Type::DOUBLE:
+ gandiva_data_type->set_type(types::GandivaType::DOUBLE);
+ break;
+ case arrow::Type::STRING:
+ gandiva_data_type->set_type(types::GandivaType::UTF8);
+ break;
+ case arrow::Type::BINARY:
+ gandiva_data_type->set_type(types::GandivaType::BINARY);
+ break;
+ case arrow::Type::DATE32:
+ gandiva_data_type->set_type(types::GandivaType::DATE32);
+ break;
+ case arrow::Type::DATE64:
+ gandiva_data_type->set_type(types::GandivaType::DATE64);
+ break;
+ case arrow::Type::TIMESTAMP: {
+ gandiva_data_type->set_type(types::GandivaType::TIMESTAMP);
+ std::shared_ptr<arrow::TimestampType> cast_time_stamp_type =
+ std::dynamic_pointer_cast<arrow::TimestampType>(type);
+ arrow::TimeUnit::type unit = cast_time_stamp_type->unit();
+ types::TimeUnit time_unit = MapTimeUnit(unit);
+ gandiva_data_type->set_timeunit(time_unit);
+ break;
+ }
+ case arrow::Type::TIME32: {
+ gandiva_data_type->set_type(types::GandivaType::TIME32);
+ std::shared_ptr<arrow::Time32Type> cast_time_32_type =
+ std::dynamic_pointer_cast<arrow::Time32Type>(type);
+ arrow::TimeUnit::type unit = cast_time_32_type->unit();
+ types::TimeUnit time_unit = MapTimeUnit(unit);
+ gandiva_data_type->set_timeunit(time_unit);
+ break;
+ }
+ case arrow::Type::TIME64: {
+ gandiva_data_type->set_type(types::GandivaType::TIME32);
+ std::shared_ptr<arrow::Time64Type> cast_time_64_type =
+ std::dynamic_pointer_cast<arrow::Time64Type>(type);
+ arrow::TimeUnit::type unit = cast_time_64_type->unit();
+ types::TimeUnit time_unit = MapTimeUnit(unit);
+ gandiva_data_type->set_timeunit(time_unit);
+ break;
+ }
+ case arrow::Type::NA:
+ gandiva_data_type->set_type(types::GandivaType::NONE);
+ break;
+ case arrow::Type::DECIMAL: {
+ gandiva_data_type->set_type(types::GandivaType::DECIMAL);
+ gandiva_data_type->set_precision(0);
+ gandiva_data_type->set_scale(0);
+ break;
+ }
+ case arrow::Type::INTERVAL_MONTHS:
+ gandiva_data_type->set_type(types::GandivaType::INTERVAL);
+ gandiva_data_type->set_intervaltype(types::IntervalType::YEAR_MONTH);
+ break;
+ case arrow::Type::INTERVAL_DAY_TIME:
+ gandiva_data_type->set_type(types::GandivaType::INTERVAL);
+ gandiva_data_type->set_intervaltype(types::IntervalType::DAY_TIME);
+ break;
+ default:
+ // un-supported types. test ensures that
+ // when one of these are added build breaks.
+ DCHECK(false);
+ }
+}
+
+JNIEXPORT jbyteArray JNICALL
+Java_org_apache_arrow_gandiva_evaluator_ExpressionRegistryJniHelper_getGandivaSupportedDataTypes( // NOLINT
+ JNIEnv* env, jobject types_helper) {
+ types::GandivaDataTypes gandiva_data_types;
+ auto supported_types = ExpressionRegistry::supported_types();
+ for (auto const& type : supported_types) {
+ types::ExtGandivaType* gandiva_data_type = gandiva_data_types.add_datatype();
+ ArrowToProtobuf(type, gandiva_data_type);
+ }
+ auto size = gandiva_data_types.ByteSizeLong();
+ std::unique_ptr<jbyte[]> buffer{new jbyte[size]};
+ gandiva_data_types.SerializeToArray(reinterpret_cast<void*>(buffer.get()), size);
+ jbyteArray ret = env->NewByteArray(size);
+ env->SetByteArrayRegion(ret, 0, size, buffer.get());
+ return ret;
+}
+
+/*
+ * Class: org_apache_arrow_gandiva_types_ExpressionRegistryJniHelper
+ * Method: getGandivaSupportedFunctions
+ * Signature: ()[B
+ */
+JNIEXPORT jbyteArray JNICALL
+Java_org_apache_arrow_gandiva_evaluator_ExpressionRegistryJniHelper_getGandivaSupportedFunctions( // NOLINT
+ JNIEnv* env, jobject types_helper) {
+ ExpressionRegistry expr_registry;
+ types::GandivaFunctions gandiva_functions;
+ for (auto function = expr_registry.function_signature_begin();
+ function != expr_registry.function_signature_end(); function++) {
+ types::FunctionSignature* function_signature = gandiva_functions.add_function();
+ function_signature->set_name((*function).base_name());
+ types::ExtGandivaType* return_type = function_signature->mutable_returntype();
+ ArrowToProtobuf((*function).ret_type(), return_type);
+ for (auto& param_type : (*function).param_types()) {
+ types::ExtGandivaType* proto_param_type = function_signature->add_paramtypes();
+ ArrowToProtobuf(param_type, proto_param_type);
+ }
+ }
+ auto size = gandiva_functions.ByteSizeLong();
+ std::unique_ptr<jbyte[]> buffer{new jbyte[size]};
+ gandiva_functions.SerializeToArray(reinterpret_cast<void*>(buffer.get()), size);
+ jbyteArray ret = env->NewByteArray(size);
+ env->SetByteArrayRegion(ret, 0, size, buffer.get());
+ return ret;
+}
diff --git a/src/arrow/cpp/src/gandiva/jni/id_to_module_map.h b/src/arrow/cpp/src/gandiva/jni/id_to_module_map.h
new file mode 100644
index 000000000..98100955b
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/jni/id_to_module_map.h
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <unordered_map>
+#include <utility>
+
+namespace gandiva {
+
+template <typename HOLDER>
+class IdToModuleMap {
+ public:
+ IdToModuleMap() : module_id_(kInitModuleId) {}
+
+ jlong Insert(HOLDER holder) {
+ mtx_.lock();
+ jlong result = module_id_++;
+ map_.insert(std::pair<jlong, HOLDER>(result, holder));
+ mtx_.unlock();
+ return result;
+ }
+
+ void Erase(jlong module_id) {
+ mtx_.lock();
+ map_.erase(module_id);
+ mtx_.unlock();
+ }
+
+ HOLDER Lookup(jlong module_id) {
+ HOLDER result = nullptr;
+ mtx_.lock();
+ try {
+ result = map_.at(module_id);
+ } catch (const std::out_of_range&) {
+ }
+ mtx_.unlock();
+ return result;
+ }
+
+ private:
+ static const int kInitModuleId = 4;
+
+ int64_t module_id_;
+ std::mutex mtx_;
+ // map from module ids returned to Java and module pointers
+ std::unordered_map<jlong, HOLDER> map_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/jni/jni_common.cc b/src/arrow/cpp/src/gandiva/jni/jni_common.cc
new file mode 100644
index 000000000..5a4cbb031
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/jni/jni_common.cc
@@ -0,0 +1,1055 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <google/protobuf/io/coded_stream.h>
+
+#include <map>
+#include <memory>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <arrow/builder.h>
+#include <arrow/record_batch.h>
+#include <arrow/type.h>
+
+#include "Types.pb.h"
+#include "gandiva/configuration.h"
+#include "gandiva/decimal_scalar.h"
+#include "gandiva/filter.h"
+#include "gandiva/jni/config_holder.h"
+#include "gandiva/jni/env_helper.h"
+#include "gandiva/jni/id_to_module_map.h"
+#include "gandiva/jni/module_holder.h"
+#include "gandiva/projector.h"
+#include "gandiva/selection_vector.h"
+#include "gandiva/tree_expr_builder.h"
+#include "jni/org_apache_arrow_gandiva_evaluator_JniWrapper.h"
+
+using gandiva::ConditionPtr;
+using gandiva::DataTypePtr;
+using gandiva::ExpressionPtr;
+using gandiva::ExpressionVector;
+using gandiva::FieldPtr;
+using gandiva::FieldVector;
+using gandiva::Filter;
+using gandiva::NodePtr;
+using gandiva::NodeVector;
+using gandiva::Projector;
+using gandiva::SchemaPtr;
+using gandiva::Status;
+using gandiva::TreeExprBuilder;
+
+using gandiva::ArrayDataVector;
+using gandiva::ConfigHolder;
+using gandiva::Configuration;
+using gandiva::ConfigurationBuilder;
+using gandiva::FilterHolder;
+using gandiva::ProjectorHolder;
+
+// forward declarations
+NodePtr ProtoTypeToNode(const types::TreeNode& node);
+
+static jint JNI_VERSION = JNI_VERSION_1_6;
+
+// extern refs - initialized for other modules.
+jclass configuration_builder_class_;
+
+// refs for self.
+static jclass gandiva_exception_;
+static jclass vector_expander_class_;
+static jclass vector_expander_ret_class_;
+static jmethodID vector_expander_method_;
+static jfieldID vector_expander_ret_address_;
+static jfieldID vector_expander_ret_capacity_;
+
+// module maps
+gandiva::IdToModuleMap<std::shared_ptr<ProjectorHolder>> projector_modules_;
+gandiva::IdToModuleMap<std::shared_ptr<FilterHolder>> filter_modules_;
+
+jint JNI_OnLoad(JavaVM* vm, void* reserved) {
+ JNIEnv* env;
+ if (vm->GetEnv(reinterpret_cast<void**>(&env), JNI_VERSION) != JNI_OK) {
+ return JNI_ERR;
+ }
+ jclass local_configuration_builder_class_ =
+ env->FindClass("org/apache/arrow/gandiva/evaluator/ConfigurationBuilder");
+ configuration_builder_class_ =
+ (jclass)env->NewGlobalRef(local_configuration_builder_class_);
+ env->DeleteLocalRef(local_configuration_builder_class_);
+
+ jclass localExceptionClass =
+ env->FindClass("org/apache/arrow/gandiva/exceptions/GandivaException");
+ gandiva_exception_ = (jclass)env->NewGlobalRef(localExceptionClass);
+ env->ExceptionDescribe();
+ env->DeleteLocalRef(localExceptionClass);
+
+ jclass local_expander_class =
+ env->FindClass("org/apache/arrow/gandiva/evaluator/VectorExpander");
+ vector_expander_class_ = (jclass)env->NewGlobalRef(local_expander_class);
+ env->DeleteLocalRef(local_expander_class);
+
+ vector_expander_method_ = env->GetMethodID(
+ vector_expander_class_, "expandOutputVectorAtIndex",
+ "(IJ)Lorg/apache/arrow/gandiva/evaluator/VectorExpander$ExpandResult;");
+
+ jclass local_expander_ret_class =
+ env->FindClass("org/apache/arrow/gandiva/evaluator/VectorExpander$ExpandResult");
+ vector_expander_ret_class_ = (jclass)env->NewGlobalRef(local_expander_ret_class);
+ env->DeleteLocalRef(local_expander_ret_class);
+
+ vector_expander_ret_address_ =
+ env->GetFieldID(vector_expander_ret_class_, "address", "J");
+ vector_expander_ret_capacity_ =
+ env->GetFieldID(vector_expander_ret_class_, "capacity", "J");
+ return JNI_VERSION;
+}
+
+void JNI_OnUnload(JavaVM* vm, void* reserved) {
+ JNIEnv* env;
+ vm->GetEnv(reinterpret_cast<void**>(&env), JNI_VERSION);
+ env->DeleteGlobalRef(configuration_builder_class_);
+ env->DeleteGlobalRef(gandiva_exception_);
+ env->DeleteGlobalRef(vector_expander_class_);
+ env->DeleteGlobalRef(vector_expander_ret_class_);
+}
+
+DataTypePtr ProtoTypeToTime32(const types::ExtGandivaType& ext_type) {
+ switch (ext_type.timeunit()) {
+ case types::SEC:
+ return arrow::time32(arrow::TimeUnit::SECOND);
+ case types::MILLISEC:
+ return arrow::time32(arrow::TimeUnit::MILLI);
+ default:
+ std::cerr << "Unknown time unit: " << ext_type.timeunit() << " for time32\n";
+ return nullptr;
+ }
+}
+
+DataTypePtr ProtoTypeToTime64(const types::ExtGandivaType& ext_type) {
+ switch (ext_type.timeunit()) {
+ case types::MICROSEC:
+ return arrow::time64(arrow::TimeUnit::MICRO);
+ case types::NANOSEC:
+ return arrow::time64(arrow::TimeUnit::NANO);
+ default:
+ std::cerr << "Unknown time unit: " << ext_type.timeunit() << " for time64\n";
+ return nullptr;
+ }
+}
+
+DataTypePtr ProtoTypeToTimestamp(const types::ExtGandivaType& ext_type) {
+ switch (ext_type.timeunit()) {
+ case types::SEC:
+ return arrow::timestamp(arrow::TimeUnit::SECOND);
+ case types::MILLISEC:
+ return arrow::timestamp(arrow::TimeUnit::MILLI);
+ case types::MICROSEC:
+ return arrow::timestamp(arrow::TimeUnit::MICRO);
+ case types::NANOSEC:
+ return arrow::timestamp(arrow::TimeUnit::NANO);
+ default:
+ std::cerr << "Unknown time unit: " << ext_type.timeunit() << " for timestamp\n";
+ return nullptr;
+ }
+}
+
+DataTypePtr ProtoTypeToInterval(const types::ExtGandivaType& ext_type) {
+ switch (ext_type.intervaltype()) {
+ case types::YEAR_MONTH:
+ return arrow::month_interval();
+ case types::DAY_TIME:
+ return arrow::day_time_interval();
+ default:
+ std::cerr << "Unknown interval type: " << ext_type.intervaltype() << "\n";
+ return nullptr;
+ }
+}
+
+DataTypePtr ProtoTypeToDataType(const types::ExtGandivaType& ext_type) {
+ switch (ext_type.type()) {
+ case types::NONE:
+ return arrow::null();
+ case types::BOOL:
+ return arrow::boolean();
+ case types::UINT8:
+ return arrow::uint8();
+ case types::INT8:
+ return arrow::int8();
+ case types::UINT16:
+ return arrow::uint16();
+ case types::INT16:
+ return arrow::int16();
+ case types::UINT32:
+ return arrow::uint32();
+ case types::INT32:
+ return arrow::int32();
+ case types::UINT64:
+ return arrow::uint64();
+ case types::INT64:
+ return arrow::int64();
+ case types::HALF_FLOAT:
+ return arrow::float16();
+ case types::FLOAT:
+ return arrow::float32();
+ case types::DOUBLE:
+ return arrow::float64();
+ case types::UTF8:
+ return arrow::utf8();
+ case types::BINARY:
+ return arrow::binary();
+ case types::DATE32:
+ return arrow::date32();
+ case types::DATE64:
+ return arrow::date64();
+ case types::DECIMAL:
+ // TODO: error handling
+ return arrow::decimal(ext_type.precision(), ext_type.scale());
+ case types::TIME32:
+ return ProtoTypeToTime32(ext_type);
+ case types::TIME64:
+ return ProtoTypeToTime64(ext_type);
+ case types::TIMESTAMP:
+ return ProtoTypeToTimestamp(ext_type);
+ case types::INTERVAL:
+ return ProtoTypeToInterval(ext_type);
+ case types::FIXED_SIZE_BINARY:
+ case types::LIST:
+ case types::STRUCT:
+ case types::UNION:
+ case types::DICTIONARY:
+ case types::MAP:
+ std::cerr << "Unhandled data type: " << ext_type.type() << "\n";
+ return nullptr;
+
+ default:
+ std::cerr << "Unknown data type: " << ext_type.type() << "\n";
+ return nullptr;
+ }
+}
+
+FieldPtr ProtoTypeToField(const types::Field& f) {
+ const std::string& name = f.name();
+ DataTypePtr type = ProtoTypeToDataType(f.type());
+ bool nullable = true;
+ if (f.has_nullable()) {
+ nullable = f.nullable();
+ }
+
+ return field(name, type, nullable);
+}
+
+NodePtr ProtoTypeToFieldNode(const types::FieldNode& node) {
+ FieldPtr field_ptr = ProtoTypeToField(node.field());
+ if (field_ptr == nullptr) {
+ std::cerr << "Unable to create field node from protobuf\n";
+ return nullptr;
+ }
+
+ return TreeExprBuilder::MakeField(field_ptr);
+}
+
+NodePtr ProtoTypeToFnNode(const types::FunctionNode& node) {
+ const std::string& name = node.functionname();
+ NodeVector children;
+
+ for (int i = 0; i < node.inargs_size(); i++) {
+ const types::TreeNode& arg = node.inargs(i);
+
+ NodePtr n = ProtoTypeToNode(arg);
+ if (n == nullptr) {
+ std::cerr << "Unable to create argument for function: " << name << "\n";
+ return nullptr;
+ }
+
+ children.push_back(n);
+ }
+
+ DataTypePtr return_type = ProtoTypeToDataType(node.returntype());
+ if (return_type == nullptr) {
+ std::cerr << "Unknown return type for function: " << name << "\n";
+ return nullptr;
+ }
+
+ return TreeExprBuilder::MakeFunction(name, children, return_type);
+}
+
+NodePtr ProtoTypeToIfNode(const types::IfNode& node) {
+ NodePtr cond = ProtoTypeToNode(node.cond());
+ if (cond == nullptr) {
+ std::cerr << "Unable to create cond node for if node\n";
+ return nullptr;
+ }
+
+ NodePtr then_node = ProtoTypeToNode(node.thennode());
+ if (then_node == nullptr) {
+ std::cerr << "Unable to create then node for if node\n";
+ return nullptr;
+ }
+
+ NodePtr else_node = ProtoTypeToNode(node.elsenode());
+ if (else_node == nullptr) {
+ std::cerr << "Unable to create else node for if node\n";
+ return nullptr;
+ }
+
+ DataTypePtr return_type = ProtoTypeToDataType(node.returntype());
+ if (return_type == nullptr) {
+ std::cerr << "Unknown return type for if node\n";
+ return nullptr;
+ }
+
+ return TreeExprBuilder::MakeIf(cond, then_node, else_node, return_type);
+}
+
+NodePtr ProtoTypeToAndNode(const types::AndNode& node) {
+ NodeVector children;
+
+ for (int i = 0; i < node.args_size(); i++) {
+ const types::TreeNode& arg = node.args(i);
+
+ NodePtr n = ProtoTypeToNode(arg);
+ if (n == nullptr) {
+ std::cerr << "Unable to create argument for boolean and\n";
+ return nullptr;
+ }
+ children.push_back(n);
+ }
+ return TreeExprBuilder::MakeAnd(children);
+}
+
+NodePtr ProtoTypeToOrNode(const types::OrNode& node) {
+ NodeVector children;
+
+ for (int i = 0; i < node.args_size(); i++) {
+ const types::TreeNode& arg = node.args(i);
+
+ NodePtr n = ProtoTypeToNode(arg);
+ if (n == nullptr) {
+ std::cerr << "Unable to create argument for boolean or\n";
+ return nullptr;
+ }
+ children.push_back(n);
+ }
+ return TreeExprBuilder::MakeOr(children);
+}
+
+NodePtr ProtoTypeToInNode(const types::InNode& node) {
+ NodePtr field = ProtoTypeToNode(node.node());
+
+ if (node.has_intvalues()) {
+ std::unordered_set<int32_t> int_values;
+ for (int i = 0; i < node.intvalues().intvalues_size(); i++) {
+ int_values.insert(node.intvalues().intvalues(i).value());
+ }
+ return TreeExprBuilder::MakeInExpressionInt32(field, int_values);
+ }
+
+ if (node.has_longvalues()) {
+ std::unordered_set<int64_t> long_values;
+ for (int i = 0; i < node.longvalues().longvalues_size(); i++) {
+ long_values.insert(node.longvalues().longvalues(i).value());
+ }
+ return TreeExprBuilder::MakeInExpressionInt64(field, long_values);
+ }
+
+ if (node.has_decimalvalues()) {
+ std::unordered_set<gandiva::DecimalScalar128> decimal_values;
+ for (int i = 0; i < node.decimalvalues().decimalvalues_size(); i++) {
+ decimal_values.insert(
+ gandiva::DecimalScalar128(node.decimalvalues().decimalvalues(i).value(),
+ node.decimalvalues().decimalvalues(i).precision(),
+ node.decimalvalues().decimalvalues(i).scale()));
+ }
+ return TreeExprBuilder::MakeInExpressionDecimal(field, decimal_values);
+ }
+
+ if (node.has_floatvalues()) {
+ std::unordered_set<float> float_values;
+ for (int i = 0; i < node.floatvalues().floatvalues_size(); i++) {
+ float_values.insert(node.floatvalues().floatvalues(i).value());
+ }
+ return TreeExprBuilder::MakeInExpressionFloat(field, float_values);
+ }
+
+ if (node.has_doublevalues()) {
+ std::unordered_set<double> double_values;
+ for (int i = 0; i < node.doublevalues().doublevalues_size(); i++) {
+ double_values.insert(node.doublevalues().doublevalues(i).value());
+ }
+ return TreeExprBuilder::MakeInExpressionDouble(field, double_values);
+ }
+
+ if (node.has_stringvalues()) {
+ std::unordered_set<std::string> stringvalues;
+ for (int i = 0; i < node.stringvalues().stringvalues_size(); i++) {
+ stringvalues.insert(node.stringvalues().stringvalues(i).value());
+ }
+ return TreeExprBuilder::MakeInExpressionString(field, stringvalues);
+ }
+
+ if (node.has_binaryvalues()) {
+ std::unordered_set<std::string> stringvalues;
+ for (int i = 0; i < node.binaryvalues().binaryvalues_size(); i++) {
+ stringvalues.insert(node.binaryvalues().binaryvalues(i).value());
+ }
+ return TreeExprBuilder::MakeInExpressionBinary(field, stringvalues);
+ }
+ // not supported yet.
+ std::cerr << "Unknown constant type for in expression.\n";
+ return nullptr;
+}
+
+NodePtr ProtoTypeToNullNode(const types::NullNode& node) {
+ DataTypePtr data_type = ProtoTypeToDataType(node.type());
+ if (data_type == nullptr) {
+ std::cerr << "Unknown type " << data_type->ToString() << " for null node\n";
+ return nullptr;
+ }
+
+ return TreeExprBuilder::MakeNull(data_type);
+}
+
+NodePtr ProtoTypeToNode(const types::TreeNode& node) {
+ if (node.has_fieldnode()) {
+ return ProtoTypeToFieldNode(node.fieldnode());
+ }
+
+ if (node.has_fnnode()) {
+ return ProtoTypeToFnNode(node.fnnode());
+ }
+
+ if (node.has_ifnode()) {
+ return ProtoTypeToIfNode(node.ifnode());
+ }
+
+ if (node.has_andnode()) {
+ return ProtoTypeToAndNode(node.andnode());
+ }
+
+ if (node.has_ornode()) {
+ return ProtoTypeToOrNode(node.ornode());
+ }
+
+ if (node.has_innode()) {
+ return ProtoTypeToInNode(node.innode());
+ }
+
+ if (node.has_nullnode()) {
+ return ProtoTypeToNullNode(node.nullnode());
+ }
+
+ if (node.has_intnode()) {
+ return TreeExprBuilder::MakeLiteral(node.intnode().value());
+ }
+
+ if (node.has_floatnode()) {
+ return TreeExprBuilder::MakeLiteral(node.floatnode().value());
+ }
+
+ if (node.has_longnode()) {
+ return TreeExprBuilder::MakeLiteral(node.longnode().value());
+ }
+
+ if (node.has_booleannode()) {
+ return TreeExprBuilder::MakeLiteral(node.booleannode().value());
+ }
+
+ if (node.has_doublenode()) {
+ return TreeExprBuilder::MakeLiteral(node.doublenode().value());
+ }
+
+ if (node.has_stringnode()) {
+ return TreeExprBuilder::MakeStringLiteral(node.stringnode().value());
+ }
+
+ if (node.has_binarynode()) {
+ return TreeExprBuilder::MakeBinaryLiteral(node.binarynode().value());
+ }
+
+ if (node.has_decimalnode()) {
+ std::string value = node.decimalnode().value();
+ gandiva::DecimalScalar128 literal(value, node.decimalnode().precision(),
+ node.decimalnode().scale());
+ return TreeExprBuilder::MakeDecimalLiteral(literal);
+ }
+ std::cerr << "Unknown node type in protobuf\n";
+ return nullptr;
+}
+
+ExpressionPtr ProtoTypeToExpression(const types::ExpressionRoot& root) {
+ NodePtr root_node = ProtoTypeToNode(root.root());
+ if (root_node == nullptr) {
+ std::cerr << "Unable to create expression node from expression protobuf\n";
+ return nullptr;
+ }
+
+ FieldPtr field = ProtoTypeToField(root.resulttype());
+ if (field == nullptr) {
+ std::cerr << "Unable to extra return field from expression protobuf\n";
+ return nullptr;
+ }
+
+ return TreeExprBuilder::MakeExpression(root_node, field);
+}
+
+ConditionPtr ProtoTypeToCondition(const types::Condition& condition) {
+ NodePtr root_node = ProtoTypeToNode(condition.root());
+ if (root_node == nullptr) {
+ return nullptr;
+ }
+
+ return TreeExprBuilder::MakeCondition(root_node);
+}
+
+SchemaPtr ProtoTypeToSchema(const types::Schema& schema) {
+ std::vector<FieldPtr> fields;
+
+ for (int i = 0; i < schema.columns_size(); i++) {
+ FieldPtr field = ProtoTypeToField(schema.columns(i));
+ if (field == nullptr) {
+ std::cerr << "Unable to extract arrow field from schema\n";
+ return nullptr;
+ }
+
+ fields.push_back(field);
+ }
+
+ return arrow::schema(fields);
+}
+
+// Common for both projector and filters.
+
+bool ParseProtobuf(uint8_t* buf, int bufLen, google::protobuf::Message* msg) {
+ google::protobuf::io::CodedInputStream cis(buf, bufLen);
+ cis.SetRecursionLimit(1000);
+ return msg->ParseFromCodedStream(&cis);
+}
+
+Status make_record_batch_with_buf_addrs(SchemaPtr schema, int num_rows,
+ jlong* in_buf_addrs, jlong* in_buf_sizes,
+ int in_bufs_len,
+ std::shared_ptr<arrow::RecordBatch>* batch) {
+ std::vector<std::shared_ptr<arrow::ArrayData>> columns;
+ auto num_fields = schema->num_fields();
+ int buf_idx = 0;
+ int sz_idx = 0;
+
+ for (int i = 0; i < num_fields; i++) {
+ auto field = schema->field(i);
+ std::vector<std::shared_ptr<arrow::Buffer>> buffers;
+
+ if (buf_idx >= in_bufs_len) {
+ return Status::Invalid("insufficient number of in_buf_addrs");
+ }
+ jlong validity_addr = in_buf_addrs[buf_idx++];
+ jlong validity_size = in_buf_sizes[sz_idx++];
+ auto validity = std::shared_ptr<arrow::Buffer>(
+ new arrow::Buffer(reinterpret_cast<uint8_t*>(validity_addr), validity_size));
+ buffers.push_back(validity);
+
+ if (buf_idx >= in_bufs_len) {
+ return Status::Invalid("insufficient number of in_buf_addrs");
+ }
+ jlong value_addr = in_buf_addrs[buf_idx++];
+ jlong value_size = in_buf_sizes[sz_idx++];
+ auto data = std::shared_ptr<arrow::Buffer>(
+ new arrow::Buffer(reinterpret_cast<uint8_t*>(value_addr), value_size));
+ buffers.push_back(data);
+
+ if (arrow::is_binary_like(field->type()->id())) {
+ if (buf_idx >= in_bufs_len) {
+ return Status::Invalid("insufficient number of in_buf_addrs");
+ }
+
+ // add offsets buffer for variable-len fields.
+ jlong offsets_addr = in_buf_addrs[buf_idx++];
+ jlong offsets_size = in_buf_sizes[sz_idx++];
+ auto offsets = std::shared_ptr<arrow::Buffer>(
+ new arrow::Buffer(reinterpret_cast<uint8_t*>(offsets_addr), offsets_size));
+ buffers.push_back(offsets);
+ }
+
+ auto array_data = arrow::ArrayData::Make(field->type(), num_rows, std::move(buffers));
+ columns.push_back(array_data);
+ }
+ *batch = arrow::RecordBatch::Make(schema, num_rows, columns);
+ return Status::OK();
+}
+
+// projector related functions.
+void releaseProjectorInput(jbyteArray schema_arr, jbyte* schema_bytes,
+ jbyteArray exprs_arr, jbyte* exprs_bytes, JNIEnv* env) {
+ env->ReleaseByteArrayElements(schema_arr, schema_bytes, JNI_ABORT);
+ env->ReleaseByteArrayElements(exprs_arr, exprs_bytes, JNI_ABORT);
+}
+
+JNIEXPORT jlong JNICALL Java_org_apache_arrow_gandiva_evaluator_JniWrapper_buildProjector(
+ JNIEnv* env, jobject obj, jbyteArray schema_arr, jbyteArray exprs_arr,
+ jint selection_vector_type, jlong configuration_id) {
+ jlong module_id = 0LL;
+ std::shared_ptr<Projector> projector;
+ std::shared_ptr<ProjectorHolder> holder;
+
+ types::Schema schema;
+ jsize schema_len = env->GetArrayLength(schema_arr);
+ jbyte* schema_bytes = env->GetByteArrayElements(schema_arr, 0);
+
+ types::ExpressionList exprs;
+ jsize exprs_len = env->GetArrayLength(exprs_arr);
+ jbyte* exprs_bytes = env->GetByteArrayElements(exprs_arr, 0);
+
+ ExpressionVector expr_vector;
+ SchemaPtr schema_ptr;
+ FieldVector ret_types;
+ gandiva::Status status;
+ auto mode = gandiva::SelectionVector::MODE_NONE;
+
+ std::shared_ptr<Configuration> config = ConfigHolder::MapLookup(configuration_id);
+ std::stringstream ss;
+
+ if (config == nullptr) {
+ ss << "configuration is mandatory.";
+ releaseProjectorInput(schema_arr, schema_bytes, exprs_arr, exprs_bytes, env);
+ goto err_out;
+ }
+
+ if (!ParseProtobuf(reinterpret_cast<uint8_t*>(schema_bytes), schema_len, &schema)) {
+ ss << "Unable to parse schema protobuf\n";
+ releaseProjectorInput(schema_arr, schema_bytes, exprs_arr, exprs_bytes, env);
+ goto err_out;
+ }
+
+ if (!ParseProtobuf(reinterpret_cast<uint8_t*>(exprs_bytes), exprs_len, &exprs)) {
+ releaseProjectorInput(schema_arr, schema_bytes, exprs_arr, exprs_bytes, env);
+ ss << "Unable to parse expressions protobuf\n";
+ goto err_out;
+ }
+
+ // convert types::Schema to arrow::Schema
+ schema_ptr = ProtoTypeToSchema(schema);
+ if (schema_ptr == nullptr) {
+ ss << "Unable to construct arrow schema object from schema protobuf\n";
+ releaseProjectorInput(schema_arr, schema_bytes, exprs_arr, exprs_bytes, env);
+ goto err_out;
+ }
+
+ // create Expression out of the list of exprs
+ for (int i = 0; i < exprs.exprs_size(); i++) {
+ ExpressionPtr root = ProtoTypeToExpression(exprs.exprs(i));
+
+ if (root == nullptr) {
+ ss << "Unable to construct expression object from expression protobuf\n";
+ releaseProjectorInput(schema_arr, schema_bytes, exprs_arr, exprs_bytes, env);
+ goto err_out;
+ }
+
+ expr_vector.push_back(root);
+ ret_types.push_back(root->result());
+ }
+
+ switch (selection_vector_type) {
+ case types::SV_NONE:
+ mode = gandiva::SelectionVector::MODE_NONE;
+ break;
+ case types::SV_INT16:
+ mode = gandiva::SelectionVector::MODE_UINT16;
+ break;
+ case types::SV_INT32:
+ mode = gandiva::SelectionVector::MODE_UINT32;
+ break;
+ }
+ // good to invoke the evaluator now
+ status = Projector::Make(schema_ptr, expr_vector, mode, config, &projector);
+
+ if (!status.ok()) {
+ ss << "Failed to make LLVM module due to " << status.message() << "\n";
+ releaseProjectorInput(schema_arr, schema_bytes, exprs_arr, exprs_bytes, env);
+ goto err_out;
+ }
+
+ // store the result in a map
+ holder = std::shared_ptr<ProjectorHolder>(
+ new ProjectorHolder(schema_ptr, ret_types, std::move(projector)));
+ module_id = projector_modules_.Insert(holder);
+ releaseProjectorInput(schema_arr, schema_bytes, exprs_arr, exprs_bytes, env);
+ return module_id;
+
+err_out:
+ env->ThrowNew(gandiva_exception_, ss.str().c_str());
+ return module_id;
+}
+
+///
+/// \brief Resizable buffer which resizes by doing a callback into java.
+///
+class JavaResizableBuffer : public arrow::ResizableBuffer {
+ public:
+ JavaResizableBuffer(JNIEnv* env, jobject jexpander, int32_t vector_idx, uint8_t* buffer,
+ int32_t len)
+ : ResizableBuffer(buffer, len),
+ env_(env),
+ jexpander_(jexpander),
+ vector_idx_(vector_idx) {
+ size_ = 0;
+ }
+
+ Status Resize(const int64_t new_size, bool shrink_to_fit) override;
+
+ Status Reserve(const int64_t new_capacity) override {
+ return Status::NotImplemented("reserve not implemented");
+ }
+
+ private:
+ JNIEnv* env_;
+ jobject jexpander_;
+ int32_t vector_idx_;
+};
+
+Status JavaResizableBuffer::Resize(const int64_t new_size, bool shrink_to_fit) {
+ if (shrink_to_fit == true) {
+ return Status::NotImplemented("shrink not implemented");
+ }
+
+ if (ARROW_PREDICT_TRUE(new_size < capacity())) {
+ // no need to expand.
+ size_ = new_size;
+ return Status::OK();
+ }
+
+ // callback into java to expand the buffer
+ jobject ret =
+ env_->CallObjectMethod(jexpander_, vector_expander_method_, vector_idx_, new_size);
+ if (env_->ExceptionCheck()) {
+ env_->ExceptionDescribe();
+ env_->ExceptionClear();
+ return Status::OutOfMemory("buffer expand failed in java");
+ }
+
+ jlong ret_address = env_->GetLongField(ret, vector_expander_ret_address_);
+ jlong ret_capacity = env_->GetLongField(ret, vector_expander_ret_capacity_);
+ DCHECK_GE(ret_capacity, new_size);
+
+ data_ = reinterpret_cast<uint8_t*>(ret_address);
+ size_ = new_size;
+ capacity_ = ret_capacity;
+ return Status::OK();
+}
+
+#define CHECK_OUT_BUFFER_IDX_AND_BREAK(idx, len) \
+ if (idx >= len) { \
+ status = gandiva::Status::Invalid("insufficient number of out_buf_addrs"); \
+ break; \
+ }
+
+JNIEXPORT void JNICALL
+Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector(
+ JNIEnv* env, jobject object, jobject jexpander, jlong module_id, jint num_rows,
+ jlongArray buf_addrs, jlongArray buf_sizes, jint sel_vec_type, jint sel_vec_rows,
+ jlong sel_vec_addr, jlong sel_vec_size, jlongArray out_buf_addrs,
+ jlongArray out_buf_sizes) {
+ Status status;
+ std::shared_ptr<ProjectorHolder> holder = projector_modules_.Lookup(module_id);
+ if (holder == nullptr) {
+ std::stringstream ss;
+ ss << "Unknown module id " << module_id;
+ env->ThrowNew(gandiva_exception_, ss.str().c_str());
+ return;
+ }
+
+ int in_bufs_len = env->GetArrayLength(buf_addrs);
+ if (in_bufs_len != env->GetArrayLength(buf_sizes)) {
+ env->ThrowNew(gandiva_exception_, "mismatch in arraylen of buf_addrs and buf_sizes");
+ return;
+ }
+
+ int out_bufs_len = env->GetArrayLength(out_buf_addrs);
+ if (out_bufs_len != env->GetArrayLength(out_buf_sizes)) {
+ env->ThrowNew(gandiva_exception_,
+ "mismatch in arraylen of out_buf_addrs and out_buf_sizes");
+ return;
+ }
+
+ jlong* in_buf_addrs = env->GetLongArrayElements(buf_addrs, 0);
+ jlong* in_buf_sizes = env->GetLongArrayElements(buf_sizes, 0);
+
+ jlong* out_bufs = env->GetLongArrayElements(out_buf_addrs, 0);
+ jlong* out_sizes = env->GetLongArrayElements(out_buf_sizes, 0);
+
+ do {
+ std::shared_ptr<arrow::RecordBatch> in_batch;
+ status = make_record_batch_with_buf_addrs(holder->schema(), num_rows, in_buf_addrs,
+ in_buf_sizes, in_bufs_len, &in_batch);
+ if (!status.ok()) {
+ break;
+ }
+
+ std::shared_ptr<gandiva::SelectionVector> selection_vector;
+ auto selection_buffer = std::make_shared<arrow::Buffer>(
+ reinterpret_cast<uint8_t*>(sel_vec_addr), sel_vec_size);
+ int output_row_count = 0;
+ switch (sel_vec_type) {
+ case types::SV_NONE: {
+ output_row_count = num_rows;
+ break;
+ }
+ case types::SV_INT16: {
+ status = gandiva::SelectionVector::MakeImmutableInt16(
+ sel_vec_rows, selection_buffer, &selection_vector);
+ output_row_count = sel_vec_rows;
+ break;
+ }
+ case types::SV_INT32: {
+ status = gandiva::SelectionVector::MakeImmutableInt32(
+ sel_vec_rows, selection_buffer, &selection_vector);
+ output_row_count = sel_vec_rows;
+ break;
+ }
+ }
+ if (!status.ok()) {
+ break;
+ }
+
+ auto ret_types = holder->rettypes();
+ ArrayDataVector output;
+ int buf_idx = 0;
+ int sz_idx = 0;
+ int output_vector_idx = 0;
+ for (FieldPtr field : ret_types) {
+ std::vector<std::shared_ptr<arrow::Buffer>> buffers;
+
+ CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len);
+ uint8_t* validity_buf = reinterpret_cast<uint8_t*>(out_bufs[buf_idx++]);
+ jlong bitmap_sz = out_sizes[sz_idx++];
+ buffers.push_back(std::make_shared<arrow::MutableBuffer>(validity_buf, bitmap_sz));
+
+ if (arrow::is_binary_like(field->type()->id())) {
+ CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len);
+ uint8_t* offsets_buf = reinterpret_cast<uint8_t*>(out_bufs[buf_idx++]);
+ jlong offsets_sz = out_sizes[sz_idx++];
+ buffers.push_back(
+ std::make_shared<arrow::MutableBuffer>(offsets_buf, offsets_sz));
+ }
+
+ CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len);
+ uint8_t* value_buf = reinterpret_cast<uint8_t*>(out_bufs[buf_idx++]);
+ jlong data_sz = out_sizes[sz_idx++];
+ if (arrow::is_binary_like(field->type()->id())) {
+ if (jexpander == nullptr) {
+ status = Status::Invalid(
+ "expression has variable len output columns, but the expander object is "
+ "null");
+ break;
+ }
+ buffers.push_back(std::make_shared<JavaResizableBuffer>(
+ env, jexpander, output_vector_idx, value_buf, data_sz));
+ } else {
+ buffers.push_back(std::make_shared<arrow::MutableBuffer>(value_buf, data_sz));
+ }
+
+ auto array_data = arrow::ArrayData::Make(field->type(), output_row_count, buffers);
+ output.push_back(array_data);
+ ++output_vector_idx;
+ }
+ if (!status.ok()) {
+ break;
+ }
+ status = holder->projector()->Evaluate(*in_batch, selection_vector.get(), output);
+ } while (0);
+
+ env->ReleaseLongArrayElements(buf_addrs, in_buf_addrs, JNI_ABORT);
+ env->ReleaseLongArrayElements(buf_sizes, in_buf_sizes, JNI_ABORT);
+ env->ReleaseLongArrayElements(out_buf_addrs, out_bufs, JNI_ABORT);
+ env->ReleaseLongArrayElements(out_buf_sizes, out_sizes, JNI_ABORT);
+
+ if (!status.ok()) {
+ std::stringstream ss;
+ ss << "Evaluate returned " << status.message() << "\n";
+ env->ThrowNew(gandiva_exception_, status.message().c_str());
+ return;
+ }
+}
+
+JNIEXPORT void JNICALL Java_org_apache_arrow_gandiva_evaluator_JniWrapper_closeProjector(
+ JNIEnv* env, jobject cls, jlong module_id) {
+ projector_modules_.Erase(module_id);
+}
+
+// filter related functions.
+void releaseFilterInput(jbyteArray schema_arr, jbyte* schema_bytes,
+ jbyteArray condition_arr, jbyte* condition_bytes, JNIEnv* env) {
+ env->ReleaseByteArrayElements(schema_arr, schema_bytes, JNI_ABORT);
+ env->ReleaseByteArrayElements(condition_arr, condition_bytes, JNI_ABORT);
+}
+
+JNIEXPORT jlong JNICALL Java_org_apache_arrow_gandiva_evaluator_JniWrapper_buildFilter(
+ JNIEnv* env, jobject obj, jbyteArray schema_arr, jbyteArray condition_arr,
+ jlong configuration_id) {
+ jlong module_id = 0LL;
+ std::shared_ptr<Filter> filter;
+ std::shared_ptr<FilterHolder> holder;
+
+ types::Schema schema;
+ jsize schema_len = env->GetArrayLength(schema_arr);
+ jbyte* schema_bytes = env->GetByteArrayElements(schema_arr, 0);
+
+ types::Condition condition;
+ jsize condition_len = env->GetArrayLength(condition_arr);
+ jbyte* condition_bytes = env->GetByteArrayElements(condition_arr, 0);
+
+ ConditionPtr condition_ptr;
+ SchemaPtr schema_ptr;
+ gandiva::Status status;
+
+ std::shared_ptr<Configuration> config = ConfigHolder::MapLookup(configuration_id);
+ std::stringstream ss;
+
+ if (config == nullptr) {
+ ss << "configuration is mandatory.";
+ releaseFilterInput(schema_arr, schema_bytes, condition_arr, condition_bytes, env);
+ goto err_out;
+ }
+
+ if (!ParseProtobuf(reinterpret_cast<uint8_t*>(schema_bytes), schema_len, &schema)) {
+ ss << "Unable to parse schema protobuf\n";
+ releaseFilterInput(schema_arr, schema_bytes, condition_arr, condition_bytes, env);
+ goto err_out;
+ }
+
+ if (!ParseProtobuf(reinterpret_cast<uint8_t*>(condition_bytes), condition_len,
+ &condition)) {
+ ss << "Unable to parse condition protobuf\n";
+ releaseFilterInput(schema_arr, schema_bytes, condition_arr, condition_bytes, env);
+ goto err_out;
+ }
+
+ // convert types::Schema to arrow::Schema
+ schema_ptr = ProtoTypeToSchema(schema);
+ if (schema_ptr == nullptr) {
+ ss << "Unable to construct arrow schema object from schema protobuf\n";
+ releaseFilterInput(schema_arr, schema_bytes, condition_arr, condition_bytes, env);
+ goto err_out;
+ }
+
+ condition_ptr = ProtoTypeToCondition(condition);
+ if (condition_ptr == nullptr) {
+ ss << "Unable to construct condition object from condition protobuf\n";
+ releaseFilterInput(schema_arr, schema_bytes, condition_arr, condition_bytes, env);
+ goto err_out;
+ }
+
+ // good to invoke the filter builder now
+ status = Filter::Make(schema_ptr, condition_ptr, config, &filter);
+ if (!status.ok()) {
+ ss << "Failed to make LLVM module due to " << status.message() << "\n";
+ releaseFilterInput(schema_arr, schema_bytes, condition_arr, condition_bytes, env);
+ goto err_out;
+ }
+
+ // store the result in a map
+ holder = std::shared_ptr<FilterHolder>(new FilterHolder(schema_ptr, std::move(filter)));
+ module_id = filter_modules_.Insert(holder);
+ releaseFilterInput(schema_arr, schema_bytes, condition_arr, condition_bytes, env);
+ return module_id;
+
+err_out:
+ env->ThrowNew(gandiva_exception_, ss.str().c_str());
+ return module_id;
+}
+
+JNIEXPORT jint JNICALL Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateFilter(
+ JNIEnv* env, jobject cls, jlong module_id, jint num_rows, jlongArray buf_addrs,
+ jlongArray buf_sizes, jint jselection_vector_type, jlong out_buf_addr,
+ jlong out_buf_size) {
+ gandiva::Status status;
+ std::shared_ptr<FilterHolder> holder = filter_modules_.Lookup(module_id);
+ if (holder == nullptr) {
+ env->ThrowNew(gandiva_exception_, "Unknown module id\n");
+ return -1;
+ }
+
+ int in_bufs_len = env->GetArrayLength(buf_addrs);
+ if (in_bufs_len != env->GetArrayLength(buf_sizes)) {
+ env->ThrowNew(gandiva_exception_, "mismatch in arraylen of buf_addrs and buf_sizes");
+ return -1;
+ }
+
+ jlong* in_buf_addrs = env->GetLongArrayElements(buf_addrs, 0);
+ jlong* in_buf_sizes = env->GetLongArrayElements(buf_sizes, 0);
+ std::shared_ptr<gandiva::SelectionVector> selection_vector;
+
+ do {
+ std::shared_ptr<arrow::RecordBatch> in_batch;
+
+ status = make_record_batch_with_buf_addrs(holder->schema(), num_rows, in_buf_addrs,
+ in_buf_sizes, in_bufs_len, &in_batch);
+ if (!status.ok()) {
+ break;
+ }
+
+ auto selection_vector_type =
+ static_cast<types::SelectionVectorType>(jselection_vector_type);
+ auto out_buffer = std::make_shared<arrow::MutableBuffer>(
+ reinterpret_cast<uint8_t*>(out_buf_addr), out_buf_size);
+ switch (selection_vector_type) {
+ case types::SV_INT16:
+ status =
+ gandiva::SelectionVector::MakeInt16(num_rows, out_buffer, &selection_vector);
+ break;
+ case types::SV_INT32:
+ status =
+ gandiva::SelectionVector::MakeInt32(num_rows, out_buffer, &selection_vector);
+ break;
+ default:
+ status = gandiva::Status::Invalid("unknown selection vector type");
+ }
+ if (!status.ok()) {
+ break;
+ }
+
+ status = holder->filter()->Evaluate(*in_batch, selection_vector);
+ } while (0);
+
+ env->ReleaseLongArrayElements(buf_addrs, in_buf_addrs, JNI_ABORT);
+ env->ReleaseLongArrayElements(buf_sizes, in_buf_sizes, JNI_ABORT);
+
+ if (!status.ok()) {
+ std::stringstream ss;
+ ss << "Evaluate returned " << status.message() << "\n";
+ env->ThrowNew(gandiva_exception_, status.message().c_str());
+ return -1;
+ } else {
+ int64_t num_slots = selection_vector->GetNumSlots();
+ // Check integer overflow
+ if (num_slots > INT_MAX) {
+ std::stringstream ss;
+ ss << "The selection vector has " << num_slots
+ << " slots, which is larger than the " << INT_MAX << " limit.\n";
+ const std::string message = ss.str();
+ env->ThrowNew(gandiva_exception_, message.c_str());
+ return -1;
+ }
+ return static_cast<int>(num_slots);
+ }
+}
+
+JNIEXPORT void JNICALL Java_org_apache_arrow_gandiva_evaluator_JniWrapper_closeFilter(
+ JNIEnv* env, jobject cls, jlong module_id) {
+ filter_modules_.Erase(module_id);
+}
diff --git a/src/arrow/cpp/src/gandiva/jni/module_holder.h b/src/arrow/cpp/src/gandiva/jni/module_holder.h
new file mode 100644
index 000000000..929c64231
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/jni/module_holder.h
@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <utility>
+
+#include "gandiva/arrow.h"
+
+namespace gandiva {
+
+class Projector;
+class Filter;
+
+class ProjectorHolder {
+ public:
+ ProjectorHolder(SchemaPtr schema, FieldVector ret_types,
+ std::shared_ptr<Projector> projector)
+ : schema_(schema), ret_types_(ret_types), projector_(std::move(projector)) {}
+
+ SchemaPtr schema() { return schema_; }
+ FieldVector rettypes() { return ret_types_; }
+ std::shared_ptr<Projector> projector() { return projector_; }
+
+ private:
+ SchemaPtr schema_;
+ FieldVector ret_types_;
+ std::shared_ptr<Projector> projector_;
+};
+
+class FilterHolder {
+ public:
+ FilterHolder(SchemaPtr schema, std::shared_ptr<Filter> filter)
+ : schema_(schema), filter_(std::move(filter)) {}
+
+ SchemaPtr schema() { return schema_; }
+ std::shared_ptr<Filter> filter() { return filter_; }
+
+ private:
+ SchemaPtr schema_;
+ std::shared_ptr<Filter> filter_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/jni/symbols.map b/src/arrow/cpp/src/gandiva/jni/symbols.map
new file mode 100644
index 000000000..e0f5def41
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/jni/symbols.map
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+{
+ global: extern "C++" { gandiva*; }; Java*; JNI*;
+ local: *;
+};
diff --git a/src/arrow/cpp/src/gandiva/like_holder.cc b/src/arrow/cpp/src/gandiva/like_holder.cc
new file mode 100644
index 000000000..af9ac67d6
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/like_holder.cc
@@ -0,0 +1,156 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/like_holder.h"
+
+#include <regex>
+#include "gandiva/node.h"
+#include "gandiva/regex_util.h"
+
+namespace gandiva {
+
+RE2 LikeHolder::starts_with_regex_(R"((\w|\s)*\.\*)");
+RE2 LikeHolder::ends_with_regex_(R"(\.\*(\w|\s)*)");
+RE2 LikeHolder::is_substr_regex_(R"(\.\*(\w|\s)*\.\*)");
+
+// Short-circuit pattern matches for the following common sub cases :
+// - starts_with, ends_with and is_substr
+const FunctionNode LikeHolder::TryOptimize(const FunctionNode& node) {
+ std::shared_ptr<LikeHolder> holder;
+ auto status = Make(node, &holder);
+ if (status.ok()) {
+ std::string& pattern = holder->pattern_;
+ auto literal_type = node.children().at(1)->return_type();
+
+ if (RE2::FullMatch(pattern, starts_with_regex_)) {
+ auto prefix = pattern.substr(0, pattern.length() - 2); // trim .*
+ auto prefix_node =
+ std::make_shared<LiteralNode>(literal_type, LiteralHolder(prefix), false);
+ return FunctionNode("starts_with", {node.children().at(0), prefix_node},
+ node.return_type());
+ } else if (RE2::FullMatch(pattern, ends_with_regex_)) {
+ auto suffix = pattern.substr(2); // skip .*
+ auto suffix_node =
+ std::make_shared<LiteralNode>(literal_type, LiteralHolder(suffix), false);
+ return FunctionNode("ends_with", {node.children().at(0), suffix_node},
+ node.return_type());
+ } else if (RE2::FullMatch(pattern, is_substr_regex_)) {
+ auto substr =
+ pattern.substr(2, pattern.length() - 4); // trim starting and ending .*
+ auto substr_node =
+ std::make_shared<LiteralNode>(literal_type, LiteralHolder(substr), false);
+ return FunctionNode("is_substr", {node.children().at(0), substr_node},
+ node.return_type());
+ }
+ }
+
+ // Could not optimize, return original node.
+ return node;
+}
+
+static bool IsArrowStringLiteral(arrow::Type::type type) {
+ return type == arrow::Type::STRING || type == arrow::Type::BINARY;
+}
+
+Status LikeHolder::Make(const FunctionNode& node, std::shared_ptr<LikeHolder>* holder) {
+ ARROW_RETURN_IF(node.children().size() != 2 && node.children().size() != 3,
+ Status::Invalid("'like' function requires two or three parameters"));
+
+ auto literal = dynamic_cast<LiteralNode*>(node.children().at(1).get());
+ ARROW_RETURN_IF(
+ literal == nullptr,
+ Status::Invalid("'like' function requires a literal as the second parameter"));
+
+ auto literal_type = literal->return_type()->id();
+ ARROW_RETURN_IF(
+ !IsArrowStringLiteral(literal_type),
+ Status::Invalid(
+ "'like' function requires a string literal as the second parameter"));
+
+ RE2::Options regex_op;
+ if (node.descriptor()->name() == "ilike") {
+ regex_op.set_case_sensitive(false); // set case-insensitive for ilike function.
+
+ return Make(arrow::util::get<std::string>(literal->holder()), holder, regex_op);
+ }
+ if (node.children().size() == 2) {
+ return Make(arrow::util::get<std::string>(literal->holder()), holder);
+ } else {
+ auto escape_char = dynamic_cast<LiteralNode*>(node.children().at(2).get());
+ ARROW_RETURN_IF(
+ escape_char == nullptr,
+ Status::Invalid("'like' function requires a literal as the third parameter"));
+
+ auto escape_char_type = escape_char->return_type()->id();
+ ARROW_RETURN_IF(
+ !IsArrowStringLiteral(escape_char_type),
+ Status::Invalid(
+ "'like' function requires a string literal as the third parameter"));
+ return Make(arrow::util::get<std::string>(literal->holder()),
+ arrow::util::get<std::string>(escape_char->holder()), holder);
+ }
+}
+
+Status LikeHolder::Make(const std::string& sql_pattern,
+ std::shared_ptr<LikeHolder>* holder) {
+ std::string pcre_pattern;
+ ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern));
+
+ auto lholder = std::shared_ptr<LikeHolder>(new LikeHolder(pcre_pattern));
+ ARROW_RETURN_IF(!lholder->regex_.ok(),
+ Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed"));
+
+ *holder = lholder;
+ return Status::OK();
+}
+
+Status LikeHolder::Make(const std::string& sql_pattern, const std::string& escape_char,
+ std::shared_ptr<LikeHolder>* holder) {
+ ARROW_RETURN_IF(escape_char.length() > 1,
+ Status::Invalid("The length of escape char ", escape_char,
+ " in 'like' function is greater than 1"));
+ std::string pcre_pattern;
+ if (escape_char.length() == 1) {
+ ARROW_RETURN_NOT_OK(
+ RegexUtil::SqlLikePatternToPcre(sql_pattern, escape_char.at(0), pcre_pattern));
+ } else {
+ ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern));
+ }
+
+ auto lholder = std::shared_ptr<LikeHolder>(new LikeHolder(pcre_pattern));
+ ARROW_RETURN_IF(!lholder->regex_.ok(),
+ Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed"));
+
+ *holder = lholder;
+ return Status::OK();
+}
+
+Status LikeHolder::Make(const std::string& sql_pattern,
+ std::shared_ptr<LikeHolder>* holder, RE2::Options regex_op) {
+ std::string pcre_pattern;
+ ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern));
+
+ std::shared_ptr<LikeHolder> lholder;
+ lholder = std::shared_ptr<LikeHolder>(new LikeHolder(pcre_pattern, regex_op));
+
+ ARROW_RETURN_IF(!lholder->regex_.ok(),
+ Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed"));
+
+ *holder = lholder;
+ return Status::OK();
+}
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/like_holder.h b/src/arrow/cpp/src/gandiva/like_holder.h
new file mode 100644
index 000000000..73e58017d
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/like_holder.h
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include <re2/re2.h>
+
+#include "arrow/status.h"
+
+#include "gandiva/function_holder.h"
+#include "gandiva/node.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+/// Function Holder for SQL 'like'
+class GANDIVA_EXPORT LikeHolder : public FunctionHolder {
+ public:
+ ~LikeHolder() override = default;
+
+ static Status Make(const FunctionNode& node, std::shared_ptr<LikeHolder>* holder);
+
+ static Status Make(const std::string& sql_pattern, std::shared_ptr<LikeHolder>* holder);
+
+ static Status Make(const std::string& sql_pattern, const std::string& escape_char,
+ std::shared_ptr<LikeHolder>* holder);
+
+ static Status Make(const std::string& sql_pattern, std::shared_ptr<LikeHolder>* holder,
+ RE2::Options regex_op);
+
+ // Try and optimise a function node with a "like" pattern.
+ static const FunctionNode TryOptimize(const FunctionNode& node);
+
+ /// Return true if the data matches the pattern.
+ bool operator()(const std::string& data) { return RE2::FullMatch(data, regex_); }
+
+ private:
+ explicit LikeHolder(const std::string& pattern) : pattern_(pattern), regex_(pattern) {}
+
+ LikeHolder(const std::string& pattern, RE2::Options regex_op)
+ : pattern_(pattern), regex_(pattern, regex_op) {}
+
+ std::string pattern_; // posix pattern string, to help debugging
+ RE2 regex_; // compiled regex for the pattern
+
+ static RE2 starts_with_regex_; // pre-compiled pattern for matching starts_with
+ static RE2 ends_with_regex_; // pre-compiled pattern for matching ends_with
+ static RE2 is_substr_regex_; // pre-compiled pattern for matching is_substr
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/like_holder_test.cc b/src/arrow/cpp/src/gandiva/like_holder_test.cc
new file mode 100644
index 000000000..a52533a11
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/like_holder_test.cc
@@ -0,0 +1,281 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/like_holder.h"
+#include "gandiva/regex_util.h"
+
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+namespace gandiva {
+
+class TestLikeHolder : public ::testing::Test {
+ public:
+ RE2::Options regex_op;
+ FunctionNode BuildLike(std::string pattern) {
+ auto field = std::make_shared<FieldNode>(arrow::field("in", arrow::utf8()));
+ auto pattern_node =
+ std::make_shared<LiteralNode>(arrow::utf8(), LiteralHolder(pattern), false);
+ return FunctionNode("like", {field, pattern_node}, arrow::boolean());
+ }
+
+ FunctionNode BuildLike(std::string pattern, char escape_char) {
+ auto field = std::make_shared<FieldNode>(arrow::field("in", arrow::utf8()));
+ auto pattern_node =
+ std::make_shared<LiteralNode>(arrow::utf8(), LiteralHolder(pattern), false);
+ auto escape_char_node = std::make_shared<LiteralNode>(
+ arrow::int8(), LiteralHolder((int8_t)escape_char), false);
+ return FunctionNode("like", {field, pattern_node, escape_char_node},
+ arrow::boolean());
+ }
+};
+
+TEST_F(TestLikeHolder, TestMatchAny) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ auto status = LikeHolder::Make("ab%", &like_holder, regex_op);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+ EXPECT_TRUE(like("ab"));
+ EXPECT_TRUE(like("abc"));
+ EXPECT_TRUE(like("abcd"));
+
+ EXPECT_FALSE(like("a"));
+ EXPECT_FALSE(like("cab"));
+}
+
+TEST_F(TestLikeHolder, TestMatchOne) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ auto status = LikeHolder::Make("ab_", &like_holder, regex_op);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+ EXPECT_TRUE(like("abc"));
+ EXPECT_TRUE(like("abd"));
+
+ EXPECT_FALSE(like("a"));
+ EXPECT_FALSE(like("abcd"));
+ EXPECT_FALSE(like("dabc"));
+}
+
+TEST_F(TestLikeHolder, TestPcreSpecial) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ auto status = LikeHolder::Make(".*ab_", &like_holder, regex_op);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+ EXPECT_TRUE(like(".*abc")); // . and * aren't special in sql regex
+ EXPECT_FALSE(like("xxabc"));
+}
+
+TEST_F(TestLikeHolder, TestRegexEscape) {
+ std::string res;
+ auto status = RegexUtil::SqlLikePatternToPcre("#%hello#_abc_def##", '#', res);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ EXPECT_EQ(res, "%hello_abc.def#");
+}
+
+TEST_F(TestLikeHolder, TestDot) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ auto status = LikeHolder::Make("abc.", &like_holder, regex_op);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+ EXPECT_FALSE(like("abcd"));
+}
+
+TEST_F(TestLikeHolder, TestOptimise) {
+ // optimise for 'starts_with'
+ auto fnode = LikeHolder::TryOptimize(BuildLike("xy 123z%"));
+ EXPECT_EQ(fnode.descriptor()->name(), "starts_with");
+ EXPECT_EQ(fnode.ToString(), "bool starts_with((string) in, (const string) xy 123z)");
+
+ // optimise for 'ends_with'
+ fnode = LikeHolder::TryOptimize(BuildLike("%xyz"));
+ EXPECT_EQ(fnode.descriptor()->name(), "ends_with");
+ EXPECT_EQ(fnode.ToString(), "bool ends_with((string) in, (const string) xyz)");
+
+ // optimise for 'is_substr'
+ fnode = LikeHolder::TryOptimize(BuildLike("%abc%"));
+ EXPECT_EQ(fnode.descriptor()->name(), "is_substr");
+ EXPECT_EQ(fnode.ToString(), "bool is_substr((string) in, (const string) abc)");
+
+ // no optimisation for others.
+ fnode = LikeHolder::TryOptimize(BuildLike("xyz_"));
+ EXPECT_EQ(fnode.descriptor()->name(), "like");
+
+ fnode = LikeHolder::TryOptimize(BuildLike("_xyz"));
+ EXPECT_EQ(fnode.descriptor()->name(), "like");
+
+ fnode = LikeHolder::TryOptimize(BuildLike("_xyz_"));
+ EXPECT_EQ(fnode.descriptor()->name(), "like");
+
+ fnode = LikeHolder::TryOptimize(BuildLike("%xyz_"));
+ EXPECT_EQ(fnode.descriptor()->name(), "like");
+
+ fnode = LikeHolder::TryOptimize(BuildLike("x_yz%"));
+ EXPECT_EQ(fnode.descriptor()->name(), "like");
+
+ // no optimisation for escaped pattern.
+ fnode = LikeHolder::TryOptimize(BuildLike("\\%xyz", '\\'));
+ EXPECT_EQ(fnode.descriptor()->name(), "like");
+ EXPECT_EQ(fnode.ToString(),
+ "bool like((string) in, (const string) \\%xyz, (const int8) \\)");
+}
+
+TEST_F(TestLikeHolder, TestMatchOneEscape) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ auto status = LikeHolder::Make("ab\\_", "\\", &like_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+
+ EXPECT_TRUE(like("ab_"));
+
+ EXPECT_FALSE(like("abc"));
+ EXPECT_FALSE(like("abd"));
+ EXPECT_FALSE(like("a"));
+ EXPECT_FALSE(like("abcd"));
+ EXPECT_FALSE(like("dabc"));
+}
+
+TEST_F(TestLikeHolder, TestMatchManyEscape) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ auto status = LikeHolder::Make("ab\\%", "\\", &like_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+
+ EXPECT_TRUE(like("ab%"));
+
+ EXPECT_FALSE(like("abc"));
+ EXPECT_FALSE(like("abd"));
+ EXPECT_FALSE(like("a"));
+ EXPECT_FALSE(like("abcd"));
+ EXPECT_FALSE(like("dabc"));
+}
+
+TEST_F(TestLikeHolder, TestMatchEscape) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ auto status = LikeHolder::Make("ab\\\\", "\\", &like_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+
+ EXPECT_TRUE(like("ab\\"));
+
+ EXPECT_FALSE(like("abc"));
+}
+
+TEST_F(TestLikeHolder, TestEmptyEscapeChar) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ auto status = LikeHolder::Make("ab\\_", "", &like_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+
+ EXPECT_TRUE(like("ab\\c"));
+ EXPECT_TRUE(like("ab\\_"));
+
+ EXPECT_FALSE(like("ab\\_d"));
+ EXPECT_FALSE(like("ab__"));
+}
+
+TEST_F(TestLikeHolder, TestMultipleEscapeChar) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ auto status = LikeHolder::Make("ab\\_", "\\\\", &like_holder);
+ EXPECT_EQ(status.ok(), false) << status.message();
+}
+class TestILikeHolder : public ::testing::Test {
+ public:
+ RE2::Options regex_op;
+ FunctionNode BuildILike(std::string pattern) {
+ auto field = std::make_shared<FieldNode>(arrow::field("in", arrow::utf8()));
+ auto pattern_node =
+ std::make_shared<LiteralNode>(arrow::utf8(), LiteralHolder(pattern), false);
+ return FunctionNode("ilike", {field, pattern_node}, arrow::boolean());
+ }
+};
+
+TEST_F(TestILikeHolder, TestMatchAny) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ regex_op.set_case_sensitive(false);
+ auto status = LikeHolder::Make("ab%", &like_holder, regex_op);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+ EXPECT_TRUE(like("ab"));
+ EXPECT_TRUE(like("aBc"));
+ EXPECT_TRUE(like("ABCD"));
+
+ EXPECT_FALSE(like("a"));
+ EXPECT_FALSE(like("cab"));
+}
+
+TEST_F(TestILikeHolder, TestMatchOne) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ regex_op.set_case_sensitive(false);
+ auto status = LikeHolder::Make("Ab_", &like_holder, regex_op);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+ EXPECT_TRUE(like("abc"));
+ EXPECT_TRUE(like("aBd"));
+
+ EXPECT_FALSE(like("A"));
+ EXPECT_FALSE(like("Abcd"));
+ EXPECT_FALSE(like("DaBc"));
+}
+
+TEST_F(TestILikeHolder, TestPcreSpecial) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ regex_op.set_case_sensitive(false);
+ auto status = LikeHolder::Make(".*aB_", &like_holder, regex_op);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+ EXPECT_TRUE(like(".*Abc")); // . and * aren't special in sql regex
+ EXPECT_FALSE(like("xxAbc"));
+}
+
+TEST_F(TestILikeHolder, TestDot) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ regex_op.set_case_sensitive(false);
+ auto status = LikeHolder::Make("aBc.", &like_holder, regex_op);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+ EXPECT_FALSE(like("abcd"));
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/literal_holder.cc b/src/arrow/cpp/src/gandiva/literal_holder.cc
new file mode 100644
index 000000000..beed8119c
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/literal_holder.cc
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <sstream>
+
+#include "gandiva/literal_holder.h"
+
+namespace gandiva {
+
+namespace {
+
+template <typename OStream>
+struct LiteralToStream {
+ OStream& ostream_;
+
+ template <typename Value>
+ void operator()(const Value& v) {
+ ostream_ << v;
+ }
+};
+
+} // namespace
+
+std::string ToString(const LiteralHolder& holder) {
+ std::stringstream ss;
+ LiteralToStream<std::stringstream> visitor{ss};
+ ::arrow::util::visit(visitor, holder);
+ return ss.str();
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/literal_holder.h b/src/arrow/cpp/src/gandiva/literal_holder.h
new file mode 100644
index 000000000..c4712aafc
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/literal_holder.h
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+
+#include <arrow/util/variant.h>
+
+#include <arrow/type.h>
+#include "gandiva/decimal_scalar.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+using LiteralHolder =
+ arrow::util::Variant<bool, float, double, int8_t, int16_t, int32_t, int64_t, uint8_t,
+ uint16_t, uint32_t, uint64_t, std::string, DecimalScalar128>;
+
+GANDIVA_EXPORT std::string ToString(const LiteralHolder& holder);
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/llvm_generator.cc b/src/arrow/cpp/src/gandiva/llvm_generator.cc
new file mode 100644
index 000000000..0129e5278
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/llvm_generator.cc
@@ -0,0 +1,1400 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/llvm_generator.h"
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "gandiva/bitmap_accumulator.h"
+#include "gandiva/decimal_ir.h"
+#include "gandiva/dex.h"
+#include "gandiva/expr_decomposer.h"
+#include "gandiva/expression.h"
+#include "gandiva/lvalue.h"
+
+namespace gandiva {
+
+#define ADD_TRACE(...) \
+ if (enable_ir_traces_) { \
+ AddTrace(__VA_ARGS__); \
+ }
+
+LLVMGenerator::LLVMGenerator() : enable_ir_traces_(false) {}
+
+Status LLVMGenerator::Make(std::shared_ptr<Configuration> config,
+ std::unique_ptr<LLVMGenerator>* llvm_generator) {
+ std::unique_ptr<LLVMGenerator> llvmgen_obj(new LLVMGenerator());
+
+ ARROW_RETURN_NOT_OK(Engine::Make(config, &(llvmgen_obj->engine_)));
+ *llvm_generator = std::move(llvmgen_obj);
+
+ return Status::OK();
+}
+
+Status LLVMGenerator::Add(const ExpressionPtr expr, const FieldDescriptorPtr output) {
+ int idx = static_cast<int>(compiled_exprs_.size());
+ // decompose the expression to separate out value and validities.
+ ExprDecomposer decomposer(function_registry_, annotator_);
+ ValueValidityPairPtr value_validity;
+ ARROW_RETURN_NOT_OK(decomposer.Decompose(*expr->root(), &value_validity));
+ // Generate the IR function for the decomposed expression.
+ std::unique_ptr<CompiledExpr> compiled_expr(new CompiledExpr(value_validity, output));
+ llvm::Function* ir_function = nullptr;
+ ARROW_RETURN_NOT_OK(CodeGenExprValue(value_validity->value_expr(),
+ annotator_.buffer_count(), output, idx,
+ &ir_function, selection_vector_mode_));
+ compiled_expr->SetIRFunction(selection_vector_mode_, ir_function);
+
+ compiled_exprs_.push_back(std::move(compiled_expr));
+ return Status::OK();
+}
+
+/// Build and optimise module for projection expression.
+Status LLVMGenerator::Build(const ExpressionVector& exprs, SelectionVector::Mode mode) {
+ selection_vector_mode_ = mode;
+ for (auto& expr : exprs) {
+ auto output = annotator_.AddOutputFieldDescriptor(expr->result());
+ ARROW_RETURN_NOT_OK(Add(expr, output));
+ }
+
+ // Compile and inject into the process' memory the generated function.
+ ARROW_RETURN_NOT_OK(engine_->FinalizeModule());
+
+ // setup the jit functions for each expression.
+ for (auto& compiled_expr : compiled_exprs_) {
+ auto ir_fn = compiled_expr->GetIRFunction(mode);
+ auto jit_fn = reinterpret_cast<EvalFunc>(engine_->CompiledFunction(ir_fn));
+ compiled_expr->SetJITFunction(selection_vector_mode_, jit_fn);
+ }
+
+ return Status::OK();
+}
+
+/// Execute the compiled module against the provided vectors.
+Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch,
+ const ArrayDataVector& output_vector) {
+ return Execute(record_batch, nullptr, output_vector);
+}
+
+/// Execute the compiled module against the provided vectors based on the type of
+/// selection vector.
+Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch,
+ const SelectionVector* selection_vector,
+ const ArrayDataVector& output_vector) {
+ DCHECK_GT(record_batch.num_rows(), 0);
+
+ auto eval_batch = annotator_.PrepareEvalBatch(record_batch, output_vector);
+ DCHECK_GT(eval_batch->GetNumBuffers(), 0);
+
+ auto mode = SelectionVector::MODE_NONE;
+ if (selection_vector != nullptr) {
+ mode = selection_vector->GetMode();
+ }
+ if (mode != selection_vector_mode_) {
+ return Status::Invalid("llvm expression built for selection vector mode ",
+ selection_vector_mode_, " received vector with mode ", mode);
+ }
+
+ for (auto& compiled_expr : compiled_exprs_) {
+ // generate data/offset vectors.
+ const uint8_t* selection_buffer = nullptr;
+ auto num_output_rows = record_batch.num_rows();
+ if (selection_vector != nullptr) {
+ selection_buffer = selection_vector->GetBuffer().data();
+ num_output_rows = selection_vector->GetNumSlots();
+ }
+
+ EvalFunc jit_function = compiled_expr->GetJITFunction(mode);
+ jit_function(eval_batch->GetBufferArray(), eval_batch->GetBufferOffsetArray(),
+ eval_batch->GetLocalBitMapArray(), selection_buffer,
+ (int64_t)eval_batch->GetExecutionContext(), num_output_rows);
+
+ // check for execution errors
+ ARROW_RETURN_IF(
+ eval_batch->GetExecutionContext()->has_error(),
+ Status::ExecutionError(eval_batch->GetExecutionContext()->get_error()));
+
+ // generate validity vectors.
+ ComputeBitMapsForExpr(*compiled_expr, *eval_batch, selection_vector);
+ }
+
+ return Status::OK();
+}
+
+llvm::Value* LLVMGenerator::LoadVectorAtIndex(llvm::Value* arg_addrs, int idx,
+ const std::string& name) {
+ auto* idx_val = types()->i32_constant(idx);
+ auto* offset = CreateGEP(ir_builder(), arg_addrs, idx_val, name + "_mem_addr");
+ return CreateLoad(ir_builder(), offset, name + "_mem");
+}
+
+/// Get reference to validity array at specified index in the args list.
+llvm::Value* LLVMGenerator::GetValidityReference(llvm::Value* arg_addrs, int idx,
+ FieldPtr field) {
+ const std::string& name = field->name();
+ llvm::Value* load = LoadVectorAtIndex(arg_addrs, idx, name);
+ return ir_builder()->CreateIntToPtr(load, types()->i64_ptr_type(), name + "_varray");
+}
+
+/// Get reference to data array at specified index in the args list.
+llvm::Value* LLVMGenerator::GetDataBufferPtrReference(llvm::Value* arg_addrs, int idx,
+ FieldPtr field) {
+ const std::string& name = field->name();
+ llvm::Value* load = LoadVectorAtIndex(arg_addrs, idx, name);
+ return ir_builder()->CreateIntToPtr(load, types()->i8_ptr_type(), name + "_buf_ptr");
+}
+
+/// Get reference to data array at specified index in the args list.
+llvm::Value* LLVMGenerator::GetDataReference(llvm::Value* arg_addrs, int idx,
+ FieldPtr field) {
+ const std::string& name = field->name();
+ llvm::Value* load = LoadVectorAtIndex(arg_addrs, idx, name);
+ llvm::Type* base_type = types()->DataVecType(field->type());
+ llvm::Value* ret;
+ if (base_type->isPointerTy()) {
+ ret = ir_builder()->CreateIntToPtr(load, base_type, name + "_darray");
+ } else {
+ llvm::Type* pointer_type = types()->ptr_type(base_type);
+ ret = ir_builder()->CreateIntToPtr(load, pointer_type, name + "_darray");
+ }
+ return ret;
+}
+
+/// Get reference to offsets array at specified index in the args list.
+llvm::Value* LLVMGenerator::GetOffsetsReference(llvm::Value* arg_addrs, int idx,
+ FieldPtr field) {
+ const std::string& name = field->name();
+ llvm::Value* load = LoadVectorAtIndex(arg_addrs, idx, name);
+ return ir_builder()->CreateIntToPtr(load, types()->i32_ptr_type(), name + "_oarray");
+}
+
+/// Get reference to local bitmap array at specified index in the args list.
+llvm::Value* LLVMGenerator::GetLocalBitMapReference(llvm::Value* arg_bitmaps, int idx) {
+ llvm::Value* load = LoadVectorAtIndex(arg_bitmaps, idx, "");
+ return ir_builder()->CreateIntToPtr(load, types()->i64_ptr_type(),
+ std::to_string(idx) + "_lbmap");
+}
+
+/// \brief Generate code for one expression.
+
+// Sample IR code for "c1:int + c2:int"
+//
+// The C-code equivalent is :
+// ------------------------------
+// int expr_0(int64_t *addrs, int64_t *local_bitmaps,
+// int64_t execution_context_ptr, int64_t nrecords) {
+// int *outVec = (int *) addrs[5];
+// int *c0Vec = (int *) addrs[1];
+// int *c1Vec = (int *) addrs[3];
+// for (int loop_var = 0; loop_var < nrecords; ++loop_var) {
+// int c0 = c0Vec[loop_var];
+// int c1 = c1Vec[loop_var];
+// int out = c0 + c1;
+// outVec[loop_var] = out;
+// }
+// }
+//
+// IR Code
+// --------
+//
+// define i32 @expr_0(i64* %args, i64* %local_bitmaps, i64 %execution_context_ptr, , i64
+// %nrecords) { entry:
+// %outmemAddr = getelementptr i64, i64* %args, i32 5
+// %outmem = load i64, i64* %outmemAddr
+// %outVec = inttoptr i64 %outmem to i32*
+// %c0memAddr = getelementptr i64, i64* %args, i32 1
+// %c0mem = load i64, i64* %c0memAddr
+// %c0Vec = inttoptr i64 %c0mem to i32*
+// %c1memAddr = getelementptr i64, i64* %args, i32 3
+// %c1mem = load i64, i64* %c1memAddr
+// %c1Vec = inttoptr i64 %c1mem to i32*
+// br label %loop
+// loop: ; preds = %loop, %entry
+// %loop_var = phi i64 [ 0, %entry ], [ %"loop_var+1", %loop ]
+// %"loop_var+1" = add i64 %loop_var, 1
+// %0 = getelementptr i32, i32* %c0Vec, i32 %loop_var
+// %c0 = load i32, i32* %0
+// %1 = getelementptr i32, i32* %c1Vec, i32 %loop_var
+// %c1 = load i32, i32* %1
+// %add_int_int = call i32 @add_int_int(i32 %c0, i32 %c1)
+// %2 = getelementptr i32, i32* %outVec, i32 %loop_var
+// store i32 %add_int_int, i32* %2
+// %"loop_var < nrec" = icmp slt i64 %"loop_var+1", %nrecords
+// br i1 %"loop_var < nrec", label %loop, label %exit
+// exit: ; preds = %loop
+// ret i32 0
+// }
+Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count,
+ FieldDescriptorPtr output, int suffix_idx,
+ llvm::Function** fn,
+ SelectionVector::Mode selection_vector_mode) {
+ llvm::IRBuilder<>* builder = ir_builder();
+ // Create fn prototype :
+ // int expr_1 (long **addrs, long *offsets, long **bitmaps,
+ // long *context_ptr, long nrec)
+ std::vector<llvm::Type*> arguments;
+ arguments.push_back(types()->i64_ptr_type()); // addrs
+ arguments.push_back(types()->i64_ptr_type()); // offsets
+ arguments.push_back(types()->i64_ptr_type()); // bitmaps
+ switch (selection_vector_mode) {
+ case SelectionVector::MODE_NONE:
+ case SelectionVector::MODE_UINT16:
+ arguments.push_back(types()->ptr_type(types()->i16_type()));
+ break;
+ case SelectionVector::MODE_UINT32:
+ arguments.push_back(types()->i32_ptr_type());
+ break;
+ case SelectionVector::MODE_UINT64:
+ arguments.push_back(types()->i64_ptr_type());
+ }
+ arguments.push_back(types()->i64_type()); // ctx_ptr
+ arguments.push_back(types()->i64_type()); // nrec
+ llvm::FunctionType* prototype =
+ llvm::FunctionType::get(types()->i32_type(), arguments, false /*isVarArg*/);
+
+ // Create fn
+ std::string func_name = "expr_" + std::to_string(suffix_idx) + "_" +
+ std::to_string(static_cast<int>(selection_vector_mode));
+ engine_->AddFunctionToCompile(func_name);
+ *fn = llvm::Function::Create(prototype, llvm::GlobalValue::ExternalLinkage, func_name,
+ module());
+ ARROW_RETURN_IF((*fn == nullptr), Status::CodeGenError("Error creating function."));
+
+ // Name the arguments
+ llvm::Function::arg_iterator args = (*fn)->arg_begin();
+ llvm::Value* arg_addrs = &*args;
+ arg_addrs->setName("inputs_addr");
+ ++args;
+ llvm::Value* arg_addr_offsets = &*args;
+ arg_addr_offsets->setName("inputs_addr_offsets");
+ ++args;
+ llvm::Value* arg_local_bitmaps = &*args;
+ arg_local_bitmaps->setName("local_bitmaps");
+ ++args;
+ llvm::Value* arg_selection_vector = &*args;
+ arg_selection_vector->setName("selection_vector");
+ ++args;
+ llvm::Value* arg_context_ptr = &*args;
+ arg_context_ptr->setName("context_ptr");
+ ++args;
+ llvm::Value* arg_nrecords = &*args;
+ arg_nrecords->setName("nrecords");
+
+ llvm::BasicBlock* loop_entry = llvm::BasicBlock::Create(*context(), "entry", *fn);
+ llvm::BasicBlock* loop_body = llvm::BasicBlock::Create(*context(), "loop", *fn);
+ llvm::BasicBlock* loop_exit = llvm::BasicBlock::Create(*context(), "exit", *fn);
+
+ // Add reference to output vector (in entry block)
+ builder->SetInsertPoint(loop_entry);
+ llvm::Value* output_ref =
+ GetDataReference(arg_addrs, output->data_idx(), output->field());
+ llvm::Value* output_buffer_ptr_ref = GetDataBufferPtrReference(
+ arg_addrs, output->data_buffer_ptr_idx(), output->field());
+ llvm::Value* output_offset_ref =
+ GetOffsetsReference(arg_addrs, output->offsets_idx(), output->field());
+
+ std::vector<llvm::Value*> slice_offsets;
+ for (int idx = 0; idx < buffer_count; idx++) {
+ auto offsetAddr = CreateGEP(builder, arg_addr_offsets, types()->i32_constant(idx));
+ auto offset = CreateLoad(builder, offsetAddr);
+ slice_offsets.push_back(offset);
+ }
+
+ // Loop body
+ builder->SetInsertPoint(loop_body);
+
+ // define loop_var : start with 0, +1 after each iter
+ llvm::PHINode* loop_var = builder->CreatePHI(types()->i64_type(), 2, "loop_var");
+
+ llvm::Value* position_var = loop_var;
+ if (selection_vector_mode != SelectionVector::MODE_NONE) {
+ position_var = builder->CreateIntCast(
+ CreateLoad(builder, CreateGEP(builder, arg_selection_vector, loop_var),
+ "uncasted_position_var"),
+ types()->i64_type(), true, "position_var");
+ }
+
+ // The visitor can add code to both the entry/loop blocks.
+ Visitor visitor(this, *fn, loop_entry, arg_addrs, arg_local_bitmaps, slice_offsets,
+ arg_context_ptr, position_var);
+ value_expr->Accept(visitor);
+ LValuePtr output_value = visitor.result();
+
+ // The "current" block may have changed due to code generation in the visitor.
+ llvm::BasicBlock* loop_body_tail = builder->GetInsertBlock();
+
+ // add jump to "loop block" at the end of the "setup block".
+ builder->SetInsertPoint(loop_entry);
+ builder->CreateBr(loop_body);
+
+ // save the value in the output vector.
+ builder->SetInsertPoint(loop_body_tail);
+
+ auto output_type_id = output->Type()->id();
+ if (output_type_id == arrow::Type::BOOL) {
+ SetPackedBitValue(output_ref, loop_var, output_value->data());
+ } else if (arrow::is_primitive(output_type_id) ||
+ output_type_id == arrow::Type::DECIMAL) {
+ llvm::Value* slot_offset = CreateGEP(builder, output_ref, loop_var);
+ builder->CreateStore(output_value->data(), slot_offset);
+ } else if (arrow::is_binary_like(output_type_id)) {
+ // Var-len output. Make a function call to populate the data.
+ // if there is an error, the fn sets it in the context. And, will be returned at the
+ // end of this row batch.
+ AddFunctionCall("gdv_fn_populate_varlen_vector", types()->i32_type(),
+ {arg_context_ptr, output_buffer_ptr_ref, output_offset_ref, loop_var,
+ output_value->data(), output_value->length()});
+ } else {
+ return Status::NotImplemented("output type ", output->Type()->ToString(),
+ " not supported");
+ }
+ ADD_TRACE("saving result " + output->Name() + " value %T", output_value->data());
+
+ if (visitor.has_arena_allocs()) {
+ // Reset allocations to avoid excessive memory usage. Once the result is copied to
+ // the output vector (store instruction above), any memory allocations in this
+ // iteration of the loop are no longer needed.
+ std::vector<llvm::Value*> reset_args;
+ reset_args.push_back(arg_context_ptr);
+ AddFunctionCall("gdv_fn_context_arena_reset", types()->void_type(), reset_args);
+ }
+
+ // check loop_var
+ loop_var->addIncoming(types()->i64_constant(0), loop_entry);
+ llvm::Value* loop_update =
+ builder->CreateAdd(loop_var, types()->i64_constant(1), "loop_var+1");
+ loop_var->addIncoming(loop_update, loop_body_tail);
+
+ llvm::Value* loop_var_check =
+ builder->CreateICmpSLT(loop_update, arg_nrecords, "loop_var < nrec");
+ builder->CreateCondBr(loop_var_check, loop_body, loop_exit);
+
+ // Loop exit
+ builder->SetInsertPoint(loop_exit);
+ builder->CreateRet(types()->i32_constant(0));
+ return Status::OK();
+}
+
+/// Return value of a bit in bitMap.
+llvm::Value* LLVMGenerator::GetPackedBitValue(llvm::Value* bitmap,
+ llvm::Value* position) {
+ ADD_TRACE("fetch bit at position %T", position);
+
+ llvm::Value* bitmap8 = ir_builder()->CreateBitCast(
+ bitmap, types()->ptr_type(types()->i8_type()), "bitMapCast");
+ return AddFunctionCall("bitMapGetBit", types()->i1_type(), {bitmap8, position});
+}
+
+/// Set the value of a bit in bitMap.
+void LLVMGenerator::SetPackedBitValue(llvm::Value* bitmap, llvm::Value* position,
+ llvm::Value* value) {
+ ADD_TRACE("set bit at position %T", position);
+ ADD_TRACE(" to value %T ", value);
+
+ llvm::Value* bitmap8 = ir_builder()->CreateBitCast(
+ bitmap, types()->ptr_type(types()->i8_type()), "bitMapCast");
+ AddFunctionCall("bitMapSetBit", types()->void_type(), {bitmap8, position, value});
+}
+
+/// Return value of a bit in validity bitMap (handles null bitmaps too).
+llvm::Value* LLVMGenerator::GetPackedValidityBitValue(llvm::Value* bitmap,
+ llvm::Value* position) {
+ ADD_TRACE("fetch validity bit at position %T", position);
+
+ llvm::Value* bitmap8 = ir_builder()->CreateBitCast(
+ bitmap, types()->ptr_type(types()->i8_type()), "bitMapCast");
+ return AddFunctionCall("bitMapValidityGetBit", types()->i1_type(), {bitmap8, position});
+}
+
+/// Clear the bit in bitMap if value = false.
+void LLVMGenerator::ClearPackedBitValueIfFalse(llvm::Value* bitmap, llvm::Value* position,
+ llvm::Value* value) {
+ ADD_TRACE("ClearIfFalse bit at position %T", position);
+ ADD_TRACE(" value %T ", value);
+
+ llvm::Value* bitmap8 = ir_builder()->CreateBitCast(
+ bitmap, types()->ptr_type(types()->i8_type()), "bitMapCast");
+ AddFunctionCall("bitMapClearBitIfFalse", types()->void_type(),
+ {bitmap8, position, value});
+}
+
+/// Extract the bitmap addresses, and do an intersection.
+void LLVMGenerator::ComputeBitMapsForExpr(const CompiledExpr& compiled_expr,
+ const EvalBatch& eval_batch,
+ const SelectionVector* selection_vector) {
+ auto validities = compiled_expr.value_validity()->validity_exprs();
+
+ // Extract all the source bitmap addresses.
+ BitMapAccumulator accumulator(eval_batch);
+ for (auto& validity_dex : validities) {
+ validity_dex->Accept(accumulator);
+ }
+
+ // Extract the destination bitmap address.
+ int out_idx = compiled_expr.output()->validity_idx();
+ uint8_t* dst_bitmap = eval_batch.GetBuffer(out_idx);
+ // Compute the destination bitmap.
+ if (selection_vector == nullptr) {
+ accumulator.ComputeResult(dst_bitmap);
+ } else {
+ /// The output bitmap is an intersection of some input/local bitmaps. However, with a
+ /// selection vector, only the bits corresponding to the indices in the selection
+ /// vector need to set in the output bitmap. This is done in two steps :
+ ///
+ /// 1. Do the intersection of input/local bitmaps to generate a temporary bitmap.
+ /// 2. copy just the relevant bits from the temporary bitmap to the output bitmap.
+ LocalBitMapsHolder bit_map_holder(eval_batch.num_records(), 1);
+ uint8_t* temp_bitmap = bit_map_holder.GetLocalBitMap(0);
+ accumulator.ComputeResult(temp_bitmap);
+
+ auto num_out_records = selection_vector->GetNumSlots();
+ // the memset isn't required, doing it just for valgrind.
+ memset(dst_bitmap, 0, arrow::BitUtil::BytesForBits(num_out_records));
+ for (auto i = 0; i < num_out_records; ++i) {
+ auto bit = arrow::BitUtil::GetBit(temp_bitmap, selection_vector->GetIndex(i));
+ arrow::BitUtil::SetBitTo(dst_bitmap, i, bit);
+ }
+ }
+}
+
+llvm::Value* LLVMGenerator::AddFunctionCall(const std::string& full_name,
+ llvm::Type* ret_type,
+ const std::vector<llvm::Value*>& args) {
+ // find the llvm function.
+ llvm::Function* fn = module()->getFunction(full_name);
+ DCHECK_NE(fn, nullptr) << "missing function " << full_name;
+
+ if (enable_ir_traces_ && !full_name.compare("printf") &&
+ !full_name.compare("printff")) {
+ // Trace for debugging
+ ADD_TRACE("invoke native fn " + full_name);
+ }
+
+ // build a call to the llvm function.
+ llvm::Value* value;
+ if (ret_type->isVoidTy()) {
+ // void functions can't have a name for the call.
+ value = ir_builder()->CreateCall(fn, args);
+ } else {
+ value = ir_builder()->CreateCall(fn, args, full_name);
+ DCHECK(value->getType() == ret_type);
+ }
+
+ return value;
+}
+
+std::shared_ptr<DecimalLValue> LLVMGenerator::BuildDecimalLValue(llvm::Value* value,
+ DataTypePtr arrow_type) {
+ // only decimals of size 128-bit supported.
+ DCHECK(is_decimal_128(arrow_type));
+ auto decimal_type =
+ arrow::internal::checked_cast<arrow::DecimalType*>(arrow_type.get());
+ return std::make_shared<DecimalLValue>(value, nullptr,
+ types()->i32_constant(decimal_type->precision()),
+ types()->i32_constant(decimal_type->scale()));
+}
+
+#define ADD_VISITOR_TRACE(...) \
+ if (generator_->enable_ir_traces_) { \
+ generator_->AddTrace(__VA_ARGS__); \
+ }
+
+// Visitor for generating the code for a decomposed expression.
+LLVMGenerator::Visitor::Visitor(LLVMGenerator* generator, llvm::Function* function,
+ llvm::BasicBlock* entry_block, llvm::Value* arg_addrs,
+ llvm::Value* arg_local_bitmaps,
+ std::vector<llvm::Value*> slice_offsets,
+ llvm::Value* arg_context_ptr, llvm::Value* loop_var)
+ : generator_(generator),
+ function_(function),
+ entry_block_(entry_block),
+ arg_addrs_(arg_addrs),
+ arg_local_bitmaps_(arg_local_bitmaps),
+ slice_offsets_(slice_offsets),
+ arg_context_ptr_(arg_context_ptr),
+ loop_var_(loop_var),
+ has_arena_allocs_(false) {
+ ADD_VISITOR_TRACE("Iteration %T", loop_var);
+}
+
+void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueDex& dex) {
+ llvm::IRBuilder<>* builder = ir_builder();
+ llvm::Value* slot_ref = GetBufferReference(dex.DataIdx(), kBufferTypeData, dex.Field());
+ llvm::Value* slot_index = builder->CreateAdd(loop_var_, GetSliceOffset(dex.DataIdx()));
+ llvm::Value* slot_value;
+ std::shared_ptr<LValue> lvalue;
+
+ switch (dex.FieldType()->id()) {
+ case arrow::Type::BOOL:
+ slot_value = generator_->GetPackedBitValue(slot_ref, slot_index);
+ lvalue = std::make_shared<LValue>(slot_value);
+ break;
+
+ case arrow::Type::DECIMAL: {
+ auto slot_offset = CreateGEP(builder, slot_ref, slot_index);
+ slot_value = CreateLoad(builder, slot_offset, dex.FieldName());
+ lvalue = generator_->BuildDecimalLValue(slot_value, dex.FieldType());
+ break;
+ }
+
+ default: {
+ auto slot_offset = CreateGEP(builder, slot_ref, slot_index);
+ slot_value = CreateLoad(builder, slot_offset, dex.FieldName());
+ lvalue = std::make_shared<LValue>(slot_value);
+ break;
+ }
+ }
+ ADD_VISITOR_TRACE("visit fixed-len data vector " + dex.FieldName() + " value %T",
+ slot_value);
+ result_ = lvalue;
+}
+
+void LLVMGenerator::Visitor::Visit(const VectorReadVarLenValueDex& dex) {
+ llvm::IRBuilder<>* builder = ir_builder();
+ llvm::Value* slot;
+
+ // compute len from the offsets array.
+ llvm::Value* offsets_slot_ref =
+ GetBufferReference(dex.OffsetsIdx(), kBufferTypeOffsets, dex.Field());
+ llvm::Value* offsets_slot_index =
+ builder->CreateAdd(loop_var_, GetSliceOffset(dex.OffsetsIdx()));
+
+ // => offset_start = offsets[loop_var]
+ slot = CreateGEP(builder, offsets_slot_ref, offsets_slot_index);
+ llvm::Value* offset_start = CreateLoad(builder, slot, "offset_start");
+
+ // => offset_end = offsets[loop_var + 1]
+ llvm::Value* offsets_slot_index_next = builder->CreateAdd(
+ offsets_slot_index, generator_->types()->i64_constant(1), "loop_var+1");
+ slot = CreateGEP(builder, offsets_slot_ref, offsets_slot_index_next);
+ llvm::Value* offset_end = CreateLoad(builder, slot, "offset_end");
+
+ // => len_value = offset_end - offset_start
+ llvm::Value* len_value =
+ builder->CreateSub(offset_end, offset_start, dex.FieldName() + "Len");
+
+ // get the data from the data array, at offset 'offset_start'.
+ llvm::Value* data_slot_ref =
+ GetBufferReference(dex.DataIdx(), kBufferTypeData, dex.Field());
+ llvm::Value* data_value = CreateGEP(builder, data_slot_ref, offset_start);
+ ADD_VISITOR_TRACE("visit var-len data vector " + dex.FieldName() + " len %T",
+ len_value);
+ result_.reset(new LValue(data_value, len_value));
+}
+
+void LLVMGenerator::Visitor::Visit(const VectorReadValidityDex& dex) {
+ llvm::IRBuilder<>* builder = ir_builder();
+ llvm::Value* slot_ref =
+ GetBufferReference(dex.ValidityIdx(), kBufferTypeValidity, dex.Field());
+ llvm::Value* slot_index =
+ builder->CreateAdd(loop_var_, GetSliceOffset(dex.ValidityIdx()));
+ llvm::Value* validity = generator_->GetPackedValidityBitValue(slot_ref, slot_index);
+
+ ADD_VISITOR_TRACE("visit validity vector " + dex.FieldName() + " value %T", validity);
+ result_.reset(new LValue(validity));
+}
+
+void LLVMGenerator::Visitor::Visit(const LocalBitMapValidityDex& dex) {
+ llvm::Value* slot_ref = GetLocalBitMapReference(dex.local_bitmap_idx());
+ llvm::Value* validity = generator_->GetPackedBitValue(slot_ref, loop_var_);
+
+ ADD_VISITOR_TRACE(
+ "visit local bitmap " + std::to_string(dex.local_bitmap_idx()) + " value %T",
+ validity);
+ result_.reset(new LValue(validity));
+}
+
+void LLVMGenerator::Visitor::Visit(const TrueDex& dex) {
+ result_.reset(new LValue(generator_->types()->true_constant()));
+}
+
+void LLVMGenerator::Visitor::Visit(const FalseDex& dex) {
+ result_.reset(new LValue(generator_->types()->false_constant()));
+}
+
+void LLVMGenerator::Visitor::Visit(const LiteralDex& dex) {
+ LLVMTypes* types = generator_->types();
+ llvm::Value* value = nullptr;
+ llvm::Value* len = nullptr;
+
+ switch (dex.type()->id()) {
+ case arrow::Type::BOOL:
+ value = types->i1_constant(arrow::util::get<bool>(dex.holder()));
+ break;
+
+ case arrow::Type::UINT8:
+ value = types->i8_constant(arrow::util::get<uint8_t>(dex.holder()));
+ break;
+
+ case arrow::Type::UINT16:
+ value = types->i16_constant(arrow::util::get<uint16_t>(dex.holder()));
+ break;
+
+ case arrow::Type::UINT32:
+ value = types->i32_constant(arrow::util::get<uint32_t>(dex.holder()));
+ break;
+
+ case arrow::Type::UINT64:
+ value = types->i64_constant(arrow::util::get<uint64_t>(dex.holder()));
+ break;
+
+ case arrow::Type::INT8:
+ value = types->i8_constant(arrow::util::get<int8_t>(dex.holder()));
+ break;
+
+ case arrow::Type::INT16:
+ value = types->i16_constant(arrow::util::get<int16_t>(dex.holder()));
+ break;
+
+ case arrow::Type::FLOAT:
+ value = types->float_constant(arrow::util::get<float>(dex.holder()));
+ break;
+
+ case arrow::Type::DOUBLE:
+ value = types->double_constant(arrow::util::get<double>(dex.holder()));
+ break;
+
+ case arrow::Type::STRING:
+ case arrow::Type::BINARY: {
+ const std::string& str = arrow::util::get<std::string>(dex.holder());
+
+ llvm::Constant* str_int_cast = types->i64_constant((int64_t)str.c_str());
+ value = llvm::ConstantExpr::getIntToPtr(str_int_cast, types->i8_ptr_type());
+ len = types->i32_constant(static_cast<int32_t>(str.length()));
+ break;
+ }
+
+ case arrow::Type::INT32:
+ case arrow::Type::DATE32:
+ case arrow::Type::TIME32:
+ case arrow::Type::INTERVAL_MONTHS:
+ value = types->i32_constant(arrow::util::get<int32_t>(dex.holder()));
+ break;
+
+ case arrow::Type::INT64:
+ case arrow::Type::DATE64:
+ case arrow::Type::TIME64:
+ case arrow::Type::TIMESTAMP:
+ case arrow::Type::INTERVAL_DAY_TIME:
+ value = types->i64_constant(arrow::util::get<int64_t>(dex.holder()));
+ break;
+
+ case arrow::Type::DECIMAL: {
+ // build code for struct
+ auto scalar = arrow::util::get<DecimalScalar128>(dex.holder());
+ // ConstantInt doesn't have a get method that takes int128 or a pair of int64. so,
+ // passing the string representation instead.
+ auto int128_value =
+ llvm::ConstantInt::get(llvm::Type::getInt128Ty(*generator_->context()),
+ Decimal128(scalar.value()).ToIntegerString(), 10);
+ auto type = arrow::decimal(scalar.precision(), scalar.scale());
+ auto lvalue = generator_->BuildDecimalLValue(int128_value, type);
+ // set it as the l-value and return.
+ result_ = lvalue;
+ return;
+ }
+
+ default:
+ DCHECK(0);
+ }
+ ADD_VISITOR_TRACE("visit Literal %T", value);
+ result_.reset(new LValue(value, len));
+}
+
+void LLVMGenerator::Visitor::Visit(const NonNullableFuncDex& dex) {
+ const std::string& function_name = dex.func_descriptor()->name();
+ ADD_VISITOR_TRACE("visit NonNullableFunc base function " + function_name);
+
+ const NativeFunction* native_function = dex.native_function();
+
+ // build the function params (ignore validity).
+ auto params = BuildParams(dex.function_holder().get(), dex.args(), false,
+ native_function->NeedsContext());
+
+ auto arrow_return_type = dex.func_descriptor()->return_type();
+ if (native_function->CanReturnErrors()) {
+ // slow path : if a function can return errors, skip invoking the function
+ // unless all of the input args are valid. Otherwise, it can cause spurious errors.
+
+ llvm::IRBuilder<>* builder = ir_builder();
+ LLVMTypes* types = generator_->types();
+ auto arrow_type_id = arrow_return_type->id();
+ auto result_type = types->IRType(arrow_type_id);
+
+ // Build combined validity of the args.
+ llvm::Value* is_valid = types->true_constant();
+ for (auto& pair : dex.args()) {
+ auto arg_validity = BuildCombinedValidity(pair->validity_exprs());
+ is_valid = builder->CreateAnd(is_valid, arg_validity, "validityBitAnd");
+ }
+
+ // then block
+ auto then_lambda = [&] {
+ ADD_VISITOR_TRACE("fn " + function_name +
+ " can return errors : all args valid, invoke fn");
+ return BuildFunctionCall(native_function, arrow_return_type, &params);
+ };
+
+ // else block
+ auto else_lambda = [&] {
+ ADD_VISITOR_TRACE("fn " + function_name +
+ " can return errors : not all args valid, return dummy value");
+ llvm::Value* else_value = types->NullConstant(result_type);
+ llvm::Value* else_value_len = nullptr;
+ if (arrow::is_binary_like(arrow_type_id)) {
+ else_value_len = types->i32_constant(0);
+ }
+ return std::make_shared<LValue>(else_value, else_value_len);
+ };
+
+ result_ = BuildIfElse(is_valid, then_lambda, else_lambda, arrow_return_type);
+ } else {
+ // fast path : invoke function without computing validities.
+ result_ = BuildFunctionCall(native_function, arrow_return_type, &params);
+ }
+}
+
+void LLVMGenerator::Visitor::Visit(const NullableNeverFuncDex& dex) {
+ ADD_VISITOR_TRACE("visit NullableNever base function " + dex.func_descriptor()->name());
+ const NativeFunction* native_function = dex.native_function();
+
+ // build function params along with validity.
+ auto params = BuildParams(dex.function_holder().get(), dex.args(), true,
+ native_function->NeedsContext());
+
+ auto arrow_return_type = dex.func_descriptor()->return_type();
+ result_ = BuildFunctionCall(native_function, arrow_return_type, &params);
+}
+
+void LLVMGenerator::Visitor::Visit(const NullableInternalFuncDex& dex) {
+ ADD_VISITOR_TRACE("visit NullableInternal base function " +
+ dex.func_descriptor()->name());
+ llvm::IRBuilder<>* builder = ir_builder();
+ LLVMTypes* types = generator_->types();
+
+ const NativeFunction* native_function = dex.native_function();
+
+ // build function params along with validity.
+ auto params = BuildParams(dex.function_holder().get(), dex.args(), true,
+ native_function->NeedsContext());
+
+ // add an extra arg for validity (allocated on stack).
+ llvm::AllocaInst* result_valid_ptr =
+ new llvm::AllocaInst(types->i8_type(), 0, "result_valid", entry_block_);
+ params.push_back(result_valid_ptr);
+
+ auto arrow_return_type = dex.func_descriptor()->return_type();
+ result_ = BuildFunctionCall(native_function, arrow_return_type, &params);
+
+ // load the result validity and truncate to i1.
+ llvm::Value* result_valid_i8 = CreateLoad(builder, result_valid_ptr);
+ llvm::Value* result_valid = builder->CreateTrunc(result_valid_i8, types->i1_type());
+
+ // set validity bit in the local bitmap.
+ ClearLocalBitMapIfNotValid(dex.local_bitmap_idx(), result_valid);
+}
+
+void LLVMGenerator::Visitor::Visit(const IfDex& dex) {
+ ADD_VISITOR_TRACE("visit IfExpression");
+ llvm::IRBuilder<>* builder = ir_builder();
+
+ // Evaluate condition.
+ LValuePtr if_condition = BuildValueAndValidity(dex.condition_vv());
+
+ // Check if the result is valid, and there is match.
+ llvm::Value* validAndMatched =
+ builder->CreateAnd(if_condition->data(), if_condition->validity(), "validAndMatch");
+
+ // then block
+ auto then_lambda = [&] {
+ ADD_VISITOR_TRACE("branch to then block");
+ LValuePtr then_lvalue = BuildValueAndValidity(dex.then_vv());
+ ClearLocalBitMapIfNotValid(dex.local_bitmap_idx(), then_lvalue->validity());
+ ADD_VISITOR_TRACE("IfExpression result validity %T in matching then",
+ then_lvalue->validity());
+ return then_lvalue;
+ };
+
+ // else block
+ auto else_lambda = [&] {
+ LValuePtr else_lvalue;
+ if (dex.is_terminal_else()) {
+ ADD_VISITOR_TRACE("branch to terminal else block");
+
+ else_lvalue = BuildValueAndValidity(dex.else_vv());
+ // update the local bitmap with the validity.
+ ClearLocalBitMapIfNotValid(dex.local_bitmap_idx(), else_lvalue->validity());
+ ADD_VISITOR_TRACE("IfExpression result validity %T in terminal else",
+ else_lvalue->validity());
+ } else {
+ ADD_VISITOR_TRACE("branch to non-terminal else block");
+
+ // this is a non-terminal else. let the child (nested if/else) handle validity.
+ auto value_expr = dex.else_vv().value_expr();
+ value_expr->Accept(*this);
+ else_lvalue = result();
+ }
+ return else_lvalue;
+ };
+
+ // build the if-else condition.
+ result_ = BuildIfElse(validAndMatched, then_lambda, else_lambda, dex.result_type());
+ if (arrow::is_binary_like(dex.result_type()->id())) {
+ ADD_VISITOR_TRACE("IfElse result length %T", result_->length());
+ }
+ ADD_VISITOR_TRACE("IfElse result value %T", result_->data());
+}
+
+// Boolean AND
+// if any arg is valid and false,
+// short-circuit and return FALSE (value=false, valid=true)
+// else if all args are valid and true
+// return TRUE (value=true, valid=true)
+// else
+// return NULL (value=true, valid=false)
+
+void LLVMGenerator::Visitor::Visit(const BooleanAndDex& dex) {
+ ADD_VISITOR_TRACE("visit BooleanAndExpression");
+ llvm::IRBuilder<>* builder = ir_builder();
+ LLVMTypes* types = generator_->types();
+ llvm::LLVMContext* context = generator_->context();
+
+ // Create blocks for short-circuit.
+ llvm::BasicBlock* short_circuit_bb =
+ llvm::BasicBlock::Create(*context, "short_circuit", function_);
+ llvm::BasicBlock* non_short_circuit_bb =
+ llvm::BasicBlock::Create(*context, "non_short_circuit", function_);
+ llvm::BasicBlock* merge_bb = llvm::BasicBlock::Create(*context, "merge", function_);
+
+ llvm::Value* all_exprs_valid = types->true_constant();
+ for (auto& pair : dex.args()) {
+ LValuePtr current = BuildValueAndValidity(*pair);
+
+ ADD_VISITOR_TRACE("BooleanAndExpression arg value %T", current->data());
+ ADD_VISITOR_TRACE("BooleanAndExpression arg validity %T", current->validity());
+
+ // short-circuit if valid and false
+ llvm::Value* is_false = builder->CreateNot(current->data());
+ llvm::Value* valid_and_false =
+ builder->CreateAnd(is_false, current->validity(), "valid_and_false");
+
+ llvm::BasicBlock* else_bb = llvm::BasicBlock::Create(*context, "else", function_);
+ builder->CreateCondBr(valid_and_false, short_circuit_bb, else_bb);
+
+ // Emit the else block.
+ builder->SetInsertPoint(else_bb);
+ // remember if any nulls were encountered.
+ all_exprs_valid =
+ builder->CreateAnd(all_exprs_valid, current->validity(), "validityBitAnd");
+ // continue to evaluate the next pair in list.
+ }
+ builder->CreateBr(non_short_circuit_bb);
+
+ // Short-circuit case (at least one of the expressions is valid and false).
+ // No need to set validity bit (valid by default).
+ builder->SetInsertPoint(short_circuit_bb);
+ ADD_VISITOR_TRACE("BooleanAndExpression result value false");
+ ADD_VISITOR_TRACE("BooleanAndExpression result validity true");
+ builder->CreateBr(merge_bb);
+
+ // non short-circuit case (All expressions are either true or null).
+ // result valid if all of the exprs are non-null.
+ builder->SetInsertPoint(non_short_circuit_bb);
+ ClearLocalBitMapIfNotValid(dex.local_bitmap_idx(), all_exprs_valid);
+ ADD_VISITOR_TRACE("BooleanAndExpression result value true");
+ ADD_VISITOR_TRACE("BooleanAndExpression result validity %T", all_exprs_valid);
+ builder->CreateBr(merge_bb);
+
+ builder->SetInsertPoint(merge_bb);
+ llvm::PHINode* result_value = builder->CreatePHI(types->i1_type(), 2, "res_value");
+ result_value->addIncoming(types->false_constant(), short_circuit_bb);
+ result_value->addIncoming(types->true_constant(), non_short_circuit_bb);
+ result_.reset(new LValue(result_value));
+}
+
+// Boolean OR
+// if any arg is valid and true,
+// short-circuit and return TRUE (value=true, valid=true)
+// else if all args are valid and false
+// return FALSE (value=false, valid=true)
+// else
+// return NULL (value=false, valid=false)
+
+void LLVMGenerator::Visitor::Visit(const BooleanOrDex& dex) {
+ ADD_VISITOR_TRACE("visit BooleanOrExpression");
+ llvm::IRBuilder<>* builder = ir_builder();
+ LLVMTypes* types = generator_->types();
+ llvm::LLVMContext* context = generator_->context();
+
+ // Create blocks for short-circuit.
+ llvm::BasicBlock* short_circuit_bb =
+ llvm::BasicBlock::Create(*context, "short_circuit", function_);
+ llvm::BasicBlock* non_short_circuit_bb =
+ llvm::BasicBlock::Create(*context, "non_short_circuit", function_);
+ llvm::BasicBlock* merge_bb = llvm::BasicBlock::Create(*context, "merge", function_);
+
+ llvm::Value* all_exprs_valid = types->true_constant();
+ for (auto& pair : dex.args()) {
+ LValuePtr current = BuildValueAndValidity(*pair);
+
+ ADD_VISITOR_TRACE("BooleanOrExpression arg value %T", current->data());
+ ADD_VISITOR_TRACE("BooleanOrExpression arg validity %T", current->validity());
+
+ // short-circuit if valid and true.
+ llvm::Value* valid_and_true =
+ builder->CreateAnd(current->data(), current->validity(), "valid_and_true");
+
+ llvm::BasicBlock* else_bb = llvm::BasicBlock::Create(*context, "else", function_);
+ builder->CreateCondBr(valid_and_true, short_circuit_bb, else_bb);
+
+ // Emit the else block.
+ builder->SetInsertPoint(else_bb);
+ // remember if any nulls were encountered.
+ all_exprs_valid =
+ builder->CreateAnd(all_exprs_valid, current->validity(), "validityBitAnd");
+ // continue to evaluate the next pair in list.
+ }
+ builder->CreateBr(non_short_circuit_bb);
+
+ // Short-circuit case (at least one of the expressions is valid and true).
+ // No need to set validity bit (valid by default).
+ builder->SetInsertPoint(short_circuit_bb);
+ ADD_VISITOR_TRACE("BooleanOrExpression result value true");
+ ADD_VISITOR_TRACE("BooleanOrExpression result validity true");
+ builder->CreateBr(merge_bb);
+
+ // non short-circuit case (All expressions are either false or null).
+ // result valid if all of the exprs are non-null.
+ builder->SetInsertPoint(non_short_circuit_bb);
+ ClearLocalBitMapIfNotValid(dex.local_bitmap_idx(), all_exprs_valid);
+ ADD_VISITOR_TRACE("BooleanOrExpression result value false");
+ ADD_VISITOR_TRACE("BooleanOrExpression result validity %T", all_exprs_valid);
+ builder->CreateBr(merge_bb);
+
+ builder->SetInsertPoint(merge_bb);
+ llvm::PHINode* result_value = builder->CreatePHI(types->i1_type(), 2, "res_value");
+ result_value->addIncoming(types->true_constant(), short_circuit_bb);
+ result_value->addIncoming(types->false_constant(), non_short_circuit_bb);
+ result_.reset(new LValue(result_value));
+}
+
+template <typename Type>
+void LLVMGenerator::Visitor::VisitInExpression(const InExprDexBase<Type>& dex) {
+ ADD_VISITOR_TRACE("visit In Expression");
+ LLVMTypes* types = generator_->types();
+ std::vector<llvm::Value*> params;
+
+ const InExprDex<Type>& dex_instance = dynamic_cast<const InExprDex<Type>&>(dex);
+ /* add the holder at the beginning */
+ llvm::Constant* ptr_int_cast =
+ types->i64_constant((int64_t)(dex_instance.in_holder().get()));
+ params.push_back(ptr_int_cast);
+
+ /* eval expr result */
+ for (auto& pair : dex.args()) {
+ DexPtr value_expr = pair->value_expr();
+ value_expr->Accept(*this);
+ LValue& result_ref = *result();
+ params.push_back(result_ref.data());
+
+ /* length if the result is a string */
+ if (result_ref.length() != nullptr) {
+ params.push_back(result_ref.length());
+ }
+
+ /* push the validity of eval expr result */
+ llvm::Value* validity_expr = BuildCombinedValidity(pair->validity_exprs());
+ params.push_back(validity_expr);
+ }
+
+ llvm::Type* ret_type = types->IRType(arrow::Type::type::BOOL);
+
+ llvm::Value* value;
+
+ value = generator_->AddFunctionCall(dex.runtime_function(), ret_type, params);
+
+ result_.reset(new LValue(value));
+}
+
+template <>
+void LLVMGenerator::Visitor::VisitInExpression<gandiva::DecimalScalar128>(
+ const InExprDexBase<gandiva::DecimalScalar128>& dex) {
+ ADD_VISITOR_TRACE("visit In Expression");
+ LLVMTypes* types = generator_->types();
+ std::vector<llvm::Value*> params;
+ DecimalIR decimalIR(generator_->engine_.get());
+
+ const InExprDex<gandiva::DecimalScalar128>& dex_instance =
+ dynamic_cast<const InExprDex<gandiva::DecimalScalar128>&>(dex);
+ /* add the holder at the beginning */
+ llvm::Constant* ptr_int_cast =
+ types->i64_constant((int64_t)(dex_instance.in_holder().get()));
+ params.push_back(ptr_int_cast);
+
+ /* eval expr result */
+ for (auto& pair : dex.args()) {
+ DexPtr value_expr = pair->value_expr();
+ value_expr->Accept(*this);
+ LValue& result_ref = *result();
+ params.push_back(result_ref.data());
+
+ llvm::Constant* precision = types->i32_constant(dex.get_precision());
+ llvm::Constant* scale = types->i32_constant(dex.get_scale());
+ params.push_back(precision);
+ params.push_back(scale);
+
+ /* push the validity of eval expr result */
+ llvm::Value* validity_expr = BuildCombinedValidity(pair->validity_exprs());
+ params.push_back(validity_expr);
+ }
+
+ llvm::Type* ret_type = types->IRType(arrow::Type::type::BOOL);
+
+ llvm::Value* value;
+
+ value = decimalIR.CallDecimalFunction(dex.runtime_function(), ret_type, params);
+
+ result_.reset(new LValue(value));
+}
+
+void LLVMGenerator::Visitor::Visit(const InExprDexBase<int32_t>& dex) {
+ VisitInExpression<int32_t>(dex);
+}
+
+void LLVMGenerator::Visitor::Visit(const InExprDexBase<int64_t>& dex) {
+ VisitInExpression<int64_t>(dex);
+}
+
+void LLVMGenerator::Visitor::Visit(const InExprDexBase<float>& dex) {
+ VisitInExpression<float>(dex);
+}
+void LLVMGenerator::Visitor::Visit(const InExprDexBase<double>& dex) {
+ VisitInExpression<double>(dex);
+}
+
+void LLVMGenerator::Visitor::Visit(const InExprDexBase<gandiva::DecimalScalar128>& dex) {
+ VisitInExpression<gandiva::DecimalScalar128>(dex);
+}
+
+void LLVMGenerator::Visitor::Visit(const InExprDexBase<std::string>& dex) {
+ VisitInExpression<std::string>(dex);
+}
+
+LValuePtr LLVMGenerator::Visitor::BuildIfElse(llvm::Value* condition,
+ std::function<LValuePtr()> then_func,
+ std::function<LValuePtr()> else_func,
+ DataTypePtr result_type) {
+ llvm::IRBuilder<>* builder = ir_builder();
+ llvm::LLVMContext* context = generator_->context();
+ LLVMTypes* types = generator_->types();
+
+ // Create blocks for the then, else and merge cases.
+ llvm::BasicBlock* then_bb = llvm::BasicBlock::Create(*context, "then", function_);
+ llvm::BasicBlock* else_bb = llvm::BasicBlock::Create(*context, "else", function_);
+ llvm::BasicBlock* merge_bb = llvm::BasicBlock::Create(*context, "merge", function_);
+
+ builder->CreateCondBr(condition, then_bb, else_bb);
+
+ // Emit the then block.
+ builder->SetInsertPoint(then_bb);
+ LValuePtr then_lvalue = then_func();
+ builder->CreateBr(merge_bb);
+
+ // refresh then_bb for phi (could have changed due to code generation of then_vv).
+ then_bb = builder->GetInsertBlock();
+
+ // Emit the else block.
+ builder->SetInsertPoint(else_bb);
+ LValuePtr else_lvalue = else_func();
+ builder->CreateBr(merge_bb);
+
+ // refresh else_bb for phi (could have changed due to code generation of else_vv).
+ else_bb = builder->GetInsertBlock();
+
+ // Emit the merge block.
+ builder->SetInsertPoint(merge_bb);
+ auto llvm_type = types->IRType(result_type->id());
+ llvm::PHINode* result_value = builder->CreatePHI(llvm_type, 2, "res_value");
+ result_value->addIncoming(then_lvalue->data(), then_bb);
+ result_value->addIncoming(else_lvalue->data(), else_bb);
+
+ LValuePtr ret;
+ switch (result_type->id()) {
+ case arrow::Type::STRING:
+ case arrow::Type::BINARY: {
+ llvm::PHINode* result_length;
+ result_length = builder->CreatePHI(types->i32_type(), 2, "res_length");
+ result_length->addIncoming(then_lvalue->length(), then_bb);
+ result_length->addIncoming(else_lvalue->length(), else_bb);
+ ret = std::make_shared<LValue>(result_value, result_length);
+ break;
+ }
+
+ case arrow::Type::DECIMAL:
+ ret = generator_->BuildDecimalLValue(result_value, result_type);
+ break;
+
+ default:
+ ret = std::make_shared<LValue>(result_value);
+ break;
+ }
+ return ret;
+}
+
+LValuePtr LLVMGenerator::Visitor::BuildValueAndValidity(const ValueValidityPair& pair) {
+ // generate code for value
+ auto value_expr = pair.value_expr();
+ value_expr->Accept(*this);
+ auto value = result()->data();
+ auto length = result()->length();
+
+ // generate code for validity
+ auto validity = BuildCombinedValidity(pair.validity_exprs());
+
+ return std::make_shared<LValue>(value, length, validity);
+}
+
+LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func,
+ DataTypePtr arrow_return_type,
+ std::vector<llvm::Value*>* params) {
+ auto types = generator_->types();
+ auto arrow_return_type_id = arrow_return_type->id();
+ auto llvm_return_type = types->IRType(arrow_return_type_id);
+ DecimalIR decimalIR(generator_->engine_.get());
+
+ if (arrow_return_type_id == arrow::Type::DECIMAL) {
+ // For decimal fns, the output precision/scale are passed along as parameters.
+ //
+ // convert from this :
+ // out = add_decimal(v1, p1, s1, v2, p2, s2)
+ // to:
+ // out = add_decimal(v1, p1, s1, v2, p2, s2, out_p, out_s)
+
+ // Append the out_precision and out_scale
+ auto ret_lvalue = generator_->BuildDecimalLValue(nullptr, arrow_return_type);
+ params->push_back(ret_lvalue->precision());
+ params->push_back(ret_lvalue->scale());
+
+ // Make the function call
+ auto out = decimalIR.CallDecimalFunction(func->pc_name(), llvm_return_type, *params);
+ ret_lvalue->set_data(out);
+ return std::move(ret_lvalue);
+ } else {
+ bool isDecimalFunction = false;
+ for (auto& arg : *params) {
+ if (arg->getType() == types->i128_type()) {
+ isDecimalFunction = true;
+ }
+ }
+ // add extra arg for return length for variable len return types (allocated on stack).
+ llvm::AllocaInst* result_len_ptr = nullptr;
+ if (arrow::is_binary_like(arrow_return_type_id)) {
+ result_len_ptr = new llvm::AllocaInst(generator_->types()->i32_type(), 0,
+ "result_len", entry_block_);
+ params->push_back(result_len_ptr);
+ has_arena_allocs_ = true;
+ }
+
+ // Make the function call
+ llvm::IRBuilder<>* builder = ir_builder();
+ auto value =
+ isDecimalFunction
+ ? decimalIR.CallDecimalFunction(func->pc_name(), llvm_return_type, *params)
+ : generator_->AddFunctionCall(func->pc_name(), llvm_return_type, *params);
+ auto value_len =
+ (result_len_ptr == nullptr) ? nullptr : CreateLoad(builder, result_len_ptr);
+ return std::make_shared<LValue>(value, value_len);
+ }
+}
+
+std::vector<llvm::Value*> LLVMGenerator::Visitor::BuildParams(
+ FunctionHolder* holder, const ValueValidityPairVector& args, bool with_validity,
+ bool with_context) {
+ LLVMTypes* types = generator_->types();
+ std::vector<llvm::Value*> params;
+
+ // add context if required.
+ if (with_context) {
+ params.push_back(arg_context_ptr_);
+ }
+
+ // if the function has holder, add the holder pointer.
+ if (holder != nullptr) {
+ auto ptr = types->i64_constant((int64_t)holder);
+ params.push_back(ptr);
+ }
+
+ // build the function params, along with the validities.
+ for (auto& pair : args) {
+ // build value.
+ DexPtr value_expr = pair->value_expr();
+ value_expr->Accept(*this);
+ LValue& result_ref = *result();
+
+ // append all the parameters corresponding to this LValue.
+ result_ref.AppendFunctionParams(&params);
+
+ // build validity.
+ if (with_validity) {
+ llvm::Value* validity_expr = BuildCombinedValidity(pair->validity_exprs());
+ params.push_back(validity_expr);
+ }
+ }
+
+ return params;
+}
+
+// Bitwise-AND of a vector of bits to get the combined validity.
+llvm::Value* LLVMGenerator::Visitor::BuildCombinedValidity(const DexVector& validities) {
+ llvm::IRBuilder<>* builder = ir_builder();
+ LLVMTypes* types = generator_->types();
+
+ llvm::Value* isValid = types->true_constant();
+ for (auto& dex : validities) {
+ dex->Accept(*this);
+ isValid = builder->CreateAnd(isValid, result()->data(), "validityBitAnd");
+ }
+ ADD_VISITOR_TRACE("combined validity is %T", isValid);
+ return isValid;
+}
+
+llvm::Value* LLVMGenerator::Visitor::GetBufferReference(int idx, BufferType buffer_type,
+ FieldPtr field) {
+ llvm::IRBuilder<>* builder = ir_builder();
+
+ // Switch to the entry block to create a reference.
+ llvm::BasicBlock* saved_block = builder->GetInsertBlock();
+ builder->SetInsertPoint(entry_block_);
+
+ llvm::Value* slot_ref = nullptr;
+ switch (buffer_type) {
+ case kBufferTypeValidity:
+ slot_ref = generator_->GetValidityReference(arg_addrs_, idx, field);
+ break;
+
+ case kBufferTypeData:
+ slot_ref = generator_->GetDataReference(arg_addrs_, idx, field);
+ break;
+
+ case kBufferTypeOffsets:
+ slot_ref = generator_->GetOffsetsReference(arg_addrs_, idx, field);
+ break;
+ }
+
+ // Revert to the saved block.
+ builder->SetInsertPoint(saved_block);
+ return slot_ref;
+}
+
+llvm::Value* LLVMGenerator::Visitor::GetSliceOffset(int idx) {
+ return slice_offsets_[idx];
+}
+
+llvm::Value* LLVMGenerator::Visitor::GetLocalBitMapReference(int idx) {
+ llvm::IRBuilder<>* builder = ir_builder();
+
+ // Switch to the entry block to create a reference.
+ llvm::BasicBlock* saved_block = builder->GetInsertBlock();
+ builder->SetInsertPoint(entry_block_);
+
+ llvm::Value* slot_ref = generator_->GetLocalBitMapReference(arg_local_bitmaps_, idx);
+
+ // Revert to the saved block.
+ builder->SetInsertPoint(saved_block);
+ return slot_ref;
+}
+
+/// The local bitmap is pre-filled with 1s. Clear only if invalid.
+void LLVMGenerator::Visitor::ClearLocalBitMapIfNotValid(int local_bitmap_idx,
+ llvm::Value* is_valid) {
+ llvm::Value* slot_ref = GetLocalBitMapReference(local_bitmap_idx);
+ generator_->ClearPackedBitValueIfFalse(slot_ref, loop_var_, is_valid);
+}
+
+// Hooks for tracing/printfs.
+//
+// replace %T with the type-specific format specifier.
+// For some reason, float/double literals are getting lost when printing with the generic
+// printf. so, use a wrapper instead.
+std::string LLVMGenerator::ReplaceFormatInTrace(const std::string& in_msg,
+ llvm::Value* value,
+ std::string* print_fn) {
+ std::string msg = in_msg;
+ std::size_t pos = msg.find("%T");
+ if (pos == std::string::npos) {
+ DCHECK(0);
+ return msg;
+ }
+
+ llvm::Type* type = value->getType();
+ const char* fmt = "";
+ if (type->isIntegerTy(1) || type->isIntegerTy(8) || type->isIntegerTy(16) ||
+ type->isIntegerTy(32)) {
+ fmt = "%d";
+ } else if (type->isIntegerTy(64)) {
+ // bigint
+ fmt = "%lld";
+ } else if (type->isFloatTy()) {
+ // float
+ fmt = "%f";
+ *print_fn = "print_float";
+ } else if (type->isDoubleTy()) {
+ // float
+ fmt = "%lf";
+ *print_fn = "print_double";
+ } else if (type->isPointerTy()) {
+ // string
+ fmt = "%s";
+ } else {
+ DCHECK(0);
+ }
+ msg.replace(pos, 2, fmt);
+ return msg;
+}
+
+void LLVMGenerator::AddTrace(const std::string& msg, llvm::Value* value) {
+ if (!enable_ir_traces_) {
+ return;
+ }
+
+ std::string dmsg = "IR_TRACE:: " + msg + "\n";
+ std::string print_fn_name = "printf";
+ if (value != nullptr) {
+ dmsg = ReplaceFormatInTrace(dmsg, value, &print_fn_name);
+ }
+ trace_strings_.push_back(dmsg);
+
+ // cast this to an llvm pointer.
+ const char* str = trace_strings_.back().c_str();
+ llvm::Constant* str_int_cast = types()->i64_constant((int64_t)str);
+ llvm::Constant* str_ptr_cast =
+ llvm::ConstantExpr::getIntToPtr(str_int_cast, types()->i8_ptr_type());
+
+ std::vector<llvm::Value*> args;
+ args.push_back(str_ptr_cast);
+ if (value != nullptr) {
+ args.push_back(value);
+ }
+ AddFunctionCall(print_fn_name, types()->i32_type(), args);
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/llvm_generator.h b/src/arrow/cpp/src/gandiva/llvm_generator.h
new file mode 100644
index 000000000..ff6d84602
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/llvm_generator.h
@@ -0,0 +1,253 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/util/macros.h"
+
+#include "gandiva/annotator.h"
+#include "gandiva/compiled_expr.h"
+#include "gandiva/configuration.h"
+#include "gandiva/dex_visitor.h"
+#include "gandiva/engine.h"
+#include "gandiva/execution_context.h"
+#include "gandiva/function_registry.h"
+#include "gandiva/gandiva_aliases.h"
+#include "gandiva/llvm_types.h"
+#include "gandiva/lvalue.h"
+#include "gandiva/selection_vector.h"
+#include "gandiva/value_validity_pair.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+class FunctionHolder;
+
+/// Builds an LLVM module and generates code for the specified set of expressions.
+class GANDIVA_EXPORT LLVMGenerator {
+ public:
+ /// \brief Factory method to initialize the generator.
+ static Status Make(std::shared_ptr<Configuration> config,
+ std::unique_ptr<LLVMGenerator>* llvm_generator);
+
+ /// \brief Build the code for the expression trees for default mode. Each
+ /// element in the vector represents an expression tree
+ Status Build(const ExpressionVector& exprs, SelectionVector::Mode mode);
+
+ /// \brief Build the code for the expression trees for default mode. Each
+ /// element in the vector represents an expression tree
+ Status Build(const ExpressionVector& exprs) {
+ return Build(exprs, SelectionVector::Mode::MODE_NONE);
+ }
+
+ /// \brief Execute the built expression against the provided arguments for
+ /// default mode.
+ Status Execute(const arrow::RecordBatch& record_batch,
+ const ArrayDataVector& output_vector);
+
+ /// \brief Execute the built expression against the provided arguments for
+ /// all modes. Only works on the records specified in the selection_vector.
+ Status Execute(const arrow::RecordBatch& record_batch,
+ const SelectionVector* selection_vector,
+ const ArrayDataVector& output_vector);
+
+ SelectionVector::Mode selection_vector_mode() { return selection_vector_mode_; }
+ LLVMTypes* types() { return engine_->types(); }
+ llvm::Module* module() { return engine_->module(); }
+ std::string DumpIR() { return engine_->DumpIR(); }
+
+ private:
+ LLVMGenerator();
+
+ FRIEND_TEST(TestLLVMGenerator, VerifyPCFunctions);
+ FRIEND_TEST(TestLLVMGenerator, TestAdd);
+ FRIEND_TEST(TestLLVMGenerator, TestNullInternal);
+
+ llvm::LLVMContext* context() { return engine_->context(); }
+ llvm::IRBuilder<>* ir_builder() { return engine_->ir_builder(); }
+
+ /// Visitor to generate the code for a decomposed expression.
+ class Visitor : public DexVisitor {
+ public:
+ Visitor(LLVMGenerator* generator, llvm::Function* function,
+ llvm::BasicBlock* entry_block, llvm::Value* arg_addrs,
+ llvm::Value* arg_local_bitmaps, std::vector<llvm::Value*> slice_offsets,
+ llvm::Value* arg_context_ptr, llvm::Value* loop_var);
+
+ void Visit(const VectorReadValidityDex& dex) override;
+ void Visit(const VectorReadFixedLenValueDex& dex) override;
+ void Visit(const VectorReadVarLenValueDex& dex) override;
+ void Visit(const LocalBitMapValidityDex& dex) override;
+ void Visit(const TrueDex& dex) override;
+ void Visit(const FalseDex& dex) override;
+ void Visit(const LiteralDex& dex) override;
+ void Visit(const NonNullableFuncDex& dex) override;
+ void Visit(const NullableNeverFuncDex& dex) override;
+ void Visit(const NullableInternalFuncDex& dex) override;
+ void Visit(const IfDex& dex) override;
+ void Visit(const BooleanAndDex& dex) override;
+ void Visit(const BooleanOrDex& dex) override;
+ void Visit(const InExprDexBase<int32_t>& dex) override;
+ void Visit(const InExprDexBase<int64_t>& dex) override;
+ void Visit(const InExprDexBase<float>& dex) override;
+ void Visit(const InExprDexBase<double>& dex) override;
+ void Visit(const InExprDexBase<gandiva::DecimalScalar128>& dex) override;
+ void Visit(const InExprDexBase<std::string>& dex) override;
+ template <typename Type>
+ void VisitInExpression(const InExprDexBase<Type>& dex);
+
+ LValuePtr result() { return result_; }
+
+ bool has_arena_allocs() { return has_arena_allocs_; }
+
+ private:
+ enum BufferType { kBufferTypeValidity = 0, kBufferTypeData, kBufferTypeOffsets };
+
+ llvm::IRBuilder<>* ir_builder() { return generator_->ir_builder(); }
+ llvm::Module* module() { return generator_->module(); }
+
+ // Generate the code to build the combined validity (bitwise and) from the
+ // vector of validities.
+ llvm::Value* BuildCombinedValidity(const DexVector& validities);
+
+ // Generate the code to build the validity and the value for the given pair.
+ LValuePtr BuildValueAndValidity(const ValueValidityPair& pair);
+
+ // Generate code to build the params.
+ std::vector<llvm::Value*> BuildParams(FunctionHolder* holder,
+ const ValueValidityPairVector& args,
+ bool with_validity, bool with_context);
+
+ // Generate code to onvoke a function call.
+ LValuePtr BuildFunctionCall(const NativeFunction* func, DataTypePtr arrow_return_type,
+ std::vector<llvm::Value*>* params);
+
+ // Generate code for an if-else condition.
+ LValuePtr BuildIfElse(llvm::Value* condition, std::function<LValuePtr()> then_func,
+ std::function<LValuePtr()> else_func,
+ DataTypePtr arrow_return_type);
+
+ // Switch to the entry_block and get reference of the validity/value/offsets buffer
+ llvm::Value* GetBufferReference(int idx, BufferType buffer_type, FieldPtr field);
+
+ // Get the slice offset of the validity/value/offsets buffer
+ llvm::Value* GetSliceOffset(int idx);
+
+ // Switch to the entry_block and get reference to the local bitmap.
+ llvm::Value* GetLocalBitMapReference(int idx);
+
+ // Clear the bit in the local bitmap, if is_valid is 'false'
+ void ClearLocalBitMapIfNotValid(int local_bitmap_idx, llvm::Value* is_valid);
+
+ LLVMGenerator* generator_;
+ LValuePtr result_;
+ llvm::Function* function_;
+ llvm::BasicBlock* entry_block_;
+ llvm::Value* arg_addrs_;
+ llvm::Value* arg_local_bitmaps_;
+ std::vector<llvm::Value*> slice_offsets_;
+ llvm::Value* arg_context_ptr_;
+ llvm::Value* loop_var_;
+ bool has_arena_allocs_;
+ };
+
+ // Generate the code for one expression for default mode, with the output of
+ // the expression going to 'output'.
+ Status Add(const ExpressionPtr expr, const FieldDescriptorPtr output);
+
+ /// Generate code to load the vector at specified index in the 'arg_addrs' array.
+ llvm::Value* LoadVectorAtIndex(llvm::Value* arg_addrs, int idx,
+ const std::string& name);
+
+ /// Generate code to load the vector at specified index and cast it as bitmap.
+ llvm::Value* GetValidityReference(llvm::Value* arg_addrs, int idx, FieldPtr field);
+
+ /// Generate code to load the vector at specified index and cast it as data array.
+ llvm::Value* GetDataReference(llvm::Value* arg_addrs, int idx, FieldPtr field);
+
+ /// Generate code to load the vector at specified index and cast it as offsets array.
+ llvm::Value* GetOffsetsReference(llvm::Value* arg_addrs, int idx, FieldPtr field);
+
+ /// Generate code to load the vector at specified index and cast it as buffer pointer.
+ llvm::Value* GetDataBufferPtrReference(llvm::Value* arg_addrs, int idx, FieldPtr field);
+
+ /// Generate code for the value array of one expression.
+ Status CodeGenExprValue(DexPtr value_expr, int num_buffers, FieldDescriptorPtr output,
+ int suffix_idx, llvm::Function** fn,
+ SelectionVector::Mode selection_vector_mode);
+
+ /// Generate code to load the local bitmap specified index and cast it as bitmap.
+ llvm::Value* GetLocalBitMapReference(llvm::Value* arg_bitmaps, int idx);
+
+ /// Generate code to get the bit value at 'position' in the bitmap.
+ llvm::Value* GetPackedBitValue(llvm::Value* bitmap, llvm::Value* position);
+
+ /// Generate code to get the bit value at 'position' in the validity bitmap.
+ llvm::Value* GetPackedValidityBitValue(llvm::Value* bitmap, llvm::Value* position);
+
+ /// Generate code to set the bit value at 'position' in the bitmap to 'value'.
+ void SetPackedBitValue(llvm::Value* bitmap, llvm::Value* position, llvm::Value* value);
+
+ /// Generate code to clear the bit value at 'position' in the bitmap if 'value'
+ /// is false.
+ void ClearPackedBitValueIfFalse(llvm::Value* bitmap, llvm::Value* position,
+ llvm::Value* value);
+
+ // Generate code to build a DecimalLValue with specified value/precision/scale.
+ std::shared_ptr<DecimalLValue> BuildDecimalLValue(llvm::Value* value,
+ DataTypePtr arrow_type);
+
+ /// Generate code to make a function call (to a pre-compiled IR function) which takes
+ /// 'args' and has a return type 'ret_type'.
+ llvm::Value* AddFunctionCall(const std::string& full_name, llvm::Type* ret_type,
+ const std::vector<llvm::Value*>& args);
+
+ /// Compute the result bitmap for the expression.
+ ///
+ /// \param[in] compiled_expr the compiled expression (includes the bitmap indices to be
+ /// used for computing the validity bitmap of the result).
+ /// \param[in] eval_batch (includes input/output buffer addresses)
+ /// \param[in] selection_vector the list of selected positions
+ void ComputeBitMapsForExpr(const CompiledExpr& compiled_expr,
+ const EvalBatch& eval_batch,
+ const SelectionVector* selection_vector);
+
+ /// Replace the %T in the trace msg with the correct type corresponding to 'type'
+ /// eg. %d for int32, %ld for int64, ..
+ std::string ReplaceFormatInTrace(const std::string& msg, llvm::Value* value,
+ std::string* print_fn);
+
+ /// Generate the code to print a trace msg with one optional argument (%T)
+ void AddTrace(const std::string& msg, llvm::Value* value = NULLPTR);
+
+ std::unique_ptr<Engine> engine_;
+ std::vector<std::unique_ptr<CompiledExpr>> compiled_exprs_;
+ FunctionRegistry function_registry_;
+ Annotator annotator_;
+ SelectionVector::Mode selection_vector_mode_;
+
+ // used for debug
+ bool enable_ir_traces_;
+ std::vector<std::string> trace_strings_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/llvm_generator_test.cc b/src/arrow/cpp/src/gandiva/llvm_generator_test.cc
new file mode 100644
index 000000000..bdc3b0051
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/llvm_generator_test.cc
@@ -0,0 +1,116 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/llvm_generator.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "gandiva/configuration.h"
+#include "gandiva/dex.h"
+#include "gandiva/expression.h"
+#include "gandiva/func_descriptor.h"
+#include "gandiva/function_registry.h"
+#include "gandiva/tests/test_util.h"
+
+namespace gandiva {
+
+typedef int64_t (*add_vector_func_t)(int64_t* elements, int nelements);
+
+class TestLLVMGenerator : public ::testing::Test {
+ protected:
+ FunctionRegistry registry_;
+};
+
+// Verify that a valid pc function exists for every function in the registry.
+TEST_F(TestLLVMGenerator, VerifyPCFunctions) {
+ std::unique_ptr<LLVMGenerator> generator;
+ ASSERT_OK(LLVMGenerator::Make(TestConfiguration(), &generator));
+
+ llvm::Module* module = generator->module();
+ for (auto& iter : registry_) {
+ EXPECT_NE(module->getFunction(iter.pc_name()), nullptr);
+ }
+}
+
+TEST_F(TestLLVMGenerator, TestAdd) {
+ // Setup LLVM generator to do an arithmetic add of two vectors
+ std::unique_ptr<LLVMGenerator> generator;
+ ASSERT_OK(LLVMGenerator::Make(TestConfiguration(), &generator));
+ Annotator annotator;
+
+ auto field0 = std::make_shared<arrow::Field>("f0", arrow::int32());
+ auto desc0 = annotator.CheckAndAddInputFieldDescriptor(field0);
+ auto validity_dex0 = std::make_shared<VectorReadValidityDex>(desc0);
+ auto value_dex0 = std::make_shared<VectorReadFixedLenValueDex>(desc0);
+ auto pair0 = std::make_shared<ValueValidityPair>(validity_dex0, value_dex0);
+
+ auto field1 = std::make_shared<arrow::Field>("f1", arrow::int32());
+ auto desc1 = annotator.CheckAndAddInputFieldDescriptor(field1);
+ auto validity_dex1 = std::make_shared<VectorReadValidityDex>(desc1);
+ auto value_dex1 = std::make_shared<VectorReadFixedLenValueDex>(desc1);
+ auto pair1 = std::make_shared<ValueValidityPair>(validity_dex1, value_dex1);
+
+ DataTypeVector params{arrow::int32(), arrow::int32()};
+ auto func_desc = std::make_shared<FuncDescriptor>("add", params, arrow::int32());
+ FunctionSignature signature(func_desc->name(), func_desc->params(),
+ func_desc->return_type());
+ const NativeFunction* native_func =
+ generator->function_registry_.LookupSignature(signature);
+
+ std::vector<ValueValidityPairPtr> pairs{pair0, pair1};
+ auto func_dex = std::make_shared<NonNullableFuncDex>(func_desc, native_func,
+ FunctionHolderPtr(nullptr), pairs);
+
+ auto field_sum = std::make_shared<arrow::Field>("out", arrow::int32());
+ auto desc_sum = annotator.CheckAndAddInputFieldDescriptor(field_sum);
+
+ llvm::Function* ir_func = nullptr;
+
+ ASSERT_OK(generator->CodeGenExprValue(func_dex, 4, desc_sum, 0, &ir_func,
+ SelectionVector::MODE_NONE));
+
+ ASSERT_OK(generator->engine_->FinalizeModule());
+ auto ir = generator->engine_->DumpIR();
+ EXPECT_THAT(ir, testing::HasSubstr("vector.body"));
+
+ EvalFunc eval_func = (EvalFunc)generator->engine_->CompiledFunction(ir_func);
+
+ constexpr size_t kNumRecords = 4;
+ std::array<uint32_t, kNumRecords> a0{1, 2, 3, 4};
+ std::array<uint32_t, kNumRecords> a1{5, 6, 7, 8};
+ uint64_t in_bitmap = 0xffffffffffffffffull;
+
+ std::array<uint32_t, kNumRecords> out{0, 0, 0, 0};
+ uint64_t out_bitmap = 0;
+
+ std::array<uint8_t*, 6> addrs{
+ reinterpret_cast<uint8_t*>(a0.data()), reinterpret_cast<uint8_t*>(&in_bitmap),
+ reinterpret_cast<uint8_t*>(a1.data()), reinterpret_cast<uint8_t*>(&in_bitmap),
+ reinterpret_cast<uint8_t*>(out.data()), reinterpret_cast<uint8_t*>(&out_bitmap),
+ };
+ std::array<int64_t, 6> addr_offsets{0, 0, 0, 0, 0, 0};
+ eval_func(addrs.data(), addr_offsets.data(), nullptr, nullptr,
+ 0 /* dummy context ptr */, kNumRecords);
+
+ EXPECT_THAT(out, testing::ElementsAre(6, 8, 10, 12));
+ EXPECT_EQ(out_bitmap, 0ULL);
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/llvm_includes.h b/src/arrow/cpp/src/gandiva/llvm_includes.h
new file mode 100644
index 000000000..37f915eb5
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/llvm_includes.h
@@ -0,0 +1,56 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4141)
+#pragma warning(disable : 4146)
+#pragma warning(disable : 4244)
+#pragma warning(disable : 4267)
+#pragma warning(disable : 4291)
+#pragma warning(disable : 4624)
+#endif
+
+#include <llvm/ExecutionEngine/ExecutionEngine.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Module.h>
+
+#if LLVM_VERSION_MAJOR >= 10
+#define LLVM_ALIGN(alignment) (llvm::Align((alignment)))
+#else
+#define LLVM_ALIGN(alignment) (alignment)
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+// Workaround for deprecated builder methods as of LLVM 13: ARROW-14363
+inline llvm::Value* CreateGEP(llvm::IRBuilder<>* builder, llvm::Value* Ptr,
+ llvm::ArrayRef<llvm::Value*> IdxList,
+ const llvm::Twine& Name = "") {
+ return builder->CreateGEP(Ptr->getType()->getScalarType()->getPointerElementType(), Ptr,
+ IdxList, Name);
+}
+
+inline llvm::LoadInst* CreateLoad(llvm::IRBuilder<>* builder, llvm::Value* Ptr,
+ const llvm::Twine& Name = "") {
+ return builder->CreateLoad(Ptr->getType()->getPointerElementType(), Ptr, Name);
+}
diff --git a/src/arrow/cpp/src/gandiva/llvm_types.cc b/src/arrow/cpp/src/gandiva/llvm_types.cc
new file mode 100644
index 000000000..de322a8c0
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/llvm_types.cc
@@ -0,0 +1,48 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/llvm_types.h"
+
+namespace gandiva {
+
+// LLVM doesn't distinguish between signed and unsigned types.
+
+LLVMTypes::LLVMTypes(llvm::LLVMContext& context) : context_(context) {
+ arrow_id_to_llvm_type_map_ = {{arrow::Type::type::BOOL, i1_type()},
+ {arrow::Type::type::INT8, i8_type()},
+ {arrow::Type::type::INT16, i16_type()},
+ {arrow::Type::type::INT32, i32_type()},
+ {arrow::Type::type::INT64, i64_type()},
+ {arrow::Type::type::UINT8, i8_type()},
+ {arrow::Type::type::UINT16, i16_type()},
+ {arrow::Type::type::UINT32, i32_type()},
+ {arrow::Type::type::UINT64, i64_type()},
+ {arrow::Type::type::FLOAT, float_type()},
+ {arrow::Type::type::DOUBLE, double_type()},
+ {arrow::Type::type::DATE32, i32_type()},
+ {arrow::Type::type::DATE64, i64_type()},
+ {arrow::Type::type::TIME32, i32_type()},
+ {arrow::Type::type::TIME64, i64_type()},
+ {arrow::Type::type::TIMESTAMP, i64_type()},
+ {arrow::Type::type::STRING, i8_ptr_type()},
+ {arrow::Type::type::BINARY, i8_ptr_type()},
+ {arrow::Type::type::DECIMAL, i128_type()},
+ {arrow::Type::type::INTERVAL_MONTHS, i32_type()},
+ {arrow::Type::type::INTERVAL_DAY_TIME, i64_type()}};
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/llvm_types.h b/src/arrow/cpp/src/gandiva/llvm_types.h
new file mode 100644
index 000000000..d6f095271
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/llvm_types.h
@@ -0,0 +1,130 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <map>
+#include <vector>
+
+#include "arrow/util/logging.h"
+#include "gandiva/arrow.h"
+#include "gandiva/llvm_includes.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+/// \brief Holder for llvm types, and mappings between arrow types and llvm types.
+class GANDIVA_EXPORT LLVMTypes {
+ public:
+ explicit LLVMTypes(llvm::LLVMContext& context);
+
+ llvm::Type* void_type() { return llvm::Type::getVoidTy(context_); }
+
+ llvm::Type* i1_type() { return llvm::Type::getInt1Ty(context_); }
+
+ llvm::Type* i8_type() { return llvm::Type::getInt8Ty(context_); }
+
+ llvm::Type* i16_type() { return llvm::Type::getInt16Ty(context_); }
+
+ llvm::Type* i32_type() { return llvm::Type::getInt32Ty(context_); }
+
+ llvm::Type* i64_type() { return llvm::Type::getInt64Ty(context_); }
+
+ llvm::Type* i128_type() { return llvm::Type::getInt128Ty(context_); }
+
+ llvm::StructType* i128_split_type() {
+ // struct with high/low bits (see decimal_ops.cc:DecimalSplit)
+ return llvm::StructType::get(context_, {i64_type(), i64_type()}, false);
+ }
+
+ llvm::Type* float_type() { return llvm::Type::getFloatTy(context_); }
+
+ llvm::Type* double_type() { return llvm::Type::getDoubleTy(context_); }
+
+ llvm::PointerType* ptr_type(llvm::Type* type) { return type->getPointerTo(); }
+
+ llvm::PointerType* i8_ptr_type() { return ptr_type(i8_type()); }
+
+ llvm::PointerType* i32_ptr_type() { return ptr_type(i32_type()); }
+
+ llvm::PointerType* i64_ptr_type() { return ptr_type(i64_type()); }
+
+ llvm::PointerType* i128_ptr_type() { return ptr_type(i128_type()); }
+
+ template <typename ctype, size_t N = (sizeof(ctype) * CHAR_BIT)>
+ llvm::Constant* int_constant(ctype val) {
+ return llvm::ConstantInt::get(context_, llvm::APInt(N, val));
+ }
+
+ llvm::Constant* i1_constant(bool val) { return int_constant<bool, 1>(val); }
+ llvm::Constant* i8_constant(int8_t val) { return int_constant(val); }
+ llvm::Constant* i16_constant(int16_t val) { return int_constant(val); }
+ llvm::Constant* i32_constant(int32_t val) { return int_constant(val); }
+ llvm::Constant* i64_constant(int64_t val) { return int_constant(val); }
+ llvm::Constant* i128_constant(int64_t val) { return int_constant<int64_t, 128>(val); }
+
+ llvm::Constant* true_constant() { return i1_constant(true); }
+ llvm::Constant* false_constant() { return i1_constant(false); }
+
+ llvm::Constant* i128_zero() { return i128_constant(0); }
+ llvm::Constant* i128_one() { return i128_constant(1); }
+
+ llvm::Constant* float_constant(float val) {
+ return llvm::ConstantFP::get(float_type(), val);
+ }
+
+ llvm::Constant* double_constant(double val) {
+ return llvm::ConstantFP::get(double_type(), val);
+ }
+
+ llvm::Constant* NullConstant(llvm::Type* type) {
+ if (type->isIntegerTy()) {
+ return llvm::ConstantInt::get(type, 0);
+ } else if (type->isFloatingPointTy()) {
+ return llvm::ConstantFP::get(type, 0);
+ } else {
+ DCHECK(type->isPointerTy());
+ return llvm::ConstantPointerNull::getNullValue(type);
+ }
+ }
+
+ /// For a given data type, find the ir type used for the data vector slot.
+ llvm::Type* DataVecType(const DataTypePtr& data_type) {
+ return IRType(data_type->id());
+ }
+
+ /// For a given minor type, find the corresponding ir type.
+ llvm::Type* IRType(arrow::Type::type arrow_type) {
+ auto found = arrow_id_to_llvm_type_map_.find(arrow_type);
+ return (found == arrow_id_to_llvm_type_map_.end()) ? NULL : found->second;
+ }
+
+ std::vector<arrow::Type::type> GetSupportedArrowTypes() {
+ std::vector<arrow::Type::type> retval;
+ for (auto const& element : arrow_id_to_llvm_type_map_) {
+ retval.push_back(element.first);
+ }
+ return retval;
+ }
+
+ private:
+ std::map<arrow::Type::type, llvm::Type*> arrow_id_to_llvm_type_map_;
+
+ llvm::LLVMContext& context_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/llvm_types_test.cc b/src/arrow/cpp/src/gandiva/llvm_types_test.cc
new file mode 100644
index 000000000..666968306
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/llvm_types_test.cc
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/llvm_types.h"
+
+#include <gtest/gtest.h>
+
+namespace gandiva {
+
+class TestLLVMTypes : public ::testing::Test {
+ protected:
+ virtual void SetUp() { types_ = new LLVMTypes(context_); }
+ virtual void TearDown() { delete types_; }
+
+ llvm::LLVMContext context_;
+ LLVMTypes* types_;
+};
+
+TEST_F(TestLLVMTypes, TestFound) {
+ EXPECT_EQ(types_->IRType(arrow::Type::BOOL), types_->i1_type());
+ EXPECT_EQ(types_->IRType(arrow::Type::INT32), types_->i32_type());
+ EXPECT_EQ(types_->IRType(arrow::Type::INT64), types_->i64_type());
+ EXPECT_EQ(types_->IRType(arrow::Type::FLOAT), types_->float_type());
+ EXPECT_EQ(types_->IRType(arrow::Type::DOUBLE), types_->double_type());
+ EXPECT_EQ(types_->IRType(arrow::Type::DATE64), types_->i64_type());
+ EXPECT_EQ(types_->IRType(arrow::Type::TIME64), types_->i64_type());
+ EXPECT_EQ(types_->IRType(arrow::Type::TIMESTAMP), types_->i64_type());
+
+ EXPECT_EQ(types_->DataVecType(arrow::boolean()), types_->i1_type());
+ EXPECT_EQ(types_->DataVecType(arrow::int32()), types_->i32_type());
+ EXPECT_EQ(types_->DataVecType(arrow::int64()), types_->i64_type());
+ EXPECT_EQ(types_->DataVecType(arrow::float32()), types_->float_type());
+ EXPECT_EQ(types_->DataVecType(arrow::float64()), types_->double_type());
+ EXPECT_EQ(types_->DataVecType(arrow::date64()), types_->i64_type());
+ EXPECT_EQ(types_->DataVecType(arrow::time64(arrow::TimeUnit::MICRO)),
+ types_->i64_type());
+ EXPECT_EQ(types_->DataVecType(arrow::timestamp(arrow::TimeUnit::MILLI)),
+ types_->i64_type());
+}
+
+TEST_F(TestLLVMTypes, TestNotFound) {
+ EXPECT_EQ(types_->IRType(arrow::Type::SPARSE_UNION), nullptr);
+ EXPECT_EQ(types_->IRType(arrow::Type::DENSE_UNION), nullptr);
+ EXPECT_EQ(types_->DataVecType(arrow::null()), nullptr);
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/local_bitmaps_holder.h b/src/arrow/cpp/src/gandiva/local_bitmaps_holder.h
new file mode 100644
index 000000000..a172fb973
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/local_bitmaps_holder.h
@@ -0,0 +1,85 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <arrow/util/logging.h>
+#include "gandiva/arrow.h"
+#include "gandiva/gandiva_aliases.h"
+
+namespace gandiva {
+
+/// \brief The buffers corresponding to one batch of records, used for
+/// expression evaluation.
+class LocalBitMapsHolder {
+ public:
+ LocalBitMapsHolder(int64_t num_records, int num_local_bitmaps);
+
+ int GetNumLocalBitMaps() const { return static_cast<int>(local_bitmaps_vec_.size()); }
+
+ int64_t GetLocalBitMapSize() const { return local_bitmap_size_; }
+
+ uint8_t** GetLocalBitMapArray() const { return local_bitmaps_array_.get(); }
+
+ uint8_t* GetLocalBitMap(int idx) const {
+ DCHECK(idx <= GetNumLocalBitMaps());
+ return local_bitmaps_array_.get()[idx];
+ }
+
+ private:
+ /// number of records in the current batch.
+ int64_t num_records_;
+
+ /// A container of 'local_bitmaps_', each sized to accommodate 'num_records'.
+ std::vector<std::unique_ptr<uint8_t[]>> local_bitmaps_vec_;
+
+ /// An array of the local bitmaps.
+ std::unique_ptr<uint8_t*[]> local_bitmaps_array_;
+
+ int64_t local_bitmap_size_;
+};
+
+inline LocalBitMapsHolder::LocalBitMapsHolder(int64_t num_records, int num_local_bitmaps)
+ : num_records_(num_records) {
+ // alloc an array for the pointers to the bitmaps.
+ if (num_local_bitmaps > 0) {
+ local_bitmaps_array_.reset(new uint8_t*[num_local_bitmaps]);
+ }
+
+ // 64-bit aligned bitmaps.
+ int64_t roundUp64Multiple = (num_records_ + 63) >> 6;
+ local_bitmap_size_ = roundUp64Multiple * 8;
+
+ // Alloc 'num_local_bitmaps_' number of bitmaps, each of capacity 'num_records_'.
+ for (int i = 0; i < num_local_bitmaps; ++i) {
+ // TODO : round-up to a slab friendly multiple.
+ std::unique_ptr<uint8_t[]> bitmap(new uint8_t[local_bitmap_size_]);
+
+ // keep pointer to the bitmap in the array.
+ (local_bitmaps_array_.get())[i] = bitmap.get();
+
+ // pre-fill with 1s (assuming that the probability of is_valid is higher).
+ memset(bitmap.get(), 0xff, local_bitmap_size_);
+ local_bitmaps_vec_.push_back(std::move(bitmap));
+ }
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/lvalue.h b/src/arrow/cpp/src/gandiva/lvalue.h
new file mode 100644
index 000000000..df292855b
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/lvalue.h
@@ -0,0 +1,77 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "arrow/util/macros.h"
+
+#include "arrow/util/logging.h"
+#include "gandiva/llvm_includes.h"
+
+namespace gandiva {
+
+/// \brief Tracks validity/value builders in LLVM.
+class GANDIVA_EXPORT LValue {
+ public:
+ explicit LValue(llvm::Value* data, llvm::Value* length = NULLPTR,
+ llvm::Value* validity = NULLPTR)
+ : data_(data), length_(length), validity_(validity) {}
+ virtual ~LValue() = default;
+
+ llvm::Value* data() { return data_; }
+ llvm::Value* length() { return length_; }
+ llvm::Value* validity() { return validity_; }
+
+ void set_data(llvm::Value* data) { data_ = data; }
+
+ // Append the params required when passing this as a function parameter.
+ virtual void AppendFunctionParams(std::vector<llvm::Value*>* params) {
+ params->push_back(data_);
+ if (length_ != NULLPTR) {
+ params->push_back(length_);
+ }
+ }
+
+ private:
+ llvm::Value* data_;
+ llvm::Value* length_;
+ llvm::Value* validity_;
+};
+
+class GANDIVA_EXPORT DecimalLValue : public LValue {
+ public:
+ DecimalLValue(llvm::Value* data, llvm::Value* validity, llvm::Value* precision,
+ llvm::Value* scale)
+ : LValue(data, NULLPTR, validity), precision_(precision), scale_(scale) {}
+
+ llvm::Value* precision() { return precision_; }
+ llvm::Value* scale() { return scale_; }
+
+ void AppendFunctionParams(std::vector<llvm::Value*>* params) override {
+ LValue::AppendFunctionParams(params);
+ params->push_back(precision_);
+ params->push_back(scale_);
+ }
+
+ private:
+ llvm::Value* precision_;
+ llvm::Value* scale_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/make_precompiled_bitcode.py b/src/arrow/cpp/src/gandiva/make_precompiled_bitcode.py
new file mode 100644
index 000000000..97d96f8a8
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/make_precompiled_bitcode.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+
+marker = b"<DATA_CHARS>"
+
+def expand(data):
+ """
+ Expand *data* as a initializer list of hexadecimal char escapes.
+ """
+ expanded_data = ", ".join([hex(c) for c in bytearray(data)])
+ return expanded_data.encode('ascii')
+
+
+def apply_template(template, data):
+ if template.count(marker) != 1:
+ raise ValueError("Invalid template")
+ return template.replace(marker, expand(data))
+
+
+if __name__ == "__main__":
+ if len(sys.argv) != 4:
+ raise ValueError("Usage: {0} <template file> <data file> "
+ "<output file>".format(sys.argv[0]))
+ with open(sys.argv[1], "rb") as f:
+ template = f.read()
+ with open(sys.argv[2], "rb") as f:
+ data = f.read()
+
+ expanded_data = apply_template(template, data)
+ with open(sys.argv[3], "wb") as f:
+ f.write(expanded_data)
diff --git a/src/arrow/cpp/src/gandiva/native_function.h b/src/arrow/cpp/src/gandiva/native_function.h
new file mode 100644
index 000000000..1268a2567
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/native_function.h
@@ -0,0 +1,81 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "gandiva/arrow.h"
+#include "gandiva/function_signature.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+enum ResultNullableType {
+ /// result validity is an intersection of the validity of the children.
+ kResultNullIfNull,
+ /// result is always valid.
+ kResultNullNever,
+ /// result validity depends on some internal logic.
+ kResultNullInternal,
+};
+
+/// \brief Holder for the mapping from a function in an expression to a
+/// precompiled function.
+class GANDIVA_EXPORT NativeFunction {
+ public:
+ // function attributes.
+ static constexpr int32_t kNeedsContext = (1 << 1);
+ static constexpr int32_t kNeedsFunctionHolder = (1 << 2);
+ static constexpr int32_t kCanReturnErrors = (1 << 3);
+
+ const std::vector<FunctionSignature>& signatures() const { return signatures_; }
+ std::string pc_name() const { return pc_name_; }
+ ResultNullableType result_nullable_type() const { return result_nullable_type_; }
+
+ bool NeedsContext() const { return (flags_ & kNeedsContext) != 0; }
+ bool NeedsFunctionHolder() const { return (flags_ & kNeedsFunctionHolder) != 0; }
+ bool CanReturnErrors() const { return (flags_ & kCanReturnErrors) != 0; }
+
+ NativeFunction(const std::string& base_name, const std::vector<std::string>& aliases,
+ const DataTypeVector& param_types, DataTypePtr ret_type,
+ const ResultNullableType& result_nullable_type,
+ const std::string& pc_name, int32_t flags = 0)
+ : signatures_(),
+ flags_(flags),
+ result_nullable_type_(result_nullable_type),
+ pc_name_(pc_name) {
+ signatures_.push_back(FunctionSignature(base_name, param_types, ret_type));
+ for (auto& func_name : aliases) {
+ signatures_.push_back(FunctionSignature(func_name, param_types, ret_type));
+ }
+ }
+
+ private:
+ std::vector<FunctionSignature> signatures_;
+
+ /// attributes
+ int32_t flags_;
+ ResultNullableType result_nullable_type_;
+
+ /// pre-compiled function name.
+ std::string pc_name_;
+};
+
+} // end namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/node.h b/src/arrow/cpp/src/gandiva/node.h
new file mode 100644
index 000000000..20807d4a0
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/node.h
@@ -0,0 +1,299 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <sstream>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "arrow/status.h"
+
+#include "gandiva/arrow.h"
+#include "gandiva/func_descriptor.h"
+#include "gandiva/gandiva_aliases.h"
+#include "gandiva/literal_holder.h"
+#include "gandiva/node_visitor.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+/// \brief Represents a node in the expression tree. Validity and value are
+/// in a joined state.
+class GANDIVA_EXPORT Node {
+ public:
+ explicit Node(DataTypePtr return_type) : return_type_(return_type) {}
+
+ virtual ~Node() = default;
+
+ const DataTypePtr& return_type() const { return return_type_; }
+
+ /// Derived classes should simply invoke the Visit api of the visitor.
+ virtual Status Accept(NodeVisitor& visitor) const = 0;
+
+ virtual std::string ToString() const = 0;
+
+ protected:
+ DataTypePtr return_type_;
+};
+
+/// \brief Node in the expression tree, representing a literal.
+class GANDIVA_EXPORT LiteralNode : public Node {
+ public:
+ LiteralNode(DataTypePtr type, const LiteralHolder& holder, bool is_null)
+ : Node(type), holder_(holder), is_null_(is_null) {}
+
+ Status Accept(NodeVisitor& visitor) const override { return visitor.Visit(*this); }
+
+ const LiteralHolder& holder() const { return holder_; }
+
+ bool is_null() const { return is_null_; }
+
+ std::string ToString() const override {
+ std::stringstream ss;
+ ss << "(const " << return_type()->ToString() << ") ";
+ if (is_null()) {
+ ss << std::string("null");
+ return ss.str();
+ }
+
+ ss << gandiva::ToString(holder_);
+ // The default formatter prints in decimal can cause a loss in precision. so,
+ // print in hex. Can't use hexfloat since gcc 4.9 doesn't support it.
+ if (return_type()->id() == arrow::Type::DOUBLE) {
+ double dvalue = arrow::util::get<double>(holder_);
+ uint64_t bits;
+ memcpy(&bits, &dvalue, sizeof(bits));
+ ss << " raw(" << std::hex << bits << ")";
+ } else if (return_type()->id() == arrow::Type::FLOAT) {
+ float fvalue = arrow::util::get<float>(holder_);
+ uint32_t bits;
+ memcpy(&bits, &fvalue, sizeof(bits));
+ ss << " raw(" << std::hex << bits << ")";
+ }
+ return ss.str();
+ }
+
+ private:
+ LiteralHolder holder_;
+ bool is_null_;
+};
+
+/// \brief Node in the expression tree, representing an arrow field.
+class GANDIVA_EXPORT FieldNode : public Node {
+ public:
+ explicit FieldNode(FieldPtr field) : Node(field->type()), field_(field) {}
+
+ Status Accept(NodeVisitor& visitor) const override { return visitor.Visit(*this); }
+
+ const FieldPtr& field() const { return field_; }
+
+ std::string ToString() const override {
+ return "(" + field()->type()->ToString() + ") " + field()->name();
+ }
+
+ private:
+ FieldPtr field_;
+};
+
+/// \brief Node in the expression tree, representing a function.
+class GANDIVA_EXPORT FunctionNode : public Node {
+ public:
+ FunctionNode(const std::string& name, const NodeVector& children, DataTypePtr retType);
+
+ Status Accept(NodeVisitor& visitor) const override { return visitor.Visit(*this); }
+
+ const FuncDescriptorPtr& descriptor() const { return descriptor_; }
+ const NodeVector& children() const { return children_; }
+
+ std::string ToString() const override {
+ std::stringstream ss;
+ ss << descriptor()->return_type()->ToString() << " " << descriptor()->name() << "(";
+ bool skip_comma = true;
+ for (auto& child : children()) {
+ if (skip_comma) {
+ ss << child->ToString();
+ skip_comma = false;
+ } else {
+ ss << ", " << child->ToString();
+ }
+ }
+ ss << ")";
+ return ss.str();
+ }
+
+ private:
+ FuncDescriptorPtr descriptor_;
+ NodeVector children_;
+};
+
+inline FunctionNode::FunctionNode(const std::string& name, const NodeVector& children,
+ DataTypePtr return_type)
+ : Node(return_type), children_(children) {
+ DataTypeVector param_types;
+ for (auto& child : children) {
+ param_types.push_back(child->return_type());
+ }
+
+ descriptor_ = FuncDescriptorPtr(new FuncDescriptor(name, param_types, return_type));
+}
+
+/// \brief Node in the expression tree, representing an if-else expression.
+class GANDIVA_EXPORT IfNode : public Node {
+ public:
+ IfNode(NodePtr condition, NodePtr then_node, NodePtr else_node, DataTypePtr result_type)
+ : Node(result_type),
+ condition_(condition),
+ then_node_(then_node),
+ else_node_(else_node) {}
+
+ Status Accept(NodeVisitor& visitor) const override { return visitor.Visit(*this); }
+
+ const NodePtr& condition() const { return condition_; }
+ const NodePtr& then_node() const { return then_node_; }
+ const NodePtr& else_node() const { return else_node_; }
+
+ std::string ToString() const override {
+ std::stringstream ss;
+ ss << "if (" << condition()->ToString() << ") { ";
+ ss << then_node()->ToString() << " } else { ";
+ ss << else_node()->ToString() << " }";
+ return ss.str();
+ }
+
+ private:
+ NodePtr condition_;
+ NodePtr then_node_;
+ NodePtr else_node_;
+};
+
+/// \brief Node in the expression tree, representing an and/or boolean expression.
+class GANDIVA_EXPORT BooleanNode : public Node {
+ public:
+ enum ExprType : char { AND, OR };
+
+ BooleanNode(ExprType expr_type, const NodeVector& children)
+ : Node(arrow::boolean()), expr_type_(expr_type), children_(children) {}
+
+ Status Accept(NodeVisitor& visitor) const override { return visitor.Visit(*this); }
+
+ ExprType expr_type() const { return expr_type_; }
+
+ const NodeVector& children() const { return children_; }
+
+ std::string ToString() const override {
+ std::stringstream ss;
+ bool first = true;
+ for (auto& child : children_) {
+ if (!first) {
+ if (expr_type() == BooleanNode::AND) {
+ ss << " && ";
+ } else {
+ ss << " || ";
+ }
+ }
+ ss << child->ToString();
+ first = false;
+ }
+ return ss.str();
+ }
+
+ private:
+ ExprType expr_type_;
+ NodeVector children_;
+};
+
+/// \brief Node in expression tree, representing an in expression.
+template <typename Type>
+class InExpressionNode : public Node {
+ public:
+ InExpressionNode(NodePtr eval_expr, const std::unordered_set<Type>& values)
+ : Node(arrow::boolean()), eval_expr_(eval_expr), values_(values) {}
+
+ const NodePtr& eval_expr() const { return eval_expr_; }
+
+ const std::unordered_set<Type>& values() const { return values_; }
+
+ Status Accept(NodeVisitor& visitor) const override { return visitor.Visit(*this); }
+
+ std::string ToString() const override {
+ std::stringstream ss;
+ ss << eval_expr_->ToString() << " IN (";
+ bool add_comma = false;
+ for (auto& value : values_) {
+ if (add_comma) {
+ ss << ", ";
+ }
+ // add type in the front to differentiate
+ ss << value;
+ add_comma = true;
+ }
+ ss << ")";
+ return ss.str();
+ }
+
+ private:
+ NodePtr eval_expr_;
+ std::unordered_set<Type> values_;
+};
+
+template <>
+class InExpressionNode<gandiva::DecimalScalar128> : public Node {
+ public:
+ InExpressionNode(NodePtr eval_expr,
+ std::unordered_set<gandiva::DecimalScalar128>& values,
+ int32_t precision, int32_t scale)
+ : Node(arrow::boolean()),
+ eval_expr_(std::move(eval_expr)),
+ values_(std::move(values)),
+ precision_(precision),
+ scale_(scale) {}
+
+ int32_t get_precision() const { return precision_; }
+
+ int32_t get_scale() const { return scale_; }
+
+ const NodePtr& eval_expr() const { return eval_expr_; }
+
+ const std::unordered_set<gandiva::DecimalScalar128>& values() const { return values_; }
+
+ Status Accept(NodeVisitor& visitor) const override { return visitor.Visit(*this); }
+
+ std::string ToString() const override {
+ std::stringstream ss;
+ ss << eval_expr_->ToString() << " IN (";
+ bool add_comma = false;
+ for (auto& value : values_) {
+ if (add_comma) {
+ ss << ", ";
+ }
+ // add type in the front to differentiate
+ ss << value;
+ add_comma = true;
+ }
+ ss << ")";
+ return ss.str();
+ }
+
+ private:
+ NodePtr eval_expr_;
+ std::unordered_set<gandiva::DecimalScalar128> values_;
+ int32_t precision_, scale_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/node_visitor.h b/src/arrow/cpp/src/gandiva/node_visitor.h
new file mode 100644
index 000000000..8f233f5b7
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/node_visitor.h
@@ -0,0 +1,56 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cmath>
+#include <string>
+
+#include "arrow/status.h"
+
+#include "arrow/util/logging.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+class FieldNode;
+class FunctionNode;
+class IfNode;
+class LiteralNode;
+class BooleanNode;
+template <typename Type>
+class InExpressionNode;
+
+/// \brief Visitor for nodes in the expression tree.
+class GANDIVA_EXPORT NodeVisitor {
+ public:
+ virtual ~NodeVisitor() = default;
+
+ virtual Status Visit(const FieldNode& node) = 0;
+ virtual Status Visit(const FunctionNode& node) = 0;
+ virtual Status Visit(const IfNode& node) = 0;
+ virtual Status Visit(const LiteralNode& node) = 0;
+ virtual Status Visit(const BooleanNode& node) = 0;
+ virtual Status Visit(const InExpressionNode<int32_t>& node) = 0;
+ virtual Status Visit(const InExpressionNode<int64_t>& node) = 0;
+ virtual Status Visit(const InExpressionNode<float>& node) = 0;
+ virtual Status Visit(const InExpressionNode<double>& node) = 0;
+ virtual Status Visit(const InExpressionNode<gandiva::DecimalScalar128>& node) = 0;
+ virtual Status Visit(const InExpressionNode<std::string>& node) = 0;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/pch.h b/src/arrow/cpp/src/gandiva/pch.h
new file mode 100644
index 000000000..f3d9b2fad
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/pch.h
@@ -0,0 +1,24 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Often-used headers, for precompiling.
+// If updating this header, please make sure you check compilation speed
+// before checking in. Adding headers which are not used extremely often
+// may incur a slowdown, since it makes the precompiled header heavier to load.
+
+#include "arrow/pch.h"
+#include "gandiva/llvm_types.h"
diff --git a/src/arrow/cpp/src/gandiva/precompiled/CMakeLists.txt b/src/arrow/cpp/src/gandiva/precompiled/CMakeLists.txt
new file mode 100644
index 000000000..650b80f6b
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/precompiled/CMakeLists.txt
@@ -0,0 +1,142 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+project(gandiva)
+
+set(PRECOMPILED_SRCS
+ arithmetic_ops.cc
+ bitmap.cc
+ decimal_ops.cc
+ decimal_wrapper.cc
+ extended_math_ops.cc
+ hash.cc
+ print.cc
+ string_ops.cc
+ time.cc
+ timestamp_arithmetic.cc
+ ../../arrow/util/basic_decimal.cc)
+
+if(MSVC)
+ # clang pretends to be a particular version of MSVC. 191[0-9] is
+ # Visual Studio 2017, and the standard library uses C++14 features,
+ # so we have to use that -std version to get the IR compilation to work
+ if(MSVC_VERSION MATCHES "^191[0-9]$")
+ set(FMS_COMPATIBILITY 19.10)
+ else()
+ message(FATAL_ERROR "Unsupported MSVC_VERSION=${MSVC_VERSION}")
+ endif()
+ set(PLATFORM_CLANG_OPTIONS -std=c++14 -fms-compatibility
+ -fms-compatibility-version=${FMS_COMPATIBILITY})
+else()
+ set(PLATFORM_CLANG_OPTIONS -std=c++11)
+endif()
+
+# Create bitcode for each of the source files.
+foreach(SRC_FILE ${PRECOMPILED_SRCS})
+ get_filename_component(SRC_BASE ${SRC_FILE} NAME_WE)
+ get_filename_component(ABSOLUTE_SRC ${SRC_FILE} ABSOLUTE)
+ set(BC_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SRC_BASE}.bc)
+ set(PRECOMPILE_COMMAND)
+ if(CMAKE_OSX_SYSROOT)
+ list(APPEND
+ PRECOMPILE_COMMAND
+ ${CMAKE_COMMAND}
+ -E
+ env
+ SDKROOT=${CMAKE_OSX_SYSROOT})
+ endif()
+ list(APPEND
+ PRECOMPILE_COMMAND
+ ${CLANG_EXECUTABLE}
+ ${PLATFORM_CLANG_OPTIONS}
+ -DGANDIVA_IR
+ -DNDEBUG # DCHECK macros not implemented in precompiled code
+ -DARROW_STATIC # Do not set __declspec(dllimport) on MSVC on Arrow symbols
+ -DGANDIVA_STATIC # Do not set __declspec(dllimport) on MSVC on Gandiva symbols
+ -fno-use-cxa-atexit # Workaround for unresolved __dso_handle
+ -emit-llvm
+ -O3
+ -c
+ ${ABSOLUTE_SRC}
+ -o
+ ${BC_FILE}
+ ${ARROW_GANDIVA_PC_CXX_FLAGS}
+ -I${CMAKE_SOURCE_DIR}/src
+ -I${ARROW_BINARY_DIR}/src)
+
+ if(NOT ARROW_USE_NATIVE_INT128)
+ list(APPEND PRECOMPILE_COMMAND -I${Boost_INCLUDE_DIR})
+ endif()
+ add_custom_command(OUTPUT ${BC_FILE}
+ COMMAND ${PRECOMPILE_COMMAND}
+ DEPENDS ${SRC_FILE})
+ list(APPEND BC_FILES ${BC_FILE})
+endforeach()
+
+# link all of the bitcode files into a single bitcode file.
+add_custom_command(OUTPUT ${GANDIVA_PRECOMPILED_BC_PATH}
+ COMMAND ${LLVM_LINK_EXECUTABLE} -o ${GANDIVA_PRECOMPILED_BC_PATH}
+ ${BC_FILES}
+ DEPENDS ${BC_FILES})
+
+# turn the bitcode file into a C++ static data variable.
+add_custom_command(OUTPUT ${GANDIVA_PRECOMPILED_CC_PATH}
+ COMMAND ${PYTHON_EXECUTABLE}
+ "${CMAKE_CURRENT_SOURCE_DIR}/../make_precompiled_bitcode.py"
+ ${GANDIVA_PRECOMPILED_CC_IN_PATH}
+ ${GANDIVA_PRECOMPILED_BC_PATH} ${GANDIVA_PRECOMPILED_CC_PATH}
+ DEPENDS ${GANDIVA_PRECOMPILED_CC_IN_PATH}
+ ${GANDIVA_PRECOMPILED_BC_PATH})
+
+add_custom_target(precompiled ALL DEPENDS ${GANDIVA_PRECOMPILED_BC_PATH}
+ ${GANDIVA_PRECOMPILED_CC_PATH})
+
+# testing
+if(ARROW_BUILD_TESTS)
+ add_executable(gandiva-precompiled-test
+ ../context_helper.cc
+ bitmap_test.cc
+ bitmap.cc
+ epoch_time_point_test.cc
+ time_test.cc
+ time.cc
+ timestamp_arithmetic.cc
+ ../cast_time.cc
+ ../../arrow/vendored/datetime/tz.cpp
+ hash_test.cc
+ hash.cc
+ string_ops_test.cc
+ string_ops.cc
+ arithmetic_ops_test.cc
+ arithmetic_ops.cc
+ extended_math_ops_test.cc
+ extended_math_ops.cc
+ decimal_ops_test.cc
+ decimal_ops.cc
+ ../decimal_type_util.cc
+ ../decimal_xlarge.cc)
+ target_include_directories(gandiva-precompiled-test PRIVATE ${CMAKE_SOURCE_DIR}/src)
+ target_link_libraries(gandiva-precompiled-test PRIVATE ${ARROW_TEST_LINK_LIBS})
+ target_compile_definitions(gandiva-precompiled-test PRIVATE GANDIVA_UNIT_TEST=1
+ ARROW_STATIC GANDIVA_STATIC)
+ set(TEST_PATH "${EXECUTABLE_OUTPUT_PATH}/gandiva-precompiled-test")
+ add_test(gandiva-precompiled-test ${TEST_PATH})
+ set_property(TEST gandiva-precompiled-test
+ APPEND
+ PROPERTY LABELS "unittest;gandiva-tests")
+ add_dependencies(gandiva-tests gandiva-precompiled-test)
+endif()
diff --git a/src/arrow/cpp/src/gandiva/precompiled/arithmetic_ops.cc b/src/arrow/cpp/src/gandiva/precompiled/arithmetic_ops.cc
new file mode 100644
index 000000000..c736c38d3
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/precompiled/arithmetic_ops.cc
@@ -0,0 +1,274 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+extern "C" {
+
+#include <math.h>
+#include "./types.h"
+
+// Expand inner macro for all numeric types.
+#define NUMERIC_TYPES(INNER, NAME, OP) \
+ INNER(NAME, int8, OP) \
+ INNER(NAME, int16, OP) \
+ INNER(NAME, int32, OP) \
+ INNER(NAME, int64, OP) \
+ INNER(NAME, uint8, OP) \
+ INNER(NAME, uint16, OP) \
+ INNER(NAME, uint32, OP) \
+ INNER(NAME, uint64, OP) \
+ INNER(NAME, float32, OP) \
+ INNER(NAME, float64, OP)
+
+// Expand inner macros for all date/time types.
+#define DATE_TYPES(INNER, NAME, OP) \
+ INNER(NAME, date64, OP) \
+ INNER(NAME, date32, OP) \
+ INNER(NAME, timestamp, OP) \
+ INNER(NAME, time32, OP)
+
+#define NUMERIC_DATE_TYPES(INNER, NAME, OP) \
+ NUMERIC_TYPES(INNER, NAME, OP) \
+ DATE_TYPES(INNER, NAME, OP)
+
+#define NUMERIC_BOOL_DATE_TYPES(INNER, NAME, OP) \
+ NUMERIC_TYPES(INNER, NAME, OP) \
+ DATE_TYPES(INNER, NAME, OP) \
+ INNER(NAME, boolean, OP)
+
+#define MOD_OP(NAME, IN_TYPE1, IN_TYPE2, OUT_TYPE) \
+ FORCE_INLINE \
+ gdv_##OUT_TYPE NAME##_##IN_TYPE1##_##IN_TYPE2(gdv_##IN_TYPE1 left, \
+ gdv_##IN_TYPE2 right) { \
+ return (right == 0 ? static_cast<gdv_##OUT_TYPE>(left) \
+ : static_cast<gdv_##OUT_TYPE>(left % right)); \
+ }
+
+// Symmetric binary fns : left, right params and return type are same.
+#define BINARY_SYMMETRIC(NAME, TYPE, OP) \
+ FORCE_INLINE \
+ gdv_##TYPE NAME##_##TYPE##_##TYPE(gdv_##TYPE left, gdv_##TYPE right) { \
+ return static_cast<gdv_##TYPE>(left OP right); \
+ }
+
+NUMERIC_TYPES(BINARY_SYMMETRIC, add, +)
+NUMERIC_TYPES(BINARY_SYMMETRIC, subtract, -)
+NUMERIC_TYPES(BINARY_SYMMETRIC, multiply, *)
+BINARY_SYMMETRIC(bitwise_and, int32, &)
+BINARY_SYMMETRIC(bitwise_and, int64, &)
+BINARY_SYMMETRIC(bitwise_or, int32, |)
+BINARY_SYMMETRIC(bitwise_or, int64, |)
+BINARY_SYMMETRIC(bitwise_xor, int32, ^)
+BINARY_SYMMETRIC(bitwise_xor, int64, ^)
+
+#undef BINARY_SYMMETRIC
+
+MOD_OP(mod, int64, int32, int32)
+MOD_OP(mod, int64, int64, int64)
+
+#undef MOD_OP
+
+gdv_float64 mod_float64_float64(int64_t context, gdv_float64 x, gdv_float64 y) {
+ if (y == 0.0) {
+ char const* err_msg = "divide by zero error";
+ gdv_fn_context_set_error_msg(context, err_msg);
+ return 0.0;
+ }
+ return fmod(x, y);
+}
+
+// Relational binary fns : left, right params are same, return is bool.
+#define BINARY_RELATIONAL(NAME, TYPE, OP) \
+ FORCE_INLINE \
+ bool NAME##_##TYPE##_##TYPE(gdv_##TYPE left, gdv_##TYPE right) { return left OP right; }
+
+NUMERIC_BOOL_DATE_TYPES(BINARY_RELATIONAL, equal, ==)
+NUMERIC_BOOL_DATE_TYPES(BINARY_RELATIONAL, not_equal, !=)
+NUMERIC_DATE_TYPES(BINARY_RELATIONAL, less_than, <)
+NUMERIC_DATE_TYPES(BINARY_RELATIONAL, less_than_or_equal_to, <=)
+NUMERIC_DATE_TYPES(BINARY_RELATIONAL, greater_than, >)
+NUMERIC_DATE_TYPES(BINARY_RELATIONAL, greater_than_or_equal_to, >=)
+
+#undef BINARY_RELATIONAL
+
+// cast fns : takes one param type, returns another type.
+#define CAST_UNARY(NAME, IN_TYPE, OUT_TYPE) \
+ FORCE_INLINE \
+ gdv_##OUT_TYPE NAME##_##IN_TYPE(gdv_##IN_TYPE in) { \
+ return static_cast<gdv_##OUT_TYPE>(in); \
+ }
+
+CAST_UNARY(castBIGINT, int32, int64)
+CAST_UNARY(castINT, int64, int32)
+CAST_UNARY(castFLOAT4, int32, float32)
+CAST_UNARY(castFLOAT4, int64, float32)
+CAST_UNARY(castFLOAT8, int32, float64)
+CAST_UNARY(castFLOAT8, int64, float64)
+CAST_UNARY(castFLOAT8, float32, float64)
+CAST_UNARY(castFLOAT4, float64, float32)
+
+#undef CAST_UNARY
+
+// cast float types to int types.
+#define CAST_INT_FLOAT(NAME, IN_TYPE, OUT_TYPE) \
+ FORCE_INLINE \
+ gdv_##OUT_TYPE NAME##_##IN_TYPE(gdv_##IN_TYPE in) { \
+ gdv_##OUT_TYPE out = static_cast<gdv_##OUT_TYPE>(round(in)); \
+ return out; \
+ }
+
+CAST_INT_FLOAT(castBIGINT, float32, int64)
+CAST_INT_FLOAT(castBIGINT, float64, int64)
+CAST_INT_FLOAT(castINT, float32, int32)
+CAST_INT_FLOAT(castINT, float64, int32)
+
+#undef CAST_INT_FLOAT
+
+// simple nullable functions, result value = fn(input validity)
+#define VALIDITY_OP(NAME, TYPE, OP) \
+ FORCE_INLINE \
+ bool NAME##_##TYPE(gdv_##TYPE in, gdv_boolean is_valid) { return OP is_valid; }
+
+NUMERIC_BOOL_DATE_TYPES(VALIDITY_OP, isnull, !)
+NUMERIC_BOOL_DATE_TYPES(VALIDITY_OP, isnotnull, +)
+NUMERIC_TYPES(VALIDITY_OP, isnumeric, +)
+
+#undef VALIDITY_OP
+
+#define NUMERIC_FUNCTION(INNER) \
+ INNER(int8) \
+ INNER(int16) \
+ INNER(int32) \
+ INNER(int64) \
+ INNER(uint8) \
+ INNER(uint16) \
+ INNER(uint32) \
+ INNER(uint64) \
+ INNER(float32) \
+ INNER(float64)
+
+#define DATE_FUNCTION(INNER) \
+ INNER(date32) \
+ INNER(date64) \
+ INNER(timestamp) \
+ INNER(time32)
+
+#define NUMERIC_BOOL_DATE_FUNCTION(INNER) \
+ NUMERIC_FUNCTION(INNER) \
+ DATE_FUNCTION(INNER) \
+ INNER(boolean)
+
+FORCE_INLINE
+gdv_boolean not_boolean(gdv_boolean in) { return !in; }
+
+// is_distinct_from
+#define IS_DISTINCT_FROM(TYPE) \
+ FORCE_INLINE \
+ bool is_distinct_from_##TYPE##_##TYPE(gdv_##TYPE in1, gdv_boolean is_valid1, \
+ gdv_##TYPE in2, gdv_boolean is_valid2) { \
+ if (is_valid1 != is_valid2) { \
+ return true; \
+ } \
+ if (!is_valid1) { \
+ return false; \
+ } \
+ return in1 != in2; \
+ }
+
+// is_not_distinct_from
+#define IS_NOT_DISTINCT_FROM(TYPE) \
+ FORCE_INLINE \
+ bool is_not_distinct_from_##TYPE##_##TYPE(gdv_##TYPE in1, gdv_boolean is_valid1, \
+ gdv_##TYPE in2, gdv_boolean is_valid2) { \
+ if (is_valid1 != is_valid2) { \
+ return false; \
+ } \
+ if (!is_valid1) { \
+ return true; \
+ } \
+ return in1 == in2; \
+ }
+
+NUMERIC_BOOL_DATE_FUNCTION(IS_DISTINCT_FROM)
+NUMERIC_BOOL_DATE_FUNCTION(IS_NOT_DISTINCT_FROM)
+
+#undef IS_DISTINCT_FROM
+#undef IS_NOT_DISTINCT_FROM
+
+#define DIVIDE(TYPE) \
+ FORCE_INLINE \
+ gdv_##TYPE divide_##TYPE##_##TYPE(gdv_int64 context, gdv_##TYPE in1, gdv_##TYPE in2) { \
+ if (in2 == 0) { \
+ char const* err_msg = "divide by zero error"; \
+ gdv_fn_context_set_error_msg(context, err_msg); \
+ return 0; \
+ } \
+ return static_cast<gdv_##TYPE>(in1 / in2); \
+ }
+
+NUMERIC_FUNCTION(DIVIDE)
+
+#undef DIVIDE
+
+#define DIV(TYPE) \
+ FORCE_INLINE \
+ gdv_##TYPE div_##TYPE##_##TYPE(gdv_int64 context, gdv_##TYPE in1, gdv_##TYPE in2) { \
+ if (in2 == 0) { \
+ char const* err_msg = "divide by zero error"; \
+ gdv_fn_context_set_error_msg(context, err_msg); \
+ return 0; \
+ } \
+ return static_cast<gdv_##TYPE>(in1 / in2); \
+ }
+
+DIV(int32)
+DIV(int64)
+
+#undef DIV
+
+#define DIV_FLOAT(TYPE) \
+ FORCE_INLINE \
+ gdv_##TYPE div_##TYPE##_##TYPE(gdv_int64 context, gdv_##TYPE in1, gdv_##TYPE in2) { \
+ if (in2 == 0) { \
+ char const* err_msg = "divide by zero error"; \
+ gdv_fn_context_set_error_msg(context, err_msg); \
+ return 0; \
+ } \
+ return static_cast<gdv_##TYPE>(::trunc(in1 / in2)); \
+ }
+
+DIV_FLOAT(float32)
+DIV_FLOAT(float64)
+
+#undef DIV_FLOAT
+
+#define BITWISE_NOT(TYPE) \
+ FORCE_INLINE \
+ gdv_##TYPE bitwise_not_##TYPE(gdv_##TYPE in) { return static_cast<gdv_##TYPE>(~in); }
+
+BITWISE_NOT(int32)
+BITWISE_NOT(int64)
+
+#undef BITWISE_NOT
+
+#undef DATE_FUNCTION
+#undef DATE_TYPES
+#undef NUMERIC_BOOL_DATE_TYPES
+#undef NUMERIC_DATE_TYPES
+#undef NUMERIC_FUNCTION
+#undef NUMERIC_TYPES
+
+} // extern "C"
diff --git a/src/arrow/cpp/src/gandiva/precompiled/arithmetic_ops_test.cc b/src/arrow/cpp/src/gandiva/precompiled/arithmetic_ops_test.cc
new file mode 100644
index 000000000..36b50bcfd
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/precompiled/arithmetic_ops_test.cc
@@ -0,0 +1,180 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "../execution_context.h"
+#include "gandiva/precompiled/types.h"
+
+namespace gandiva {
+
+TEST(TestArithmeticOps, TestIsDistinctFrom) {
+ EXPECT_EQ(is_distinct_from_timestamp_timestamp(1000, true, 1000, false), true);
+ EXPECT_EQ(is_distinct_from_timestamp_timestamp(1000, false, 1000, true), true);
+ EXPECT_EQ(is_distinct_from_timestamp_timestamp(1000, false, 1000, false), false);
+ EXPECT_EQ(is_distinct_from_timestamp_timestamp(1000, true, 1000, true), false);
+
+ EXPECT_EQ(is_not_distinct_from_int32_int32(1000, true, 1000, false), false);
+ EXPECT_EQ(is_not_distinct_from_int32_int32(1000, false, 1000, true), false);
+ EXPECT_EQ(is_not_distinct_from_int32_int32(1000, false, 1000, false), true);
+ EXPECT_EQ(is_not_distinct_from_int32_int32(1000, true, 1000, true), true);
+}
+
+TEST(TestArithmeticOps, TestMod) {
+ gandiva::ExecutionContext context;
+ EXPECT_EQ(mod_int64_int32(10, 0), 10);
+
+ const double acceptable_abs_error = 0.00000000001; // 1e-10
+
+ EXPECT_DOUBLE_EQ(mod_float64_float64(reinterpret_cast<gdv_int64>(&context), 2.5, 0.0),
+ 0.0);
+ EXPECT_TRUE(context.has_error());
+ EXPECT_EQ(context.get_error(), "divide by zero error");
+
+ context.Reset();
+ EXPECT_NEAR(mod_float64_float64(reinterpret_cast<gdv_int64>(&context), 2.5, 1.2), 0.1,
+ acceptable_abs_error);
+ EXPECT_FALSE(context.has_error());
+
+ context.Reset();
+ EXPECT_DOUBLE_EQ(mod_float64_float64(reinterpret_cast<gdv_int64>(&context), 2.5, 2.5),
+ 0.0);
+ EXPECT_FALSE(context.has_error());
+
+ context.Reset();
+ EXPECT_NEAR(mod_float64_float64(reinterpret_cast<gdv_int64>(&context), 9.2, 3.7), 1.8,
+ acceptable_abs_error);
+ EXPECT_FALSE(context.has_error());
+}
+
+TEST(TestArithmeticOps, TestDivide) {
+ gandiva::ExecutionContext context;
+ EXPECT_EQ(divide_int64_int64(reinterpret_cast<gdv_int64>(&context), 10, 0), 0);
+ EXPECT_EQ(context.has_error(), true);
+ EXPECT_EQ(context.get_error(), "divide by zero error");
+
+ context.Reset();
+ EXPECT_EQ(divide_int64_int64(reinterpret_cast<gdv_int64>(&context), 10, 2), 5);
+ EXPECT_EQ(context.has_error(), false);
+}
+
+TEST(TestArithmeticOps, TestDiv) {
+ gandiva::ExecutionContext context;
+ EXPECT_EQ(div_int64_int64(reinterpret_cast<gdv_int64>(&context), 101, 0), 0);
+ EXPECT_EQ(context.has_error(), true);
+ EXPECT_EQ(context.get_error(), "divide by zero error");
+ context.Reset();
+
+ EXPECT_EQ(div_int64_int64(reinterpret_cast<gdv_int64>(&context), 101, 111), 0);
+ EXPECT_EQ(context.has_error(), false);
+ context.Reset();
+
+ EXPECT_EQ(div_float64_float64(reinterpret_cast<gdv_int64>(&context), 1010.1010, 2.1),
+ 481.0);
+ EXPECT_EQ(context.has_error(), false);
+ context.Reset();
+
+ EXPECT_EQ(
+ div_float64_float64(reinterpret_cast<gdv_int64>(&context), 1010.1010, 0.00000),
+ 0.0);
+ EXPECT_EQ(context.has_error(), true);
+ EXPECT_EQ(context.get_error(), "divide by zero error");
+ context.Reset();
+
+ EXPECT_EQ(div_float32_float32(reinterpret_cast<gdv_int64>(&context), 1010.1010f, 2.1f),
+ 481.0f);
+ EXPECT_EQ(context.has_error(), false);
+ context.Reset();
+}
+
+TEST(TestArithmeticOps, TestBitwiseOps) {
+ // bitwise AND
+ EXPECT_EQ(bitwise_and_int32_int32(0x0147D, 0x17159), 0x01059);
+ EXPECT_EQ(bitwise_and_int32_int32(0xFFFFFFCC, 0x00000297), 0x00000284);
+ EXPECT_EQ(bitwise_and_int32_int32(0x000, 0x285), 0x000);
+ EXPECT_EQ(bitwise_and_int64_int64(0x563672F83, 0x0D9FCF85B), 0x041642803);
+ EXPECT_EQ(bitwise_and_int64_int64(0xFFFFFFFFFFDA8F6A, 0xFFFFFFFFFFFF791C),
+ 0xFFFFFFFFFFDA0908);
+ EXPECT_EQ(bitwise_and_int64_int64(0x6A5B1, 0x00000), 0x00000);
+
+ // bitwise OR
+ EXPECT_EQ(bitwise_or_int32_int32(0x0147D, 0x17159), 0x1757D);
+ EXPECT_EQ(bitwise_or_int32_int32(0xFFFFFFCC, 0x00000297), 0xFFFFFFDF);
+ EXPECT_EQ(bitwise_or_int32_int32(0x000, 0x285), 0x285);
+ EXPECT_EQ(bitwise_or_int64_int64(0x563672F83, 0x0D9FCF85B), 0x5FBFFFFDB);
+ EXPECT_EQ(bitwise_or_int64_int64(0xFFFFFFFFFFDA8F6A, 0xFFFFFFFFFFFF791C),
+ 0xFFFFFFFFFFFFFF7E);
+ EXPECT_EQ(bitwise_or_int64_int64(0x6A5B1, 0x00000), 0x6A5B1);
+
+ // bitwise XOR
+ EXPECT_EQ(bitwise_xor_int32_int32(0x0147D, 0x17159), 0x16524);
+ EXPECT_EQ(bitwise_xor_int32_int32(0xFFFFFFCC, 0x00000297), 0XFFFFFD5B);
+ EXPECT_EQ(bitwise_xor_int32_int32(0x000, 0x285), 0x285);
+ EXPECT_EQ(bitwise_xor_int64_int64(0x563672F83, 0x0D9FCF85B), 0x5BA9BD7D8);
+ EXPECT_EQ(bitwise_xor_int64_int64(0xFFFFFFFFFFDA8F6A, 0xFFFFFFFFFFFF791C), 0X25F676);
+ EXPECT_EQ(bitwise_xor_int64_int64(0x6A5B1, 0x00000), 0x6A5B1);
+ EXPECT_EQ(bitwise_xor_int64_int64(0x6A5B1, 0x6A5B1), 0x00000);
+
+ // bitwise NOT
+ EXPECT_EQ(bitwise_not_int32(0x00017159), 0xFFFE8EA6);
+ EXPECT_EQ(bitwise_not_int32(0xFFFFF226), 0x00000DD9);
+ EXPECT_EQ(bitwise_not_int64(0x000000008BCAE9B4), 0xFFFFFFFF7435164B);
+ EXPECT_EQ(bitwise_not_int64(0xFFFFFF966C8D7997), 0x0000006993728668);
+ EXPECT_EQ(bitwise_not_int64(0x0000000000000000), 0xFFFFFFFFFFFFFFFF);
+}
+
+TEST(TestArithmeticOps, TestIntCastFloatDouble) {
+ // castINT from floats
+ EXPECT_EQ(castINT_float32(6.6f), 7);
+ EXPECT_EQ(castINT_float32(-6.6f), -7);
+ EXPECT_EQ(castINT_float32(-6.3f), -6);
+ EXPECT_EQ(castINT_float32(0.0f), 0);
+ EXPECT_EQ(castINT_float32(-0), 0);
+
+ // castINT from doubles
+ EXPECT_EQ(castINT_float64(6.6), 7);
+ EXPECT_EQ(castINT_float64(-6.6), -7);
+ EXPECT_EQ(castINT_float64(-6.3), -6);
+ EXPECT_EQ(castINT_float64(0.0), 0);
+ EXPECT_EQ(castINT_float64(-0), 0);
+ EXPECT_EQ(castINT_float64(999999.99999999999999999999999), 1000000);
+ EXPECT_EQ(castINT_float64(-999999.99999999999999999999999), -1000000);
+ EXPECT_EQ(castINT_float64(INT32_MAX), 2147483647);
+ EXPECT_EQ(castINT_float64(-2147483647), -2147483647);
+}
+
+TEST(TestArithmeticOps, TestBigIntCastFloatDouble) {
+ // castINT from floats
+ EXPECT_EQ(castBIGINT_float32(6.6f), 7);
+ EXPECT_EQ(castBIGINT_float32(-6.6f), -7);
+ EXPECT_EQ(castBIGINT_float32(-6.3f), -6);
+ EXPECT_EQ(castBIGINT_float32(0.0f), 0);
+ EXPECT_EQ(castBIGINT_float32(-0), 0);
+
+ // castINT from doubles
+ EXPECT_EQ(castBIGINT_float64(6.6), 7);
+ EXPECT_EQ(castBIGINT_float64(-6.6), -7);
+ EXPECT_EQ(castBIGINT_float64(-6.3), -6);
+ EXPECT_EQ(castBIGINT_float64(0.0), 0);
+ EXPECT_EQ(castBIGINT_float64(-0), 0);
+ EXPECT_EQ(castBIGINT_float64(999999.99999999999999999999999), 1000000);
+ EXPECT_EQ(castBIGINT_float64(-999999.99999999999999999999999), -1000000);
+ EXPECT_EQ(castBIGINT_float64(INT32_MAX), 2147483647);
+ EXPECT_EQ(castBIGINT_float64(-2147483647), -2147483647);
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/precompiled/bitmap.cc b/src/arrow/cpp/src/gandiva/precompiled/bitmap.cc
new file mode 100644
index 000000000..332f08dbe
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/precompiled/bitmap.cc
@@ -0,0 +1,60 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// BitMap functions
+
+#include "arrow/util/bit_util.h"
+
+extern "C" {
+
+#include "./types.h"
+
+#define BITS_TO_BYTES(x) ((x + 7) / 8)
+#define BITS_TO_WORDS(x) ((x + 63) / 64)
+
+#define POS_TO_BYTE_INDEX(p) (p / 8)
+#define POS_TO_BIT_INDEX(p) (p % 8)
+
+FORCE_INLINE
+bool bitMapGetBit(const uint8_t* bmap, int64_t position) {
+ return arrow::BitUtil::GetBit(bmap, position);
+}
+
+FORCE_INLINE
+bool bitMapValidityGetBit(const uint8_t* bmap, int64_t position) {
+ if (bmap == nullptr) {
+ // if validity bitmap is null, all entries are valid.
+ return true;
+ } else {
+ return bitMapGetBit(bmap, position);
+ }
+}
+
+FORCE_INLINE
+void bitMapSetBit(uint8_t* bmap, int64_t position, bool value) {
+ arrow::BitUtil::SetBitTo(bmap, position, value);
+}
+
+// Clear the bit if value = false. Does nothing if value = true.
+FORCE_INLINE
+void bitMapClearBitIfFalse(uint8_t* bmap, int64_t position, bool value) {
+ if (!value) {
+ arrow::BitUtil::ClearBit(bmap, position);
+ }
+}
+
+} // extern "C"
diff --git a/src/arrow/cpp/src/gandiva/precompiled/bitmap_test.cc b/src/arrow/cpp/src/gandiva/precompiled/bitmap_test.cc
new file mode 100644
index 000000000..ac3084ade
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/precompiled/bitmap_test.cc
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+#include "gandiva/precompiled/types.h"
+
+namespace gandiva {
+
+TEST(TestBitMap, TestSimple) {
+ static const int kNumBytes = 16;
+ uint8_t bit_map[kNumBytes];
+ memset(bit_map, 0, kNumBytes);
+
+ EXPECT_EQ(bitMapGetBit(bit_map, 100), false);
+
+ // set 100th bit and verify
+ bitMapSetBit(bit_map, 100, true);
+ EXPECT_EQ(bitMapGetBit(bit_map, 100), true);
+
+ // clear 100th bit and verify
+ bitMapSetBit(bit_map, 100, false);
+ EXPECT_EQ(bitMapGetBit(bit_map, 100), false);
+}
+
+TEST(TestBitMap, TestClearIfFalse) {
+ static const int kNumBytes = 32;
+ uint8_t bit_map[kNumBytes];
+ memset(bit_map, 0, kNumBytes);
+
+ bitMapSetBit(bit_map, 24, true);
+
+ // bit should remain unchanged.
+ bitMapClearBitIfFalse(bit_map, 24, true);
+ EXPECT_EQ(bitMapGetBit(bit_map, 24), true);
+
+ // bit should be cleared.
+ bitMapClearBitIfFalse(bit_map, 24, false);
+ EXPECT_EQ(bitMapGetBit(bit_map, 24), false);
+
+ // this function should have no impact if the bit is already clear.
+ bitMapClearBitIfFalse(bit_map, 24, true);
+ EXPECT_EQ(bitMapGetBit(bit_map, 24), false);
+
+ bitMapClearBitIfFalse(bit_map, 24, false);
+ EXPECT_EQ(bitMapGetBit(bit_map, 24), false);
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/precompiled/decimal_ops.cc b/src/arrow/cpp/src/gandiva/precompiled/decimal_ops.cc
new file mode 100644
index 000000000..61cac6062
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/precompiled/decimal_ops.cc
@@ -0,0 +1,723 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Algorithms adapted from Apache Impala
+
+#include "gandiva/precompiled/decimal_ops.h"
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+
+#include "arrow/util/logging.h"
+#include "gandiva/decimal_type_util.h"
+#include "gandiva/decimal_xlarge.h"
+#include "gandiva/gdv_function_stubs.h"
+
+// Several operations (multiply, divide, mod, ..) require converting to 256-bit, and we
+// use the boost library for doing 256-bit operations. To avoid references to boost from
+// the precompiled-to-ir code (this causes issues with symbol resolution at runtime), we
+// use a wrapper exported from the CPP code. The wrapper functions are named gdv_xlarge_xx
+
+namespace gandiva {
+namespace decimalops {
+
+using arrow::BasicDecimal128;
+
+static BasicDecimal128 CheckAndIncreaseScale(const BasicDecimal128& in, int32_t delta) {
+ return (delta <= 0) ? in : in.IncreaseScaleBy(delta);
+}
+
+static BasicDecimal128 CheckAndReduceScale(const BasicDecimal128& in, int32_t delta) {
+ return (delta <= 0) ? in : in.ReduceScaleBy(delta);
+}
+
+/// Adjust x and y to the same scale, and add them.
+static BasicDecimal128 AddFastPath(const BasicDecimalScalar128& x,
+ const BasicDecimalScalar128& y, int32_t out_scale) {
+ auto higher_scale = std::max(x.scale(), y.scale());
+
+ auto x_scaled = CheckAndIncreaseScale(x.value(), higher_scale - x.scale());
+ auto y_scaled = CheckAndIncreaseScale(y.value(), higher_scale - y.scale());
+ return x_scaled + y_scaled;
+}
+
+/// Add x and y, caller has ensured there can be no overflow.
+static BasicDecimal128 AddNoOverflow(const BasicDecimalScalar128& x,
+ const BasicDecimalScalar128& y, int32_t out_scale) {
+ auto higher_scale = std::max(x.scale(), y.scale());
+ auto sum = AddFastPath(x, y, out_scale);
+ return CheckAndReduceScale(sum, higher_scale - out_scale);
+}
+
+/// Both x_value and y_value must be >= 0
+static BasicDecimal128 AddLargePositive(const BasicDecimalScalar128& x,
+ const BasicDecimalScalar128& y,
+ int32_t out_scale) {
+ DCHECK_GE(x.value(), 0);
+ DCHECK_GE(y.value(), 0);
+
+ // separate out whole/fractions.
+ BasicDecimal128 x_left, x_right, y_left, y_right;
+ x.value().GetWholeAndFraction(x.scale(), &x_left, &x_right);
+ y.value().GetWholeAndFraction(y.scale(), &y_left, &y_right);
+
+ // Adjust fractional parts to higher scale.
+ auto higher_scale = std::max(x.scale(), y.scale());
+ auto x_right_scaled = CheckAndIncreaseScale(x_right, higher_scale - x.scale());
+ auto y_right_scaled = CheckAndIncreaseScale(y_right, higher_scale - y.scale());
+
+ BasicDecimal128 right;
+ BasicDecimal128 carry_to_left;
+ auto multiplier = BasicDecimal128::GetScaleMultiplier(higher_scale);
+ if (x_right_scaled >= multiplier - y_right_scaled) {
+ right = x_right_scaled - (multiplier - y_right_scaled);
+ carry_to_left = 1;
+ } else {
+ right = x_right_scaled + y_right_scaled;
+ carry_to_left = 0;
+ }
+ right = CheckAndReduceScale(right, higher_scale - out_scale);
+
+ auto left = x_left + y_left + carry_to_left;
+ return (left * BasicDecimal128::GetScaleMultiplier(out_scale)) + right;
+}
+
+/// x_value and y_value cannot be 0, and one must be positive and the other negative.
+static BasicDecimal128 AddLargeNegative(const BasicDecimalScalar128& x,
+ const BasicDecimalScalar128& y,
+ int32_t out_scale) {
+ DCHECK_NE(x.value(), 0);
+ DCHECK_NE(y.value(), 0);
+ DCHECK((x.value() < 0 && y.value() > 0) || (x.value() > 0 && y.value() < 0));
+
+ // separate out whole/fractions.
+ BasicDecimal128 x_left, x_right, y_left, y_right;
+ x.value().GetWholeAndFraction(x.scale(), &x_left, &x_right);
+ y.value().GetWholeAndFraction(y.scale(), &y_left, &y_right);
+
+ // Adjust fractional parts to higher scale.
+ auto higher_scale = std::max(x.scale(), y.scale());
+ x_right = CheckAndIncreaseScale(x_right, higher_scale - x.scale());
+ y_right = CheckAndIncreaseScale(y_right, higher_scale - y.scale());
+
+ // Overflow not possible because one is +ve and the other is -ve.
+ auto left = x_left + y_left;
+ auto right = x_right + y_right;
+
+ // If the whole and fractional parts have different signs, then we need to make the
+ // fractional part have the same sign as the whole part. If either left or right is
+ // zero, then nothing needs to be done.
+ if (left < 0 && right > 0) {
+ left += 1;
+ right -= BasicDecimal128::GetScaleMultiplier(higher_scale);
+ } else if (left > 0 && right < 0) {
+ left -= 1;
+ right += BasicDecimal128::GetScaleMultiplier(higher_scale);
+ }
+ right = CheckAndReduceScale(right, higher_scale - out_scale);
+ return (left * BasicDecimal128::GetScaleMultiplier(out_scale)) + right;
+}
+
+static BasicDecimal128 AddLarge(const BasicDecimalScalar128& x,
+ const BasicDecimalScalar128& y, int32_t out_scale) {
+ if (x.value() >= 0 && y.value() >= 0) {
+ // both positive or 0
+ return AddLargePositive(x, y, out_scale);
+ } else if (x.value() <= 0 && y.value() <= 0) {
+ // both negative or 0
+ BasicDecimalScalar128 x_neg(-x.value(), x.precision(), x.scale());
+ BasicDecimalScalar128 y_neg(-y.value(), y.precision(), y.scale());
+ return -AddLargePositive(x_neg, y_neg, out_scale);
+ } else {
+ // one positive and the other negative
+ return AddLargeNegative(x, y, out_scale);
+ }
+}
+
+// Suppose we have a number that requires x bits to be represented and we scale it up by
+// 10^scale_by. Let's say now y bits are required to represent it. This function returns
+// the maximum possible y - x for a given 'scale_by'.
+inline int32_t MaxBitsRequiredIncreaseAfterScaling(int32_t scale_by) {
+ // We rely on the following formula:
+ // bits_required(x * 10^y) <= bits_required(x) + floor(log2(10^y)) + 1
+ // We precompute floor(log2(10^x)) + 1 for x = 0, 1, 2...75, 76
+ DCHECK_GE(scale_by, 0);
+ DCHECK_LE(scale_by, 76);
+ static const int32_t floor_log2_plus_one[] = {
+ 0, 4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37, 40, 44, 47, 50,
+ 54, 57, 60, 64, 67, 70, 74, 77, 80, 84, 87, 90, 94, 97, 100, 103,
+ 107, 110, 113, 117, 120, 123, 127, 130, 133, 137, 140, 143, 147, 150, 153, 157,
+ 160, 163, 167, 170, 173, 177, 180, 183, 187, 190, 193, 196, 200, 203, 206, 210,
+ 213, 216, 220, 223, 226, 230, 233, 236, 240, 243, 246, 250, 253};
+ return floor_log2_plus_one[scale_by];
+}
+
+// If we have a number with 'num_lz' leading zeros, and we scale it up by 10^scale_by,
+// this function returns the minimum number of leading zeros the result can have.
+inline int32_t MinLeadingZerosAfterScaling(int32_t num_lz, int32_t scale_by) {
+ DCHECK_GE(scale_by, 0);
+ DCHECK_LE(scale_by, 76);
+ int32_t result = num_lz - MaxBitsRequiredIncreaseAfterScaling(scale_by);
+ return result;
+}
+
+// Returns the maximum possible number of bits required to represent num * 10^scale_by.
+inline int32_t MaxBitsRequiredAfterScaling(const BasicDecimalScalar128& num,
+ int32_t scale_by) {
+ auto value = num.value();
+ auto value_abs = value.Abs();
+
+ int32_t num_occupied = 128 - value_abs.CountLeadingBinaryZeros();
+ DCHECK_GE(scale_by, 0);
+ DCHECK_LE(scale_by, 76);
+ return num_occupied + MaxBitsRequiredIncreaseAfterScaling(scale_by);
+}
+
+// Returns the minimum number of leading zero x or y would have after one of them gets
+// scaled up to match the scale of the other one.
+inline int32_t MinLeadingZeros(const BasicDecimalScalar128& x,
+ const BasicDecimalScalar128& y) {
+ auto x_value = x.value();
+ auto x_value_abs = x_value.Abs();
+
+ auto y_value = y.value();
+ auto y_value_abs = y_value.Abs();
+
+ int32_t x_lz = x_value_abs.CountLeadingBinaryZeros();
+ int32_t y_lz = y_value_abs.CountLeadingBinaryZeros();
+ if (x.scale() < y.scale()) {
+ x_lz = MinLeadingZerosAfterScaling(x_lz, y.scale() - x.scale());
+ } else if (x.scale() > y.scale()) {
+ y_lz = MinLeadingZerosAfterScaling(y_lz, x.scale() - y.scale());
+ }
+ return std::min(x_lz, y_lz);
+}
+
+BasicDecimal128 Add(const BasicDecimalScalar128& x, const BasicDecimalScalar128& y,
+ int32_t out_precision, int32_t out_scale) {
+ if (out_precision < DecimalTypeUtil::kMaxPrecision) {
+ // fast-path add
+ return AddFastPath(x, y, out_scale);
+ } else {
+ int32_t min_lz = MinLeadingZeros(x, y);
+ if (min_lz >= 3) {
+ // If both numbers have at least MIN_LZ leading zeros, we can add them directly
+ // without the risk of overflow.
+ // We want the result to have at least 2 leading zeros, which ensures that it fits
+ // into the maximum decimal because 2^126 - 1 < 10^38 - 1. If both x and y have at
+ // least 3 leading zeros, then we are guaranteed that the result will have at lest 2
+ // leading zeros.
+ return AddNoOverflow(x, y, out_scale);
+ } else {
+ // slower-version : add whole/fraction parts separately, and then, combine.
+ return AddLarge(x, y, out_scale);
+ }
+ }
+}
+
+BasicDecimal128 Subtract(const BasicDecimalScalar128& x, const BasicDecimalScalar128& y,
+ int32_t out_precision, int32_t out_scale) {
+ return Add(x, {-y.value(), y.precision(), y.scale()}, out_precision, out_scale);
+}
+
+// Multiply when the out_precision is 38, and there is no trimming of the scale i.e
+// the intermediate value is the same as the final value.
+static BasicDecimal128 MultiplyMaxPrecisionNoScaleDown(const BasicDecimalScalar128& x,
+ const BasicDecimalScalar128& y,
+ int32_t out_scale,
+ bool* overflow) {
+ DCHECK_EQ(x.scale() + y.scale(), out_scale);
+
+ BasicDecimal128 result;
+ auto x_abs = BasicDecimal128::Abs(x.value());
+ auto y_abs = BasicDecimal128::Abs(y.value());
+
+ if (x_abs > BasicDecimal128::GetMaxValue() / y_abs) {
+ *overflow = true;
+ } else {
+ // We've verified that the result will fit into 128 bits.
+ *overflow = false;
+ result = x.value() * y.value();
+ }
+ return result;
+}
+
+// Multiply when the out_precision is 38, and there is trimming of the scale i.e
+// the intermediate value could be larger than the final value.
+static BasicDecimal128 MultiplyMaxPrecisionAndScaleDown(const BasicDecimalScalar128& x,
+ const BasicDecimalScalar128& y,
+ int32_t out_scale,
+ bool* overflow) {
+ auto delta_scale = x.scale() + y.scale() - out_scale;
+ DCHECK_GT(delta_scale, 0);
+
+ *overflow = false;
+ BasicDecimal128 result;
+ auto x_abs = BasicDecimal128::Abs(x.value());
+ auto y_abs = BasicDecimal128::Abs(y.value());
+
+ // It's possible that the intermediate value does not fit in 128-bits, but the
+ // final value will (after scaling down).
+ bool needs_int256 = false;
+ int32_t total_leading_zeros =
+ x_abs.CountLeadingBinaryZeros() + y_abs.CountLeadingBinaryZeros();
+ // This check is quick, but conservative. In some cases it will indicate that
+ // converting to 256 bits is necessary, when it's not actually the case.
+ needs_int256 = total_leading_zeros <= 128;
+ if (ARROW_PREDICT_FALSE(needs_int256)) {
+ int64_t result_high;
+ uint64_t result_low;
+
+ // This requires converting to 256-bit, and we use the boost library for that. To
+ // avoid references to boost from the precompiled-to-ir code (this causes issues
+ // with symbol resolution at runtime), we use a wrapper exported from the CPP code.
+ gdv_xlarge_multiply_and_scale_down(x.value().high_bits(), x.value().low_bits(),
+ y.value().high_bits(), y.value().low_bits(),
+ delta_scale, &result_high, &result_low, overflow);
+ result = BasicDecimal128(result_high, result_low);
+ } else {
+ if (ARROW_PREDICT_TRUE(delta_scale <= 38)) {
+ // The largest value that result can have here is (2^64 - 1) * (2^63 - 1), which is
+ // greater than BasicDecimal128::kMaxValue.
+ result = x.value() * y.value();
+ // Since delta_scale is greater than zero, result can now be at most
+ // ((2^64 - 1) * (2^63 - 1)) / 10, which is less than BasicDecimal128::kMaxValue, so
+ // there cannot be any overflow.
+ result = result.ReduceScaleBy(delta_scale);
+ } else {
+ // We are multiplying decimal(38, 38) by decimal(38, 38). The result should be a
+ // decimal(38, 37), so delta scale = 38 + 38 - 37 = 39. Since we are not in the
+ // 256 bit intermediate value case and we are scaling down by 39, then we are
+ // guaranteed that the result is 0 (even if we try to round). The largest possible
+ // intermediate result is 38 "9"s. If we scale down by 39, the leftmost 9 is now
+ // two digits to the right of the rightmost "visible" one. The reason why we have
+ // to handle this case separately is because a scale multiplier with a delta_scale
+ // 39 does not fit into 128 bit.
+ DCHECK_EQ(delta_scale, 39);
+ result = 0;
+ }
+ }
+ return result;
+}
+
+// Multiply when the out_precision is 38.
+static BasicDecimal128 MultiplyMaxPrecision(const BasicDecimalScalar128& x,
+ const BasicDecimalScalar128& y,
+ int32_t out_scale, bool* overflow) {
+ auto delta_scale = x.scale() + y.scale() - out_scale;
+ DCHECK_GE(delta_scale, 0);
+ if (delta_scale == 0) {
+ return MultiplyMaxPrecisionNoScaleDown(x, y, out_scale, overflow);
+ } else {
+ return MultiplyMaxPrecisionAndScaleDown(x, y, out_scale, overflow);
+ }
+}
+
+BasicDecimal128 Multiply(const BasicDecimalScalar128& x, const BasicDecimalScalar128& y,
+ int32_t out_precision, int32_t out_scale, bool* overflow) {
+ BasicDecimal128 result;
+ *overflow = false;
+ if (out_precision < DecimalTypeUtil::kMaxPrecision) {
+ // fast-path multiply
+ result = x.value() * y.value();
+ DCHECK_EQ(x.scale() + y.scale(), out_scale);
+ DCHECK_LE(BasicDecimal128::Abs(result), BasicDecimal128::GetMaxValue());
+ } else if (x.value() == 0 || y.value() == 0) {
+ // Handle this separately to avoid divide-by-zero errors.
+ result = BasicDecimal128(0, 0);
+ } else {
+ result = MultiplyMaxPrecision(x, y, out_scale, overflow);
+ }
+ DCHECK(*overflow || BasicDecimal128::Abs(result) <= BasicDecimal128::GetMaxValue());
+ return result;
+}
+
+BasicDecimal128 Divide(int64_t context, const BasicDecimalScalar128& x,
+ const BasicDecimalScalar128& y, int32_t out_precision,
+ int32_t out_scale, bool* overflow) {
+ if (y.value() == 0) {
+ char const* err_msg = "divide by zero error";
+ gdv_fn_context_set_error_msg(context, err_msg);
+ return 0;
+ }
+
+ // scale up to the output scale, and do an integer division.
+ int32_t delta_scale = out_scale + y.scale() - x.scale();
+ DCHECK_GE(delta_scale, 0);
+
+ BasicDecimal128 result;
+ auto num_bits_required_after_scaling = MaxBitsRequiredAfterScaling(x, delta_scale);
+ if (num_bits_required_after_scaling <= 127) {
+ // fast-path. The dividend fits in 128-bit after scaling too.
+ *overflow = false;
+
+ // do the division.
+ auto x_scaled = CheckAndIncreaseScale(x.value(), delta_scale);
+ BasicDecimal128 remainder;
+ auto status = x_scaled.Divide(y.value(), &result, &remainder);
+ DCHECK_EQ(status, arrow::DecimalStatus::kSuccess);
+
+ // round-up
+ if (BasicDecimal128::Abs(2 * remainder) >= BasicDecimal128::Abs(y.value())) {
+ result += (x.value().Sign() ^ y.value().Sign()) + 1;
+ }
+ } else {
+ // convert to 256-bit and do the divide.
+ *overflow = delta_scale > 38 && num_bits_required_after_scaling > 255;
+ if (!*overflow) {
+ int64_t result_high;
+ uint64_t result_low;
+
+ gdv_xlarge_scale_up_and_divide(x.value().high_bits(), x.value().low_bits(),
+ y.value().high_bits(), y.value().low_bits(),
+ delta_scale, &result_high, &result_low, overflow);
+ result = BasicDecimal128(result_high, result_low);
+ }
+ }
+ return result;
+}
+
+BasicDecimal128 Mod(int64_t context, const BasicDecimalScalar128& x,
+ const BasicDecimalScalar128& y, int32_t out_precision,
+ int32_t out_scale, bool* overflow) {
+ if (y.value() == 0) {
+ char const* err_msg = "divide by zero error";
+ gdv_fn_context_set_error_msg(context, err_msg);
+ return 0;
+ }
+
+ // Adsjust x and y to the same scale (higher one), and then, do a integer mod.
+ *overflow = false;
+ BasicDecimal128 result;
+ int32_t min_lz = MinLeadingZeros(x, y);
+ if (min_lz >= 2) {
+ auto higher_scale = std::max(x.scale(), y.scale());
+ auto x_scaled = CheckAndIncreaseScale(x.value(), higher_scale - x.scale());
+ auto y_scaled = CheckAndIncreaseScale(y.value(), higher_scale - y.scale());
+ result = x_scaled % y_scaled;
+ DCHECK_LE(BasicDecimal128::Abs(result), BasicDecimal128::GetMaxValue());
+ } else {
+ int64_t result_high;
+ uint64_t result_low;
+
+ gdv_xlarge_mod(x.value().high_bits(), x.value().low_bits(), x.scale(),
+ y.value().high_bits(), y.value().low_bits(), y.scale(), &result_high,
+ &result_low);
+ result = BasicDecimal128(result_high, result_low);
+ }
+ DCHECK(BasicDecimal128::Abs(result) <= BasicDecimal128::Abs(x.value()) ||
+ BasicDecimal128::Abs(result) <= BasicDecimal128::Abs(y.value()));
+ return result;
+}
+
+int32_t CompareSameScale(const BasicDecimal128& x, const BasicDecimal128& y) {
+ if (x == y) {
+ return 0;
+ } else if (x < y) {
+ return -1;
+ } else {
+ return 1;
+ }
+}
+
+int32_t Compare(const BasicDecimalScalar128& x, const BasicDecimalScalar128& y) {
+ int32_t delta_scale = x.scale() - y.scale();
+
+ // fast-path : both are of the same scale.
+ if (delta_scale == 0) {
+ return CompareSameScale(x.value(), y.value());
+ }
+
+ // Check if we'll need more than 256-bits after adjusting the scale.
+ bool need256 =
+ (delta_scale < 0 && x.precision() - delta_scale > DecimalTypeUtil::kMaxPrecision) ||
+ (y.precision() + delta_scale > DecimalTypeUtil::kMaxPrecision);
+ if (need256) {
+ return gdv_xlarge_compare(x.value().high_bits(), x.value().low_bits(), x.scale(),
+ y.value().high_bits(), y.value().low_bits(), y.scale());
+ } else {
+ BasicDecimal128 x_scaled;
+ BasicDecimal128 y_scaled;
+
+ if (delta_scale < 0) {
+ x_scaled = x.value().IncreaseScaleBy(-delta_scale);
+ y_scaled = y.value();
+ } else {
+ x_scaled = x.value();
+ y_scaled = y.value().IncreaseScaleBy(delta_scale);
+ }
+ return CompareSameScale(x_scaled, y_scaled);
+ }
+}
+
+#define DECIMAL_OVERFLOW_IF(condition, overflow) \
+ do { \
+ if (*overflow || (condition)) { \
+ *overflow = true; \
+ return 0; \
+ } \
+ } while (0)
+
+static BasicDecimal128 GetMaxValue(int32_t precision) {
+ return BasicDecimal128::GetScaleMultiplier(precision) - 1;
+}
+
+// Compute the double scale multipliers once.
+static std::array<double, DecimalTypeUtil::kMaxPrecision + 1> kDoubleScaleMultipliers =
+ ([]() -> std::array<double, DecimalTypeUtil::kMaxPrecision + 1> {
+ std::array<double, DecimalTypeUtil::kMaxPrecision + 1> values;
+ values[0] = 1.0;
+ for (int32_t idx = 1; idx <= DecimalTypeUtil::kMaxPrecision; idx++) {
+ values[idx] = values[idx - 1] * 10;
+ }
+ return values;
+ })();
+
+BasicDecimal128 FromDouble(double in, int32_t precision, int32_t scale, bool* overflow) {
+ // Multiply decimal with the scale
+ auto unscaled = in * kDoubleScaleMultipliers[scale];
+ DECIMAL_OVERFLOW_IF(std::isnan(unscaled), overflow);
+
+ unscaled = std::round(unscaled);
+
+ // convert scaled double to int128
+ int32_t sign = unscaled < 0 ? -1 : 1;
+ auto unscaled_abs = std::abs(unscaled);
+
+ // overflow if > 2^127 - 1
+ DECIMAL_OVERFLOW_IF(unscaled_abs > std::ldexp(static_cast<double>(1), 127) - 1,
+ overflow);
+
+ uint64_t high_bits = static_cast<uint64_t>(std::ldexp(unscaled_abs, -64));
+ uint64_t low_bits = static_cast<uint64_t>(
+ unscaled_abs - std::ldexp(static_cast<double>(high_bits), 64));
+
+ auto result = BasicDecimal128(static_cast<int64_t>(high_bits), low_bits);
+
+ // overflow if > max value based on precision
+ DECIMAL_OVERFLOW_IF(result > GetMaxValue(precision), overflow);
+ return result * sign;
+}
+
+double ToDouble(const BasicDecimalScalar128& in, bool* overflow) {
+ // convert int128 to double
+ int64_t sign = in.value().Sign();
+ auto value_abs = BasicDecimal128::Abs(in.value());
+ double unscaled = static_cast<double>(value_abs.low_bits()) +
+ std::ldexp(static_cast<double>(value_abs.high_bits()), 64);
+
+ // scale double.
+ return (unscaled * sign) / kDoubleScaleMultipliers[in.scale()];
+}
+
+BasicDecimal128 FromInt64(int64_t in, int32_t precision, int32_t scale, bool* overflow) {
+ // check if multiplying by scale will cause an overflow.
+ DECIMAL_OVERFLOW_IF(std::abs(in) > GetMaxValue(precision - scale), overflow);
+ return in * BasicDecimal128::GetScaleMultiplier(scale);
+}
+
+// Helper function to modify the scale and/or precision of a decimal value.
+static BasicDecimal128 ModifyScaleAndPrecision(const BasicDecimalScalar128& x,
+ int32_t out_precision, int32_t out_scale,
+ bool* overflow) {
+ int32_t delta_scale = out_scale - x.scale();
+ if (delta_scale >= 0) {
+ // check if multiplying by delta_scale will cause an overflow.
+ DECIMAL_OVERFLOW_IF(
+ BasicDecimal128::Abs(x.value()) > GetMaxValue(out_precision - delta_scale),
+ overflow);
+ return x.value().IncreaseScaleBy(delta_scale);
+ } else {
+ // Do not do any rounding, that is handled by the caller.
+ auto result = x.value().ReduceScaleBy(-delta_scale, false);
+ DECIMAL_OVERFLOW_IF(BasicDecimal128::Abs(result) > GetMaxValue(out_precision),
+ overflow);
+ return result;
+ }
+}
+
+enum RoundType {
+ kRoundTypeCeil, // +1 if +ve and trailing value is > 0, else no rounding.
+ kRoundTypeFloor, // -1 if -ve and trailing value is < 0, else no rounding.
+ kRoundTypeTrunc, // no rounding, truncate the trailing digits.
+ kRoundTypeHalfRoundUp, // if +ve and trailing value is >= half of base, +1.
+ // else if -ve and trailing value is >= half of base, -1.
+};
+
+// Compute the rounding delta for the givven rounding type.
+static int32_t ComputeRoundingDelta(const BasicDecimal128& x, int32_t x_scale,
+ int32_t out_scale, RoundType type) {
+ if (type == kRoundTypeTrunc || // no rounding for this type.
+ out_scale >= x_scale) { // no digits dropped, so no rounding.
+ return 0;
+ }
+
+ int32_t result = 0;
+ switch (type) {
+ case kRoundTypeHalfRoundUp: {
+ auto base = BasicDecimal128::GetScaleMultiplier(x_scale - out_scale);
+ auto trailing = x % base;
+ if (trailing == 0) {
+ result = 0;
+ } else if (trailing.Abs() < base / 2) {
+ result = 0;
+ } else {
+ result = (x < 0) ? -1 : 1;
+ }
+ break;
+ }
+
+ case kRoundTypeCeil:
+ if (x < 0) {
+ // no rounding for -ve
+ result = 0;
+ } else {
+ auto base = BasicDecimal128::GetScaleMultiplier(x_scale - out_scale);
+ auto trailing = x % base;
+ result = (trailing == 0) ? 0 : 1;
+ }
+ break;
+
+ case kRoundTypeFloor:
+ if (x > 0) {
+ // no rounding for +ve
+ result = 0;
+ } else {
+ auto base = BasicDecimal128::GetScaleMultiplier(x_scale - out_scale);
+ auto trailing = x % base;
+ result = (trailing == 0) ? 0 : -1;
+ }
+ break;
+
+ case kRoundTypeTrunc:
+ break;
+ }
+ return result;
+}
+
+// Modify the scale and round.
+static BasicDecimal128 RoundWithPositiveScale(const BasicDecimalScalar128& x,
+ int32_t out_precision, int32_t out_scale,
+ RoundType round_type, bool* overflow) {
+ DCHECK_GE(out_scale, 0);
+
+ auto scaled = ModifyScaleAndPrecision(x, out_precision, out_scale, overflow);
+ if (*overflow) {
+ return 0;
+ }
+
+ auto delta = ComputeRoundingDelta(x.value(), x.scale(), out_scale, round_type);
+ if (delta == 0) {
+ return scaled;
+ }
+
+ // If there is a rounding delta, the output scale must be less than the input scale.
+ // That means at least one digit is dropped after the decimal. The delta add can add
+ // utmost one digit before the decimal. So, overflow will occur only if the output
+ // precision has changed.
+ DCHECK_GT(x.scale(), out_scale);
+ auto result = scaled + delta;
+ DECIMAL_OVERFLOW_IF(out_precision < x.precision() &&
+ BasicDecimal128::Abs(result) > GetMaxValue(out_precision),
+ overflow);
+ return result;
+}
+
+// Modify scale to drop all digits to the right of the decimal and round.
+// Then, zero out 'rounding_scale' number of digits to the left of the decimal point.
+static BasicDecimal128 RoundWithNegativeScale(const BasicDecimalScalar128& x,
+ int32_t out_precision,
+ int32_t rounding_scale,
+ RoundType round_type, bool* overflow) {
+ DCHECK_LT(rounding_scale, 0);
+
+ // get rid of the fractional part.
+ auto scaled = ModifyScaleAndPrecision(x, out_precision, 0, overflow);
+ auto rounding_delta = ComputeRoundingDelta(scaled, 0, -rounding_scale, round_type);
+
+ auto base = BasicDecimal128::GetScaleMultiplier(-rounding_scale);
+ auto delta = rounding_delta * base - (scaled % base);
+ DECIMAL_OVERFLOW_IF(BasicDecimal128::Abs(scaled) >
+ GetMaxValue(out_precision) - BasicDecimal128::Abs(delta),
+ overflow);
+ return scaled + delta;
+}
+
+BasicDecimal128 Round(const BasicDecimalScalar128& x, int32_t out_precision,
+ int32_t out_scale, int32_t rounding_scale, bool* overflow) {
+ // no-op if target scale is same as arg scale
+ if (x.scale() == out_scale && rounding_scale >= 0) {
+ return x.value();
+ }
+
+ if (rounding_scale < 0) {
+ return RoundWithNegativeScale(x, out_precision, rounding_scale,
+ RoundType::kRoundTypeHalfRoundUp, overflow);
+ } else {
+ return RoundWithPositiveScale(x, out_precision, rounding_scale,
+ RoundType::kRoundTypeHalfRoundUp, overflow);
+ }
+}
+
+BasicDecimal128 Truncate(const BasicDecimalScalar128& x, int32_t out_precision,
+ int32_t out_scale, int32_t rounding_scale, bool* overflow) {
+ // no-op if target scale is same as arg scale
+ if (x.scale() == out_scale && rounding_scale >= 0) {
+ return x.value();
+ }
+
+ if (rounding_scale < 0) {
+ return RoundWithNegativeScale(x, out_precision, rounding_scale,
+ RoundType::kRoundTypeTrunc, overflow);
+ } else {
+ return RoundWithPositiveScale(x, out_precision, rounding_scale,
+ RoundType::kRoundTypeTrunc, overflow);
+ }
+}
+
+BasicDecimal128 Ceil(const BasicDecimalScalar128& x, bool* overflow) {
+ return RoundWithPositiveScale(x, x.precision(), 0, RoundType::kRoundTypeCeil, overflow);
+}
+
+BasicDecimal128 Floor(const BasicDecimalScalar128& x, bool* overflow) {
+ return RoundWithPositiveScale(x, x.precision(), 0, RoundType::kRoundTypeFloor,
+ overflow);
+}
+
+BasicDecimal128 Convert(const BasicDecimalScalar128& x, int32_t out_precision,
+ int32_t out_scale, bool* overflow) {
+ DCHECK_GE(out_scale, 0);
+ DCHECK_LE(out_scale, DecimalTypeUtil::kMaxScale);
+ DCHECK_GT(out_precision, 0);
+ DCHECK_LE(out_precision, DecimalTypeUtil::kMaxScale);
+
+ return RoundWithPositiveScale(x, out_precision, out_scale,
+ RoundType::kRoundTypeHalfRoundUp, overflow);
+}
+
+int64_t ToInt64(const BasicDecimalScalar128& in, bool* overflow) {
+ auto rounded = RoundWithPositiveScale(in, in.precision(), 0 /*scale*/,
+ RoundType::kRoundTypeHalfRoundUp, overflow);
+ DECIMAL_OVERFLOW_IF((rounded > std::numeric_limits<int64_t>::max()) ||
+ (rounded < std::numeric_limits<int64_t>::min()),
+ overflow);
+ return static_cast<int64_t>(rounded.low_bits());
+}
+
+} // namespace decimalops
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/precompiled/decimal_ops.h b/src/arrow/cpp/src/gandiva/precompiled/decimal_ops.h
new file mode 100644
index 000000000..292dce220
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/precompiled/decimal_ops.h
@@ -0,0 +1,90 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include "gandiva/basic_decimal_scalar.h"
+
+namespace gandiva {
+namespace decimalops {
+
+/// Return the sum of 'x' and 'y'.
+/// out_precision and out_scale are passed along for efficiency, they must match
+/// the rules in DecimalTypeSql::GetResultType.
+arrow::BasicDecimal128 Add(const BasicDecimalScalar128& x, const BasicDecimalScalar128& y,
+ int32_t out_precision, int32_t out_scale);
+
+/// Subtract 'y' from 'x', and return the result.
+arrow::BasicDecimal128 Subtract(const BasicDecimalScalar128& x,
+ const BasicDecimalScalar128& y, int32_t out_precision,
+ int32_t out_scale);
+
+/// Multiply 'x' from 'y', and return the result.
+arrow::BasicDecimal128 Multiply(const BasicDecimalScalar128& x,
+ const BasicDecimalScalar128& y, int32_t out_precision,
+ int32_t out_scale, bool* overflow);
+
+/// Divide 'x' by 'y', and return the result.
+arrow::BasicDecimal128 Divide(int64_t context, const BasicDecimalScalar128& x,
+ const BasicDecimalScalar128& y, int32_t out_precision,
+ int32_t out_scale, bool* overflow);
+
+/// Divide 'x' by 'y', and return the remainder.
+arrow::BasicDecimal128 Mod(int64_t context, const BasicDecimalScalar128& x,
+ const BasicDecimalScalar128& y, int32_t out_precision,
+ int32_t out_scale, bool* overflow);
+
+/// Compare two decimals. Returns :
+/// 0 if x == y
+/// 1 if x > y
+/// -1 if x < y
+int32_t Compare(const BasicDecimalScalar128& x, const BasicDecimalScalar128& y);
+
+/// Convert to decimal from double.
+BasicDecimal128 FromDouble(double in, int32_t precision, int32_t scale, bool* overflow);
+
+/// Convert from decimal to double.
+double ToDouble(const BasicDecimalScalar128& in, bool* overflow);
+
+/// Convert to decimal from gdv_int64.
+BasicDecimal128 FromInt64(int64_t in, int32_t precision, int32_t scale, bool* overflow);
+
+/// Convert from decimal to gdv_int64
+int64_t ToInt64(const BasicDecimalScalar128& in, bool* overflow);
+
+/// Convert from one decimal scale/precision to another.
+BasicDecimal128 Convert(const BasicDecimalScalar128& x, int32_t out_precision,
+ int32_t out_scale, bool* overflow);
+
+/// round decimal.
+BasicDecimal128 Round(const BasicDecimalScalar128& x, int32_t out_precision,
+ int32_t out_scale, int32_t rounding_scale, bool* overflow);
+
+/// truncate decimal.
+BasicDecimal128 Truncate(const BasicDecimalScalar128& x, int32_t out_precision,
+ int32_t out_scale, int32_t rounding_scale, bool* overflow);
+
+/// ceil decimal
+BasicDecimal128 Ceil(const BasicDecimalScalar128& x, bool* overflow);
+
+/// floor decimal
+BasicDecimal128 Floor(const BasicDecimalScalar128& x, bool* overflow);
+
+} // namespace decimalops
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/precompiled/decimal_ops_test.cc b/src/arrow/cpp/src/gandiva/precompiled/decimal_ops_test.cc
new file mode 100644
index 000000000..be8a1fe8a
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/precompiled/decimal_ops_test.cc
@@ -0,0 +1,1095 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <limits>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "arrow/testing/gtest_util.h"
+#include "gandiva/decimal_scalar.h"
+#include "gandiva/decimal_type_util.h"
+#include "gandiva/execution_context.h"
+#include "gandiva/precompiled/decimal_ops.h"
+#include "gandiva/precompiled/types.h"
+
+namespace gandiva {
+
+const arrow::Decimal128 kThirtyFive9s(std::string(35, '9'));
+const arrow::Decimal128 kThirtySix9s(std::string(36, '9'));
+const arrow::Decimal128 kThirtyEight9s(std::string(38, '9'));
+
+class TestDecimalSql : public ::testing::Test {
+ protected:
+ static void Verify(DecimalTypeUtil::Op op, const DecimalScalar128& x,
+ const DecimalScalar128& y, const DecimalScalar128& expected_result,
+ bool expected_overflow);
+
+ static void VerifyAllSign(DecimalTypeUtil::Op op, const DecimalScalar128& left,
+ const DecimalScalar128& right,
+ const DecimalScalar128& expected_output,
+ bool expected_overflow);
+
+ void AddAndVerify(const DecimalScalar128& x, const DecimalScalar128& y,
+ const DecimalScalar128& expected_result) {
+ // TODO: overflow checks
+ return Verify(DecimalTypeUtil::kOpAdd, x, y, expected_result, false);
+ }
+
+ void SubtractAndVerify(const DecimalScalar128& x, const DecimalScalar128& y,
+ const DecimalScalar128& expected_result) {
+ // TODO: overflow checks
+ return Verify(DecimalTypeUtil::kOpSubtract, x, y, expected_result, false);
+ }
+
+ void MultiplyAndVerify(const DecimalScalar128& x, const DecimalScalar128& y,
+ const DecimalScalar128& expected_result,
+ bool expected_overflow) {
+ return Verify(DecimalTypeUtil::kOpMultiply, x, y, expected_result, expected_overflow);
+ }
+
+ void MultiplyAndVerifyAllSign(const DecimalScalar128& x, const DecimalScalar128& y,
+ const DecimalScalar128& expected_result,
+ bool expected_overflow) {
+ return VerifyAllSign(DecimalTypeUtil::kOpMultiply, x, y, expected_result,
+ expected_overflow);
+ }
+
+ void DivideAndVerify(const DecimalScalar128& x, const DecimalScalar128& y,
+ const DecimalScalar128& expected_result, bool expected_overflow) {
+ return Verify(DecimalTypeUtil::kOpDivide, x, y, expected_result, expected_overflow);
+ }
+
+ void DivideAndVerifyAllSign(const DecimalScalar128& x, const DecimalScalar128& y,
+ const DecimalScalar128& expected_result,
+ bool expected_overflow) {
+ return VerifyAllSign(DecimalTypeUtil::kOpDivide, x, y, expected_result,
+ expected_overflow);
+ }
+
+ void ModAndVerify(const DecimalScalar128& x, const DecimalScalar128& y,
+ const DecimalScalar128& expected_result, bool expected_overflow) {
+ return Verify(DecimalTypeUtil::kOpMod, x, y, expected_result, expected_overflow);
+ }
+
+ void ModAndVerifyAllSign(const DecimalScalar128& x, const DecimalScalar128& y,
+ const DecimalScalar128& expected_result,
+ bool expected_overflow) {
+ return VerifyAllSign(DecimalTypeUtil::kOpMod, x, y, expected_result,
+ expected_overflow);
+ }
+};
+
+#define EXPECT_DECIMAL_EQ(op, x, y, expected_result, expected_overflow, actual_result, \
+ actual_overflow) \
+ { \
+ EXPECT_TRUE(expected_overflow == actual_overflow) \
+ << op << "(" << (x).ToString() << " and " << (y).ToString() << ")" \
+ << " expected overflow : " << expected_overflow \
+ << " actual overflow : " << actual_overflow; \
+ if (!expected_overflow) { \
+ EXPECT_TRUE(expected_result == actual_result) \
+ << op << "(" << (x).ToString() << " and " << (y).ToString() << ")" \
+ << " expected : " << expected_result.ToString() \
+ << " actual : " << actual_result.ToString(); \
+ } \
+ }
+
+void TestDecimalSql::Verify(DecimalTypeUtil::Op op, const DecimalScalar128& x,
+ const DecimalScalar128& y,
+ const DecimalScalar128& expected_result,
+ bool expected_overflow) {
+ auto t1 = std::make_shared<arrow::Decimal128Type>(x.precision(), x.scale());
+ auto t2 = std::make_shared<arrow::Decimal128Type>(y.precision(), y.scale());
+ bool overflow = false;
+ int64_t context = 0;
+
+ Decimal128TypePtr out_type;
+ ARROW_EXPECT_OK(DecimalTypeUtil::GetResultType(op, {t1, t2}, &out_type));
+
+ arrow::BasicDecimal128 out_value;
+ std::string op_name;
+ switch (op) {
+ case DecimalTypeUtil::kOpAdd:
+ op_name = "add";
+ out_value = decimalops::Add(x, y, out_type->precision(), out_type->scale());
+ break;
+
+ case DecimalTypeUtil::kOpSubtract:
+ op_name = "subtract";
+ out_value = decimalops::Subtract(x, y, out_type->precision(), out_type->scale());
+ break;
+
+ case DecimalTypeUtil::kOpMultiply:
+ op_name = "multiply";
+ out_value =
+ decimalops::Multiply(x, y, out_type->precision(), out_type->scale(), &overflow);
+ break;
+
+ case DecimalTypeUtil::kOpDivide:
+ op_name = "divide";
+ out_value = decimalops::Divide(context, x, y, out_type->precision(),
+ out_type->scale(), &overflow);
+ break;
+
+ case DecimalTypeUtil::kOpMod:
+ op_name = "mod";
+ out_value = decimalops::Mod(context, x, y, out_type->precision(), out_type->scale(),
+ &overflow);
+ break;
+
+ default:
+ // not implemented.
+ ASSERT_FALSE(true);
+ }
+ EXPECT_DECIMAL_EQ(op_name, x, y, expected_result, expected_overflow,
+ DecimalScalar128(out_value, out_type->precision(), out_type->scale()),
+ overflow);
+}
+
+void TestDecimalSql::VerifyAllSign(DecimalTypeUtil::Op op, const DecimalScalar128& left,
+ const DecimalScalar128& right,
+ const DecimalScalar128& expected_output,
+ bool expected_overflow) {
+ // both +ve
+ Verify(op, left, right, expected_output, expected_overflow);
+
+ // left -ve
+ Verify(op, -left, right, -expected_output, expected_overflow);
+
+ if (op == DecimalTypeUtil::kOpMod) {
+ // right -ve
+ Verify(op, left, -right, expected_output, expected_overflow);
+
+ // both -ve
+ Verify(op, -left, -right, -expected_output, expected_overflow);
+ } else {
+ ASSERT_TRUE(op == DecimalTypeUtil::kOpMultiply || op == DecimalTypeUtil::kOpDivide);
+
+ // right -ve
+ Verify(op, left, -right, -expected_output, expected_overflow);
+
+ // both -ve
+ Verify(op, -left, -right, expected_output, expected_overflow);
+ }
+}
+
+TEST_F(TestDecimalSql, Add) {
+ // fast-path
+ AddAndVerify(DecimalScalar128{"201", 30, 3}, // x
+ DecimalScalar128{"301", 30, 3}, // y
+ DecimalScalar128{"502", 31, 3}); // expected
+
+ // max precision
+ AddAndVerify(DecimalScalar128{"09999999999999999999999999999999000000", 38, 5}, // x
+ DecimalScalar128{"100", 38, 7}, // y
+ DecimalScalar128{"99999999999999999999999999999990000010", 38, 6});
+
+ // Both -ve
+ AddAndVerify(DecimalScalar128{"-201", 30, 3}, // x
+ DecimalScalar128{"-301", 30, 2}, // y
+ DecimalScalar128{"-3211", 32, 3}); // expected
+
+ // -ve and max precision
+ AddAndVerify(DecimalScalar128{"-09999999999999999999999999999999000000", 38, 5}, // x
+ DecimalScalar128{"-100", 38, 7}, // y
+ DecimalScalar128{"-99999999999999999999999999999990000010", 38, 6});
+}
+
+TEST_F(TestDecimalSql, Subtract) {
+ // fast-path
+ SubtractAndVerify(DecimalScalar128{"201", 30, 3}, // x
+ DecimalScalar128{"301", 30, 3}, // y
+ DecimalScalar128{"-100", 31, 3}); // expected
+
+ // max precision
+ SubtractAndVerify(
+ DecimalScalar128{"09999999999999999999999999999999000000", 38, 5}, // x
+ DecimalScalar128{"100", 38, 7}, // y
+ DecimalScalar128{"99999999999999999999999999999989999990", 38, 6});
+
+ // Both -ve
+ SubtractAndVerify(DecimalScalar128{"-201", 30, 3}, // x
+ DecimalScalar128{"-301", 30, 2}, // y
+ DecimalScalar128{"2809", 32, 3}); // expected
+
+ // -ve and max precision
+ SubtractAndVerify(
+ DecimalScalar128{"-09999999999999999999999999999999000000", 38, 5}, // x
+ DecimalScalar128{"-100", 38, 7}, // y
+ DecimalScalar128{"-99999999999999999999999999999989999990", 38, 6});
+}
+
+TEST_F(TestDecimalSql, Multiply) {
+ // fast-path : out_precision < 38
+ MultiplyAndVerifyAllSign(DecimalScalar128{"201", 10, 3}, // x
+ DecimalScalar128{"301", 10, 2}, // y
+ DecimalScalar128{"60501", 21, 5}, // expected
+ false); // overflow
+
+ // right 0
+ MultiplyAndVerify(DecimalScalar128{"201", 20, 3}, // x
+ DecimalScalar128{"0", 20, 2}, // y
+ DecimalScalar128{"0", 38, 5}, // expected
+ false); // overflow
+
+ // left 0
+ MultiplyAndVerify(DecimalScalar128{"0", 20, 3}, // x
+ DecimalScalar128{"301", 20, 2}, // y
+ DecimalScalar128{"0", 38, 5}, // expected
+ false); // overflow
+
+ // out_precision == 38, small input values, no trimming of scale (scale <= 6 doesn't
+ // get trimmed).
+ MultiplyAndVerify(DecimalScalar128{"201", 20, 3}, // x
+ DecimalScalar128{"301", 20, 2}, // y
+ DecimalScalar128{"60501", 38, 5}, // expected
+ false); // overflow
+
+ // out_precision == 38, large values, no trimming of scale (scale <= 6 doesn't
+ // get trimmed).
+ MultiplyAndVerifyAllSign(
+ DecimalScalar128{"201", 20, 3}, // x
+ DecimalScalar128{kThirtyFive9s, 35, 2}, // y
+ DecimalScalar128{"20099999999999999999999999999999999799", 38, 5}, // expected
+ false); // overflow
+
+ // out_precision == 38, very large values, no trimming of scale (scale <= 6 doesn't
+ // get trimmed). overflow expected.
+ MultiplyAndVerifyAllSign(DecimalScalar128{"201", 20, 3}, // x
+ DecimalScalar128{kThirtySix9s, 35, 2}, // y
+ DecimalScalar128{"0", 38, 5}, // expected
+ true); // overflow
+
+ MultiplyAndVerifyAllSign(DecimalScalar128{"201", 20, 3}, // x
+ DecimalScalar128{kThirtyEight9s, 35, 2}, // y
+ DecimalScalar128{"0", 38, 5}, // expected
+ true); // overflow
+
+ // out_precision == 38, small input values, trimming of scale.
+ MultiplyAndVerifyAllSign(DecimalScalar128{"201", 20, 5}, // x
+ DecimalScalar128{"301", 20, 5}, // y
+ DecimalScalar128{"61", 38, 7}, // expected
+ false); // overflow
+
+ // out_precision == 38, large values, trimming of scale.
+ MultiplyAndVerifyAllSign(
+ DecimalScalar128{"201", 20, 5}, // x
+ DecimalScalar128{kThirtyFive9s, 35, 5}, // y
+ DecimalScalar128{"2010000000000000000000000000000000", 38, 6}, // expected
+ false); // overflow
+
+ // out_precision == 38, very large values, trimming of scale (requires convert to 256).
+ MultiplyAndVerifyAllSign(
+ DecimalScalar128{kThirtyFive9s, 38, 20}, // x
+ DecimalScalar128{kThirtySix9s, 38, 20}, // y
+ DecimalScalar128{"9999999999999999999999999999999999890", 38, 6}, // expected
+ false); // overflow
+
+ // out_precision == 38, very large values, trimming of scale (requires convert to 256).
+ // should cause overflow.
+ MultiplyAndVerifyAllSign(DecimalScalar128{kThirtyFive9s, 38, 4}, // x
+ DecimalScalar128{kThirtySix9s, 38, 4}, // y
+ DecimalScalar128{"0", 38, 6}, // expected
+ true); // overflow
+
+ // corner cases.
+ MultiplyAndVerifyAllSign(
+ DecimalScalar128{0, UINT64_MAX, 38, 4}, // x
+ DecimalScalar128{0, UINT64_MAX, 38, 4}, // y
+ DecimalScalar128{"3402823669209384634264811192843491082", 38, 6}, // expected
+ false); // overflow
+
+ MultiplyAndVerifyAllSign(
+ DecimalScalar128{0, UINT64_MAX, 38, 4}, // x
+ DecimalScalar128{0, INT64_MAX, 38, 4}, // y
+ DecimalScalar128{"1701411834604692317040171876053197783", 38, 6}, // expected
+ false); // overflow
+
+ MultiplyAndVerifyAllSign(DecimalScalar128{"201", 38, 38}, // x
+ DecimalScalar128{"301", 38, 38}, // y
+ DecimalScalar128{"0", 38, 37}, // expected
+ false); // overflow
+
+ MultiplyAndVerifyAllSign(DecimalScalar128{0, UINT64_MAX, 38, 38}, // x
+ DecimalScalar128{0, UINT64_MAX, 38, 38}, // y
+ DecimalScalar128{"0", 38, 37}, // expected
+ false); // overflow
+
+ MultiplyAndVerifyAllSign(
+ DecimalScalar128{kThirtyFive9s, 38, 38}, // x
+ DecimalScalar128{kThirtySix9s, 38, 38}, // y
+ DecimalScalar128{"100000000000000000000000000000000", 38, 37}, // expected
+ false); // overflow
+}
+
+TEST_F(TestDecimalSql, Divide) {
+ DivideAndVerifyAllSign(DecimalScalar128{"201", 10, 3}, // x
+ DecimalScalar128{"301", 10, 2}, // y
+ DecimalScalar128{"6677740863787", 23, 14}, // expected
+ false); // overflow
+
+ DivideAndVerifyAllSign(DecimalScalar128{"201", 20, 3}, // x
+ DecimalScalar128{"301", 20, 2}, // y
+ DecimalScalar128{"667774086378737542", 38, 19}, // expected
+ false); // overflow
+
+ DivideAndVerifyAllSign(DecimalScalar128{"201", 20, 3}, // x
+ DecimalScalar128{kThirtyFive9s, 35, 2}, // y
+ DecimalScalar128{"0", 38, 19}, // expected
+ false); // overflow
+
+ DivideAndVerifyAllSign(
+ DecimalScalar128{kThirtyFive9s, 35, 6}, // x
+ DecimalScalar128{"201", 20, 3}, // y
+ DecimalScalar128{"497512437810945273631840796019900493", 38, 6}, // expected
+ false); // overflow
+
+ DivideAndVerifyAllSign(DecimalScalar128{kThirtyEight9s, 38, 20}, // x
+ DecimalScalar128{kThirtyFive9s, 38, 20}, // y
+ DecimalScalar128{"1000000000", 38, 6}, // expected
+ false); // overflow
+
+ DivideAndVerifyAllSign(DecimalScalar128{"31939128063561476055", 38, 8}, // x
+ DecimalScalar128{"10000", 20, 0}, // y
+ DecimalScalar128{"3193912806356148", 38, 8}, // expected
+ false);
+
+ // Corner cases
+ DivideAndVerifyAllSign(DecimalScalar128{0, UINT64_MAX, 38, 4}, // x
+ DecimalScalar128{0, UINT64_MAX, 38, 4}, // y
+ DecimalScalar128{"1000000", 38, 6}, // expected
+ false); // overflow
+
+ DivideAndVerifyAllSign(DecimalScalar128{0, UINT64_MAX, 38, 4}, // x
+ DecimalScalar128{0, INT64_MAX, 38, 4}, // y
+ DecimalScalar128{"2000000", 38, 6}, // expected
+ false); // overflow
+
+ DivideAndVerifyAllSign(DecimalScalar128{0, UINT64_MAX, 19, 5}, // x
+ DecimalScalar128{0, INT64_MAX, 19, 5}, // y
+ DecimalScalar128{"20000000000000000001", 38, 19}, // expected
+ false); // overflow
+
+ DivideAndVerifyAllSign(DecimalScalar128{kThirtyFive9s, 38, 37}, // x
+ DecimalScalar128{kThirtyFive9s, 38, 38}, // y
+ DecimalScalar128{"10000000", 38, 6}, // expected
+ false); // overflow
+
+ // overflow
+ DivideAndVerifyAllSign(DecimalScalar128{kThirtyEight9s, 38, 6}, // x
+ DecimalScalar128{"201", 20, 3}, // y
+ DecimalScalar128{"0", 38, 6}, // expected
+ true);
+}
+
+TEST_F(TestDecimalSql, Mod) {
+ ModAndVerifyAllSign(DecimalScalar128{"201", 10, 3}, // x
+ DecimalScalar128{"301", 10, 2}, // y
+ DecimalScalar128{"201", 10, 3}, // expected
+ false); // overflow
+
+ ModAndVerify(DecimalScalar128{"201", 20, 2}, // x
+ DecimalScalar128{"301", 20, 3}, // y
+ DecimalScalar128{"204", 20, 3}, // expected
+ false); // overflow
+
+ ModAndVerifyAllSign(DecimalScalar128{"201", 20, 3}, // x
+ DecimalScalar128{kThirtyFive9s, 35, 2}, // y
+ DecimalScalar128{"201", 20, 3}, // expected
+ false); // overflow
+
+ ModAndVerifyAllSign(DecimalScalar128{kThirtyFive9s, 35, 6}, // x
+ DecimalScalar128{"201", 20, 3}, // y
+ DecimalScalar128{"180999", 23, 6}, // expected
+ false); // overflow
+
+ ModAndVerifyAllSign(DecimalScalar128{kThirtyEight9s, 38, 20}, // x
+ DecimalScalar128{kThirtyFive9s, 38, 21}, // y
+ DecimalScalar128{"9990", 38, 21}, // expected
+ false); // overflow
+
+ ModAndVerifyAllSign(DecimalScalar128{"31939128063561476055", 38, 8}, // x
+ DecimalScalar128{"10000", 20, 0}, // y
+ DecimalScalar128{"63561476055", 28, 8}, // expected
+ false);
+
+ ModAndVerifyAllSign(DecimalScalar128{0, UINT64_MAX, 38, 4}, // x
+ DecimalScalar128{0, UINT64_MAX, 38, 4}, // y
+ DecimalScalar128{"0", 38, 4}, // expected
+ false); // overflow
+
+ ModAndVerifyAllSign(DecimalScalar128{0, UINT64_MAX, 38, 4}, // x
+ DecimalScalar128{0, INT64_MAX, 38, 4}, // y
+ DecimalScalar128{"1", 38, 4}, // expected
+ false); // overflow
+}
+
+TEST_F(TestDecimalSql, DivideByZero) {
+ gandiva::ExecutionContext context;
+ int32_t result_precision;
+ int32_t result_scale;
+ bool overflow;
+
+ // divide-by-zero should cause an error.
+ context.Reset();
+ result_precision = 38;
+ result_scale = 19;
+ decimalops::Divide(reinterpret_cast<gdv_int64>(&context),
+ DecimalScalar128{"201", 20, 3}, DecimalScalar128{"0", 20, 2},
+ result_precision, result_scale, &overflow);
+ EXPECT_TRUE(context.has_error());
+ EXPECT_EQ(context.get_error(), "divide by zero error");
+
+ // divide-by-nonzero should not cause an error.
+ context.Reset();
+ decimalops::Divide(reinterpret_cast<gdv_int64>(&context),
+ DecimalScalar128{"201", 20, 3}, DecimalScalar128{"1", 20, 2},
+ result_precision, result_scale, &overflow);
+ EXPECT_FALSE(context.has_error());
+
+ // mod-by-zero should cause an error.
+ context.Reset();
+ result_precision = 20;
+ result_scale = 3;
+ decimalops::Mod(reinterpret_cast<gdv_int64>(&context), DecimalScalar128{"201", 20, 3},
+ DecimalScalar128{"0", 20, 2}, result_precision, result_scale,
+ &overflow);
+ EXPECT_TRUE(context.has_error());
+ EXPECT_EQ(context.get_error(), "divide by zero error");
+
+ // mod-by-nonzero should not cause an error.
+ context.Reset();
+ decimalops::Mod(reinterpret_cast<gdv_int64>(&context), DecimalScalar128{"201", 20, 3},
+ DecimalScalar128{"1", 20, 2}, result_precision, result_scale,
+ &overflow);
+ EXPECT_FALSE(context.has_error());
+}
+
+TEST_F(TestDecimalSql, Compare) {
+ // x.scale == y.scale
+ EXPECT_EQ(
+ 0, decimalops::Compare(DecimalScalar128{100, 38, 6}, DecimalScalar128{100, 38, 6}));
+ EXPECT_EQ(
+ 1, decimalops::Compare(DecimalScalar128{200, 38, 6}, DecimalScalar128{100, 38, 6}));
+ EXPECT_EQ(-1, decimalops::Compare(DecimalScalar128{100, 38, 6},
+ DecimalScalar128{200, 38, 6}));
+
+ // x.scale == y.scale, with -ve.
+ EXPECT_EQ(0, decimalops::Compare(DecimalScalar128{-100, 38, 6},
+ DecimalScalar128{-100, 38, 6}));
+ EXPECT_EQ(-1, decimalops::Compare(DecimalScalar128{-200, 38, 6},
+ DecimalScalar128{-100, 38, 6}));
+ EXPECT_EQ(1, decimalops::Compare(DecimalScalar128{-100, 38, 6},
+ DecimalScalar128{-200, 38, 6}));
+ EXPECT_EQ(1, decimalops::Compare(DecimalScalar128{100, 38, 6},
+ DecimalScalar128{-200, 38, 6}));
+
+ for (int32_t precision : {16, 36, 38}) {
+ // x_scale > y_scale
+ EXPECT_EQ(0, decimalops::Compare(DecimalScalar128{10000, precision, 6},
+ DecimalScalar128{100, precision, 4}));
+ EXPECT_EQ(1, decimalops::Compare(DecimalScalar128{20000, precision, 6},
+ DecimalScalar128{100, precision, 4}));
+ EXPECT_EQ(-1, decimalops::Compare(DecimalScalar128{10000, precision, 6},
+ DecimalScalar128{200, precision, 4}));
+
+ // x.scale > y.scale, with -ve
+ EXPECT_EQ(0, decimalops::Compare(DecimalScalar128{-10000, precision, 6},
+ DecimalScalar128{-100, precision, 4}));
+ EXPECT_EQ(-1, decimalops::Compare(DecimalScalar128{-20000, precision, 6},
+ DecimalScalar128{-100, precision, 4}));
+ EXPECT_EQ(1, decimalops::Compare(DecimalScalar128{-10000, precision, 6},
+ DecimalScalar128{-200, precision, 4}));
+ EXPECT_EQ(1, decimalops::Compare(DecimalScalar128{10000, precision, 6},
+ DecimalScalar128{-200, precision, 4}));
+
+ // x.scale < y.scale
+ EXPECT_EQ(0, decimalops::Compare(DecimalScalar128{100, precision, 4},
+ DecimalScalar128{10000, precision, 6}));
+ EXPECT_EQ(1, decimalops::Compare(DecimalScalar128{200, precision, 4},
+ DecimalScalar128{10000, precision, 6}));
+ EXPECT_EQ(-1, decimalops::Compare(DecimalScalar128{100, precision, 4},
+ DecimalScalar128{20000, precision, 6}));
+
+ // x.scale < y.scale, with -ve
+ EXPECT_EQ(0, decimalops::Compare(DecimalScalar128{-100, precision, 4},
+ DecimalScalar128{-10000, precision, 6}));
+ EXPECT_EQ(-1, decimalops::Compare(DecimalScalar128{-200, precision, 4},
+ DecimalScalar128{-10000, precision, 6}));
+ EXPECT_EQ(1, decimalops::Compare(DecimalScalar128{-100, precision, 4},
+ DecimalScalar128{-20000, precision, 6}));
+ EXPECT_EQ(1, decimalops::Compare(DecimalScalar128{100, precision, 4},
+ DecimalScalar128{-200, precision, 6}));
+ }
+
+ // large cases.
+ EXPECT_EQ(0, decimalops::Compare(DecimalScalar128{kThirtyEight9s, 38, 6},
+ DecimalScalar128{kThirtyEight9s, 38, 6}));
+
+ EXPECT_EQ(1, decimalops::Compare(DecimalScalar128{kThirtyEight9s, 38, 6},
+ DecimalScalar128{kThirtySix9s, 38, 4}));
+
+ EXPECT_EQ(-1, decimalops::Compare(DecimalScalar128{kThirtyEight9s, 38, 6},
+ DecimalScalar128{kThirtyEight9s, 38, 4}));
+}
+
+TEST_F(TestDecimalSql, Round) {
+ // expected, input, rounding_scale, overflow
+ using TupleType = std::tuple<DecimalScalar128, DecimalScalar128, int32_t, bool>;
+ std::vector<TupleType> test_values = {
+ // examples from
+ // https://dev.mysql.com/doc/refman/5.7/en/mathematical-functions.html#function_round
+ std::make_tuple(DecimalScalar128{-1, 36, 0}, DecimalScalar128{-123, 38, 2}, 0,
+ false),
+ std::make_tuple(DecimalScalar128{-2, 36, 0}, DecimalScalar128{-158, 38, 2}, 0,
+ false),
+ std::make_tuple(DecimalScalar128{2, 36, 0}, DecimalScalar128{158, 38, 2}, 0, false),
+ std::make_tuple(DecimalScalar128{-13, 36, 1}, DecimalScalar128{-1298, 38, 3}, 1,
+ false),
+ std::make_tuple(DecimalScalar128{-1, 35, 0}, DecimalScalar128{-1298, 38, 3}, 0,
+ false),
+ std::make_tuple(DecimalScalar128{20, 35, 0}, DecimalScalar128{23298, 38, 3}, -1,
+ false),
+ std::make_tuple(DecimalScalar128{100, 38, 0}, DecimalScalar128{122, 38, 0}, -2,
+ false),
+ std::make_tuple(DecimalScalar128{3, 37, 0}, DecimalScalar128{25, 38, 1}, 0, false),
+
+ // border cases
+ std::make_tuple(DecimalScalar128{INT64_MIN / 100, 36, 0},
+ DecimalScalar128{INT64_MIN, 38, 2}, 0, false),
+
+ std::make_tuple(DecimalScalar128{INT64_MIN, 38, 0},
+ DecimalScalar128{INT64_MIN, 38, 0}, 0, false),
+ std::make_tuple(DecimalScalar128{0, 0, 36, 0}, DecimalScalar128{0, 0, 38, 2}, 0,
+ false),
+ std::make_tuple(DecimalScalar128{INT64_MAX, 38, 0},
+ DecimalScalar128{INT64_MAX, 38, 0}, 0, false),
+
+ std::make_tuple(DecimalScalar128{INT64_MAX / 100, 36, 0},
+ DecimalScalar128{INT64_MAX, 38, 2}, 0, false),
+
+ // large scales
+ std::make_tuple(DecimalScalar128{0, 0, 22, 0}, DecimalScalar128{12345, 38, 16}, 0,
+ false),
+
+ std::make_tuple(
+ DecimalScalar128{BasicDecimal128{124}, 22, 0},
+ DecimalScalar128{BasicDecimal128{12389}.IncreaseScaleBy(14), 38, 16}, 0, false),
+ std::make_tuple(
+ DecimalScalar128{BasicDecimal128{-124}, 22, 0},
+ DecimalScalar128{BasicDecimal128{-12389}.IncreaseScaleBy(14), 38, 16}, 0,
+ false),
+ std::make_tuple(
+ DecimalScalar128{BasicDecimal128{124}, 6, 0},
+ DecimalScalar128{BasicDecimal128{12389}.IncreaseScaleBy(30), 38, 32}, 0, false),
+ std::make_tuple(
+ DecimalScalar128{BasicDecimal128{-124}, 6, 0},
+ DecimalScalar128{BasicDecimal128{-12389}.IncreaseScaleBy(30), 38, 32}, 0,
+ false),
+
+ // scale bigger than arg
+ std::make_tuple(
+ DecimalScalar128{BasicDecimal128{12389}.IncreaseScaleBy(32), 38, 32},
+ DecimalScalar128{BasicDecimal128{12389}.IncreaseScaleBy(32), 38, 32}, 35,
+ false),
+ std::make_tuple(
+ DecimalScalar128{BasicDecimal128{-12389}.IncreaseScaleBy(32), 38, 32},
+ DecimalScalar128{BasicDecimal128{-12389}.IncreaseScaleBy(32), 38, 32}, 35,
+ false),
+
+ // overflow
+ std::make_tuple(DecimalScalar128{0, 0, 1, 0}, DecimalScalar128{99, 2, 1}, 0, true),
+ };
+
+ for (auto iter : test_values) {
+ auto expected = std::get<0>(iter);
+ auto input = std::get<1>(iter);
+ auto rounding_scale = std::get<2>(iter);
+ auto expected_overflow = std::get<3>(iter);
+ bool overflow = false;
+
+ EXPECT_EQ(expected.value(),
+ decimalops::Round(input, expected.precision(), expected.scale(),
+ rounding_scale, &overflow))
+ << " failed on input " << input << " rounding scale " << rounding_scale;
+ if (expected_overflow) {
+ ASSERT_TRUE(overflow) << "overflow expected for input " << input;
+ } else {
+ ASSERT_FALSE(overflow) << "overflow not expected for input " << input;
+ }
+ }
+}
+
+TEST_F(TestDecimalSql, Truncate) {
+ // expected, input, rounding_scale, overflow
+ using TupleType = std::tuple<DecimalScalar128, DecimalScalar128, int32_t, bool>;
+ std::vector<TupleType> test_values = {
+ // examples from
+ // https://dev.mysql.com/doc/refman/5.7/en/mathematical-functions.html#function_truncate
+ std::make_tuple(DecimalScalar128{12, 36, 1}, DecimalScalar128{1223, 38, 3}, 1,
+ false),
+ std::make_tuple(DecimalScalar128{19, 36, 1}, DecimalScalar128{1999, 38, 3}, 1,
+ false),
+ std::make_tuple(DecimalScalar128{1, 35, 0}, DecimalScalar128{1999, 38, 3}, 0,
+ false),
+ std::make_tuple(DecimalScalar128{-19, 36, 1}, DecimalScalar128{-1999, 38, 3}, 1,
+ false),
+ std::make_tuple(DecimalScalar128{100, 38, 0}, DecimalScalar128{122, 38, 0}, -2,
+ false),
+ std::make_tuple(DecimalScalar128{1028, 38, 0}, DecimalScalar128{1028, 38, 0}, 0,
+ false),
+
+ // border cases
+ std::make_tuple(DecimalScalar128{BasicDecimal128{INT64_MIN / 100}, 36, 0},
+ DecimalScalar128{INT64_MIN, 38, 2}, 0, false),
+
+ std::make_tuple(DecimalScalar128{INT64_MIN, 38, 0},
+ DecimalScalar128{INT64_MIN, 38, 0}, 0, false),
+ std::make_tuple(DecimalScalar128{0, 0, 38, 0}, DecimalScalar128{0, 0, 38, 2}, 0,
+ false),
+ std::make_tuple(DecimalScalar128{INT64_MAX, 38, 0},
+ DecimalScalar128{INT64_MAX, 38, 0}, 0, false),
+
+ std::make_tuple(DecimalScalar128{BasicDecimal128(INT64_MAX / 100), 36, 0},
+ DecimalScalar128{INT64_MAX, 38, 2}, 0, false),
+
+ // large scales
+ std::make_tuple(DecimalScalar128{BasicDecimal128{0, 0}, 22, 0},
+ DecimalScalar128{12345, 38, 16}, 0, false),
+ std::make_tuple(
+ DecimalScalar128{BasicDecimal128{123}, 22, 0},
+ DecimalScalar128{BasicDecimal128{12389}.IncreaseScaleBy(14), 38, 16}, 0, false),
+ std::make_tuple(
+ DecimalScalar128{BasicDecimal128{-123}, 22, 0},
+ DecimalScalar128{BasicDecimal128{-12389}.IncreaseScaleBy(14), 38, 16}, 0,
+ false),
+ std::make_tuple(
+ DecimalScalar128{BasicDecimal128{123}, 6, 0},
+ DecimalScalar128{BasicDecimal128{12389}.IncreaseScaleBy(30), 38, 32}, 0, false),
+ std::make_tuple(
+ DecimalScalar128{BasicDecimal128{-123}, 6, 0},
+ DecimalScalar128{BasicDecimal128{-12389}.IncreaseScaleBy(30), 38, 32}, 0,
+ false),
+
+ // overflow
+ std::make_tuple(
+ DecimalScalar128{BasicDecimal128{12389}.IncreaseScaleBy(32), 38, 32},
+ DecimalScalar128{BasicDecimal128{12389}.IncreaseScaleBy(32), 38, 32}, 35,
+ false),
+ std::make_tuple(
+ DecimalScalar128{BasicDecimal128{-12389}.IncreaseScaleBy(32), 38, 32},
+ DecimalScalar128{BasicDecimal128{-12389}.IncreaseScaleBy(32), 38, 32}, 35,
+ false),
+ };
+
+ for (auto iter : test_values) {
+ auto expected = std::get<0>(iter);
+ auto input = std::get<1>(iter);
+ auto rounding_scale = std::get<2>(iter);
+ auto expected_overflow = std::get<3>(iter);
+ bool overflow = false;
+
+ EXPECT_EQ(expected.value(),
+ decimalops::Truncate(input, expected.precision(), expected.scale(),
+ rounding_scale, &overflow))
+ << " failed on input " << input << " rounding scale " << rounding_scale;
+ if (expected_overflow) {
+ ASSERT_TRUE(overflow) << "overflow expected for input " << input;
+ } else {
+ ASSERT_FALSE(overflow) << "overflow not expected for input " << input;
+ }
+ }
+}
+
+TEST_F(TestDecimalSql, Ceil) {
+ // expected, input, overflow
+ std::vector<std::tuple<BasicDecimal128, DecimalScalar128, bool>> test_values = {
+ // https://dev.mysql.com/doc/refman/5.7/en/mathematical-functions.html#function_ceil
+ std::make_tuple(2, DecimalScalar128{123, 38, 2}, false),
+ std::make_tuple(-1, DecimalScalar128{-123, 38, 2}, false),
+
+ // border cases
+ std::make_tuple(BasicDecimal128{INT64_MIN / 100},
+ DecimalScalar128{INT64_MIN, 38, 2}, false),
+
+ std::make_tuple(INT64_MIN, DecimalScalar128{INT64_MIN, 38, 0}, false),
+ std::make_tuple(BasicDecimal128{0, 0}, DecimalScalar128{0, 0, 38, 2}, false),
+ std::make_tuple(INT64_MAX, DecimalScalar128{INT64_MAX, 38, 0}, false),
+
+ std::make_tuple(BasicDecimal128(INT64_MAX / 100 + 1),
+ DecimalScalar128{INT64_MAX, 38, 2}, false),
+
+ // large scales
+ std::make_tuple(BasicDecimal128{0, 1}, DecimalScalar128{12345, 38, 16}, false),
+ std::make_tuple(
+ BasicDecimal128{124},
+ DecimalScalar128{BasicDecimal128{12389}.IncreaseScaleBy(14), 38, 16}, false),
+ std::make_tuple(
+ BasicDecimal128{-123},
+ DecimalScalar128{BasicDecimal128{-12389}.IncreaseScaleBy(14), 38, 16}, false),
+ std::make_tuple(
+ BasicDecimal128{124},
+ DecimalScalar128{BasicDecimal128{12389}.IncreaseScaleBy(30), 38, 32}, false),
+ std::make_tuple(
+ BasicDecimal128{-123},
+ DecimalScalar128{BasicDecimal128{-12389}.IncreaseScaleBy(30), 38, 32}, false),
+ };
+
+ for (auto iter : test_values) {
+ auto expected = std::get<0>(iter);
+ auto input = std::get<1>(iter);
+ auto expected_overflow = std::get<2>(iter);
+ bool overflow = false;
+
+ EXPECT_EQ(expected, decimalops::Ceil(input, &overflow))
+ << " failed on input " << input;
+ if (expected_overflow) {
+ ASSERT_TRUE(overflow) << "overflow expected for input " << input;
+ } else {
+ ASSERT_FALSE(overflow) << "overflow not expected for input " << input;
+ }
+ }
+}
+
+TEST_F(TestDecimalSql, Floor) {
+ // expected, input, overflow
+ std::vector<std::tuple<BasicDecimal128, DecimalScalar128, bool>> test_values = {
+ // https://dev.mysql.com/doc/refman/5.7/en/mathematical-functions.html#function_floor
+ std::make_tuple(1, DecimalScalar128{123, 38, 2}, false),
+ std::make_tuple(-2, DecimalScalar128{-123, 38, 2}, false),
+
+ // border cases
+ std::make_tuple(BasicDecimal128{INT64_MIN / 100 - 1},
+ DecimalScalar128{INT64_MIN, 38, 2}, false),
+
+ std::make_tuple(INT64_MIN, DecimalScalar128{INT64_MIN, 38, 0}, false),
+ std::make_tuple(BasicDecimal128{0, 0}, DecimalScalar128{0, 0, 38, 2}, false),
+ std::make_tuple(INT64_MAX, DecimalScalar128{INT64_MAX, 38, 0}, false),
+
+ std::make_tuple(BasicDecimal128{INT64_MAX / 100},
+ DecimalScalar128{INT64_MAX, 38, 2}, false),
+
+ // large scales
+ std::make_tuple(BasicDecimal128{0, 0}, DecimalScalar128{12345, 38, 16}, false),
+ std::make_tuple(
+ BasicDecimal128{123},
+ DecimalScalar128{BasicDecimal128{12389}.IncreaseScaleBy(14), 38, 16}, false),
+ std::make_tuple(
+ BasicDecimal128{-124},
+ DecimalScalar128{BasicDecimal128{-12389}.IncreaseScaleBy(14), 38, 16}, false),
+ std::make_tuple(
+ BasicDecimal128{123},
+ DecimalScalar128{BasicDecimal128{12389}.IncreaseScaleBy(30), 38, 32}, false),
+ std::make_tuple(
+ BasicDecimal128{-124},
+ DecimalScalar128{BasicDecimal128{-12389}.IncreaseScaleBy(30), 38, 32}, false),
+ };
+
+ for (auto iter : test_values) {
+ auto expected = std::get<0>(iter);
+ auto input = std::get<1>(iter);
+ auto expected_overflow = std::get<2>(iter);
+ bool overflow = false;
+
+ EXPECT_EQ(expected, decimalops::Floor(input, &overflow))
+ << " failed on input " << input;
+ if (expected_overflow) {
+ ASSERT_TRUE(overflow) << "overflow expected for input " << input;
+ } else {
+ ASSERT_FALSE(overflow) << "overflow not expected for input " << input;
+ }
+ }
+}
+
+TEST_F(TestDecimalSql, Convert) {
+ // expected, input, overflow
+ std::vector<std::tuple<DecimalScalar128, DecimalScalar128, bool>> test_values = {
+ // simple cases
+ std::make_tuple(DecimalScalar128{12, 38, 1}, DecimalScalar128{123, 38, 2}, false),
+ std::make_tuple(DecimalScalar128{1230, 38, 3}, DecimalScalar128{123, 38, 2}, false),
+ std::make_tuple(DecimalScalar128{123, 38, 2}, DecimalScalar128{123, 38, 2}, false),
+
+ std::make_tuple(DecimalScalar128{-12, 38, 1}, DecimalScalar128{-123, 38, 2}, false),
+ std::make_tuple(DecimalScalar128{-1230, 38, 3}, DecimalScalar128{-123, 38, 2},
+ false),
+ std::make_tuple(DecimalScalar128{-123, 38, 2}, DecimalScalar128{-123, 38, 2},
+ false),
+
+ // border cases
+ std::make_tuple(
+ DecimalScalar128{BasicDecimal128(INT64_MIN).ReduceScaleBy(1), 38, 1},
+ DecimalScalar128{INT64_MIN, 38, 2}, false),
+ std::make_tuple(
+ DecimalScalar128{BasicDecimal128(INT64_MIN).IncreaseScaleBy(1), 38, 3},
+ DecimalScalar128{INT64_MIN, 38, 2}, false),
+ std::make_tuple(DecimalScalar128{-3, 38, 1}, DecimalScalar128{-32, 38, 2}, false),
+ std::make_tuple(DecimalScalar128{0, 0, 38, 1}, DecimalScalar128{0, 0, 38, 2},
+ false),
+ std::make_tuple(DecimalScalar128{3, 38, 1}, DecimalScalar128{32, 38, 2}, false),
+ std::make_tuple(
+ DecimalScalar128{BasicDecimal128(INT64_MAX).ReduceScaleBy(1), 38, 1},
+ DecimalScalar128{INT64_MAX, 38, 2}, false),
+ std::make_tuple(
+ DecimalScalar128{BasicDecimal128(INT64_MAX).IncreaseScaleBy(1), 38, 3},
+ DecimalScalar128{INT64_MAX, 38, 2}, false),
+
+ // large scales
+ std::make_tuple(DecimalScalar128{BasicDecimal128(123).IncreaseScaleBy(16), 38, 18},
+ DecimalScalar128{123, 38, 2}, false),
+ std::make_tuple(DecimalScalar128{BasicDecimal128(-123).IncreaseScaleBy(16), 38, 18},
+ DecimalScalar128{-123, 38, 2}, false),
+ std::make_tuple(DecimalScalar128{BasicDecimal128(123).IncreaseScaleBy(30), 38, 32},
+ DecimalScalar128{123, 38, 2}, false),
+ std::make_tuple(DecimalScalar128{BasicDecimal128(-123).IncreaseScaleBy(30), 38, 32},
+ DecimalScalar128{-123, 38, 2}, false),
+
+ // overflow due to scaling up.
+ std::make_tuple(DecimalScalar128{0, 0, 38, 36}, DecimalScalar128{12345, 38, 2},
+ true),
+ std::make_tuple(DecimalScalar128{0, 0, 38, 36}, DecimalScalar128{-12345, 38, 2},
+ true),
+
+ // overflow due to precision.
+ std::make_tuple(DecimalScalar128{0, 0, 5, 3}, DecimalScalar128{12345, 5, 2}, true),
+ };
+
+ for (auto iter : test_values) {
+ auto expected = std::get<0>(iter);
+ auto input = std::get<1>(iter);
+ auto expected_overflow = std::get<2>(iter);
+ bool overflow = false;
+
+ EXPECT_EQ(expected.value(), decimalops::Convert(input, expected.precision(),
+ expected.scale(), &overflow))
+ << " failed on input " << input;
+
+ if (expected_overflow) {
+ ASSERT_TRUE(overflow) << "overflow expected for input " << input;
+ } else {
+ ASSERT_FALSE(overflow) << "overflow not expected for input " << input;
+ }
+ }
+}
+
+// double can store up to this integer value without losing precision
+static const int64_t kMaxDoubleInt = 1ull << 53;
+
+TEST_F(TestDecimalSql, FromDouble) {
+ // expected, input, overflow
+ std::vector<std::tuple<DecimalScalar128, double, bool>> test_values = {
+ // simple cases
+ std::make_tuple(DecimalScalar128{-16285, 38, 3}, -16.285, false),
+ std::make_tuple(DecimalScalar128{-162850, 38, 4}, -16.285, false),
+ std::make_tuple(DecimalScalar128{-1629, 38, 2}, -16.285, false),
+
+ std::make_tuple(DecimalScalar128{16285, 38, 3}, 16.285, false),
+ std::make_tuple(DecimalScalar128{162850, 38, 4}, 16.285, false),
+ std::make_tuple(DecimalScalar128{1629, 38, 2}, 16.285, false),
+
+ // round up
+ std::make_tuple(DecimalScalar128{1, 18, 0}, 1.15470053838, false),
+ std::make_tuple(DecimalScalar128{-1, 18, 0}, -1.15470053838, false),
+ std::make_tuple(DecimalScalar128{2, 18, 0}, 1.55470053838, false),
+ std::make_tuple(DecimalScalar128{-2, 18, 0}, -1.55470053838, false),
+
+ // border cases
+ std::make_tuple(DecimalScalar128{-kMaxDoubleInt, 38, 0},
+ static_cast<double>(-kMaxDoubleInt), false),
+ std::make_tuple(DecimalScalar128{-32, 38, 0}, -32, false),
+ std::make_tuple(DecimalScalar128{0, 0, 38, 0}, 0, false),
+ std::make_tuple(DecimalScalar128{32, 38, 0}, 32, false),
+ std::make_tuple(DecimalScalar128{kMaxDoubleInt, 38, 0},
+ static_cast<double>(kMaxDoubleInt), false),
+
+ // large scales
+ std::make_tuple(DecimalScalar128{123, 38, 16}, 1.23E-14, false),
+ std::make_tuple(DecimalScalar128{123, 38, 32}, 1.23E-30, false),
+ std::make_tuple(DecimalScalar128{1230, 38, 33}, 1.23E-30, false),
+ std::make_tuple(DecimalScalar128{123, 38, 38}, 1.23E-36, false),
+
+ // very small doubles
+ std::make_tuple(DecimalScalar128{0, 0, 38, 0}, std::numeric_limits<double>::min(),
+ false),
+ std::make_tuple(DecimalScalar128{0, 0, 38, 0}, -std::numeric_limits<double>::min(),
+ false),
+
+ // overflow due to large -ve double
+ std::make_tuple(DecimalScalar128{0, 0, 38, 0}, -std::numeric_limits<double>::max(),
+ true),
+ // overflow due to large +ve double
+ std::make_tuple(DecimalScalar128{0, 0, 38, 0}, std::numeric_limits<double>::max(),
+ true),
+ // overflow due to scaling up.
+ std::make_tuple(DecimalScalar128{0, 0, 38, 36}, 123.45, true),
+ // overflow due to precision.
+ std::make_tuple(DecimalScalar128{0, 0, 4, 2}, 12345.67, true),
+ };
+
+ for (auto iter : test_values) {
+ auto dscalar = std::get<0>(iter);
+ auto input = std::get<1>(iter);
+ auto expected_overflow = std::get<2>(iter);
+ bool overflow = false;
+
+ EXPECT_EQ(dscalar.value(), decimalops::FromDouble(input, dscalar.precision(),
+ dscalar.scale(), &overflow))
+ << " failed on input " << input;
+
+ if (expected_overflow) {
+ ASSERT_TRUE(overflow) << "overflow expected for input " << input;
+ } else {
+ ASSERT_FALSE(overflow) << "overflow not expected for input " << input;
+ }
+ }
+}
+
+#define EXPECT_FUZZY_EQ(x, y) \
+ EXPECT_TRUE(x - y <= 0.00001) << "expected " << x << ", got " << y
+
+TEST_F(TestDecimalSql, ToDouble) {
+ // expected, input, overflow
+ std::vector<std::tuple<double, DecimalScalar128>> test_values = {
+ // simple ones
+ std::make_tuple(-16.285, DecimalScalar128{-16285, 38, 3}),
+ std::make_tuple(-162.85, DecimalScalar128{-16285, 38, 2}),
+ std::make_tuple(-1.6285, DecimalScalar128{-16285, 38, 4}),
+
+ // large scales
+ std::make_tuple(1.23E-14, DecimalScalar128{123, 38, 16}),
+ std::make_tuple(1.23E-30, DecimalScalar128{123, 38, 32}),
+ std::make_tuple(1.23E-36, DecimalScalar128{123, 38, 38}),
+
+ // border cases
+ std::make_tuple(static_cast<double>(-kMaxDoubleInt),
+ DecimalScalar128{-kMaxDoubleInt, 38, 0}),
+ std::make_tuple(-32, DecimalScalar128{-32, 38, 0}),
+ std::make_tuple(0, DecimalScalar128{0, 0, 38, 0}),
+ std::make_tuple(32, DecimalScalar128{32, 38, 0}),
+ std::make_tuple(static_cast<double>(kMaxDoubleInt),
+ DecimalScalar128{kMaxDoubleInt, 38, 0}),
+ };
+ for (auto iter : test_values) {
+ auto input = std::get<1>(iter);
+ bool overflow = false;
+
+ EXPECT_FUZZY_EQ(std::get<0>(iter), decimalops::ToDouble(input, &overflow));
+ ASSERT_FALSE(overflow) << "overflow not expected for input " << input;
+ }
+}
+
+TEST_F(TestDecimalSql, FromInt64) {
+ // expected, input, overflow
+ std::vector<std::tuple<DecimalScalar128, int64_t, bool>> test_values = {
+ // simple cases
+ std::make_tuple(DecimalScalar128{-16000, 38, 3}, -16, false),
+ std::make_tuple(DecimalScalar128{-160000, 38, 4}, -16, false),
+ std::make_tuple(DecimalScalar128{-1600, 38, 2}, -16, false),
+
+ std::make_tuple(DecimalScalar128{16000, 38, 3}, 16, false),
+ std::make_tuple(DecimalScalar128{160000, 38, 4}, 16, false),
+ std::make_tuple(DecimalScalar128{1600, 38, 2}, 16, false),
+
+ // border cases
+ std::make_tuple(DecimalScalar128{INT64_MIN, 38, 0}, INT64_MIN, false),
+ std::make_tuple(DecimalScalar128{-32, 38, 0}, -32, false),
+ std::make_tuple(DecimalScalar128{0, 0, 38, 0}, 0, false),
+ std::make_tuple(DecimalScalar128{32, 38, 0}, 32, false),
+ std::make_tuple(DecimalScalar128{INT64_MAX, 38, 0}, INT64_MAX, false),
+
+ // large scales
+ std::make_tuple(DecimalScalar128{BasicDecimal128(123).IncreaseScaleBy(16), 38, 16},
+ 123, false),
+ std::make_tuple(DecimalScalar128{BasicDecimal128(123).IncreaseScaleBy(32), 38, 32},
+ 123, false),
+ std::make_tuple(DecimalScalar128{BasicDecimal128(-123).IncreaseScaleBy(16), 38, 16},
+ -123, false),
+ std::make_tuple(DecimalScalar128{BasicDecimal128(-123).IncreaseScaleBy(32), 38, 32},
+ -123, false),
+
+ // overflow due to scaling up.
+ std::make_tuple(DecimalScalar128{0, 0, 38, 36}, 123, true),
+ // overflow due to precision.
+ std::make_tuple(DecimalScalar128{0, 0, 4, 2}, 12345, true),
+ };
+
+ for (auto iter : test_values) {
+ auto dscalar = std::get<0>(iter);
+ auto input = std::get<1>(iter);
+ auto expected_overflow = std::get<2>(iter);
+ bool overflow = false;
+
+ EXPECT_EQ(dscalar.value(), decimalops::FromInt64(input, dscalar.precision(),
+ dscalar.scale(), &overflow))
+ << " failed on input " << input;
+
+ if (expected_overflow) {
+ ASSERT_TRUE(overflow) << "overflow expected for input " << input;
+ } else {
+ ASSERT_FALSE(overflow) << "overflow not expected for input " << input;
+ }
+ }
+}
+
+TEST_F(TestDecimalSql, ToInt64) {
+ // expected, input, overflow
+ std::vector<std::tuple<int64_t, DecimalScalar128, bool>> test_values = {
+ // simple ones
+ std::make_tuple(-16, DecimalScalar128{-16285, 38, 3}, false),
+ std::make_tuple(-163, DecimalScalar128{-16285, 38, 2}, false),
+ std::make_tuple(-2, DecimalScalar128{-16285, 38, 4}, false),
+
+ // border cases
+ std::make_tuple(INT64_MIN, DecimalScalar128{INT64_MIN, 38, 0}, false),
+ std::make_tuple(-32, DecimalScalar128{-32, 38, 0}, false),
+ std::make_tuple(0, DecimalScalar128{0, 0, 38, 0}, false),
+ std::make_tuple(32, DecimalScalar128{32, 38, 0}, false),
+ std::make_tuple(INT64_MAX, DecimalScalar128{INT64_MAX, 38, 0}, false),
+
+ // large scales
+ std::make_tuple(0, DecimalScalar128{123, 38, 16}, false),
+ std::make_tuple(0, DecimalScalar128{123, 38, 32}, false),
+ std::make_tuple(0, DecimalScalar128{123, 38, 38}, false),
+
+ // overflow test cases
+ // very large
+ std::make_tuple(0, DecimalScalar128{32768, 16, 38, 2}, true),
+ std::make_tuple(0, DecimalScalar128{INT64_MAX, UINT64_MAX, 38, 10}, true),
+ // very small
+ std::make_tuple(0, -DecimalScalar128{32768, 16, 38, 2}, true),
+ std::make_tuple(0, -DecimalScalar128{INT64_MAX, UINT64_MAX, 38, 10}, true),
+ };
+
+ for (auto iter : test_values) {
+ auto expected_value = std::get<0>(iter);
+ auto input = std::get<1>(iter);
+ auto expected_overflow = std::get<2>(iter);
+ bool overflow = false;
+
+ EXPECT_EQ(expected_value, decimalops::ToInt64(input, &overflow))
+ << " failed on input " << input;
+ if (expected_overflow) {
+ ASSERT_TRUE(overflow) << "overflow expected for input " << input;
+ } else {
+ ASSERT_FALSE(overflow) << "overflow not expected for input " << input;
+ }
+ }
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/precompiled/decimal_wrapper.cc b/src/arrow/cpp/src/gandiva/precompiled/decimal_wrapper.cc
new file mode 100644
index 000000000..082d5832d
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/precompiled/decimal_wrapper.cc
@@ -0,0 +1,433 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/precompiled/decimal_ops.h"
+#include "gandiva/precompiled/types.h"
+
+extern "C" {
+
+FORCE_INLINE
+void add_large_decimal128_decimal128(int64_t x_high, uint64_t x_low, int32_t x_precision,
+ int32_t x_scale, int64_t y_high, uint64_t y_low,
+ int32_t y_precision, int32_t y_scale,
+ int32_t out_precision, int32_t out_scale,
+ int64_t* out_high, uint64_t* out_low) {
+ gandiva::BasicDecimalScalar128 x(x_high, x_low, x_precision, x_scale);
+ gandiva::BasicDecimalScalar128 y(y_high, y_low, y_precision, y_scale);
+
+ arrow::BasicDecimal128 out = gandiva::decimalops::Add(x, y, out_precision, out_scale);
+ *out_high = out.high_bits();
+ *out_low = out.low_bits();
+}
+
+FORCE_INLINE
+void multiply_decimal128_decimal128(int64_t x_high, uint64_t x_low, int32_t x_precision,
+ int32_t x_scale, int64_t y_high, uint64_t y_low,
+ int32_t y_precision, int32_t y_scale,
+ int32_t out_precision, int32_t out_scale,
+ int64_t* out_high, uint64_t* out_low) {
+ gandiva::BasicDecimalScalar128 x(x_high, x_low, x_precision, x_scale);
+ gandiva::BasicDecimalScalar128 y(y_high, y_low, y_precision, y_scale);
+ bool overflow;
+
+ // TODO ravindra: generate error on overflows (ARROW-4570).
+ arrow::BasicDecimal128 out =
+ gandiva::decimalops::Multiply(x, y, out_precision, out_scale, &overflow);
+ *out_high = out.high_bits();
+ *out_low = out.low_bits();
+}
+
+FORCE_INLINE
+void divide_decimal128_decimal128(int64_t context, int64_t x_high, uint64_t x_low,
+ int32_t x_precision, int32_t x_scale, int64_t y_high,
+ uint64_t y_low, int32_t y_precision, int32_t y_scale,
+ int32_t out_precision, int32_t out_scale,
+ int64_t* out_high, uint64_t* out_low) {
+ gandiva::BasicDecimalScalar128 x(x_high, x_low, x_precision, x_scale);
+ gandiva::BasicDecimalScalar128 y(y_high, y_low, y_precision, y_scale);
+ bool overflow;
+
+ // TODO ravindra: generate error on overflows (ARROW-4570).
+ arrow::BasicDecimal128 out =
+ gandiva::decimalops::Divide(context, x, y, out_precision, out_scale, &overflow);
+ *out_high = out.high_bits();
+ *out_low = out.low_bits();
+}
+
+FORCE_INLINE
+void mod_decimal128_decimal128(int64_t context, int64_t x_high, uint64_t x_low,
+ int32_t x_precision, int32_t x_scale, int64_t y_high,
+ uint64_t y_low, int32_t y_precision, int32_t y_scale,
+ int32_t out_precision, int32_t out_scale,
+ int64_t* out_high, uint64_t* out_low) {
+ gandiva::BasicDecimalScalar128 x(x_high, x_low, x_precision, x_scale);
+ gandiva::BasicDecimalScalar128 y(y_high, y_low, y_precision, y_scale);
+ bool overflow;
+
+ // TODO ravindra: generate error on overflows (ARROW-4570).
+ arrow::BasicDecimal128 out =
+ gandiva::decimalops::Mod(context, x, y, out_precision, out_scale, &overflow);
+ *out_high = out.high_bits();
+ *out_low = out.low_bits();
+}
+
+FORCE_INLINE
+int32_t compare_decimal128_decimal128_internal(int64_t x_high, uint64_t x_low,
+ int32_t x_precision, int32_t x_scale,
+ int64_t y_high, uint64_t y_low,
+ int32_t y_precision, int32_t y_scale) {
+ gandiva::BasicDecimalScalar128 x(x_high, x_low, x_precision, x_scale);
+ gandiva::BasicDecimalScalar128 y(y_high, y_low, y_precision, y_scale);
+
+ return gandiva::decimalops::Compare(x, y);
+}
+
+FORCE_INLINE
+void abs_decimal128(int64_t x_high, uint64_t x_low, int32_t x_precision, int32_t x_scale,
+ int32_t out_precision, int32_t out_scale, int64_t* out_high,
+ uint64_t* out_low) {
+ gandiva::BasicDecimal128 x(x_high, x_low);
+ x.Abs();
+ *out_high = x.high_bits();
+ *out_low = x.low_bits();
+}
+
+FORCE_INLINE
+void ceil_decimal128(int64_t x_high, uint64_t x_low, int32_t x_precision, int32_t x_scale,
+ int32_t out_precision, int32_t out_scale, int64_t* out_high,
+ uint64_t* out_low) {
+ gandiva::BasicDecimalScalar128 x({x_high, x_low}, x_precision, x_scale);
+
+ bool overflow = false;
+ auto out = gandiva::decimalops::Ceil(x, &overflow);
+ *out_high = out.high_bits();
+ *out_low = out.low_bits();
+}
+
+FORCE_INLINE
+void floor_decimal128(int64_t x_high, uint64_t x_low, int32_t x_precision,
+ int32_t x_scale, int32_t out_precision, int32_t out_scale,
+ int64_t* out_high, uint64_t* out_low) {
+ gandiva::BasicDecimalScalar128 x({x_high, x_low}, x_precision, x_scale);
+
+ bool overflow = false;
+ auto out = gandiva::decimalops::Floor(x, &overflow);
+ *out_high = out.high_bits();
+ *out_low = out.low_bits();
+}
+
+FORCE_INLINE
+void round_decimal128(int64_t x_high, uint64_t x_low, int32_t x_precision,
+ int32_t x_scale, int32_t out_precision, int32_t out_scale,
+ int64_t* out_high, uint64_t* out_low) {
+ gandiva::BasicDecimalScalar128 x({x_high, x_low}, x_precision, x_scale);
+
+ bool overflow = false;
+ auto out = gandiva::decimalops::Round(x, out_precision, 0, 0, &overflow);
+ *out_high = out.high_bits();
+ *out_low = out.low_bits();
+}
+
+FORCE_INLINE
+void round_decimal128_int32(int64_t x_high, uint64_t x_low, int32_t x_precision,
+ int32_t x_scale, int32_t rounding_scale,
+ int32_t out_precision, int32_t out_scale, int64_t* out_high,
+ uint64_t* out_low) {
+ gandiva::BasicDecimalScalar128 x({x_high, x_low}, x_precision, x_scale);
+
+ bool overflow = false;
+ auto out =
+ gandiva::decimalops::Round(x, out_precision, out_scale, rounding_scale, &overflow);
+ *out_high = out.high_bits();
+ *out_low = out.low_bits();
+}
+
+FORCE_INLINE
+void truncate_decimal128(int64_t x_high, uint64_t x_low, int32_t x_precision,
+ int32_t x_scale, int32_t out_precision, int32_t out_scale,
+ int64_t* out_high, uint64_t* out_low) {
+ gandiva::BasicDecimalScalar128 x({x_high, x_low}, x_precision, x_scale);
+
+ bool overflow = false;
+ auto out = gandiva::decimalops::Truncate(x, out_precision, 0, 0, &overflow);
+ *out_high = out.high_bits();
+ *out_low = out.low_bits();
+}
+
+FORCE_INLINE
+void truncate_decimal128_int32(int64_t x_high, uint64_t x_low, int32_t x_precision,
+ int32_t x_scale, int32_t rounding_scale,
+ int32_t out_precision, int32_t out_scale,
+ int64_t* out_high, uint64_t* out_low) {
+ gandiva::BasicDecimalScalar128 x({x_high, x_low}, x_precision, x_scale);
+
+ bool overflow = false;
+ auto out = gandiva::decimalops::Truncate(x, out_precision, out_scale, rounding_scale,
+ &overflow);
+ *out_high = out.high_bits();
+ *out_low = out.low_bits();
+}
+
+FORCE_INLINE
+double castFLOAT8_decimal128(int64_t x_high, uint64_t x_low, int32_t x_precision,
+ int32_t x_scale) {
+ gandiva::BasicDecimalScalar128 x({x_high, x_low}, x_precision, x_scale);
+
+ bool overflow = false;
+ return gandiva::decimalops::ToDouble(x, &overflow);
+}
+
+FORCE_INLINE
+int64_t castBIGINT_decimal128(int64_t x_high, uint64_t x_low, int32_t x_precision,
+ int32_t x_scale) {
+ gandiva::BasicDecimalScalar128 x({x_high, x_low}, x_precision, x_scale);
+
+ bool overflow = false;
+ return gandiva::decimalops::ToInt64(x, &overflow);
+}
+
+FORCE_INLINE
+void castDECIMAL_int64(int64_t in, int32_t x_precision, int32_t x_scale,
+ int64_t* out_high, uint64_t* out_low) {
+ bool overflow = false;
+ auto out = gandiva::decimalops::FromInt64(in, x_precision, x_scale, &overflow);
+ *out_high = out.high_bits();
+ *out_low = out.low_bits();
+}
+
+FORCE_INLINE
+void castDECIMAL_int32(int32_t in, int32_t x_precision, int32_t x_scale,
+ int64_t* out_high, uint64_t* out_low) {
+ castDECIMAL_int64(in, x_precision, x_scale, out_high, out_low);
+}
+
+FORCE_INLINE
+void castDECIMAL_float64(double in, int32_t x_precision, int32_t x_scale,
+ int64_t* out_high, uint64_t* out_low) {
+ bool overflow = false;
+ auto out = gandiva::decimalops::FromDouble(in, x_precision, x_scale, &overflow);
+ *out_high = out.high_bits();
+ *out_low = out.low_bits();
+}
+
+FORCE_INLINE
+void castDECIMAL_float32(float in, int32_t x_precision, int32_t x_scale,
+ int64_t* out_high, uint64_t* out_low) {
+ castDECIMAL_float64(in, x_precision, x_scale, out_high, out_low);
+}
+
+FORCE_INLINE
+bool castDecimal_internal(int64_t x_high, uint64_t x_low, int32_t x_precision,
+ int32_t x_scale, int32_t out_precision, int32_t out_scale,
+ int64_t* out_high, int64_t* out_low) {
+ gandiva::BasicDecimalScalar128 x({x_high, x_low}, x_precision, x_scale);
+ bool overflow = false;
+ auto out = gandiva::decimalops::Convert(x, out_precision, out_scale, &overflow);
+ *out_high = out.high_bits();
+ *out_low = out.low_bits();
+ return overflow;
+}
+
+FORCE_INLINE
+void castDECIMAL_decimal128(int64_t x_high, uint64_t x_low, int32_t x_precision,
+ int32_t x_scale, int32_t out_precision, int32_t out_scale,
+ int64_t* out_high, int64_t* out_low) {
+ castDecimal_internal(x_high, x_low, x_precision, x_scale, out_precision, out_scale,
+ out_high, out_low);
+}
+
+FORCE_INLINE
+void castDECIMALNullOnOverflow_decimal128(int64_t x_high, uint64_t x_low,
+ int32_t x_precision, int32_t x_scale,
+ bool x_isvalid, bool* out_valid,
+ int32_t out_precision, int32_t out_scale,
+ int64_t* out_high, int64_t* out_low) {
+ *out_valid = true;
+
+ if (!x_isvalid) {
+ *out_valid = false;
+ return;
+ }
+
+ if (castDecimal_internal(x_high, x_low, x_precision, x_scale, out_precision, out_scale,
+ out_high, out_low)) {
+ *out_valid = false;
+ }
+}
+
+FORCE_INLINE
+int32_t hash32_decimal128(int64_t x_high, uint64_t x_low, int32_t x_precision,
+ int32_t x_scale, gdv_boolean x_isvalid) {
+ return x_isvalid
+ ? hash32_buf(gandiva::BasicDecimal128(x_high, x_low).ToBytes().data(), 16, 0)
+ : 0;
+}
+
+FORCE_INLINE
+int32_t hash_decimal128(int64_t x_high, uint64_t x_low, int32_t x_precision,
+ int32_t x_scale, gdv_boolean x_isvalid) {
+ return hash32_decimal128(x_high, x_low, x_precision, x_scale, x_isvalid);
+}
+
+FORCE_INLINE
+int64_t hash64_decimal128(int64_t x_high, uint64_t x_low, int32_t x_precision,
+ int32_t x_scale, gdv_boolean x_isvalid) {
+ return x_isvalid
+ ? hash64_buf(gandiva::BasicDecimal128(x_high, x_low).ToBytes().data(), 16, 0)
+ : 0;
+}
+
+FORCE_INLINE
+int32_t hash32WithSeed_decimal128(int64_t x_high, uint64_t x_low, int32_t x_precision,
+ int32_t x_scale, gdv_boolean x_isvalid, int32_t seed,
+ gdv_boolean seed_isvalid) {
+ if (!x_isvalid) {
+ return seed;
+ }
+ return hash32_buf(gandiva::BasicDecimal128(x_high, x_low).ToBytes().data(), 16, seed);
+}
+
+FORCE_INLINE
+int64_t hash64WithSeed_decimal128(int64_t x_high, uint64_t x_low, int32_t x_precision,
+ int32_t x_scale, gdv_boolean x_isvalid, int64_t seed,
+ gdv_boolean seed_isvalid) {
+ if (!x_isvalid) {
+ return seed;
+ }
+ return hash64_buf(gandiva::BasicDecimal128(x_high, x_low).ToBytes().data(), 16, seed);
+}
+
+FORCE_INLINE
+int32_t hash32AsDouble_decimal128(int64_t x_high, uint64_t x_low, int32_t x_precision,
+ int32_t x_scale, gdv_boolean x_isvalid) {
+ return x_isvalid
+ ? hash32_buf(gandiva::BasicDecimal128(x_high, x_low).ToBytes().data(), 16, 0)
+ : 0;
+}
+
+FORCE_INLINE
+int64_t hash64AsDouble_decimal128(int64_t x_high, uint64_t x_low, int32_t x_precision,
+ int32_t x_scale, gdv_boolean x_isvalid) {
+ return x_isvalid
+ ? hash64_buf(gandiva::BasicDecimal128(x_high, x_low).ToBytes().data(), 16, 0)
+ : 0;
+}
+
+FORCE_INLINE
+int32_t hash32AsDoubleWithSeed_decimal128(int64_t x_high, uint64_t x_low,
+ int32_t x_precision, int32_t x_scale,
+ gdv_boolean x_isvalid, int32_t seed,
+ gdv_boolean seed_isvalid) {
+ if (!x_isvalid) {
+ return seed;
+ }
+ return hash32_buf(gandiva::BasicDecimal128(x_high, x_low).ToBytes().data(), 16, seed);
+}
+
+FORCE_INLINE
+int64_t hash64AsDoubleWithSeed_decimal128(int64_t x_high, uint64_t x_low,
+ int32_t x_precision, int32_t x_scale,
+ gdv_boolean x_isvalid, int64_t seed,
+ gdv_boolean seed_isvalid) {
+ if (!x_isvalid) {
+ return seed;
+ }
+ return hash64_buf(gandiva::BasicDecimal128(x_high, x_low).ToBytes().data(), 16, seed);
+}
+
+FORCE_INLINE
+gdv_boolean isnull_decimal128(int64_t x_high, uint64_t x_low, int32_t x_precision,
+ int32_t x_scale, gdv_boolean x_isvalid) {
+ return !x_isvalid;
+}
+
+FORCE_INLINE
+gdv_boolean isnotnull_decimal128(int64_t x_high, uint64_t x_low, int32_t x_precision,
+ int32_t x_scale, gdv_boolean x_isvalid) {
+ return x_isvalid;
+}
+
+FORCE_INLINE
+gdv_boolean isnumeric_decimal128(int64_t x_high, uint64_t x_low, int32_t x_precision,
+ int32_t x_scale, gdv_boolean x_isvalid) {
+ return x_isvalid;
+}
+
+FORCE_INLINE
+gdv_boolean is_not_distinct_from_decimal128_decimal128(
+ int64_t x_high, uint64_t x_low, int32_t x_precision, int32_t x_scale,
+ gdv_boolean x_isvalid, int64_t y_high, uint64_t y_low, int32_t y_precision,
+ int32_t y_scale, gdv_boolean y_isvalid) {
+ if (x_isvalid != y_isvalid) {
+ return false;
+ }
+ if (!x_isvalid) {
+ return true;
+ }
+ return 0 == compare_decimal128_decimal128_internal(x_high, x_low, x_precision, x_scale,
+ y_high, y_low, y_precision, y_scale);
+}
+
+FORCE_INLINE
+gdv_boolean is_distinct_from_decimal128_decimal128(int64_t x_high, uint64_t x_low,
+ int32_t x_precision, int32_t x_scale,
+ gdv_boolean x_isvalid, int64_t y_high,
+ uint64_t y_low, int32_t y_precision,
+ int32_t y_scale,
+ gdv_boolean y_isvalid) {
+ return !is_not_distinct_from_decimal128_decimal128(x_high, x_low, x_precision, x_scale,
+ x_isvalid, y_high, y_low,
+ y_precision, y_scale, y_isvalid);
+}
+
+FORCE_INLINE
+void castDECIMAL_utf8(int64_t context, const char* in, int32_t in_length,
+ int32_t out_precision, int32_t out_scale, int64_t* out_high,
+ uint64_t* out_low) {
+ int64_t dec_high_from_str;
+ uint64_t dec_low_from_str;
+ int32_t precision_from_str;
+ int32_t scale_from_str;
+ int32_t status =
+ gdv_fn_dec_from_string(context, in, in_length, &precision_from_str, &scale_from_str,
+ &dec_high_from_str, &dec_low_from_str);
+ if (status != 0) {
+ return;
+ }
+
+ gandiva::BasicDecimalScalar128 x({dec_high_from_str, dec_low_from_str},
+ precision_from_str, scale_from_str);
+ bool overflow = false;
+ auto out = gandiva::decimalops::Convert(x, out_precision, out_scale, &overflow);
+ *out_high = out.high_bits();
+ *out_low = out.low_bits();
+}
+
+FORCE_INLINE
+char* castVARCHAR_decimal128_int64(int64_t context, int64_t x_high, uint64_t x_low,
+ int32_t x_precision, int32_t x_scale,
+ int64_t out_len_param, int32_t* out_length) {
+ int32_t full_dec_str_len;
+ char* dec_str =
+ gdv_fn_dec_to_string(context, x_high, x_low, x_scale, &full_dec_str_len);
+ int32_t trunc_dec_str_len =
+ out_len_param < full_dec_str_len ? out_len_param : full_dec_str_len;
+ *out_length = trunc_dec_str_len;
+ return dec_str;
+}
+
+} // extern "C"
diff --git a/src/arrow/cpp/src/gandiva/precompiled/epoch_time_point.h b/src/arrow/cpp/src/gandiva/precompiled/epoch_time_point.h
new file mode 100644
index 000000000..45cfb28ca
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/precompiled/epoch_time_point.h
@@ -0,0 +1,118 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+// TODO(wesm): IR compilation does not have any include directories set
+#include "../../arrow/vendored/datetime/date.h"
+
+bool is_leap_year(int yy);
+bool did_days_overflow(arrow_vendored::date::year_month_day ymd);
+int last_possible_day_in_month(int month, int year);
+
+// A point of time measured in millis since epoch.
+class EpochTimePoint {
+ public:
+ explicit EpochTimePoint(std::chrono::milliseconds millis_since_epoch)
+ : tp_(millis_since_epoch) {}
+
+ explicit EpochTimePoint(int64_t millis_since_epoch)
+ : EpochTimePoint(std::chrono::milliseconds(millis_since_epoch)) {}
+
+ int TmYear() const { return static_cast<int>(YearMonthDay().year()) - 1900; }
+
+ int TmMon() const { return static_cast<unsigned int>(YearMonthDay().month()) - 1; }
+
+ int TmYday() const {
+ auto to_days = arrow_vendored::date::floor<arrow_vendored::date::days>(tp_);
+ auto first_day_in_year = arrow_vendored::date::sys_days{
+ YearMonthDay().year() / arrow_vendored::date::jan / 1};
+ return (to_days - first_day_in_year).count();
+ }
+
+ int TmMday() const { return static_cast<unsigned int>(YearMonthDay().day()); }
+
+ int TmWday() const {
+ auto to_days = arrow_vendored::date::floor<arrow_vendored::date::days>(tp_);
+ return (arrow_vendored::date::weekday{to_days} - // NOLINT
+ arrow_vendored::date::Sunday)
+ .count();
+ }
+
+ int TmHour() const { return static_cast<int>(TimeOfDay().hours().count()); }
+
+ int TmMin() const { return static_cast<int>(TimeOfDay().minutes().count()); }
+
+ int TmSec() const {
+ // TODO(wesm): UNIX y2k issue on int=gdv_int32 platforms
+ return static_cast<int>(TimeOfDay().seconds().count());
+ }
+
+ EpochTimePoint AddYears(int num_years) const {
+ auto ymd = YearMonthDay() + arrow_vendored::date::years(num_years);
+ return EpochTimePoint((arrow_vendored::date::sys_days{ymd} + // NOLINT
+ TimeOfDay().to_duration())
+ .time_since_epoch());
+ }
+
+ EpochTimePoint AddMonths(int num_months) const {
+ auto ymd = YearMonthDay() + arrow_vendored::date::months(num_months);
+
+ EpochTimePoint tp = EpochTimePoint((arrow_vendored::date::sys_days{ymd} + // NOLINT
+ TimeOfDay().to_duration())
+ .time_since_epoch());
+
+ if (did_days_overflow(ymd)) {
+ int days_to_offset =
+ last_possible_day_in_month(static_cast<int>(ymd.year()),
+ static_cast<unsigned int>(ymd.month())) -
+ static_cast<unsigned int>(ymd.day());
+ tp = tp.AddDays(days_to_offset);
+ }
+ return tp;
+ }
+
+ EpochTimePoint AddDays(int num_days) const {
+ auto days_since_epoch = arrow_vendored::date::sys_days{YearMonthDay()} + // NOLINT
+ arrow_vendored::date::days(num_days);
+ return EpochTimePoint(
+ (days_since_epoch + TimeOfDay().to_duration()).time_since_epoch());
+ }
+
+ EpochTimePoint ClearTimeOfDay() const {
+ return EpochTimePoint((tp_ - TimeOfDay().to_duration()).time_since_epoch());
+ }
+
+ bool operator==(const EpochTimePoint& other) const { return tp_ == other.tp_; }
+
+ int64_t MillisSinceEpoch() const { return tp_.time_since_epoch().count(); }
+
+ arrow_vendored::date::time_of_day<std::chrono::milliseconds> TimeOfDay() const {
+ auto millis_since_midnight =
+ tp_ - arrow_vendored::date::floor<arrow_vendored::date::days>(tp_);
+ return arrow_vendored::date::time_of_day<std::chrono::milliseconds>(
+ millis_since_midnight);
+ }
+
+ private:
+ arrow_vendored::date::year_month_day YearMonthDay() const {
+ return arrow_vendored::date::year_month_day{
+ arrow_vendored::date::floor<arrow_vendored::date::days>(tp_)}; // NOLINT
+ }
+
+ std::chrono::time_point<std::chrono::system_clock, std::chrono::milliseconds> tp_;
+};
diff --git a/src/arrow/cpp/src/gandiva/precompiled/epoch_time_point_test.cc b/src/arrow/cpp/src/gandiva/precompiled/epoch_time_point_test.cc
new file mode 100644
index 000000000..9180aac07
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/precompiled/epoch_time_point_test.cc
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <ctime>
+
+#include <gtest/gtest.h>
+#include "./epoch_time_point.h"
+#include "gandiva/precompiled/testing.h"
+#include "gandiva/precompiled/types.h"
+
+#include "gandiva/date_utils.h"
+
+namespace gandiva {
+
+TEST(TestEpochTimePoint, TestTm) {
+ auto ts = StringToTimestamp("2015-05-07 10:20:34");
+ EpochTimePoint tp(ts);
+
+ struct tm* tm_ptr;
+#if defined(_WIN32)
+ __time64_t tsec = ts / 1000;
+ tm_ptr = _gmtime64(&tsec);
+#else
+ struct tm tm;
+ time_t tsec = ts / 1000;
+ tm_ptr = gmtime_r(&tsec, &tm);
+#endif
+
+ EXPECT_EQ(tp.TmYear(), tm_ptr->tm_year);
+ EXPECT_EQ(tp.TmMon(), tm_ptr->tm_mon);
+ EXPECT_EQ(tp.TmYday(), tm_ptr->tm_yday);
+ EXPECT_EQ(tp.TmMday(), tm_ptr->tm_mday);
+ EXPECT_EQ(tp.TmWday(), tm_ptr->tm_wday);
+ EXPECT_EQ(tp.TmHour(), tm_ptr->tm_hour);
+ EXPECT_EQ(tp.TmMin(), tm_ptr->tm_min);
+ EXPECT_EQ(tp.TmSec(), tm_ptr->tm_sec);
+}
+
+TEST(TestEpochTimePoint, TestAddYears) {
+ EXPECT_EQ(EpochTimePoint(StringToTimestamp("2015-05-05 10:20:34")).AddYears(2),
+ EpochTimePoint(StringToTimestamp("2017-05-05 10:20:34")));
+
+ EXPECT_EQ(EpochTimePoint(StringToTimestamp("2015-05-05 10:20:34")).AddYears(0),
+ EpochTimePoint(StringToTimestamp("2015-05-05 10:20:34")));
+
+ EXPECT_EQ(EpochTimePoint(StringToTimestamp("2015-05-05 10:20:34")).AddYears(-1),
+ EpochTimePoint(StringToTimestamp("2014-05-05 10:20:34")));
+}
+
+TEST(TestEpochTimePoint, TestAddMonths) {
+ EXPECT_EQ(EpochTimePoint(StringToTimestamp("2015-05-05 10:20:34")).AddMonths(2),
+ EpochTimePoint(StringToTimestamp("2015-07-05 10:20:34")));
+
+ EXPECT_EQ(EpochTimePoint(StringToTimestamp("2015-05-05 10:20:34")).AddMonths(11),
+ EpochTimePoint(StringToTimestamp("2016-04-05 10:20:34")));
+
+ EXPECT_EQ(EpochTimePoint(StringToTimestamp("2015-05-05 10:20:34")).AddMonths(0),
+ EpochTimePoint(StringToTimestamp("2015-05-05 10:20:34")));
+
+ EXPECT_EQ(EpochTimePoint(StringToTimestamp("2015-05-05 10:20:34")).AddMonths(-1),
+ EpochTimePoint(StringToTimestamp("2015-04-05 10:20:34")));
+
+ EXPECT_EQ(EpochTimePoint(StringToTimestamp("2015-05-05 10:20:34")).AddMonths(-10),
+ EpochTimePoint(StringToTimestamp("2014-07-05 10:20:34")));
+}
+
+TEST(TestEpochTimePoint, TestAddDays) {
+ EXPECT_EQ(EpochTimePoint(StringToTimestamp("2015-05-05 10:20:34")).AddDays(2),
+ EpochTimePoint(StringToTimestamp("2015-05-07 10:20:34")));
+
+ EXPECT_EQ(EpochTimePoint(StringToTimestamp("2015-05-05 10:20:34")).AddDays(11),
+ EpochTimePoint(StringToTimestamp("2015-05-16 10:20:34")));
+
+ EXPECT_EQ(EpochTimePoint(StringToTimestamp("2015-05-05 10:20:34")).AddDays(0),
+ EpochTimePoint(StringToTimestamp("2015-05-05 10:20:34")));
+
+ EXPECT_EQ(EpochTimePoint(StringToTimestamp("2015-05-05 10:20:34")).AddDays(-1),
+ EpochTimePoint(StringToTimestamp("2015-05-04 10:20:34")));
+
+ EXPECT_EQ(EpochTimePoint(StringToTimestamp("2015-05-05 10:20:34")).AddDays(-10),
+ EpochTimePoint(StringToTimestamp("2015-04-25 10:20:34")));
+}
+
+TEST(TestEpochTimePoint, TestClearTimeOfDay) {
+ EXPECT_EQ(EpochTimePoint(StringToTimestamp("2015-05-05 10:20:34")).ClearTimeOfDay(),
+ EpochTimePoint(StringToTimestamp("2015-05-05 00:00:00")));
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/precompiled/extended_math_ops.cc b/src/arrow/cpp/src/gandiva/precompiled/extended_math_ops.cc
new file mode 100644
index 000000000..365b08a6d
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/precompiled/extended_math_ops.cc
@@ -0,0 +1,410 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+#include "arrow/util/logging.h"
+#include "gandiva/precompiled/decimal_ops.h"
+
+extern "C" {
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "./types.h"
+
+// Expand the inner fn for types that support extended math.
+#define ENUMERIC_TYPES_UNARY(INNER, OUT_TYPE) \
+ INNER(int32, OUT_TYPE) \
+ INNER(uint32, OUT_TYPE) \
+ INNER(int64, OUT_TYPE) \
+ INNER(uint64, OUT_TYPE) \
+ INNER(float32, OUT_TYPE) \
+ INNER(float64, OUT_TYPE)
+
+// Cubic root
+#define CBRT(IN_TYPE, OUT_TYPE) \
+ FORCE_INLINE \
+ gdv_##OUT_TYPE cbrt_##IN_TYPE(gdv_##IN_TYPE in) { \
+ return static_cast<gdv_float64>(cbrtl(static_cast<long double>(in))); \
+ }
+
+ENUMERIC_TYPES_UNARY(CBRT, float64)
+
+// Exponent
+#define EXP(IN_TYPE, OUT_TYPE) \
+ FORCE_INLINE \
+ gdv_##OUT_TYPE exp_##IN_TYPE(gdv_##IN_TYPE in) { \
+ return static_cast<gdv_float64>(expl(static_cast<long double>(in))); \
+ }
+
+ENUMERIC_TYPES_UNARY(EXP, float64)
+
+// log
+#define LOG(IN_TYPE, OUT_TYPE) \
+ FORCE_INLINE \
+ gdv_##OUT_TYPE log_##IN_TYPE(gdv_##IN_TYPE in) { \
+ return static_cast<gdv_float64>(logl(static_cast<long double>(in))); \
+ }
+
+ENUMERIC_TYPES_UNARY(LOG, float64)
+
+// log base 10
+#define LOG10(IN_TYPE, OUT_TYPE) \
+ FORCE_INLINE \
+ gdv_##OUT_TYPE log10_##IN_TYPE(gdv_##IN_TYPE in) { \
+ return static_cast<gdv_float64>(log10l(static_cast<long double>(in))); \
+ }
+
+#define LOGL(VALUE) static_cast<gdv_float64>(logl(static_cast<long double>(VALUE)))
+
+ENUMERIC_TYPES_UNARY(LOG10, float64)
+
+FORCE_INLINE
+void set_error_for_logbase(int64_t execution_context, double base) {
+ char const* prefix = "divide by zero error with log of base";
+ int size = static_cast<int>(strlen(prefix)) + 64;
+ char* error = reinterpret_cast<char*>(malloc(size));
+ snprintf(error, size, "%s %f", prefix, base);
+ gdv_fn_context_set_error_msg(execution_context, error);
+ free(static_cast<char*>(error));
+}
+
+// log with base
+#define LOG_WITH_BASE(IN_TYPE1, IN_TYPE2, OUT_TYPE) \
+ FORCE_INLINE \
+ gdv_##OUT_TYPE log_##IN_TYPE1##_##IN_TYPE2(gdv_int64 context, gdv_##IN_TYPE1 base, \
+ gdv_##IN_TYPE2 value) { \
+ gdv_##OUT_TYPE log_of_base = LOGL(base); \
+ if (log_of_base == 0) { \
+ set_error_for_logbase(context, static_cast<gdv_float64>(base)); \
+ return 0; \
+ } \
+ return LOGL(value) / LOGL(base); \
+ }
+
+LOG_WITH_BASE(int32, int32, float64)
+LOG_WITH_BASE(uint32, uint32, float64)
+LOG_WITH_BASE(int64, int64, float64)
+LOG_WITH_BASE(uint64, uint64, float64)
+LOG_WITH_BASE(float32, float32, float64)
+LOG_WITH_BASE(float64, float64, float64)
+
+// Sin
+#define SIN(IN_TYPE, OUT_TYPE) \
+ FORCE_INLINE \
+ gdv_##OUT_TYPE sin_##IN_TYPE(gdv_##IN_TYPE in) { \
+ return static_cast<gdv_##OUT_TYPE>(sin(static_cast<long double>(in))); \
+ }
+ENUMERIC_TYPES_UNARY(SIN, float64)
+
+// Asin
+#define ASIN(IN_TYPE, OUT_TYPE) \
+ FORCE_INLINE \
+ gdv_##OUT_TYPE asin_##IN_TYPE(gdv_##IN_TYPE in) { \
+ return static_cast<gdv_##OUT_TYPE>(asin(static_cast<long double>(in))); \
+ }
+ENUMERIC_TYPES_UNARY(ASIN, float64)
+
+// Cos
+#define COS(IN_TYPE, OUT_TYPE) \
+ FORCE_INLINE \
+ gdv_##OUT_TYPE cos_##IN_TYPE(gdv_##IN_TYPE in) { \
+ return static_cast<gdv_##OUT_TYPE>(cos(static_cast<long double>(in))); \
+ }
+ENUMERIC_TYPES_UNARY(COS, float64)
+
+// Acos
+#define ACOS(IN_TYPE, OUT_TYPE) \
+ FORCE_INLINE \
+ gdv_##OUT_TYPE acos_##IN_TYPE(gdv_##IN_TYPE in) { \
+ return static_cast<gdv_##OUT_TYPE>(acos(static_cast<long double>(in))); \
+ }
+ENUMERIC_TYPES_UNARY(ACOS, float64)
+
+// Tan
+#define TAN(IN_TYPE, OUT_TYPE) \
+ FORCE_INLINE \
+ gdv_##OUT_TYPE tan_##IN_TYPE(gdv_##IN_TYPE in) { \
+ return static_cast<gdv_##OUT_TYPE>(tan(static_cast<long double>(in))); \
+ }
+ENUMERIC_TYPES_UNARY(TAN, float64)
+
+// Atan
+#define ATAN(IN_TYPE, OUT_TYPE) \
+ FORCE_INLINE \
+ gdv_##OUT_TYPE atan_##IN_TYPE(gdv_##IN_TYPE in) { \
+ return static_cast<gdv_##OUT_TYPE>(atan(static_cast<long double>(in))); \
+ }
+ENUMERIC_TYPES_UNARY(ATAN, float64)
+
+// Sinh
+#define SINH(IN_TYPE, OUT_TYPE) \
+ FORCE_INLINE \
+ gdv_##OUT_TYPE sinh_##IN_TYPE(gdv_##IN_TYPE in) { \
+ return static_cast<gdv_##OUT_TYPE>(sinh(static_cast<long double>(in))); \
+ }
+ENUMERIC_TYPES_UNARY(SINH, float64)
+
+// Cosh
+#define COSH(IN_TYPE, OUT_TYPE) \
+ FORCE_INLINE \
+ gdv_##OUT_TYPE cosh_##IN_TYPE(gdv_##IN_TYPE in) { \
+ return static_cast<gdv_##OUT_TYPE>(cosh(static_cast<long double>(in))); \
+ }
+ENUMERIC_TYPES_UNARY(COSH, float64)
+
+// Tanh
+#define TANH(IN_TYPE, OUT_TYPE) \
+ FORCE_INLINE \
+ gdv_##OUT_TYPE tanh_##IN_TYPE(gdv_##IN_TYPE in) { \
+ return static_cast<gdv_##OUT_TYPE>(tanh(static_cast<long double>(in))); \
+ }
+ENUMERIC_TYPES_UNARY(TANH, float64)
+
+// Atan2
+#define ATAN2(IN_TYPE, OUT_TYPE) \
+ FORCE_INLINE \
+ gdv_##OUT_TYPE atan2_##IN_TYPE##_##IN_TYPE(gdv_##IN_TYPE in1, gdv_##IN_TYPE in2) { \
+ return static_cast<gdv_##OUT_TYPE>( \
+ atan2(static_cast<long double>(in1), static_cast<long double>(in2))); \
+ }
+ENUMERIC_TYPES_UNARY(ATAN2, float64)
+
+// Cot
+#define COT(IN_TYPE, OUT_TYPE) \
+ FORCE_INLINE \
+ gdv_##OUT_TYPE cot_##IN_TYPE(gdv_##IN_TYPE in) { \
+ return static_cast<gdv_##OUT_TYPE>(tan(M_PI / 2 - static_cast<long double>(in))); \
+ }
+ENUMERIC_TYPES_UNARY(COT, float64)
+
+// Radians
+#define RADIANS(IN_TYPE, OUT_TYPE) \
+ FORCE_INLINE \
+ gdv_##OUT_TYPE radians_##IN_TYPE(gdv_##IN_TYPE in) { \
+ return static_cast<gdv_##OUT_TYPE>(static_cast<long double>(in) * M_PI / 180.0); \
+ }
+ENUMERIC_TYPES_UNARY(RADIANS, float64)
+
+// Degrees
+#define DEGREES(IN_TYPE, OUT_TYPE) \
+ FORCE_INLINE \
+ gdv_##OUT_TYPE degrees_##IN_TYPE(gdv_##IN_TYPE in) { \
+ return static_cast<gdv_##OUT_TYPE>(static_cast<long double>(in) * 180.0 / M_PI); \
+ }
+ENUMERIC_TYPES_UNARY(DEGREES, float64)
+
+// power
+#define POWER(IN_TYPE1, IN_TYPE2, OUT_TYPE) \
+ FORCE_INLINE \
+ gdv_##OUT_TYPE power_##IN_TYPE1##_##IN_TYPE2(gdv_##IN_TYPE1 in1, gdv_##IN_TYPE2 in2) { \
+ return static_cast<gdv_float64>(powl(in1, in2)); \
+ }
+POWER(float64, float64, float64)
+
+FORCE_INLINE
+gdv_int32 round_int32(gdv_int32 num) { return num; }
+
+FORCE_INLINE
+gdv_int64 round_int64(gdv_int64 num) { return num; }
+
+// rounds the number to the nearest integer
+#define ROUND_DECIMAL(TYPE) \
+ FORCE_INLINE \
+ gdv_##TYPE round_##TYPE(gdv_##TYPE num) { \
+ return static_cast<gdv_##TYPE>(trunc(num + ((num >= 0) ? 0.5 : -0.5))); \
+ }
+
+ROUND_DECIMAL(float32)
+ROUND_DECIMAL(float64)
+
+// rounds the number to the given scale
+#define ROUND_DECIMAL_TO_SCALE(TYPE) \
+ FORCE_INLINE \
+ gdv_##TYPE round_##TYPE##_int32(gdv_##TYPE number, gdv_int32 out_scale) { \
+ gdv_float64 scale_multiplier = get_scale_multiplier(out_scale); \
+ return static_cast<gdv_##TYPE>( \
+ trunc(number * scale_multiplier + ((number >= 0) ? 0.5 : -0.5)) / \
+ scale_multiplier); \
+ }
+
+ROUND_DECIMAL_TO_SCALE(float32)
+ROUND_DECIMAL_TO_SCALE(float64)
+
+FORCE_INLINE
+gdv_int32 round_int32_int32(gdv_int32 number, gdv_int32 precision) {
+ // for integers, there is nothing following the decimal point,
+ // so round() always returns the same number if precision >= 0
+ if (precision >= 0) {
+ return number;
+ }
+ gdv_int32 abs_precision = -precision;
+ // This is to ensure that there is no overflow while calculating 10^precision, 9 is
+ // the smallest N for which 10^N does not fit into 32 bits, so we can safely return 0
+ if (abs_precision > 9) {
+ return 0;
+ }
+ gdv_int32 num_sign = (number > 0) ? 1 : -1;
+ gdv_int32 abs_number = number * num_sign;
+ gdv_int32 power_of_10 = static_cast<gdv_int32>(get_power_of_10(abs_precision));
+ gdv_int32 remainder = abs_number % power_of_10;
+ abs_number -= remainder;
+ // if the fractional part of the quotient >= 0.5, round to next higher integer
+ if (remainder >= power_of_10 / 2) {
+ abs_number += power_of_10;
+ }
+ return abs_number * num_sign;
+}
+
+FORCE_INLINE
+gdv_int64 round_int64_int32(gdv_int64 number, gdv_int32 precision) {
+ // for long integers, there is nothing following the decimal point,
+ // so round() always returns the same number if precision >= 0
+ if (precision >= 0) {
+ return number;
+ }
+ gdv_int32 abs_precision = -precision;
+ // This is to ensure that there is no overflow while calculating 10^precision, 19 is
+ // the smallest N for which 10^N does not fit into 64 bits, so we can safely return 0
+ if (abs_precision > 18) {
+ return 0;
+ }
+ gdv_int32 num_sign = (number > 0) ? 1 : -1;
+ gdv_int64 abs_number = number * num_sign;
+ gdv_int64 power_of_10 = get_power_of_10(abs_precision);
+ gdv_int64 remainder = abs_number % power_of_10;
+ abs_number -= remainder;
+ // if the fractional part of the quotient >= 0.5, round to next higher integer
+ if (remainder >= power_of_10 / 2) {
+ abs_number += power_of_10;
+ }
+ return abs_number * num_sign;
+}
+
+FORCE_INLINE
+gdv_int64 get_power_of_10(gdv_int32 exp) {
+ DCHECK_GE(exp, 0);
+ DCHECK_LE(exp, 18);
+ static const gdv_int64 power_of_10[] = {1,
+ 10,
+ 100,
+ 1000,
+ 10000,
+ 100000,
+ 1000000,
+ 10000000,
+ 100000000,
+ 1000000000,
+ 10000000000,
+ 100000000000,
+ 1000000000000,
+ 10000000000000,
+ 100000000000000,
+ 1000000000000000,
+ 10000000000000000,
+ 100000000000000000,
+ 1000000000000000000};
+ return power_of_10[exp];
+}
+
+FORCE_INLINE
+gdv_int64 truncate_int64_int32(gdv_int64 in, gdv_int32 out_scale) {
+ bool overflow = false;
+ arrow::BasicDecimal128 decimal = gandiva::decimalops::FromInt64(in, 38, 0, &overflow);
+ arrow::BasicDecimal128 decimal_with_outscale =
+ gandiva::decimalops::Truncate(gandiva::BasicDecimalScalar128(decimal, 38, 0), 38,
+ out_scale, out_scale, &overflow);
+ if (out_scale < 0) {
+ out_scale = 0;
+ }
+ return gandiva::decimalops::ToInt64(
+ gandiva::BasicDecimalScalar128(decimal_with_outscale, 38, out_scale), &overflow);
+}
+
+FORCE_INLINE
+gdv_float64 get_scale_multiplier(gdv_int32 scale) {
+ static const gdv_float64 values[] = {1.0,
+ 10.0,
+ 100.0,
+ 1000.0,
+ 10000.0,
+ 100000.0,
+ 1000000.0,
+ 10000000.0,
+ 100000000.0,
+ 1000000000.0,
+ 10000000000.0,
+ 100000000000.0,
+ 1000000000000.0,
+ 10000000000000.0,
+ 100000000000000.0,
+ 1000000000000000.0,
+ 10000000000000000.0,
+ 100000000000000000.0,
+ 1000000000000000000.0,
+ 10000000000000000000.0};
+ if (scale >= 0 && scale < 20) {
+ return values[scale];
+ }
+ return power_float64_float64(10.0, scale);
+}
+
+// returns the binary representation of a given integer (e.g. 928 -> 1110100000)
+#define BIN_INTEGER(IN_TYPE) \
+ FORCE_INLINE \
+ const char* bin_##IN_TYPE(int64_t context, gdv_##IN_TYPE value, int32_t* out_len) { \
+ *out_len = 0; \
+ int32_t len = 8 * sizeof(value); \
+ char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, len)); \
+ if (ret == nullptr) { \
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for output"); \
+ return ""; \
+ } \
+ /* handle case when value is zero */ \
+ if (value == 0) { \
+ *out_len = 1; \
+ ret[0] = '0'; \
+ return ret; \
+ } \
+ /* generate binary representation iteratively */ \
+ gdv_u##IN_TYPE i; \
+ int8_t count = 0; \
+ bool first = false; /* flag for not printing left zeros in positive numbers */ \
+ for (i = static_cast<gdv_u##IN_TYPE>(1) << (len - 1); i > 0; i = i / 2) { \
+ if ((value & i) != 0) { \
+ ret[count] = '1'; \
+ if (!first) first = true; \
+ } else { \
+ if (!first) continue; \
+ ret[count] = '0'; \
+ } \
+ count += 1; \
+ } \
+ *out_len = count; \
+ return ret; \
+ }
+
+BIN_INTEGER(int32)
+BIN_INTEGER(int64)
+
+#undef BIN_INTEGER
+
+} // extern "C"
diff --git a/src/arrow/cpp/src/gandiva/precompiled/extended_math_ops_test.cc b/src/arrow/cpp/src/gandiva/precompiled/extended_math_ops_test.cc
new file mode 100644
index 000000000..147b4035c
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/precompiled/extended_math_ops_test.cc
@@ -0,0 +1,349 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+#include <gtest/gtest.h>
+#include <cmath>
+#include "gandiva/execution_context.h"
+#include "gandiva/precompiled/types.h"
+
+namespace gandiva {
+
+static const double MAX_ERROR = 0.00005;
+
+void VerifyFuzzyEquals(double actual, double expected, double max_error = MAX_ERROR) {
+ EXPECT_TRUE(fabs(actual - expected) < max_error) << actual << " != " << expected;
+}
+
+TEST(TestExtendedMathOps, TestCbrt) {
+ VerifyFuzzyEquals(cbrt_int32(27), 3);
+ VerifyFuzzyEquals(cbrt_int64(27), 3);
+ VerifyFuzzyEquals(cbrt_float32(27), 3);
+ VerifyFuzzyEquals(cbrt_float64(27), 3);
+ VerifyFuzzyEquals(cbrt_float64(-27), -3);
+
+ VerifyFuzzyEquals(cbrt_float32(15.625), 2.5);
+ VerifyFuzzyEquals(cbrt_float64(15.625), 2.5);
+}
+
+TEST(TestExtendedMathOps, TestExp) {
+ double val = 20.085536923187668;
+
+ VerifyFuzzyEquals(exp_int32(3), val);
+ VerifyFuzzyEquals(exp_int64(3), val);
+ VerifyFuzzyEquals(exp_float32(3), val);
+ VerifyFuzzyEquals(exp_float64(3), val);
+}
+
+TEST(TestExtendedMathOps, TestLog) {
+ double val = 4.1588830833596715;
+
+ VerifyFuzzyEquals(log_int32(64), val);
+ VerifyFuzzyEquals(log_int64(64), val);
+ VerifyFuzzyEquals(log_float32(64), val);
+ VerifyFuzzyEquals(log_float64(64), val);
+
+ EXPECT_EQ(log_int32(0), -std::numeric_limits<double>::infinity());
+}
+
+TEST(TestExtendedMathOps, TestLog10) {
+ VerifyFuzzyEquals(log10_int32(100), 2);
+ VerifyFuzzyEquals(log10_int64(100), 2);
+ VerifyFuzzyEquals(log10_float32(100), 2);
+ VerifyFuzzyEquals(log10_float64(100), 2);
+}
+
+TEST(TestExtendedMathOps, TestPower) {
+ VerifyFuzzyEquals(power_float64_float64(2, 5.4), 42.22425314473263);
+ VerifyFuzzyEquals(power_float64_float64(5.4, 2), 29.160000000000004);
+}
+
+TEST(TestExtendedMathOps, TestLogWithBase) {
+ gandiva::ExecutionContext context;
+ gdv_float64 out =
+ log_int32_int32(reinterpret_cast<gdv_int64>(&context), 1 /*base*/, 10 /*value*/);
+ VerifyFuzzyEquals(out, 0);
+ EXPECT_EQ(context.has_error(), true);
+ EXPECT_TRUE(context.get_error().find("divide by zero error") != std::string::npos)
+ << context.get_error();
+
+ gandiva::ExecutionContext context1;
+ out = log_int32_int32(reinterpret_cast<gdv_int64>(&context), 2 /*base*/, 64 /*value*/);
+ VerifyFuzzyEquals(out, 6);
+ EXPECT_EQ(context1.has_error(), false);
+}
+
+TEST(TestExtendedMathOps, TestRoundDecimal) {
+ EXPECT_FLOAT_EQ(round_float32(1234.245f), 1234);
+ EXPECT_FLOAT_EQ(round_float32(-11.7892f), -12);
+ EXPECT_FLOAT_EQ(round_float32(1.4999999f), 1);
+ EXPECT_EQ(std::signbit(round_float32(0)), 0);
+ EXPECT_FLOAT_EQ(round_float32_int32(1234.789f, 2), 1234.79f);
+ EXPECT_FLOAT_EQ(round_float32_int32(1234.12345f, -3), 1000);
+ EXPECT_FLOAT_EQ(round_float32_int32(-1234.4567f, 3), -1234.457f);
+ EXPECT_FLOAT_EQ(round_float32_int32(-1234.4567f, -3), -1000);
+ EXPECT_FLOAT_EQ(round_float32_int32(1234.4567f, 0), 1234);
+ EXPECT_FLOAT_EQ(round_float32_int32(1.5499999523162842f, 1), 1.5f);
+ EXPECT_EQ(std::signbit(round_float32_int32(0, 5)), 0);
+ EXPECT_FLOAT_EQ(round_float32_int32(static_cast<float>(1.55), 1), 1.5f);
+ EXPECT_FLOAT_EQ(round_float32_int32(static_cast<float>(9.134123), 2), 9.13f);
+ EXPECT_FLOAT_EQ(round_float32_int32(static_cast<float>(-1.923), 1), -1.9f);
+
+ VerifyFuzzyEquals(round_float64(1234.245), 1234);
+ VerifyFuzzyEquals(round_float64(-11.7892), -12);
+ VerifyFuzzyEquals(round_float64(1.4999999), 1);
+ EXPECT_EQ(std::signbit(round_float64(0)), 0);
+ VerifyFuzzyEquals(round_float64_int32(1234.789, 2), 1234.79);
+ VerifyFuzzyEquals(round_float64_int32(1234.12345, -3), 1000);
+ VerifyFuzzyEquals(round_float64_int32(-1234.4567, 3), -1234.457);
+ VerifyFuzzyEquals(round_float64_int32(-1234.4567, -3), -1000);
+ VerifyFuzzyEquals(round_float64_int32(1234.4567, 0), 1234);
+ EXPECT_EQ(std::signbit(round_float64_int32(0, -2)), 0);
+ VerifyFuzzyEquals(round_float64_int32((double)INT_MAX + 1, 0), (double)INT_MAX + 1);
+ VerifyFuzzyEquals(round_float64_int32((double)INT_MIN - 1, 0), (double)INT_MIN - 1);
+}
+
+TEST(TestExtendedMathOps, TestRound) {
+ EXPECT_EQ(round_int32(21134), 21134);
+ EXPECT_EQ(round_int32(-132422), -132422);
+ EXPECT_EQ(round_int32_int32(7589, -1), 7590);
+ EXPECT_EQ(round_int32_int32(8532, -2), 8500);
+ EXPECT_EQ(round_int32_int32(-8579, -1), -8580);
+ EXPECT_EQ(round_int32_int32(-8612, -2), -8600);
+ EXPECT_EQ(round_int32_int32(758, 2), 758);
+ EXPECT_EQ(round_int32_int32(8612, -5), 0);
+
+ EXPECT_EQ(round_int64(3453562312), 3453562312);
+ EXPECT_EQ(round_int64(-23453462343), -23453462343);
+ EXPECT_EQ(round_int64_int32(3453562312, -2), 3453562300);
+ EXPECT_EQ(round_int64_int32(3453562343, -5), 3453600000);
+ EXPECT_EQ(round_int64_int32(345353425343, 12), 345353425343);
+ EXPECT_EQ(round_int64_int32(-23453462343, -4), -23453460000);
+ EXPECT_EQ(round_int64_int32(-23453462343, -5), -23453500000);
+ EXPECT_EQ(round_int64_int32(345353425343, -12), 0);
+}
+
+TEST(TestExtendedMathOps, TestTruncate) {
+ EXPECT_EQ(truncate_int64_int32(1234, 4), 1234);
+ EXPECT_EQ(truncate_int64_int32(-1234, 4), -1234);
+ EXPECT_EQ(truncate_int64_int32(1234, -4), 0);
+ EXPECT_EQ(truncate_int64_int32(-1234, -2), -1200);
+ EXPECT_EQ(truncate_int64_int32(8124674407369523212, 0), 8124674407369523212);
+ EXPECT_EQ(truncate_int64_int32(8124674407369523212, -2), 8124674407369523200);
+}
+
+TEST(TestExtendedMathOps, TestTrigonometricFunctions) {
+ auto pi_float = static_cast<float>(M_PI);
+ // Sin functions
+ VerifyFuzzyEquals(sin_float32(0), sin(0));
+ VerifyFuzzyEquals(sin_float32(0), sin(0));
+ VerifyFuzzyEquals(sin_float32(pi_float / 2), sin(M_PI / 2));
+ VerifyFuzzyEquals(sin_float32(pi_float), sin(M_PI));
+ VerifyFuzzyEquals(sin_float32(-pi_float / 2), sin(-M_PI / 2));
+ VerifyFuzzyEquals(sin_float64(0), sin(0));
+ VerifyFuzzyEquals(sin_float64(M_PI / 2), sin(M_PI / 2));
+ VerifyFuzzyEquals(sin_float64(M_PI), sin(M_PI));
+ VerifyFuzzyEquals(sin_float64(-M_PI / 2), sin(-M_PI / 2));
+ VerifyFuzzyEquals(sin_int32(0), sin(0));
+ VerifyFuzzyEquals(sin_int64(0), sin(0));
+
+ // Cos functions
+ VerifyFuzzyEquals(cos_float32(0), cos(0));
+ VerifyFuzzyEquals(cos_float32(pi_float / 2), cos(M_PI / 2));
+ VerifyFuzzyEquals(cos_float32(pi_float), cos(M_PI));
+ VerifyFuzzyEquals(cos_float32(-pi_float / 2), cos(-M_PI / 2));
+ VerifyFuzzyEquals(cos_float64(0), cos(0));
+ VerifyFuzzyEquals(cos_float64(M_PI / 2), cos(M_PI / 2));
+ VerifyFuzzyEquals(cos_float64(M_PI), cos(M_PI));
+ VerifyFuzzyEquals(cos_float64(-M_PI / 2), cos(-M_PI / 2));
+ VerifyFuzzyEquals(cos_int32(0), cos(0));
+ VerifyFuzzyEquals(cos_int64(0), cos(0));
+
+ // Asin functions
+ VerifyFuzzyEquals(asin_float32(-1.0), asin(-1.0));
+ VerifyFuzzyEquals(asin_float32(1.0), asin(1.0));
+ VerifyFuzzyEquals(asin_float64(-1.0), asin(-1.0));
+ VerifyFuzzyEquals(asin_float64(1.0), asin(1.0));
+ VerifyFuzzyEquals(asin_int32(0), asin(0));
+ VerifyFuzzyEquals(asin_int64(0), asin(0));
+
+ // Acos functions
+ VerifyFuzzyEquals(acos_float32(-1.0), acos(-1.0));
+ VerifyFuzzyEquals(acos_float32(1.0), acos(1.0));
+ VerifyFuzzyEquals(acos_float64(-1.0), acos(-1.0));
+ VerifyFuzzyEquals(acos_float64(1.0), acos(1.0));
+ VerifyFuzzyEquals(acos_int32(0), acos(0));
+ VerifyFuzzyEquals(acos_int64(0), acos(0));
+
+ // Tan
+ VerifyFuzzyEquals(tan_float32(pi_float), tan(M_PI));
+ VerifyFuzzyEquals(tan_float32(-pi_float), tan(-M_PI));
+ VerifyFuzzyEquals(tan_float64(M_PI), tan(M_PI));
+ VerifyFuzzyEquals(tan_float64(-M_PI), tan(-M_PI));
+ VerifyFuzzyEquals(tan_int32(0), tan(0));
+ VerifyFuzzyEquals(tan_int64(0), tan(0));
+
+ // Atan
+ VerifyFuzzyEquals(atan_float32(pi_float), atan(M_PI));
+ VerifyFuzzyEquals(atan_float32(-pi_float), atan(-M_PI));
+ VerifyFuzzyEquals(atan_float64(M_PI), atan(M_PI));
+ VerifyFuzzyEquals(atan_float64(-M_PI), atan(-M_PI));
+ VerifyFuzzyEquals(atan_int32(0), atan(0));
+ VerifyFuzzyEquals(atan_int64(0), atan(0));
+
+ // Sinh functions
+ VerifyFuzzyEquals(sinh_float32(0), sinh(0));
+ VerifyFuzzyEquals(sinh_float32(pi_float / 2), sinh(M_PI / 2));
+ VerifyFuzzyEquals(sinh_float32(pi_float), sinh(M_PI));
+ VerifyFuzzyEquals(sinh_float32(-pi_float / 2), sinh(-M_PI / 2));
+ VerifyFuzzyEquals(sinh_float64(0), sinh(0));
+ VerifyFuzzyEquals(sinh_float64(M_PI / 2), sinh(M_PI / 2));
+ VerifyFuzzyEquals(sinh_float64(M_PI), sinh(M_PI));
+ VerifyFuzzyEquals(sinh_float64(-M_PI / 2), sinh(-M_PI / 2));
+ VerifyFuzzyEquals(sinh_int32(0), sinh(0));
+ VerifyFuzzyEquals(sinh_int64(0), sinh(0));
+
+ // Cosh functions
+ VerifyFuzzyEquals(cosh_float32(0), cosh(0));
+ VerifyFuzzyEquals(cosh_float32(pi_float / 2), cosh(M_PI / 2));
+ VerifyFuzzyEquals(cosh_float32(pi_float), cosh(M_PI));
+ VerifyFuzzyEquals(cosh_float32(-pi_float / 2), cosh(-M_PI / 2));
+ VerifyFuzzyEquals(cosh_float64(0), cosh(0));
+ VerifyFuzzyEquals(cosh_float64(M_PI / 2), cosh(M_PI / 2));
+ VerifyFuzzyEquals(cosh_float64(M_PI), cosh(M_PI));
+ VerifyFuzzyEquals(cosh_float64(-M_PI / 2), cosh(-M_PI / 2));
+ VerifyFuzzyEquals(cosh_int32(0), cosh(0));
+ VerifyFuzzyEquals(cosh_int64(0), cosh(0));
+
+ // Tanh
+ VerifyFuzzyEquals(tanh_float32(pi_float), tanh(M_PI));
+ VerifyFuzzyEquals(tanh_float32(-pi_float), tanh(-M_PI));
+ VerifyFuzzyEquals(tanh_float64(M_PI), tanh(M_PI));
+ VerifyFuzzyEquals(tanh_float64(-M_PI), tanh(-M_PI));
+ VerifyFuzzyEquals(tanh_int32(0), tanh(0));
+ VerifyFuzzyEquals(tanh_int64(0), tanh(0));
+
+ // Atan2
+ VerifyFuzzyEquals(atan2_float32_float32(1, 0), atan2(1, 0));
+ VerifyFuzzyEquals(atan2_float32_float32(-1.0, 0), atan2(-1, 0));
+ VerifyFuzzyEquals(atan2_float64_float64(1.0, 0.0), atan2(1, 0));
+ VerifyFuzzyEquals(atan2_float64_float64(-1, 0), atan2(-1, 0));
+ VerifyFuzzyEquals(atan2_int32_int32(1, 0), atan2(1, 0));
+ VerifyFuzzyEquals(atan2_int64_int64(-1, 0), atan2(-1, 0));
+
+ // Radians
+ VerifyFuzzyEquals(radians_float32(0), 0);
+ VerifyFuzzyEquals(radians_float32(180.0), M_PI);
+ VerifyFuzzyEquals(radians_float32(90.0), M_PI / 2);
+ VerifyFuzzyEquals(radians_float64(0), 0);
+ VerifyFuzzyEquals(radians_float64(180.0), M_PI);
+ VerifyFuzzyEquals(radians_float64(90.0), M_PI / 2);
+ VerifyFuzzyEquals(radians_int32(180), M_PI);
+ VerifyFuzzyEquals(radians_int64(90), M_PI / 2);
+
+ // Degrees
+ VerifyFuzzyEquals(degrees_float32(0), 0.0);
+ VerifyFuzzyEquals(degrees_float32(pi_float), 180.0);
+ VerifyFuzzyEquals(degrees_float32(pi_float / 2), 90.0);
+ VerifyFuzzyEquals(degrees_float64(0), 0.0);
+ VerifyFuzzyEquals(degrees_float64(M_PI), 180.0);
+ VerifyFuzzyEquals(degrees_float64(M_PI / 2), 90.0);
+ VerifyFuzzyEquals(degrees_int32(1), 57.2958);
+ VerifyFuzzyEquals(degrees_int64(1), 57.2958);
+
+ // Cot
+ VerifyFuzzyEquals(cot_float32(pi_float / 2), tan(M_PI / 2 - M_PI / 2));
+ VerifyFuzzyEquals(cot_float64(M_PI / 2), tan(M_PI / 2 - M_PI / 2));
+}
+
+TEST(TestExtendedMathOps, TestBinRepresentation) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+
+ const char* out_str = bin_int32(ctx_ptr, 7, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "111");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = bin_int32(ctx_ptr, 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "0");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = bin_int32(ctx_ptr, 28550, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "110111110000110");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = bin_int32(ctx_ptr, -28550, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "11111111111111111001000001111010");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = bin_int32(ctx_ptr, 58117, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "1110001100000101");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = bin_int32(ctx_ptr, -58117, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "11111111111111110001110011111011");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = bin_int32(ctx_ptr, INT32_MAX, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "1111111111111111111111111111111");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = bin_int32(ctx_ptr, INT32_MIN, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "10000000000000000000000000000000");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = bin_int64(ctx_ptr, 7, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "111");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = bin_int64(ctx_ptr, 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "0");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = bin_int64(ctx_ptr, 28550, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "110111110000110");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = bin_int64(ctx_ptr, -28550, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len),
+ "1111111111111111111111111111111111111111111111111001000001111010");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = bin_int64(ctx_ptr, 58117, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "1110001100000101");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = bin_int64(ctx_ptr, -58117, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len),
+ "1111111111111111111111111111111111111111111111110001110011111011");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = bin_int64(ctx_ptr, INT64_MAX, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len),
+ "111111111111111111111111111111111111111111111111111111111111111");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = bin_int64(ctx_ptr, INT64_MIN, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len),
+ "1000000000000000000000000000000000000000000000000000000000000000");
+ EXPECT_FALSE(ctx.has_error());
+}
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/precompiled/hash.cc b/src/arrow/cpp/src/gandiva/precompiled/hash.cc
new file mode 100644
index 000000000..eacf36230
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/precompiled/hash.cc
@@ -0,0 +1,407 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+extern "C" {
+
+#include <string.h>
+
+#include "./types.h"
+
+static inline gdv_uint64 rotate_left(gdv_uint64 val, int distance) {
+ return (val << distance) | (val >> (64 - distance));
+}
+
+//
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain.
+// See http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp
+// MurmurHash3_x64_128
+//
+static inline gdv_uint64 fmix64(gdv_uint64 k) {
+ k ^= k >> 33;
+ k *= 0xff51afd7ed558ccduLL;
+ k ^= k >> 33;
+ k *= 0xc4ceb9fe1a85ec53uLL;
+ k ^= k >> 33;
+ return k;
+}
+
+static inline gdv_uint64 murmur3_64(gdv_uint64 val, gdv_int32 seed) {
+ gdv_uint64 h1 = seed;
+ gdv_uint64 h2 = seed;
+
+ gdv_uint64 c1 = 0x87c37b91114253d5ull;
+ gdv_uint64 c2 = 0x4cf5ad432745937full;
+
+ int length = 8;
+ gdv_uint64 k1 = 0;
+
+ k1 = val;
+ k1 *= c1;
+ k1 = rotate_left(k1, 31);
+ k1 *= c2;
+ h1 ^= k1;
+
+ h1 ^= length;
+ h2 ^= length;
+
+ h1 += h2;
+ h2 += h1;
+
+ h1 = fmix64(h1);
+ h2 = fmix64(h2);
+
+ h1 += h2;
+
+ // h2 += h1;
+ // murmur3_128 should return 128 bit (h1,h2), now we return only 64bits,
+ return h1;
+}
+
+static inline gdv_uint32 murmur3_32(gdv_uint64 val, gdv_int32 seed) {
+ gdv_uint64 c1 = 0xcc9e2d51ull;
+ gdv_uint64 c2 = 0x1b873593ull;
+ int length = 8;
+ static gdv_uint64 UINT_MASK = 0xffffffffull;
+ gdv_uint64 lh1 = seed & UINT_MASK;
+ for (int i = 0; i < 2; i++) {
+ gdv_uint64 lk1 = ((val >> i * 32) & UINT_MASK);
+ lk1 *= c1;
+ lk1 &= UINT_MASK;
+
+ lk1 = ((lk1 << 15) & UINT_MASK) | (lk1 >> 17);
+
+ lk1 *= c2;
+ lk1 &= UINT_MASK;
+
+ lh1 ^= lk1;
+ lh1 = ((lh1 << 13) & UINT_MASK) | (lh1 >> 19);
+
+ lh1 = lh1 * 5 + 0xe6546b64L;
+ lh1 = UINT_MASK & lh1;
+ }
+ lh1 ^= length;
+
+ lh1 ^= lh1 >> 16;
+ lh1 *= 0x85ebca6bull;
+ lh1 = UINT_MASK & lh1;
+ lh1 ^= lh1 >> 13;
+ lh1 *= 0xc2b2ae35ull;
+ lh1 = UINT_MASK & lh1;
+ lh1 ^= lh1 >> 16;
+
+ return static_cast<gdv_uint32>(lh1);
+}
+
+static inline gdv_uint64 double_to_long_bits(double value) {
+ gdv_uint64 result;
+ memcpy(&result, &value, sizeof(result));
+ return result;
+}
+
+FORCE_INLINE gdv_int64 hash64(double val, gdv_int64 seed) {
+ return murmur3_64(double_to_long_bits(val), static_cast<gdv_int32>(seed));
+}
+
+FORCE_INLINE gdv_int32 hash32(double val, gdv_int32 seed) {
+ return murmur3_32(double_to_long_bits(val), seed);
+}
+
+// Wrappers for all the numeric/data/time arrow types
+
+#define HASH64_WITH_SEED_OP(NAME, TYPE) \
+ FORCE_INLINE \
+ gdv_int64 NAME##_##TYPE(gdv_##TYPE in, gdv_boolean is_valid, gdv_int64 seed, \
+ gdv_boolean seed_isvalid) { \
+ if (!is_valid) { \
+ return seed; \
+ } \
+ return hash64(static_cast<double>(in), seed); \
+ }
+
+#define HASH32_WITH_SEED_OP(NAME, TYPE) \
+ FORCE_INLINE \
+ gdv_int32 NAME##_##TYPE(gdv_##TYPE in, gdv_boolean is_valid, gdv_int32 seed, \
+ gdv_boolean seed_isvalid) { \
+ if (!is_valid) { \
+ return seed; \
+ } \
+ return hash32(static_cast<double>(in), seed); \
+ }
+
+#define HASH64_OP(NAME, TYPE) \
+ FORCE_INLINE \
+ gdv_int64 NAME##_##TYPE(gdv_##TYPE in, gdv_boolean is_valid) { \
+ return is_valid ? hash64(static_cast<double>(in), 0) : 0; \
+ }
+
+#define HASH32_OP(NAME, TYPE) \
+ FORCE_INLINE \
+ gdv_int32 NAME##_##TYPE(gdv_##TYPE in, gdv_boolean is_valid) { \
+ return is_valid ? hash32(static_cast<double>(in), 0) : 0; \
+ }
+
+// Expand inner macro for all numeric types.
+#define NUMERIC_BOOL_DATE_TYPES(INNER, NAME) \
+ INNER(NAME, int8) \
+ INNER(NAME, int16) \
+ INNER(NAME, int32) \
+ INNER(NAME, int64) \
+ INNER(NAME, uint8) \
+ INNER(NAME, uint16) \
+ INNER(NAME, uint32) \
+ INNER(NAME, uint64) \
+ INNER(NAME, float32) \
+ INNER(NAME, float64) \
+ INNER(NAME, boolean) \
+ INNER(NAME, date64) \
+ INNER(NAME, date32) \
+ INNER(NAME, time32) \
+ INNER(NAME, timestamp)
+
+NUMERIC_BOOL_DATE_TYPES(HASH32_OP, hash)
+NUMERIC_BOOL_DATE_TYPES(HASH32_OP, hash32)
+NUMERIC_BOOL_DATE_TYPES(HASH32_OP, hash32AsDouble)
+NUMERIC_BOOL_DATE_TYPES(HASH32_WITH_SEED_OP, hash32WithSeed)
+NUMERIC_BOOL_DATE_TYPES(HASH32_WITH_SEED_OP, hash32AsDoubleWithSeed)
+
+NUMERIC_BOOL_DATE_TYPES(HASH64_OP, hash64)
+NUMERIC_BOOL_DATE_TYPES(HASH64_OP, hash64AsDouble)
+NUMERIC_BOOL_DATE_TYPES(HASH64_WITH_SEED_OP, hash64WithSeed)
+NUMERIC_BOOL_DATE_TYPES(HASH64_WITH_SEED_OP, hash64AsDoubleWithSeed)
+
+#undef NUMERIC_BOOL_DATE_TYPES
+
+static inline gdv_uint64 murmur3_64_buf(const gdv_uint8* key, gdv_int32 len,
+ gdv_int32 seed) {
+ gdv_uint64 h1 = seed;
+ gdv_uint64 h2 = seed;
+ gdv_uint64 c1 = 0x87c37b91114253d5ull;
+ gdv_uint64 c2 = 0x4cf5ad432745937full;
+
+ const gdv_uint64* blocks = reinterpret_cast<const gdv_uint64*>(key);
+ int nblocks = len / 16;
+ for (int i = 0; i < nblocks; i++) {
+ gdv_uint64 k1 = blocks[i * 2 + 0];
+ gdv_uint64 k2 = blocks[i * 2 + 1];
+
+ k1 *= c1;
+ k1 = rotate_left(k1, 31);
+ k1 *= c2;
+ h1 ^= k1;
+ h1 = rotate_left(h1, 27);
+ h1 += h2;
+ h1 = h1 * 5 + 0x52dce729;
+ k2 *= c2;
+ k2 = rotate_left(k2, 33);
+ k2 *= c1;
+ h2 ^= k2;
+ h2 = rotate_left(h2, 31);
+ h2 += h1;
+ h2 = h2 * 5 + 0x38495ab5;
+ }
+
+ // tail
+ gdv_uint64 k1 = 0;
+ gdv_uint64 k2 = 0;
+
+ const gdv_uint8* tail = reinterpret_cast<const gdv_uint8*>(key + nblocks * 16);
+ switch (len & 15) {
+ case 15:
+ k2 = static_cast<gdv_uint64>(tail[14]) << 48;
+ case 14:
+ k2 ^= static_cast<gdv_uint64>(tail[13]) << 40;
+ case 13:
+ k2 ^= static_cast<gdv_uint64>(tail[12]) << 32;
+ case 12:
+ k2 ^= static_cast<gdv_uint64>(tail[11]) << 24;
+ case 11:
+ k2 ^= static_cast<gdv_uint64>(tail[10]) << 16;
+ case 10:
+ k2 ^= static_cast<gdv_uint64>(tail[9]) << 8;
+ case 9:
+ k2 ^= static_cast<gdv_uint64>(tail[8]);
+ k2 *= c2;
+ k2 = rotate_left(k2, 33);
+ k2 *= c1;
+ h2 ^= k2;
+ case 8:
+ k1 ^= static_cast<gdv_uint64>(tail[7]) << 56;
+ case 7:
+ k1 ^= static_cast<gdv_uint64>(tail[6]) << 48;
+ case 6:
+ k1 ^= static_cast<gdv_uint64>(tail[5]) << 40;
+ case 5:
+ k1 ^= static_cast<gdv_uint64>(tail[4]) << 32;
+ case 4:
+ k1 ^= static_cast<gdv_uint64>(tail[3]) << 24;
+ case 3:
+ k1 ^= static_cast<gdv_uint64>(tail[2]) << 16;
+ case 2:
+ k1 ^= static_cast<gdv_uint64>(tail[1]) << 8;
+ case 1:
+ k1 ^= static_cast<gdv_uint64>(tail[0]) << 0;
+ k1 *= c1;
+ k1 = rotate_left(k1, 31);
+ k1 *= c2;
+ h1 ^= k1;
+ }
+
+ h1 ^= len;
+ h2 ^= len;
+
+ h1 += h2;
+ h2 += h1;
+
+ h1 = fmix64(h1);
+ h2 = fmix64(h2);
+
+ h1 += h2;
+ // h2 += h1;
+ // returning 64-bits of the 128-bit hash.
+ return h1;
+}
+
+static gdv_uint32 murmur3_32_buf(const gdv_uint8* key, gdv_int32 len, gdv_int32 seed) {
+ static const gdv_uint64 c1 = 0xcc9e2d51ull;
+ static const gdv_uint64 c2 = 0x1b873593ull;
+ static const gdv_uint64 UINT_MASK = 0xffffffffull;
+ gdv_uint64 lh1 = seed;
+ const gdv_uint32* blocks = reinterpret_cast<const gdv_uint32*>(key);
+ int nblocks = len / 4;
+ const gdv_uint8* tail = reinterpret_cast<const gdv_uint8*>(key + nblocks * 4);
+ for (int i = 0; i < nblocks; i++) {
+ gdv_uint64 lk1 = static_cast<gdv_uint64>(blocks[i]);
+
+ // k1 *= c1;
+ lk1 *= c1;
+ lk1 &= UINT_MASK;
+
+ lk1 = ((lk1 << 15) & UINT_MASK) | (lk1 >> 17);
+
+ lk1 *= c2;
+ lk1 = lk1 & UINT_MASK;
+ lh1 ^= lk1;
+ lh1 = ((lh1 << 13) & UINT_MASK) | (lh1 >> 19);
+
+ lh1 = lh1 * 5 + 0xe6546b64ull;
+ lh1 = UINT_MASK & lh1;
+ }
+
+ // tail
+ gdv_uint64 lk1 = 0;
+
+ switch (len & 3) {
+ case 3:
+ lk1 = (tail[2] & 0xff) << 16;
+ case 2:
+ lk1 |= (tail[1] & 0xff) << 8;
+ case 1:
+ lk1 |= (tail[0] & 0xff);
+ lk1 *= c1;
+ lk1 = UINT_MASK & lk1;
+ lk1 = ((lk1 << 15) & UINT_MASK) | (lk1 >> 17);
+
+ lk1 *= c2;
+ lk1 = lk1 & UINT_MASK;
+
+ lh1 ^= lk1;
+ }
+
+ // finalization
+ lh1 ^= len;
+
+ lh1 ^= lh1 >> 16;
+ lh1 *= 0x85ebca6b;
+ lh1 = UINT_MASK & lh1;
+ lh1 ^= lh1 >> 13;
+
+ lh1 *= 0xc2b2ae35;
+ lh1 = UINT_MASK & lh1;
+ lh1 ^= lh1 >> 16;
+
+ return static_cast<gdv_uint32>(lh1 & UINT_MASK);
+}
+
+FORCE_INLINE gdv_int64 hash64_buf(const gdv_uint8* buf, int len, gdv_int64 seed) {
+ return murmur3_64_buf(buf, len, static_cast<gdv_int32>(seed));
+}
+
+FORCE_INLINE gdv_int32 hash32_buf(const gdv_uint8* buf, int len, gdv_int32 seed) {
+ return murmur3_32_buf(buf, len, seed);
+}
+
+// Wrappers for the varlen types
+
+#define HASH64_BUF_WITH_SEED_OP(NAME, TYPE) \
+ FORCE_INLINE \
+ gdv_int64 NAME##_##TYPE(gdv_##TYPE in, gdv_int32 len, gdv_boolean is_valid, \
+ gdv_int64 seed, gdv_boolean seed_isvalid) { \
+ if (!is_valid) { \
+ return seed; \
+ } \
+ return hash64_buf(reinterpret_cast<const uint8_t*>(in), len, seed); \
+ }
+
+#define HASH32_BUF_WITH_SEED_OP(NAME, TYPE) \
+ FORCE_INLINE \
+ gdv_int32 NAME##_##TYPE(gdv_##TYPE in, gdv_int32 len, gdv_boolean is_valid, \
+ gdv_int32 seed, gdv_boolean seed_isvalid) { \
+ if (!is_valid) { \
+ return seed; \
+ } \
+ return hash32_buf(reinterpret_cast<const uint8_t*>(in), len, seed); \
+ }
+
+#define HASH64_BUF_OP(NAME, TYPE) \
+ FORCE_INLINE \
+ gdv_int64 NAME##_##TYPE(gdv_##TYPE in, gdv_int32 len, gdv_boolean is_valid) { \
+ return is_valid ? hash64_buf(reinterpret_cast<const uint8_t*>(in), len, 0) : 0; \
+ }
+
+#define HASH32_BUF_OP(NAME, TYPE) \
+ FORCE_INLINE \
+ gdv_int32 NAME##_##TYPE(gdv_##TYPE in, gdv_int32 len, gdv_boolean is_valid) { \
+ return is_valid ? hash32_buf(reinterpret_cast<const uint8_t*>(in), len, 0) : 0; \
+ }
+
+// Expand inner macro for all non-numeric types.
+#define VAR_LEN_TYPES(INNER, NAME) \
+ INNER(NAME, utf8) \
+ INNER(NAME, binary)
+
+VAR_LEN_TYPES(HASH32_BUF_OP, hash)
+VAR_LEN_TYPES(HASH32_BUF_OP, hash32)
+VAR_LEN_TYPES(HASH32_BUF_OP, hash32AsDouble)
+VAR_LEN_TYPES(HASH32_BUF_WITH_SEED_OP, hash32WithSeed)
+VAR_LEN_TYPES(HASH32_BUF_WITH_SEED_OP, hash32AsDoubleWithSeed)
+
+VAR_LEN_TYPES(HASH64_BUF_OP, hash64)
+VAR_LEN_TYPES(HASH64_BUF_OP, hash64AsDouble)
+VAR_LEN_TYPES(HASH64_BUF_WITH_SEED_OP, hash64WithSeed)
+VAR_LEN_TYPES(HASH64_BUF_WITH_SEED_OP, hash64AsDoubleWithSeed)
+
+#undef HASH32_BUF_OP
+#undef HASH32_BUF_WITH_SEED_OP
+#undef HASH32_OP
+#undef HASH32_WITH_SEED_OP
+#undef HASH64_BUF_OP
+#undef HASH64_BUF_WITH_SEED_OP
+#undef HASH64_OP
+#undef HASH64_WITH_SEED_OP
+
+} // extern "C"
diff --git a/src/arrow/cpp/src/gandiva/precompiled/hash_test.cc b/src/arrow/cpp/src/gandiva/precompiled/hash_test.cc
new file mode 100644
index 000000000..0a51dced2
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/precompiled/hash_test.cc
@@ -0,0 +1,122 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <time.h>
+
+#include <gtest/gtest.h>
+#include "gandiva/precompiled/types.h"
+
+namespace gandiva {
+
+TEST(TestHash, TestHash32) {
+ gdv_int8 s8 = 0;
+ gdv_uint8 u8 = 0;
+ gdv_int16 s16 = 0;
+ gdv_uint16 u16 = 0;
+ gdv_int32 s32 = 0;
+ gdv_uint32 u32 = 0;
+ gdv_int64 s64 = 0;
+ gdv_uint64 u64 = 0;
+ gdv_float32 f32 = 0;
+ gdv_float64 f64 = 0;
+
+ // hash of 0 should be non-zero (zero is the hash value for nulls).
+ gdv_int32 zero_hash = hash32(s8, 0);
+ EXPECT_NE(zero_hash, 0);
+
+ // for a given value, all numeric types must have the same hash.
+ EXPECT_EQ(hash32(u8, 0), zero_hash);
+ EXPECT_EQ(hash32(s16, 0), zero_hash);
+ EXPECT_EQ(hash32(u16, 0), zero_hash);
+ EXPECT_EQ(hash32(s32, 0), zero_hash);
+ EXPECT_EQ(hash32(u32, 0), zero_hash);
+ EXPECT_EQ(hash32(static_cast<double>(s64), 0), zero_hash);
+ EXPECT_EQ(hash32(static_cast<double>(u64), 0), zero_hash);
+ EXPECT_EQ(hash32(f32, 0), zero_hash);
+ EXPECT_EQ(hash32(f64, 0), zero_hash);
+
+ // hash must change with a change in seed.
+ EXPECT_NE(hash32(s8, 1), zero_hash);
+
+ // for a given value and seed, all numeric types must have the same hash.
+ EXPECT_EQ(hash32(s8, 1), hash32(s16, 1));
+ EXPECT_EQ(hash32(s8, 1), hash32(u32, 1));
+ EXPECT_EQ(hash32(s8, 1), hash32(f32, 1));
+ EXPECT_EQ(hash32(s8, 1), hash32(f64, 1));
+}
+
+TEST(TestHash, TestHash64) {
+ gdv_int8 s8 = 0;
+ gdv_uint8 u8 = 0;
+ gdv_int16 s16 = 0;
+ gdv_uint16 u16 = 0;
+ gdv_int32 s32 = 0;
+ gdv_uint32 u32 = 0;
+ gdv_int64 s64 = 0;
+ gdv_uint64 u64 = 0;
+ gdv_float32 f32 = 0;
+ gdv_float64 f64 = 0;
+
+ // hash of 0 should be non-zero (zero is the hash value for nulls).
+ gdv_int64 zero_hash = hash64(s8, 0);
+ EXPECT_NE(zero_hash, 0);
+ EXPECT_NE(hash64(u8, 0), hash32(u8, 0));
+
+ // for a given value, all numeric types must have the same hash.
+ EXPECT_EQ(hash64(u8, 0), zero_hash);
+ EXPECT_EQ(hash64(s16, 0), zero_hash);
+ EXPECT_EQ(hash64(u16, 0), zero_hash);
+ EXPECT_EQ(hash64(s32, 0), zero_hash);
+ EXPECT_EQ(hash64(u32, 0), zero_hash);
+ EXPECT_EQ(hash64(static_cast<double>(s64), 0), zero_hash);
+ EXPECT_EQ(hash64(static_cast<double>(u64), 0), zero_hash);
+ EXPECT_EQ(hash64(f32, 0), zero_hash);
+ EXPECT_EQ(hash64(f64, 0), zero_hash);
+
+ // hash must change with a change in seed.
+ EXPECT_NE(hash64(s8, 1), zero_hash);
+
+ // for a given value and seed, all numeric types must have the same hash.
+ EXPECT_EQ(hash64(s8, 1), hash64(s16, 1));
+ EXPECT_EQ(hash64(s8, 1), hash64(u32, 1));
+ EXPECT_EQ(hash64(s8, 1), hash64(f32, 1));
+}
+
+TEST(TestHash, TestHashBuf) {
+ const char* buf = "hello";
+ int buf_len = 5;
+
+ // hash should be non-zero (zero is the hash value for nulls).
+ EXPECT_NE(hash32_buf((const gdv_uint8*)buf, buf_len, 0), 0);
+ EXPECT_NE(hash64_buf((const gdv_uint8*)buf, buf_len, 0), 0);
+
+ // hash must change if the string is changed.
+ EXPECT_NE(hash32_buf((const gdv_uint8*)buf, buf_len, 0),
+ hash32_buf((const gdv_uint8*)buf, buf_len - 1, 0));
+
+ EXPECT_NE(hash64_buf((const gdv_uint8*)buf, buf_len, 0),
+ hash64_buf((const gdv_uint8*)buf, buf_len - 1, 0));
+
+ // hash must change if the seed is changed.
+ EXPECT_NE(hash32_buf((const gdv_uint8*)buf, buf_len, 0),
+ hash32_buf((const gdv_uint8*)buf, buf_len, 1));
+
+ EXPECT_NE(hash64_buf((const gdv_uint8*)buf, buf_len, 0),
+ hash64_buf((const gdv_uint8*)buf, buf_len, 1));
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/precompiled/print.cc b/src/arrow/cpp/src/gandiva/precompiled/print.cc
new file mode 100644
index 000000000..ecb90e1a3
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/precompiled/print.cc
@@ -0,0 +1,28 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+extern "C" {
+
+#include <stdio.h>
+
+#include "./types.h"
+
+int print_double(char* msg, double val) { return printf(msg, val); }
+
+int print_float(char* msg, float val) { return printf(msg, val); }
+
+} // extern "C"
diff --git a/src/arrow/cpp/src/gandiva/precompiled/string_ops.cc b/src/arrow/cpp/src/gandiva/precompiled/string_ops.cc
new file mode 100644
index 000000000..48c24b862
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/precompiled/string_ops.cc
@@ -0,0 +1,2198 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// String functions
+#include "arrow/util/value_parsing.h"
+
+extern "C" {
+
+#include <algorithm>
+#include <climits>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "./types.h"
+
+FORCE_INLINE
+gdv_int32 octet_length_utf8(const gdv_utf8 input, gdv_int32 length) { return length; }
+
+FORCE_INLINE
+gdv_int32 bit_length_utf8(const gdv_utf8 input, gdv_int32 length) { return length * 8; }
+
+FORCE_INLINE
+gdv_int32 octet_length_binary(const gdv_binary input, gdv_int32 length) { return length; }
+
+FORCE_INLINE
+gdv_int32 bit_length_binary(const gdv_binary input, gdv_int32 length) {
+ return length * 8;
+}
+
+FORCE_INLINE
+int match_string(const char* input, gdv_int32 input_len, gdv_int32 start_pos,
+ const char* delim, gdv_int32 delim_len) {
+ for (int i = start_pos; i < input_len; i++) {
+ int left_chars = input_len - i;
+ if ((left_chars >= delim_len) && memcmp(input + i, delim, delim_len) == 0) {
+ return i + delim_len;
+ }
+ }
+
+ return -1;
+}
+
+FORCE_INLINE
+gdv_int32 mem_compare(const char* left, gdv_int32 left_len, const char* right,
+ gdv_int32 right_len) {
+ int min = left_len;
+ if (right_len < min) {
+ min = right_len;
+ }
+
+ int cmp_ret = memcmp(left, right, min);
+ if (cmp_ret != 0) {
+ return cmp_ret;
+ } else {
+ return left_len - right_len;
+ }
+}
+
+// Expand inner macro for all varlen types.
+#define VAR_LEN_OP_TYPES(INNER, NAME, OP) \
+ INNER(NAME, utf8, OP) \
+ INNER(NAME, binary, OP)
+
+// Relational binary fns : left, right params are same, return is bool.
+#define BINARY_RELATIONAL(NAME, TYPE, OP) \
+ FORCE_INLINE \
+ bool NAME##_##TYPE##_##TYPE(const gdv_##TYPE left, gdv_int32 left_len, \
+ const gdv_##TYPE right, gdv_int32 right_len) { \
+ return mem_compare(left, left_len, right, right_len) OP 0; \
+ }
+
+VAR_LEN_OP_TYPES(BINARY_RELATIONAL, equal, ==)
+VAR_LEN_OP_TYPES(BINARY_RELATIONAL, not_equal, !=)
+VAR_LEN_OP_TYPES(BINARY_RELATIONAL, less_than, <)
+VAR_LEN_OP_TYPES(BINARY_RELATIONAL, less_than_or_equal_to, <=)
+VAR_LEN_OP_TYPES(BINARY_RELATIONAL, greater_than, >)
+VAR_LEN_OP_TYPES(BINARY_RELATIONAL, greater_than_or_equal_to, >=)
+
+#undef BINARY_RELATIONAL
+#undef VAR_LEN_OP_TYPES
+
+// Expand inner macro for all varlen types.
+#define VAR_LEN_TYPES(INNER, NAME) \
+ INNER(NAME, utf8) \
+ INNER(NAME, binary)
+
+FORCE_INLINE
+int to_binary_from_hex(char ch) {
+ if (ch >= 'A' && ch <= 'F') {
+ return 10 + (ch - 'A');
+ } else if (ch >= 'a' && ch <= 'f') {
+ return 10 + (ch - 'a');
+ }
+ return ch - '0';
+}
+
+FORCE_INLINE
+bool starts_with_utf8_utf8(const char* data, gdv_int32 data_len, const char* prefix,
+ gdv_int32 prefix_len) {
+ return ((data_len >= prefix_len) && (memcmp(data, prefix, prefix_len) == 0));
+}
+
+FORCE_INLINE
+bool ends_with_utf8_utf8(const char* data, gdv_int32 data_len, const char* suffix,
+ gdv_int32 suffix_len) {
+ return ((data_len >= suffix_len) &&
+ (memcmp(data + data_len - suffix_len, suffix, suffix_len) == 0));
+}
+
+FORCE_INLINE
+bool is_substr_utf8_utf8(const char* data, int32_t data_len, const char* substr,
+ int32_t substr_len) {
+ for (int32_t i = 0; i <= data_len - substr_len; ++i) {
+ if (memcmp(data + i, substr, substr_len) == 0) {
+ return true;
+ }
+ }
+ return false;
+}
+
+FORCE_INLINE
+gdv_int32 utf8_char_length(char c) {
+ if ((signed char)c >= 0) { // 1-byte char (0x00 ~ 0x7F)
+ return 1;
+ } else if ((c & 0xE0) == 0xC0) { // 2-byte char
+ return 2;
+ } else if ((c & 0xF0) == 0xE0) { // 3-byte char
+ return 3;
+ } else if ((c & 0xF8) == 0xF0) { // 4-byte char
+ return 4;
+ }
+ // invalid char
+ return 0;
+}
+
+FORCE_INLINE
+void set_error_for_invalid_utf(int64_t execution_context, char val) {
+ char const* fmt = "unexpected byte \\%02hhx encountered while decoding utf8 string";
+ int size = static_cast<int>(strlen(fmt)) + 64;
+ char* error = reinterpret_cast<char*>(malloc(size));
+ snprintf(error, size, fmt, (unsigned char)val);
+ gdv_fn_context_set_error_msg(execution_context, error);
+ free(error);
+}
+
+FORCE_INLINE
+bool validate_utf8_following_bytes(const char* data, int32_t data_len,
+ int32_t char_index) {
+ for (int j = 1; j < data_len; ++j) {
+ if ((data[char_index + j] & 0xC0) != 0x80) { // bytes following head-byte of glyph
+ return false;
+ }
+ }
+ return true;
+}
+
+// Count the number of utf8 characters
+// return 0 for invalid/incomplete input byte sequences
+FORCE_INLINE
+gdv_int32 utf8_length(gdv_int64 context, const char* data, gdv_int32 data_len) {
+ int char_len = 0;
+ int count = 0;
+ for (int i = 0; i < data_len; i += char_len) {
+ char_len = utf8_char_length(data[i]);
+ if (char_len == 0 || i + char_len > data_len) { // invalid byte or incomplete glyph
+ set_error_for_invalid_utf(context, data[i]);
+ return 0;
+ }
+ for (int j = 1; j < char_len; ++j) {
+ if ((data[i + j] & 0xC0) != 0x80) { // bytes following head-byte of glyph
+ set_error_for_invalid_utf(context, data[i + j]);
+ return 0;
+ }
+ }
+ ++count;
+ }
+ return count;
+}
+
+// Count the number of utf8 characters, ignoring invalid char, considering size 1
+FORCE_INLINE
+gdv_int32 utf8_length_ignore_invalid(const char* data, gdv_int32 data_len) {
+ int char_len = 0;
+ int count = 0;
+ for (int i = 0; i < data_len; i += char_len) {
+ char_len = utf8_char_length(data[i]);
+ if (char_len == 0 || i + char_len > data_len) { // invalid byte or incomplete glyph
+ // if invalid byte or incomplete glyph, ignore it
+ char_len = 1;
+ }
+ for (int j = 1; j < char_len; ++j) {
+ if ((data[i + j] & 0xC0) != 0x80) { // bytes following head-byte of glyph
+ char_len += 1;
+ }
+ }
+ ++count;
+ }
+ return count;
+}
+
+// Get the byte position corresponding to a character position for a non-empty utf8
+// sequence
+FORCE_INLINE
+gdv_int32 utf8_byte_pos(gdv_int64 context, const char* str, gdv_int32 str_len,
+ gdv_int32 char_pos) {
+ int char_len = 0;
+ int byte_index = 0;
+ for (gdv_int32 char_index = 0; char_index < char_pos && byte_index < str_len;
+ char_index++) {
+ char_len = utf8_char_length(str[byte_index]);
+ if (char_len == 0 ||
+ byte_index + char_len > str_len) { // invalid byte or incomplete glyph
+ set_error_for_invalid_utf(context, str[byte_index]);
+ return -1;
+ }
+ byte_index += char_len;
+ }
+ return byte_index;
+}
+
+#define UTF8_LENGTH(NAME, TYPE) \
+ FORCE_INLINE \
+ gdv_int32 NAME##_##TYPE(gdv_int64 context, gdv_##TYPE in, gdv_int32 in_len) { \
+ return utf8_length(context, in, in_len); \
+ }
+
+UTF8_LENGTH(char_length, utf8)
+UTF8_LENGTH(length, utf8)
+UTF8_LENGTH(lengthUtf8, binary)
+
+// Returns a string of 'n' spaces.
+#define SPACE_STR(IN_TYPE) \
+ GANDIVA_EXPORT \
+ const char* space_##IN_TYPE(gdv_int64 ctx, gdv_##IN_TYPE n, int32_t* out_len) { \
+ gdv_int32 n_times = static_cast<gdv_int32>(n); \
+ if (n_times <= 0) { \
+ *out_len = 0; \
+ return ""; \
+ } \
+ char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(ctx, n_times)); \
+ if (ret == nullptr) { \
+ gdv_fn_context_set_error_msg(ctx, "Could not allocate memory for output string"); \
+ *out_len = 0; \
+ return ""; \
+ } \
+ for (int i = 0; i < n_times; i++) { \
+ ret[i] = ' '; \
+ } \
+ *out_len = n_times; \
+ return ret; \
+ }
+
+SPACE_STR(int32)
+SPACE_STR(int64)
+
+// Reverse a utf8 sequence
+FORCE_INLINE
+const char* reverse_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
+ int32_t* out_len) {
+ if (data_len == 0) {
+ *out_len = 0;
+ return "";
+ }
+
+ char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, data_len));
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+
+ gdv_int32 char_len;
+ for (gdv_int32 i = 0; i < data_len; i += char_len) {
+ char_len = utf8_char_length(data[i]);
+
+ if (char_len == 0 || i + char_len > data_len) { // invalid byte or incomplete glyph
+ set_error_for_invalid_utf(context, data[i]);
+ *out_len = 0;
+ return "";
+ }
+
+ for (gdv_int32 j = 0; j < char_len; ++j) {
+ if (j > 0 && (data[i + j] & 0xC0) != 0x80) { // bytes following head-byte of glyph
+ set_error_for_invalid_utf(context, data[i + j]);
+ *out_len = 0;
+ return "";
+ }
+ ret[data_len - i - char_len + j] = data[i + j];
+ }
+ }
+ *out_len = data_len;
+ return ret;
+}
+
+// Trims whitespaces from the left end of the input utf8 sequence
+FORCE_INLINE
+const char* ltrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
+ int32_t* out_len) {
+ if (data_len == 0) {
+ *out_len = 0;
+ return "";
+ }
+
+ gdv_int32 start = 0;
+ // start denotes the first position of non-space characters in the input string
+ while (start < data_len && data[start] == ' ') {
+ ++start;
+ }
+
+ *out_len = data_len - start;
+ return data + start;
+}
+
+// Trims whitespaces from the right end of the input utf8 sequence
+FORCE_INLINE
+const char* rtrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
+ int32_t* out_len) {
+ if (data_len == 0) {
+ *out_len = 0;
+ return "";
+ }
+
+ gdv_int32 end = data_len - 1;
+ // end denotes the last position of non-space characters in the input string
+ while (end >= 0 && data[end] == ' ') {
+ --end;
+ }
+
+ *out_len = end + 1;
+ return data;
+}
+
+// Trims whitespaces from both the ends of the input utf8 sequence
+FORCE_INLINE
+const char* btrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
+ int32_t* out_len) {
+ if (data_len == 0) {
+ *out_len = 0;
+ return "";
+ }
+
+ gdv_int32 start = 0, end = data_len - 1;
+ // start and end denote the first and last positions of non-space
+ // characters in the input string respectively
+ while (start <= end && data[start] == ' ') {
+ ++start;
+ }
+ while (end >= start && data[end] == ' ') {
+ --end;
+ }
+
+ // string has some leading/trailing spaces and some non-space characters
+ *out_len = end - start + 1;
+ return data + start;
+}
+
+// Trims characters present in the trim text from the left end of the base text
+FORCE_INLINE
+const char* ltrim_utf8_utf8(gdv_int64 context, const char* basetext,
+ gdv_int32 basetext_len, const char* trimtext,
+ gdv_int32 trimtext_len, int32_t* out_len) {
+ if (basetext_len == 0) {
+ *out_len = 0;
+ return "";
+ } else if (trimtext_len == 0) {
+ *out_len = basetext_len;
+ return basetext;
+ }
+
+ gdv_int32 start_ptr, char_len;
+ // scan the base text from left to right and increment the start pointer till
+ // there is a character which is not present in the trim text
+ for (start_ptr = 0; start_ptr < basetext_len; start_ptr += char_len) {
+ char_len = utf8_char_length(basetext[start_ptr]);
+ if (char_len == 0 || start_ptr + char_len > basetext_len) {
+ // invalid byte or incomplete glyph
+ set_error_for_invalid_utf(context, basetext[start_ptr]);
+ *out_len = 0;
+ return "";
+ }
+ if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + start_ptr, char_len)) {
+ break;
+ }
+ }
+
+ *out_len = basetext_len - start_ptr;
+ return basetext + start_ptr;
+}
+
+// Trims characters present in the trim text from the right end of the base text
+FORCE_INLINE
+const char* rtrim_utf8_utf8(gdv_int64 context, const char* basetext,
+ gdv_int32 basetext_len, const char* trimtext,
+ gdv_int32 trimtext_len, int32_t* out_len) {
+ if (basetext_len == 0) {
+ *out_len = 0;
+ return "";
+ } else if (trimtext_len == 0) {
+ *out_len = basetext_len;
+ return basetext;
+ }
+
+ gdv_int32 char_len, end_ptr, byte_cnt = 1;
+ // scan the base text from right to left and decrement the end pointer till
+ // there is a character which is not present in the trim text
+ for (end_ptr = basetext_len - 1; end_ptr >= 0; --end_ptr) {
+ char_len = utf8_char_length(basetext[end_ptr]);
+ if (char_len == 0) { // trailing bytes of multibyte character
+ ++byte_cnt;
+ continue;
+ }
+ // this is the first byte of a character, hence check if char_len = char_cnt
+ if (byte_cnt != char_len) { // invalid byte or incomplete glyph
+ set_error_for_invalid_utf(context, basetext[end_ptr]);
+ *out_len = 0;
+ return "";
+ }
+ byte_cnt = 1; // reset the counter*/
+ if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + end_ptr, char_len)) {
+ break;
+ }
+ }
+
+ // when all characters in the basetext are part of the trimtext
+ if (end_ptr == -1) {
+ *out_len = 0;
+ return "";
+ }
+
+ end_ptr += utf8_char_length(basetext[end_ptr]); // point to the next character
+ *out_len = end_ptr;
+ return basetext;
+}
+
+// Trims characters present in the trim text from both ends of the base text
+FORCE_INLINE
+const char* btrim_utf8_utf8(gdv_int64 context, const char* basetext,
+ gdv_int32 basetext_len, const char* trimtext,
+ gdv_int32 trimtext_len, int32_t* out_len) {
+ if (basetext_len == 0) {
+ *out_len = 0;
+ return "";
+ } else if (trimtext_len == 0) {
+ *out_len = basetext_len;
+ return basetext;
+ }
+
+ gdv_int32 start_ptr, end_ptr, char_len, byte_cnt = 1;
+ // scan the base text from left to right and increment the start and decrement the
+ // end pointers till there are characters which are not present in the trim text
+ for (start_ptr = 0; start_ptr < basetext_len; start_ptr += char_len) {
+ char_len = utf8_char_length(basetext[start_ptr]);
+ if (char_len == 0 || start_ptr + char_len > basetext_len) {
+ // invalid byte or incomplete glyph
+ set_error_for_invalid_utf(context, basetext[start_ptr]);
+ *out_len = 0;
+ return "";
+ }
+ if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + start_ptr, char_len)) {
+ break;
+ }
+ }
+ for (end_ptr = basetext_len - 1; end_ptr >= start_ptr; --end_ptr) {
+ char_len = utf8_char_length(basetext[end_ptr]);
+ if (char_len == 0) { // trailing byte in multibyte character
+ ++byte_cnt;
+ continue;
+ }
+ // this is the first byte of a character, hence check if char_len = char_cnt
+ if (byte_cnt != char_len) { // invalid byte or incomplete glyph
+ set_error_for_invalid_utf(context, basetext[end_ptr]);
+ *out_len = 0;
+ return "";
+ }
+ byte_cnt = 1; // reset the counter*/
+ if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + end_ptr, char_len)) {
+ break;
+ }
+ }
+
+ // when all characters are trimmed, start_ptr has been incremented to basetext_len and
+ // end_ptr still points to basetext_len - 1, hence we need to handle this case
+ if (start_ptr > end_ptr) {
+ *out_len = 0;
+ return "";
+ }
+
+ end_ptr += utf8_char_length(basetext[end_ptr]); // point to the next character
+ *out_len = end_ptr - start_ptr;
+ return basetext + start_ptr;
+}
+
+FORCE_INLINE
+gdv_boolean compare_lower_strings(const char* base_str, gdv_int32 base_str_len,
+ const char* str, gdv_int32 str_len) {
+ if (base_str_len != str_len) {
+ return false;
+ }
+ for (int i = 0; i < str_len; i++) {
+ // convert char to lower
+ char cur = str[i];
+ // 'A' - 'Z' : 0x41 - 0x5a
+ // 'a' - 'z' : 0x61 - 0x7a
+ if (cur >= 0x41 && cur <= 0x5a) {
+ cur = static_cast<char>(cur + 0x20);
+ }
+ // if the character does not match, break the flow
+ if (cur != base_str[i]) break;
+ // if the character matches and it is the last iteration, return true
+ if (i == str_len - 1) return true;
+ }
+ return false;
+}
+
+// Try to cast the received string ('0', '1', 'true', 'false'), ignoring leading
+// and trailing spaces, also ignoring lower and upper case.
+FORCE_INLINE
+gdv_boolean castBIT_utf8(gdv_int64 context, const char* data, gdv_int32 data_len) {
+ if (data_len <= 0) {
+ gdv_fn_context_set_error_msg(context, "Invalid value for boolean.");
+ return false;
+ }
+
+ // trim leading and trailing spaces
+ int32_t trimmed_len;
+ int32_t start = 0, end = data_len - 1;
+ while (start <= end && data[start] == ' ') {
+ ++start;
+ }
+ while (end >= start && data[end] == ' ') {
+ --end;
+ }
+ trimmed_len = end - start + 1;
+ const char* trimmed_data = data + start;
+
+ // compare received string with the valid bool string values '1', '0', 'true', 'false'
+ if (trimmed_len == 1) {
+ // case for '0' and '1' value
+ if (trimmed_data[0] == '1') return true;
+ if (trimmed_data[0] == '0') return false;
+ } else if (trimmed_len == 4) {
+ // case for matching 'true'
+ if (compare_lower_strings("true", 4, trimmed_data, trimmed_len)) return true;
+ } else if (trimmed_len == 5) {
+ // case for matching 'false'
+ if (compare_lower_strings("false", 5, trimmed_data, trimmed_len)) return false;
+ }
+ // if no 'true', 'false', '0' or '1' value is found, set an error
+ gdv_fn_context_set_error_msg(context, "Invalid value for boolean.");
+ return false;
+}
+
+FORCE_INLINE
+const char* castVARCHAR_bool_int64(gdv_int64 context, gdv_boolean value,
+ gdv_int64 out_len, gdv_int32* out_length) {
+ gdv_int32 len = static_cast<gdv_int32>(out_len);
+ if (len < 0) {
+ gdv_fn_context_set_error_msg(context, "Output buffer length can't be negative");
+ *out_length = 0;
+ return "";
+ }
+ const char* out =
+ reinterpret_cast<const char*>(gdv_fn_context_arena_malloc(context, 5));
+ out = value ? "true" : "false";
+ *out_length = value ? ((len > 4) ? 4 : len) : ((len > 5) ? 5 : len);
+ return out;
+}
+
+// Truncates the string to given length
+#define CAST_VARCHAR_FROM_VARLEN_TYPE(TYPE) \
+ FORCE_INLINE \
+ const char* castVARCHAR_##TYPE##_int64(gdv_int64 context, const char* data, \
+ gdv_int32 data_len, int64_t out_len, \
+ int32_t* out_length) { \
+ int32_t len = static_cast<int32_t>(out_len); \
+ \
+ if (len < 0) { \
+ gdv_fn_context_set_error_msg(context, "Output buffer length can't be negative"); \
+ *out_length = 0; \
+ return ""; \
+ } \
+ \
+ if (len >= data_len || len == 0) { \
+ *out_length = data_len; \
+ return data; \
+ } \
+ \
+ int32_t remaining = len; \
+ int32_t index = 0; \
+ bool is_multibyte = false; \
+ do { \
+ /* In utf8, MSB of a single byte unicode char is always 0, \
+ * whereas for a multibyte character the MSB of each byte is 1. \
+ * So for a single byte char, a bitwise-and with x80 (10000000) will be 0 \
+ * and it won't be 0 for bytes of a multibyte char. \
+ */ \
+ char* data_ptr = const_cast<char*>(data); \
+ \
+ /* advance byte by byte till the 8-byte boundary then advance 8 bytes */ \
+ auto num_bytes = reinterpret_cast<uintptr_t>(data_ptr) & 0x07; \
+ num_bytes = (8 - num_bytes) & 0x07; \
+ while (num_bytes > 0) { \
+ uint8_t* ptr = reinterpret_cast<uint8_t*>(data_ptr + index); \
+ if ((*ptr & 0x80) != 0) { \
+ is_multibyte = true; \
+ break; \
+ } \
+ index++; \
+ remaining--; \
+ num_bytes--; \
+ } \
+ if (is_multibyte) break; \
+ while (remaining >= 8) { \
+ uint64_t* ptr = reinterpret_cast<uint64_t*>(data_ptr + index); \
+ if ((*ptr & 0x8080808080808080) != 0) { \
+ is_multibyte = true; \
+ break; \
+ } \
+ index += 8; \
+ remaining -= 8; \
+ } \
+ if (is_multibyte) break; \
+ if (remaining >= 4) { \
+ uint32_t* ptr = reinterpret_cast<uint32_t*>(data_ptr + index); \
+ if ((*ptr & 0x80808080) != 0) break; \
+ index += 4; \
+ remaining -= 4; \
+ } \
+ while (remaining > 0) { \
+ uint8_t* ptr = reinterpret_cast<uint8_t*>(data_ptr + index); \
+ if ((*ptr & 0x80) != 0) { \
+ is_multibyte = true; \
+ break; \
+ } \
+ index++; \
+ remaining--; \
+ } \
+ if (is_multibyte) break; \
+ /* reached here; all are single byte characters */ \
+ *out_length = len; \
+ return data; \
+ } while (false); \
+ \
+ /* detected multibyte utf8 characters; slow path */ \
+ int32_t byte_pos = \
+ utf8_byte_pos(context, data + index, data_len - index, len - index); \
+ if (byte_pos < 0) { \
+ *out_length = 0; \
+ return ""; \
+ } \
+ \
+ *out_length = index + byte_pos; \
+ return data; \
+ }
+
+CAST_VARCHAR_FROM_VARLEN_TYPE(utf8)
+CAST_VARCHAR_FROM_VARLEN_TYPE(binary)
+
+#undef CAST_VARCHAR_FROM_VARLEN_TYPE
+
+// Add functions for castVARBINARY
+#define CAST_VARBINARY_FROM_STRING_AND_BINARY(TYPE) \
+ GANDIVA_EXPORT \
+ const char* castVARBINARY_##TYPE##_int64(gdv_int64 context, const char* data, \
+ gdv_int32 data_len, int64_t out_len, \
+ int32_t* out_length) { \
+ int32_t len = static_cast<int32_t>(out_len); \
+ if (len < 0) { \
+ gdv_fn_context_set_error_msg(context, "Output buffer length can't be negative"); \
+ *out_length = 0; \
+ return ""; \
+ } \
+ \
+ if (len >= data_len || len == 0) { \
+ *out_length = data_len; \
+ } else { \
+ *out_length = len; \
+ } \
+ return data; \
+ }
+
+CAST_VARBINARY_FROM_STRING_AND_BINARY(utf8)
+CAST_VARBINARY_FROM_STRING_AND_BINARY(binary)
+
+#undef CAST_VARBINARY_FROM_STRING_AND_BINARY
+
+#define IS_NULL(NAME, TYPE) \
+ FORCE_INLINE \
+ bool NAME##_##TYPE(gdv_##TYPE in, gdv_int32 len, gdv_boolean is_valid) { \
+ return !is_valid; \
+ }
+
+VAR_LEN_TYPES(IS_NULL, isnull)
+
+#undef IS_NULL
+
+#define IS_NOT_NULL(NAME, TYPE) \
+ FORCE_INLINE \
+ bool NAME##_##TYPE(gdv_##TYPE in, gdv_int32 len, gdv_boolean is_valid) { \
+ return is_valid; \
+ }
+
+VAR_LEN_TYPES(IS_NOT_NULL, isnotnull)
+
+#undef IS_NOT_NULL
+#undef VAR_LEN_TYPES
+
+/*
+ We follow Oracle semantics for offset:
+ - If position is positive, then the first glyph in the substring is determined by
+ counting that many glyphs forward from the beginning of the input. (i.e., for position ==
+ 1 the first glyph in the substring will be identical to the first glyph in the input)
+
+ - If position is negative, then the first glyph in the substring is determined by
+ counting that many glyphs backward from the end of the input. (i.e., for position == -1
+ the first glyph in the substring will be identical to the last glyph in the input)
+
+ - If position is 0 then it is treated as 1.
+ */
+FORCE_INLINE
+const char* substr_utf8_int64_int64(gdv_int64 context, const char* input,
+ gdv_int32 in_data_len, gdv_int64 position,
+ gdv_int64 substring_length, gdv_int32* out_data_len) {
+ if (substring_length <= 0 || input == nullptr || in_data_len <= 0) {
+ *out_data_len = 0;
+ return "";
+ }
+
+ gdv_int64 in_glyphs_count =
+ static_cast<gdv_int64>(utf8_length(context, input, in_data_len));
+
+ // in_glyphs_count is zero if input has invalid glyphs
+ if (in_glyphs_count == 0) {
+ *out_data_len = 0;
+ return "";
+ }
+
+ gdv_int64 from_glyph; // from_glyph==0 indicates the first glyph of the input
+ if (position > 0) {
+ from_glyph = position - 1;
+ } else if (position < 0) {
+ from_glyph = in_glyphs_count + position;
+ } else {
+ from_glyph = 0;
+ }
+
+ if (from_glyph < 0 || from_glyph >= in_glyphs_count) {
+ *out_data_len = 0;
+ return "";
+ }
+
+ gdv_int64 out_glyphs_count = substring_length;
+ if (substring_length > in_glyphs_count - from_glyph) {
+ out_glyphs_count = in_glyphs_count - from_glyph;
+ }
+
+ gdv_int64 in_data_len64 = static_cast<gdv_int64>(in_data_len);
+ gdv_int64 start_pos = 0;
+ gdv_int64 end_pos = in_data_len64;
+
+ gdv_int64 current_glyph = 0;
+ gdv_int64 pos = 0;
+ while (pos < in_data_len64) {
+ if (current_glyph == from_glyph) {
+ start_pos = pos;
+ }
+ pos += static_cast<gdv_int64>(utf8_char_length(input[pos]));
+ if (current_glyph - from_glyph + 1 == out_glyphs_count) {
+ end_pos = pos;
+ }
+ current_glyph++;
+ }
+
+ if (end_pos > in_data_len64 || end_pos > INT_MAX) {
+ end_pos = in_data_len64;
+ }
+
+ *out_data_len = static_cast<gdv_int32>(end_pos - start_pos);
+ char* ret =
+ reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_data_len));
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+ *out_data_len = 0;
+ return "";
+ }
+ memcpy(ret, input + start_pos, *out_data_len);
+ return ret;
+}
+
+FORCE_INLINE
+const char* substr_utf8_int64(gdv_int64 context, const char* input, gdv_int32 in_len,
+ gdv_int64 offset64, gdv_int32* out_len) {
+ return substr_utf8_int64_int64(context, input, in_len, offset64, in_len, out_len);
+}
+
+FORCE_INLINE
+const char* repeat_utf8_int32(gdv_int64 context, const char* in, gdv_int32 in_len,
+ gdv_int32 repeat_number, gdv_int32* out_len) {
+ // if the repeat number is zero, then return empty string
+ if (repeat_number == 0 || in_len <= 0) {
+ *out_len = 0;
+ return "";
+ }
+ // if the repeat number is a negative number, an error is set on context
+ if (repeat_number < 0) {
+ gdv_fn_context_set_error_msg(context, "Repeat number can't be negative");
+ *out_len = 0;
+ return "";
+ }
+ *out_len = repeat_number * in_len;
+ char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+ for (int i = 0; i < repeat_number; ++i) {
+ memcpy(ret + (i * in_len), in, in_len);
+ }
+ return ret;
+}
+
+FORCE_INLINE
+const char* concat_utf8_utf8(gdv_int64 context, const char* left, gdv_int32 left_len,
+ bool left_validity, const char* right, gdv_int32 right_len,
+ bool right_validity, gdv_int32* out_len) {
+ if (!left_validity) {
+ left_len = 0;
+ }
+ if (!right_validity) {
+ right_len = 0;
+ }
+ return concatOperator_utf8_utf8(context, left, left_len, right, right_len, out_len);
+}
+
+FORCE_INLINE
+const char* concatOperator_utf8_utf8(gdv_int64 context, const char* left,
+ gdv_int32 left_len, const char* right,
+ gdv_int32 right_len, gdv_int32* out_len) {
+ *out_len = left_len + right_len;
+ if (*out_len <= 0) {
+ *out_len = 0;
+ return "";
+ }
+ char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+ memcpy(ret, left, left_len);
+ memcpy(ret + left_len, right, right_len);
+ return ret;
+}
+
+FORCE_INLINE
+const char* concat_utf8_utf8_utf8(gdv_int64 context, const char* in1, gdv_int32 in1_len,
+ bool in1_validity, const char* in2, gdv_int32 in2_len,
+ bool in2_validity, const char* in3, gdv_int32 in3_len,
+ bool in3_validity, gdv_int32* out_len) {
+ if (!in1_validity) {
+ in1_len = 0;
+ }
+ if (!in2_validity) {
+ in2_len = 0;
+ }
+ if (!in3_validity) {
+ in3_len = 0;
+ }
+ return concatOperator_utf8_utf8_utf8(context, in1, in1_len, in2, in2_len, in3, in3_len,
+ out_len);
+}
+
+FORCE_INLINE
+const char* concatOperator_utf8_utf8_utf8(gdv_int64 context, const char* in1,
+ gdv_int32 in1_len, const char* in2,
+ gdv_int32 in2_len, const char* in3,
+ gdv_int32 in3_len, gdv_int32* out_len) {
+ *out_len = in1_len + in2_len + in3_len;
+ if (*out_len <= 0) {
+ *out_len = 0;
+ return "";
+ }
+ char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+ memcpy(ret, in1, in1_len);
+ memcpy(ret + in1_len, in2, in2_len);
+ memcpy(ret + in1_len + in2_len, in3, in3_len);
+ return ret;
+}
+
+FORCE_INLINE
+const char* concat_utf8_utf8_utf8_utf8(gdv_int64 context, const char* in1,
+ gdv_int32 in1_len, bool in1_validity,
+ const char* in2, gdv_int32 in2_len,
+ bool in2_validity, const char* in3,
+ gdv_int32 in3_len, bool in3_validity,
+ const char* in4, gdv_int32 in4_len,
+ bool in4_validity, gdv_int32* out_len) {
+ if (!in1_validity) {
+ in1_len = 0;
+ }
+ if (!in2_validity) {
+ in2_len = 0;
+ }
+ if (!in3_validity) {
+ in3_len = 0;
+ }
+ if (!in4_validity) {
+ in4_len = 0;
+ }
+ return concatOperator_utf8_utf8_utf8_utf8(context, in1, in1_len, in2, in2_len, in3,
+ in3_len, in4, in4_len, out_len);
+}
+
+FORCE_INLINE
+const char* concatOperator_utf8_utf8_utf8_utf8(gdv_int64 context, const char* in1,
+ gdv_int32 in1_len, const char* in2,
+ gdv_int32 in2_len, const char* in3,
+ gdv_int32 in3_len, const char* in4,
+ gdv_int32 in4_len, gdv_int32* out_len) {
+ *out_len = in1_len + in2_len + in3_len + in4_len;
+ if (*out_len <= 0) {
+ *out_len = 0;
+ return "";
+ }
+ char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+ memcpy(ret, in1, in1_len);
+ memcpy(ret + in1_len, in2, in2_len);
+ memcpy(ret + in1_len + in2_len, in3, in3_len);
+ memcpy(ret + in1_len + in2_len + in3_len, in4, in4_len);
+ return ret;
+}
+
+FORCE_INLINE
+const char* concat_utf8_utf8_utf8_utf8_utf8(
+ gdv_int64 context, const char* in1, gdv_int32 in1_len, bool in1_validity,
+ const char* in2, gdv_int32 in2_len, bool in2_validity, const char* in3,
+ gdv_int32 in3_len, bool in3_validity, const char* in4, gdv_int32 in4_len,
+ bool in4_validity, const char* in5, gdv_int32 in5_len, bool in5_validity,
+ gdv_int32* out_len) {
+ if (!in1_validity) {
+ in1_len = 0;
+ }
+ if (!in2_validity) {
+ in2_len = 0;
+ }
+ if (!in3_validity) {
+ in3_len = 0;
+ }
+ if (!in4_validity) {
+ in4_len = 0;
+ }
+ if (!in5_validity) {
+ in5_len = 0;
+ }
+ return concatOperator_utf8_utf8_utf8_utf8_utf8(context, in1, in1_len, in2, in2_len, in3,
+ in3_len, in4, in4_len, in5, in5_len,
+ out_len);
+}
+
+FORCE_INLINE
+const char* concatOperator_utf8_utf8_utf8_utf8_utf8(
+ gdv_int64 context, const char* in1, gdv_int32 in1_len, const char* in2,
+ gdv_int32 in2_len, const char* in3, gdv_int32 in3_len, const char* in4,
+ gdv_int32 in4_len, const char* in5, gdv_int32 in5_len, gdv_int32* out_len) {
+ *out_len = in1_len + in2_len + in3_len + in4_len + in5_len;
+ if (*out_len <= 0) {
+ *out_len = 0;
+ return "";
+ }
+ char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+ memcpy(ret, in1, in1_len);
+ memcpy(ret + in1_len, in2, in2_len);
+ memcpy(ret + in1_len + in2_len, in3, in3_len);
+ memcpy(ret + in1_len + in2_len + in3_len, in4, in4_len);
+ memcpy(ret + in1_len + in2_len + in3_len + in4_len, in5, in5_len);
+ return ret;
+}
+
+FORCE_INLINE
+const char* concat_utf8_utf8_utf8_utf8_utf8_utf8(
+ gdv_int64 context, const char* in1, gdv_int32 in1_len, bool in1_validity,
+ const char* in2, gdv_int32 in2_len, bool in2_validity, const char* in3,
+ gdv_int32 in3_len, bool in3_validity, const char* in4, gdv_int32 in4_len,
+ bool in4_validity, const char* in5, gdv_int32 in5_len, bool in5_validity,
+ const char* in6, gdv_int32 in6_len, bool in6_validity, gdv_int32* out_len) {
+ if (!in1_validity) {
+ in1_len = 0;
+ }
+ if (!in2_validity) {
+ in2_len = 0;
+ }
+ if (!in3_validity) {
+ in3_len = 0;
+ }
+ if (!in4_validity) {
+ in4_len = 0;
+ }
+ if (!in5_validity) {
+ in5_len = 0;
+ }
+ if (!in6_validity) {
+ in6_len = 0;
+ }
+ return concatOperator_utf8_utf8_utf8_utf8_utf8_utf8(context, in1, in1_len, in2, in2_len,
+ in3, in3_len, in4, in4_len, in5,
+ in5_len, in6, in6_len, out_len);
+}
+
+FORCE_INLINE
+const char* concatOperator_utf8_utf8_utf8_utf8_utf8_utf8(
+ gdv_int64 context, const char* in1, gdv_int32 in1_len, const char* in2,
+ gdv_int32 in2_len, const char* in3, gdv_int32 in3_len, const char* in4,
+ gdv_int32 in4_len, const char* in5, gdv_int32 in5_len, const char* in6,
+ gdv_int32 in6_len, gdv_int32* out_len) {
+ *out_len = in1_len + in2_len + in3_len + in4_len + in5_len + in6_len;
+ if (*out_len <= 0) {
+ *out_len = 0;
+ return "";
+ }
+ char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+ memcpy(ret, in1, in1_len);
+ memcpy(ret + in1_len, in2, in2_len);
+ memcpy(ret + in1_len + in2_len, in3, in3_len);
+ memcpy(ret + in1_len + in2_len + in3_len, in4, in4_len);
+ memcpy(ret + in1_len + in2_len + in3_len + in4_len, in5, in5_len);
+ memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len, in6, in6_len);
+ return ret;
+}
+
+FORCE_INLINE
+const char* concat_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ gdv_int64 context, const char* in1, gdv_int32 in1_len, bool in1_validity,
+ const char* in2, gdv_int32 in2_len, bool in2_validity, const char* in3,
+ gdv_int32 in3_len, bool in3_validity, const char* in4, gdv_int32 in4_len,
+ bool in4_validity, const char* in5, gdv_int32 in5_len, bool in5_validity,
+ const char* in6, gdv_int32 in6_len, bool in6_validity, const char* in7,
+ gdv_int32 in7_len, bool in7_validity, gdv_int32* out_len) {
+ if (!in1_validity) {
+ in1_len = 0;
+ }
+ if (!in2_validity) {
+ in2_len = 0;
+ }
+ if (!in3_validity) {
+ in3_len = 0;
+ }
+ if (!in4_validity) {
+ in4_len = 0;
+ }
+ if (!in5_validity) {
+ in5_len = 0;
+ }
+ if (!in6_validity) {
+ in6_len = 0;
+ }
+ if (!in7_validity) {
+ in7_len = 0;
+ }
+ return concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ context, in1, in1_len, in2, in2_len, in3, in3_len, in4, in4_len, in5, in5_len, in6,
+ in6_len, in7, in7_len, out_len);
+}
+
+FORCE_INLINE
+const char* concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ gdv_int64 context, const char* in1, gdv_int32 in1_len, const char* in2,
+ gdv_int32 in2_len, const char* in3, gdv_int32 in3_len, const char* in4,
+ gdv_int32 in4_len, const char* in5, gdv_int32 in5_len, const char* in6,
+ gdv_int32 in6_len, const char* in7, gdv_int32 in7_len, gdv_int32* out_len) {
+ *out_len = in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len;
+ if (*out_len <= 0) {
+ *out_len = 0;
+ return "";
+ }
+ char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+ memcpy(ret, in1, in1_len);
+ memcpy(ret + in1_len, in2, in2_len);
+ memcpy(ret + in1_len + in2_len, in3, in3_len);
+ memcpy(ret + in1_len + in2_len + in3_len, in4, in4_len);
+ memcpy(ret + in1_len + in2_len + in3_len + in4_len, in5, in5_len);
+ memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len, in6, in6_len);
+ memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len, in7, in7_len);
+ return ret;
+}
+
+FORCE_INLINE
+const char* concat_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ gdv_int64 context, const char* in1, gdv_int32 in1_len, bool in1_validity,
+ const char* in2, gdv_int32 in2_len, bool in2_validity, const char* in3,
+ gdv_int32 in3_len, bool in3_validity, const char* in4, gdv_int32 in4_len,
+ bool in4_validity, const char* in5, gdv_int32 in5_len, bool in5_validity,
+ const char* in6, gdv_int32 in6_len, bool in6_validity, const char* in7,
+ gdv_int32 in7_len, bool in7_validity, const char* in8, gdv_int32 in8_len,
+ bool in8_validity, gdv_int32* out_len) {
+ if (!in1_validity) {
+ in1_len = 0;
+ }
+ if (!in2_validity) {
+ in2_len = 0;
+ }
+ if (!in3_validity) {
+ in3_len = 0;
+ }
+ if (!in4_validity) {
+ in4_len = 0;
+ }
+ if (!in5_validity) {
+ in5_len = 0;
+ }
+ if (!in6_validity) {
+ in6_len = 0;
+ }
+ if (!in7_validity) {
+ in7_len = 0;
+ }
+ if (!in8_validity) {
+ in8_len = 0;
+ }
+ return concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ context, in1, in1_len, in2, in2_len, in3, in3_len, in4, in4_len, in5, in5_len, in6,
+ in6_len, in7, in7_len, in8, in8_len, out_len);
+}
+
+FORCE_INLINE
+const char* concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ gdv_int64 context, const char* in1, gdv_int32 in1_len, const char* in2,
+ gdv_int32 in2_len, const char* in3, gdv_int32 in3_len, const char* in4,
+ gdv_int32 in4_len, const char* in5, gdv_int32 in5_len, const char* in6,
+ gdv_int32 in6_len, const char* in7, gdv_int32 in7_len, const char* in8,
+ gdv_int32 in8_len, gdv_int32* out_len) {
+ *out_len =
+ in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len + in8_len;
+ if (*out_len <= 0) {
+ *out_len = 0;
+ return "";
+ }
+ char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+ memcpy(ret, in1, in1_len);
+ memcpy(ret + in1_len, in2, in2_len);
+ memcpy(ret + in1_len + in2_len, in3, in3_len);
+ memcpy(ret + in1_len + in2_len + in3_len, in4, in4_len);
+ memcpy(ret + in1_len + in2_len + in3_len + in4_len, in5, in5_len);
+ memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len, in6, in6_len);
+ memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len, in7, in7_len);
+ memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len, in8,
+ in8_len);
+ return ret;
+}
+
+FORCE_INLINE
+const char* concat_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ gdv_int64 context, const char* in1, gdv_int32 in1_len, bool in1_validity,
+ const char* in2, gdv_int32 in2_len, bool in2_validity, const char* in3,
+ gdv_int32 in3_len, bool in3_validity, const char* in4, gdv_int32 in4_len,
+ bool in4_validity, const char* in5, gdv_int32 in5_len, bool in5_validity,
+ const char* in6, gdv_int32 in6_len, bool in6_validity, const char* in7,
+ gdv_int32 in7_len, bool in7_validity, const char* in8, gdv_int32 in8_len,
+ bool in8_validity, const char* in9, gdv_int32 in9_len, bool in9_validity,
+ gdv_int32* out_len) {
+ if (!in1_validity) {
+ in1_len = 0;
+ }
+ if (!in2_validity) {
+ in2_len = 0;
+ }
+ if (!in3_validity) {
+ in3_len = 0;
+ }
+ if (!in4_validity) {
+ in4_len = 0;
+ }
+ if (!in5_validity) {
+ in5_len = 0;
+ }
+ if (!in6_validity) {
+ in6_len = 0;
+ }
+ if (!in7_validity) {
+ in7_len = 0;
+ }
+ if (!in8_validity) {
+ in8_len = 0;
+ }
+ if (!in9_validity) {
+ in9_len = 0;
+ }
+ return concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ context, in1, in1_len, in2, in2_len, in3, in3_len, in4, in4_len, in5, in5_len, in6,
+ in6_len, in7, in7_len, in8, in8_len, in9, in9_len, out_len);
+}
+
+FORCE_INLINE
+const char* concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ gdv_int64 context, const char* in1, gdv_int32 in1_len, const char* in2,
+ gdv_int32 in2_len, const char* in3, gdv_int32 in3_len, const char* in4,
+ gdv_int32 in4_len, const char* in5, gdv_int32 in5_len, const char* in6,
+ gdv_int32 in6_len, const char* in7, gdv_int32 in7_len, const char* in8,
+ gdv_int32 in8_len, const char* in9, gdv_int32 in9_len, gdv_int32* out_len) {
+ *out_len = in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len +
+ in8_len + in9_len;
+ if (*out_len <= 0) {
+ *out_len = 0;
+ return "";
+ }
+ char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+ memcpy(ret, in1, in1_len);
+ memcpy(ret + in1_len, in2, in2_len);
+ memcpy(ret + in1_len + in2_len, in3, in3_len);
+ memcpy(ret + in1_len + in2_len + in3_len, in4, in4_len);
+ memcpy(ret + in1_len + in2_len + in3_len + in4_len, in5, in5_len);
+ memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len, in6, in6_len);
+ memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len, in7, in7_len);
+ memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len, in8,
+ in8_len);
+ memcpy(
+ ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len + in8_len,
+ in9, in9_len);
+ return ret;
+}
+
+FORCE_INLINE
+const char* concat_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ gdv_int64 context, const char* in1, gdv_int32 in1_len, bool in1_validity,
+ const char* in2, gdv_int32 in2_len, bool in2_validity, const char* in3,
+ gdv_int32 in3_len, bool in3_validity, const char* in4, gdv_int32 in4_len,
+ bool in4_validity, const char* in5, gdv_int32 in5_len, bool in5_validity,
+ const char* in6, gdv_int32 in6_len, bool in6_validity, const char* in7,
+ gdv_int32 in7_len, bool in7_validity, const char* in8, gdv_int32 in8_len,
+ bool in8_validity, const char* in9, gdv_int32 in9_len, bool in9_validity,
+ const char* in10, gdv_int32 in10_len, bool in10_validity, gdv_int32* out_len) {
+ if (!in1_validity) {
+ in1_len = 0;
+ }
+ if (!in2_validity) {
+ in2_len = 0;
+ }
+ if (!in3_validity) {
+ in3_len = 0;
+ }
+ if (!in4_validity) {
+ in4_len = 0;
+ }
+ if (!in5_validity) {
+ in5_len = 0;
+ }
+ if (!in6_validity) {
+ in6_len = 0;
+ }
+ if (!in7_validity) {
+ in7_len = 0;
+ }
+ if (!in8_validity) {
+ in8_len = 0;
+ }
+ if (!in9_validity) {
+ in9_len = 0;
+ }
+ if (!in10_validity) {
+ in10_len = 0;
+ }
+ return concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ context, in1, in1_len, in2, in2_len, in3, in3_len, in4, in4_len, in5, in5_len, in6,
+ in6_len, in7, in7_len, in8, in8_len, in9, in9_len, in10, in10_len, out_len);
+}
+
+FORCE_INLINE
+const char* concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ gdv_int64 context, const char* in1, gdv_int32 in1_len, const char* in2,
+ gdv_int32 in2_len, const char* in3, gdv_int32 in3_len, const char* in4,
+ gdv_int32 in4_len, const char* in5, gdv_int32 in5_len, const char* in6,
+ gdv_int32 in6_len, const char* in7, gdv_int32 in7_len, const char* in8,
+ gdv_int32 in8_len, const char* in9, gdv_int32 in9_len, const char* in10,
+ gdv_int32 in10_len, gdv_int32* out_len) {
+ *out_len = in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len +
+ in8_len + in9_len + in10_len;
+ if (*out_len <= 0) {
+ *out_len = 0;
+ return "";
+ }
+ char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+ memcpy(ret, in1, in1_len);
+ memcpy(ret + in1_len, in2, in2_len);
+ memcpy(ret + in1_len + in2_len, in3, in3_len);
+ memcpy(ret + in1_len + in2_len + in3_len, in4, in4_len);
+ memcpy(ret + in1_len + in2_len + in3_len + in4_len, in5, in5_len);
+ memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len, in6, in6_len);
+ memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len, in7, in7_len);
+ memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len, in8,
+ in8_len);
+ memcpy(
+ ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len + in8_len,
+ in9, in9_len);
+ memcpy(ret + in1_len + in2_len + in3_len + in4_len + in5_len + in6_len + in7_len +
+ in8_len + in9_len,
+ in10, in10_len);
+ return ret;
+}
+
+// Returns the numeric value of the first character of str.
+GANDIVA_EXPORT
+gdv_int32 ascii_utf8(const char* data, gdv_int32 data_len) {
+ if (data_len == 0) {
+ return 0;
+ }
+ return static_cast<gdv_int32>(data[0]);
+}
+
+FORCE_INLINE
+const char* convert_fromUTF8_binary(gdv_int64 context, const char* bin_in, gdv_int32 len,
+ gdv_int32* out_len) {
+ *out_len = len;
+ char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+ memcpy(ret, bin_in, *out_len);
+ return ret;
+}
+
+FORCE_INLINE
+const char* convert_replace_invalid_fromUTF8_binary(int64_t context, const char* text_in,
+ int32_t text_len,
+ const char* char_to_replace,
+ int32_t char_to_replace_len,
+ int32_t* out_len) {
+ if (char_to_replace_len > 1) {
+ gdv_fn_context_set_error_msg(context, "Replacement of multiple bytes not supported");
+ *out_len = 0;
+ return "";
+ }
+ // actually the convert_replace function replaces invalid chars with an ASCII
+ // character so the output length will be the same as the input length
+ *out_len = text_len;
+ char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+ int32_t valid_bytes_to_cpy = 0;
+ int32_t out_byte_counter = 0;
+ int32_t in_byte_counter = 0;
+ int32_t char_len;
+ // scan the base text from left to right and increment the start pointer till
+ // looking for invalid chars to substitute
+ for (int text_index = 0; text_index < text_len; text_index += char_len) {
+ char_len = utf8_char_length(text_in[text_index]);
+ // only memory copy the bytes when detect invalid char
+ if (char_len == 0 || text_index + char_len > text_len ||
+ !validate_utf8_following_bytes(text_in, char_len, text_index)) {
+ // define char_len = 1 to increase text_index by 1 (as ASCII char fits in 1 byte)
+ char_len = 1;
+ // first copy the valid bytes until now and then replace the invalid character
+ memcpy(ret + out_byte_counter, text_in + in_byte_counter, valid_bytes_to_cpy);
+ // if the replacement char is empty, the invalid char should be ignored
+ if (char_to_replace_len == 0) {
+ out_byte_counter += valid_bytes_to_cpy;
+ } else {
+ ret[out_byte_counter + valid_bytes_to_cpy] = char_to_replace[0];
+ out_byte_counter += valid_bytes_to_cpy + char_len;
+ }
+ in_byte_counter += valid_bytes_to_cpy + char_len;
+ valid_bytes_to_cpy = 0;
+ continue;
+ }
+ valid_bytes_to_cpy += char_len;
+ }
+ // if invalid chars were not found, return the original string
+ if (out_byte_counter == 0 && in_byte_counter == 0) return text_in;
+ // if there are still valid bytes to copy, do it
+ if (valid_bytes_to_cpy != 0) {
+ memcpy(ret + out_byte_counter, text_in + in_byte_counter, valid_bytes_to_cpy);
+ }
+ // the out length will be the out bytes copied + the missing end bytes copied
+ *out_len = valid_bytes_to_cpy + out_byte_counter;
+ return ret;
+}
+
+// The function reverse a char array in-place
+static inline void reverse_char_buf(char* buf, int32_t len) {
+ char temp;
+
+ for (int32_t i = 0; i < len / 2; i++) {
+ int32_t pos_swp = len - (1 + i);
+ temp = buf[pos_swp];
+ buf[pos_swp] = buf[i];
+ buf[i] = temp;
+ }
+}
+
+// Converts a double variable to binary
+FORCE_INLINE
+const char* convert_toDOUBLE(int64_t context, double value, int32_t* out_len) {
+ *out_len = sizeof(value);
+ char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context,
+ "Could not allocate memory for the output string");
+
+ *out_len = 0;
+ return "";
+ }
+
+ memcpy(ret, &value, *out_len);
+
+ return ret;
+}
+
+FORCE_INLINE
+const char* convert_toDOUBLE_be(int64_t context, double value, int32_t* out_len) {
+ // The function behaves like convert_toDOUBLE, but always return the result
+ // in big endian format
+ char* ret = const_cast<char*>(convert_toDOUBLE(context, value, out_len));
+
+#if ARROW_LITTLE_ENDIAN
+ reverse_char_buf(ret, *out_len);
+#endif
+
+ return ret;
+}
+
+// Converts a float variable to binary
+FORCE_INLINE
+const char* convert_toFLOAT(int64_t context, float value, int32_t* out_len) {
+ *out_len = sizeof(value);
+ char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context,
+ "Could not allocate memory for the output string");
+
+ *out_len = 0;
+ return "";
+ }
+
+ memcpy(ret, &value, *out_len);
+
+ return ret;
+}
+
+FORCE_INLINE
+const char* convert_toFLOAT_be(int64_t context, float value, int32_t* out_len) {
+ // The function behaves like convert_toFLOAT, but always return the result
+ // in big endian format
+ char* ret = const_cast<char*>(convert_toFLOAT(context, value, out_len));
+
+#if ARROW_LITTLE_ENDIAN
+ reverse_char_buf(ret, *out_len);
+#endif
+
+ return ret;
+}
+
+// Converts a bigint(int with 64 bits) variable to binary
+FORCE_INLINE
+const char* convert_toBIGINT(int64_t context, int64_t value, int32_t* out_len) {
+ *out_len = sizeof(value);
+ char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context,
+ "Could not allocate memory for the output string");
+
+ *out_len = 0;
+ return "";
+ }
+
+ memcpy(ret, &value, *out_len);
+
+ return ret;
+}
+
+FORCE_INLINE
+const char* convert_toBIGINT_be(int64_t context, int64_t value, int32_t* out_len) {
+ // The function behaves like convert_toBIGINT, but always return the result
+ // in big endian format
+ char* ret = const_cast<char*>(convert_toBIGINT(context, value, out_len));
+
+#if ARROW_LITTLE_ENDIAN
+ reverse_char_buf(ret, *out_len);
+#endif
+
+ return ret;
+}
+
+// Converts an integer(with 32 bits) variable to binary
+FORCE_INLINE
+const char* convert_toINT(int64_t context, int32_t value, int32_t* out_len) {
+ *out_len = sizeof(value);
+ char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context,
+ "Could not allocate memory for the output string");
+
+ *out_len = 0;
+ return "";
+ }
+
+ memcpy(ret, &value, *out_len);
+
+ return ret;
+}
+
+FORCE_INLINE
+const char* convert_toINT_be(int64_t context, int32_t value, int32_t* out_len) {
+ // The function behaves like convert_toINT, but always return the result
+ // in big endian format
+ char* ret = const_cast<char*>(convert_toINT(context, value, out_len));
+
+#if ARROW_LITTLE_ENDIAN
+ reverse_char_buf(ret, *out_len);
+#endif
+
+ return ret;
+}
+
+// Converts a boolean variable to binary
+FORCE_INLINE
+const char* convert_toBOOLEAN(int64_t context, bool value, int32_t* out_len) {
+ *out_len = sizeof(value);
+ char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context,
+ "Could not allocate memory for the output string");
+
+ *out_len = 0;
+ return "";
+ }
+
+ memcpy(ret, &value, *out_len);
+
+ return ret;
+}
+
+// Converts a time variable to binary
+FORCE_INLINE
+const char* convert_toTIME_EPOCH(int64_t context, int32_t value, int32_t* out_len) {
+ return convert_toINT(context, value, out_len);
+}
+
+FORCE_INLINE
+const char* convert_toTIME_EPOCH_be(int64_t context, int32_t value, int32_t* out_len) {
+ // The function behaves as convert_toTIME_EPOCH, but
+ // returns the bytes in big endian format
+ return convert_toINT_be(context, value, out_len);
+}
+
+// Converts a timestamp variable to binary
+FORCE_INLINE
+const char* convert_toTIMESTAMP_EPOCH(int64_t context, int64_t timestamp,
+ int32_t* out_len) {
+ return convert_toBIGINT(context, timestamp, out_len);
+}
+
+FORCE_INLINE
+const char* convert_toTIMESTAMP_EPOCH_be(int64_t context, int64_t timestamp,
+ int32_t* out_len) {
+ // The function behaves as convert_toTIMESTAMP_EPOCH, but
+ // returns the bytes in big endian format
+ return convert_toBIGINT_be(context, timestamp, out_len);
+}
+
+// Converts a date variable to binary
+FORCE_INLINE
+const char* convert_toDATE_EPOCH(int64_t context, int64_t date, int32_t* out_len) {
+ return convert_toBIGINT(context, date, out_len);
+}
+
+FORCE_INLINE
+const char* convert_toDATE_EPOCH_be(int64_t context, int64_t date, int32_t* out_len) {
+ // The function behaves as convert_toDATE_EPOCH, but
+ // returns the bytes in big endian format
+ return convert_toBIGINT_be(context, date, out_len);
+}
+
+// Converts a string variable to binary
+FORCE_INLINE
+const char* convert_toUTF8(int64_t context, const char* value, int32_t value_len,
+ int32_t* out_len) {
+ *out_len = value_len;
+ return value;
+}
+
+// Search for a string within another string
+// Same as "locate(substr, str)", except for the reverse order of the arguments.
+FORCE_INLINE
+gdv_int32 strpos_utf8_utf8(gdv_int64 context, const char* str, gdv_int32 str_len,
+ const char* sub_str, gdv_int32 sub_str_len) {
+ return locate_utf8_utf8_int32(context, sub_str, sub_str_len, str, str_len, 1);
+}
+
+// Search for a string within another string
+FORCE_INLINE
+gdv_int32 locate_utf8_utf8(gdv_int64 context, const char* sub_str, gdv_int32 sub_str_len,
+ const char* str, gdv_int32 str_len) {
+ return locate_utf8_utf8_int32(context, sub_str, sub_str_len, str, str_len, 1);
+}
+
+// Search for a string within another string starting at position start-pos (1-indexed)
+FORCE_INLINE
+gdv_int32 locate_utf8_utf8_int32(gdv_int64 context, const char* sub_str,
+ gdv_int32 sub_str_len, const char* str,
+ gdv_int32 str_len, gdv_int32 start_pos) {
+ if (start_pos < 1) {
+ gdv_fn_context_set_error_msg(context, "Start position must be greater than 0");
+ return 0;
+ }
+
+ if (str_len == 0 || sub_str_len == 0) {
+ return 0;
+ }
+
+ gdv_int32 byte_pos = utf8_byte_pos(context, str, str_len, start_pos - 1);
+ if (byte_pos < 0 || byte_pos >= str_len) {
+ return 0;
+ }
+ for (gdv_int32 i = byte_pos; i <= str_len - sub_str_len; ++i) {
+ if (memcmp(str + i, sub_str, sub_str_len) == 0) {
+ return utf8_length(context, str, i) + 1;
+ }
+ }
+ return 0;
+}
+
+FORCE_INLINE
+const char* replace_with_max_len_utf8_utf8_utf8(gdv_int64 context, const char* text,
+ gdv_int32 text_len, const char* from_str,
+ gdv_int32 from_str_len,
+ const char* to_str, gdv_int32 to_str_len,
+ gdv_int32 max_length,
+ gdv_int32* out_len) {
+ // if from_str is empty or its length exceeds that of original string,
+ // return the original string
+ if (from_str_len <= 0 || from_str_len > text_len) {
+ *out_len = text_len;
+ return text;
+ }
+
+ bool found = false;
+ gdv_int32 text_index = 0;
+ char* out;
+ gdv_int32 out_index = 0;
+ gdv_int32 last_match_index =
+ 0; // defer copying string from last_match_index till next match is found
+
+ for (; text_index <= text_len - from_str_len;) {
+ if (memcmp(text + text_index, from_str, from_str_len) == 0) {
+ if (out_index + text_index - last_match_index + to_str_len > max_length) {
+ gdv_fn_context_set_error_msg(context, "Buffer overflow for output string");
+ *out_len = 0;
+ return "";
+ }
+ if (!found) {
+ // found match for first time
+ out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, max_length));
+ if (out == nullptr) {
+ gdv_fn_context_set_error_msg(context,
+ "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+ found = true;
+ }
+ // first copy the part deferred till now
+ memcpy(out + out_index, text + last_match_index, (text_index - last_match_index));
+ out_index += text_index - last_match_index;
+ // then copy the target string
+ memcpy(out + out_index, to_str, to_str_len);
+ out_index += to_str_len;
+
+ text_index += from_str_len;
+ last_match_index = text_index;
+ } else {
+ text_index++;
+ }
+ }
+
+ if (!found) {
+ *out_len = text_len;
+ return text;
+ }
+
+ if (out_index + text_len - last_match_index > max_length) {
+ gdv_fn_context_set_error_msg(context, "Buffer overflow for output string");
+ *out_len = 0;
+ return "";
+ }
+ memcpy(out + out_index, text + last_match_index, text_len - last_match_index);
+ out_index += text_len - last_match_index;
+ *out_len = out_index;
+ return out;
+}
+
+FORCE_INLINE
+const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text,
+ gdv_int32 text_len, const char* from_str,
+ gdv_int32 from_str_len, const char* to_str,
+ gdv_int32 to_str_len, gdv_int32* out_len) {
+ return replace_with_max_len_utf8_utf8_utf8(context, text, text_len, from_str,
+ from_str_len, to_str, to_str_len, 65535,
+ out_len);
+}
+
+FORCE_INLINE
+const char* lpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len,
+ gdv_int32 return_length, const char* fill_text,
+ gdv_int32 fill_text_len, gdv_int32* out_len) {
+ // if the text length or the defined return length (number of characters to return)
+ // is <=0, then return an empty string.
+ if (text_len == 0 || return_length <= 0) {
+ *out_len = 0;
+ return "";
+ }
+
+ // count the number of utf8 characters on text, ignoring invalid bytes
+ int text_char_count = utf8_length_ignore_invalid(text, text_len);
+
+ if (return_length == text_char_count ||
+ (return_length > text_char_count && fill_text_len == 0)) {
+ // case where the return length is same as the text's length, or if it need to
+ // fill into text but "fill_text" is empty, then return text directly.
+ *out_len = text_len;
+ return text;
+ } else if (return_length < text_char_count) {
+ // case where it truncates the result on return length.
+ *out_len = utf8_byte_pos(context, text, text_len, return_length);
+ return text;
+ } else {
+ // case (return_length > text_char_count)
+ // case where it needs to copy "fill_text" on the string left. The total number
+ // of chars to copy is given by (return_length - text_char_count)
+ char* ret =
+ reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, return_length));
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context,
+ "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+ // try to fulfill the return string with the "fill_text" continuously
+ int32_t copied_chars_count = 0;
+ int32_t copied_chars_position = 0;
+ while (copied_chars_count < return_length - text_char_count) {
+ int32_t char_len;
+ int32_t fill_index;
+ // for each char, evaluate its length to consider it when mem copying
+ for (fill_index = 0; fill_index < fill_text_len; fill_index += char_len) {
+ if (copied_chars_count >= return_length - text_char_count) {
+ break;
+ }
+ char_len = utf8_char_length(fill_text[fill_index]);
+ // ignore invalid char on the fill text, considering it as size 1
+ if (char_len == 0) char_len += 1;
+ copied_chars_count++;
+ }
+ memcpy(ret + copied_chars_position, fill_text, fill_index);
+ copied_chars_position += fill_index;
+ }
+ // after fulfilling the text, copy the main string
+ memcpy(ret + copied_chars_position, text, text_len);
+ *out_len = copied_chars_position + text_len;
+ return ret;
+ }
+}
+
+FORCE_INLINE
+const char* rpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len,
+ gdv_int32 return_length, const char* fill_text,
+ gdv_int32 fill_text_len, gdv_int32* out_len) {
+ // if the text length or the defined return length (number of characters to return)
+ // is <=0, then return an empty string.
+ if (text_len == 0 || return_length <= 0) {
+ *out_len = 0;
+ return "";
+ }
+
+ // count the number of utf8 characters on text, ignoring invalid bytes
+ int text_char_count = utf8_length_ignore_invalid(text, text_len);
+
+ if (return_length == text_char_count ||
+ (return_length > text_char_count && fill_text_len == 0)) {
+ // case where the return length is same as the text's length, or if it need to
+ // fill into text but "fill_text" is empty, then return text directly.
+ *out_len = text_len;
+ return text;
+ } else if (return_length < text_char_count) {
+ // case where it truncates the result on return length.
+ *out_len = utf8_byte_pos(context, text, text_len, return_length);
+ return text;
+ } else {
+ // case (return_length > text_char_count)
+ // case where it needs to copy "fill_text" on the string right
+ char* ret =
+ reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, return_length));
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context,
+ "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+ // fulfill the initial text copying the main input string
+ memcpy(ret, text, text_len);
+ // try to fulfill the return string with the "fill_text" continuously
+ int32_t copied_chars_count = 0;
+ int32_t copied_chars_position = 0;
+ while (text_char_count + copied_chars_count < return_length) {
+ int32_t char_len;
+ int32_t fill_length;
+ // for each char, evaluate its length to consider it when mem copying
+ for (fill_length = 0; fill_length < fill_text_len; fill_length += char_len) {
+ if (text_char_count + copied_chars_count >= return_length) {
+ break;
+ }
+ char_len = utf8_char_length(fill_text[fill_length]);
+ // ignore invalid char on the fill text, considering it as size 1
+ if (char_len == 0) char_len += 1;
+ copied_chars_count++;
+ }
+ memcpy(ret + text_len + copied_chars_position, fill_text, fill_length);
+ copied_chars_position += fill_length;
+ }
+ *out_len = copied_chars_position + text_len;
+ return ret;
+ }
+}
+
+FORCE_INLINE
+const char* lpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
+ gdv_int32 return_length, gdv_int32* out_len) {
+ return lpad_utf8_int32_utf8(context, text, text_len, return_length, " ", 1, out_len);
+}
+
+FORCE_INLINE
+const char* rpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
+ gdv_int32 return_length, gdv_int32* out_len) {
+ return rpad_utf8_int32_utf8(context, text, text_len, return_length, " ", 1, out_len);
+}
+
+FORCE_INLINE
+const char* split_part(gdv_int64 context, const char* text, gdv_int32 text_len,
+ const char* delimiter, gdv_int32 delim_len, gdv_int32 index,
+ gdv_int32* out_len) {
+ *out_len = 0;
+ if (index < 1) {
+ char error_message[100];
+ snprintf(error_message, sizeof(error_message),
+ "Index in split_part must be positive, value provided was %d", index);
+ gdv_fn_context_set_error_msg(context, error_message);
+ return "";
+ }
+
+ if (delim_len == 0 || text_len == 0) {
+ // output will just be text if no delimiter is provided
+ *out_len = text_len;
+ return text;
+ }
+
+ int i = 0, match_no = 1;
+
+ while (i < text_len) {
+ // find the position where delimiter matched for the first time
+ int match_pos = match_string(text, text_len, i, delimiter, delim_len);
+ if (match_pos == -1 && match_no != index) {
+ // reached the end without finding a match.
+ return "";
+ } else {
+ // Found a match. If the match number is index then return this match
+ if (match_no == index) {
+ int end_pos = match_pos - delim_len;
+
+ if (match_pos == -1) {
+ // end position should be last position of the string as we have the last
+ // delimiter
+ end_pos = text_len;
+ }
+
+ *out_len = end_pos - i;
+ char* out_str =
+ reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+ if (out_str == nullptr) {
+ gdv_fn_context_set_error_msg(context,
+ "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+ memcpy(out_str, text + i, *out_len);
+ return out_str;
+ } else {
+ i = match_pos;
+ match_no++;
+ }
+ }
+ }
+
+ return "";
+}
+
+// Returns the x leftmost characters of a given string. Cases:
+// LEFT("TestString", 10) => "TestString"
+// LEFT("TestString", 3) => "Tes"
+// LEFT("TestString", -3) => "TestStr"
+FORCE_INLINE
+const char* left_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
+ gdv_int32 number, gdv_int32* out_len) {
+ // returns the 'number' left most characters of a given text
+ if (text_len == 0 || number == 0) {
+ *out_len = 0;
+ return "";
+ }
+
+ // iterate over the utf8 string validating each character
+ int char_len;
+ int char_count = 0;
+ int byte_index = 0;
+ for (int i = 0; i < text_len; i += char_len) {
+ char_len = utf8_char_length(text[i]);
+ if (char_len == 0 || i + char_len > text_len) { // invalid byte or incomplete glyph
+ set_error_for_invalid_utf(context, text[i]);
+ *out_len = 0;
+ return "";
+ }
+ for (int j = 1; j < char_len; ++j) {
+ if ((text[i + j] & 0xC0) != 0x80) { // bytes following head-byte of glyph
+ set_error_for_invalid_utf(context, text[i + j]);
+ *out_len = 0;
+ return "";
+ }
+ }
+ byte_index += char_len;
+ ++char_count;
+ // Define the rules to stop the iteration over the string
+ // case where left('abc', 5) -> 'abc'
+ if (number > 0 && char_count == number) break;
+ // case where left('abc', -5) ==> ''
+ if (number < 0 && char_count == number + text_len) break;
+ }
+
+ *out_len = byte_index;
+ return text;
+}
+
+// Returns the x rightmost characters of a given string. Cases:
+// RIGHT("TestString", 10) => "TestString"
+// RIGHT("TestString", 3) => "ing"
+// RIGHT("TestString", -3) => "tString"
+FORCE_INLINE
+const char* right_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
+ gdv_int32 number, gdv_int32* out_len) {
+ // returns the 'number' left most characters of a given text
+ if (text_len == 0 || number == 0) {
+ *out_len = 0;
+ return "";
+ }
+
+ // initially counts the number of utf8 characters in the defined text
+ int32_t char_count = utf8_length(context, text, text_len);
+ // char_count is zero if input has invalid utf8 char
+ if (char_count == 0) {
+ *out_len = 0;
+ return "";
+ }
+
+ int32_t start_char_pos; // the char result start position (inclusive)
+ int32_t end_char_len; // the char result end position (inclusive)
+ if (number > 0) {
+ // case where right('abc', 5) ==> 'abc' start_char_pos=1.
+ start_char_pos = (char_count > number) ? char_count - number : 0;
+ end_char_len = char_count - start_char_pos;
+ } else {
+ start_char_pos = number * -1;
+ end_char_len = char_count - start_char_pos;
+ }
+
+ // calculate the start byte position and the output length
+ int32_t start_byte_pos = utf8_byte_pos(context, text, text_len, start_char_pos);
+ *out_len = utf8_byte_pos(context, text, text_len, end_char_len);
+
+ // try to allocate memory for the response
+ char* ret =
+ reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, *out_len));
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+ memcpy(ret, text + start_byte_pos, *out_len);
+ return ret;
+}
+
+FORCE_INLINE
+const char* binary_string(gdv_int64 context, const char* text, gdv_int32 text_len,
+ gdv_int32* out_len) {
+ gdv_binary ret =
+ reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, text_len));
+
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+
+ if (text_len == 0) {
+ *out_len = 0;
+ return "";
+ }
+
+ // converting hex encoded string to normal string
+ int j = 0;
+ for (int i = 0; i < text_len; i++, j++) {
+ if (text[i] == '\\' && i + 3 < text_len &&
+ (text[i + 1] == 'x' || text[i + 1] == 'X')) {
+ char hd1 = text[i + 2];
+ char hd2 = text[i + 3];
+ if (isxdigit(hd1) && isxdigit(hd2)) {
+ // [a-fA-F0-9]
+ ret[j] = to_binary_from_hex(hd1) * 16 + to_binary_from_hex(hd2);
+ i += 3;
+ } else {
+ ret[j] = text[i];
+ }
+ } else {
+ ret[j] = text[i];
+ }
+ }
+ *out_len = j;
+ return ret;
+}
+
+#define CAST_INT_BIGINT_VARBINARY(OUT_TYPE, TYPE_NAME) \
+ FORCE_INLINE \
+ OUT_TYPE \
+ cast##TYPE_NAME##_varbinary(gdv_int64 context, const char* in, int32_t in_len) { \
+ if (in_len == 0) { \
+ gdv_fn_context_set_error_msg(context, "Can't cast an empty string."); \
+ return -1; \
+ } \
+ char sign = in[0]; \
+ \
+ bool negative = false; \
+ if (sign == '-') { \
+ negative = true; \
+ /* Ignores the sign char in the hexadecimal string */ \
+ in++; \
+ in_len--; \
+ } \
+ \
+ if (negative && in_len == 0) { \
+ gdv_fn_context_set_error_msg(context, \
+ "Can't cast hexadecimal with only a minus sign."); \
+ return -1; \
+ } \
+ \
+ OUT_TYPE result = 0; \
+ int digit; \
+ \
+ int read_index = 0; \
+ while (read_index < in_len) { \
+ char c1 = in[read_index]; \
+ if (isxdigit(c1)) { \
+ digit = to_binary_from_hex(c1); \
+ \
+ OUT_TYPE next = result * 16 - digit; \
+ \
+ if (next > result) { \
+ gdv_fn_context_set_error_msg(context, "Integer overflow."); \
+ return -1; \
+ } \
+ result = next; \
+ read_index++; \
+ } else { \
+ gdv_fn_context_set_error_msg(context, \
+ "The hexadecimal given has invalid characters."); \
+ return -1; \
+ } \
+ } \
+ if (!negative) { \
+ result *= -1; \
+ \
+ if (result < 0) { \
+ gdv_fn_context_set_error_msg(context, "Integer overflow."); \
+ return -1; \
+ } \
+ } \
+ return result; \
+ }
+
+CAST_INT_BIGINT_VARBINARY(int32_t, INT)
+CAST_INT_BIGINT_VARBINARY(int64_t, BIGINT)
+
+#undef CAST_INT_BIGINT_VARBINARY
+
+// Produces the binary representation of a string y characters long derived by starting
+// at offset 'x' and considering the defined length 'y'. Notice that the offset index
+// may be a negative number (starting from the end of the string), or a positive number
+// starting on index 1. Cases:
+// BYTE_SUBSTR("TestString", 1, 10) => "TestString"
+// BYTE_SUBSTR("TestString", 5, 10) => "String"
+// BYTE_SUBSTR("TestString", -6, 10) => "String"
+// BYTE_SUBSTR("TestString", -600, 10) => "TestString"
+FORCE_INLINE
+const char* byte_substr_binary_int32_int32(gdv_int64 context, const char* text,
+ gdv_int32 text_len, gdv_int32 offset,
+ gdv_int32 length, gdv_int32* out_len) {
+ // the first offset position for a string is 1, so not consider offset == 0
+ // also, the length should be always a positive number
+ if (text_len == 0 || offset == 0 || length <= 0) {
+ *out_len = 0;
+ return "";
+ }
+
+ char* ret =
+ reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, text_len));
+
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+
+ int32_t startPos = 0;
+ if (offset >= 0) {
+ startPos = offset - 1;
+ } else if (text_len + offset >= 0) {
+ startPos = text_len + offset;
+ }
+
+ // calculate end position from length and truncate to upper value bounds
+ if (startPos + length > text_len) {
+ *out_len = text_len - startPos;
+ } else {
+ *out_len = length;
+ }
+
+ memcpy(ret, text + startPos, *out_len);
+ return ret;
+}
+} // extern "C"
diff --git a/src/arrow/cpp/src/gandiva/precompiled/string_ops_test.cc b/src/arrow/cpp/src/gandiva/precompiled/string_ops_test.cc
new file mode 100644
index 000000000..6221dffb3
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/precompiled/string_ops_test.cc
@@ -0,0 +1,1758 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include <limits>
+
+#include "gandiva/execution_context.h"
+#include "gandiva/precompiled/types.h"
+
+namespace gandiva {
+
+TEST(TestStringOps, TestCompare) {
+ const char* left = "abcd789";
+ const char* right = "abcd123";
+
+ // 0 for equal
+ EXPECT_EQ(mem_compare(left, 4, right, 4), 0);
+
+ // compare lengths if the prefixes match
+ EXPECT_GT(mem_compare(left, 5, right, 4), 0);
+ EXPECT_LT(mem_compare(left, 4, right, 5), 0);
+
+ // compare bytes if the prefixes don't match
+ EXPECT_GT(mem_compare(left, 5, right, 5), 0);
+ EXPECT_GT(mem_compare(left, 5, right, 7), 0);
+ EXPECT_GT(mem_compare(left, 7, right, 5), 0);
+}
+
+TEST(TestStringOps, TestAscii) {
+ // ASCII
+ EXPECT_EQ(ascii_utf8("ABC", 3), 65);
+ EXPECT_EQ(ascii_utf8("abc", 3), 97);
+ EXPECT_EQ(ascii_utf8("Hello World!", 12), 72);
+ EXPECT_EQ(ascii_utf8("This is us", 10), 84);
+ EXPECT_EQ(ascii_utf8("", 0), 0);
+ EXPECT_EQ(ascii_utf8("123", 3), 49);
+ EXPECT_EQ(ascii_utf8("999", 3), 57);
+}
+
+TEST(TestStringOps, TestBeginsEnds) {
+ // starts_with
+ EXPECT_TRUE(starts_with_utf8_utf8("hello sir", 9, "hello", 5));
+ EXPECT_TRUE(starts_with_utf8_utf8("hellos", 6, "hello", 5));
+ EXPECT_TRUE(starts_with_utf8_utf8("hello", 5, "hello", 5));
+ EXPECT_FALSE(starts_with_utf8_utf8("hell", 4, "hello", 5));
+ EXPECT_FALSE(starts_with_utf8_utf8("world hello", 11, "hello", 5));
+
+ // ends_with
+ EXPECT_TRUE(ends_with_utf8_utf8("hello sir", 9, "sir", 3));
+ EXPECT_TRUE(ends_with_utf8_utf8("ssir", 4, "sir", 3));
+ EXPECT_TRUE(ends_with_utf8_utf8("sir", 3, "sir", 3));
+ EXPECT_FALSE(ends_with_utf8_utf8("ir", 2, "sir", 3));
+ EXPECT_FALSE(ends_with_utf8_utf8("hello", 5, "sir", 3));
+}
+
+TEST(TestStringOps, TestSpace) {
+ // Space - returns a string with 'n' spaces
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ int32_t out_len = 0;
+
+ auto out = space_int32(ctx_ptr, 1, &out_len);
+ EXPECT_EQ(std::string(out, out_len), " ");
+ out = space_int32(ctx_ptr, 10, &out_len);
+ EXPECT_EQ(std::string(out, out_len), " ");
+ out = space_int32(ctx_ptr, 5, &out_len);
+ EXPECT_EQ(std::string(out, out_len), " ");
+ out = space_int32(ctx_ptr, -5, &out_len);
+ EXPECT_EQ(std::string(out, out_len), "");
+
+ out = space_int64(ctx_ptr, 2, &out_len);
+ EXPECT_EQ(std::string(out, out_len), " ");
+ out = space_int64(ctx_ptr, 9, &out_len);
+ EXPECT_EQ(std::string(out, out_len), " ");
+ out = space_int64(ctx_ptr, 4, &out_len);
+ EXPECT_EQ(std::string(out, out_len), " ");
+ out = space_int64(ctx_ptr, -5, &out_len);
+ EXPECT_EQ(std::string(out, out_len), "");
+}
+
+TEST(TestStringOps, TestIsSubstr) {
+ EXPECT_TRUE(is_substr_utf8_utf8("hello world", 11, "world", 5));
+ EXPECT_TRUE(is_substr_utf8_utf8("hello world", 11, "lo wo", 5));
+ EXPECT_FALSE(is_substr_utf8_utf8("hello world", 11, "adsed", 5));
+ EXPECT_FALSE(is_substr_utf8_utf8("hel", 3, "hello", 5));
+ EXPECT_TRUE(is_substr_utf8_utf8("hello", 5, "hello", 5));
+ EXPECT_TRUE(is_substr_utf8_utf8("hello world", 11, "", 0));
+}
+
+TEST(TestStringOps, TestCharLength) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+
+ EXPECT_EQ(utf8_length(ctx_ptr, "hello sir", 9), 9);
+
+ std::string a("âpple");
+ EXPECT_EQ(utf8_length(ctx_ptr, a.data(), static_cast<int>(a.length())), 5);
+
+ std::string b("मदन");
+ EXPECT_EQ(utf8_length(ctx_ptr, b.data(), static_cast<int>(b.length())), 3);
+
+ // invalid utf8
+ std::string c("\xf8\x28");
+ EXPECT_EQ(utf8_length(ctx_ptr, c.data(), static_cast<int>(c.length())), 0);
+ EXPECT_TRUE(ctx.get_error().find(
+ "unexpected byte \\f8 encountered while decoding utf8 string") !=
+ std::string::npos)
+ << ctx.get_error();
+ ctx.Reset();
+
+ std::string d("aa\xc3");
+ EXPECT_EQ(utf8_length(ctx_ptr, d.data(), static_cast<int>(d.length())), 0);
+ EXPECT_TRUE(ctx.get_error().find(
+ "unexpected byte \\c3 encountered while decoding utf8 string") !=
+ std::string::npos)
+ << ctx.get_error();
+ ctx.Reset();
+
+ std::string e(
+ "a\xc3"
+ "a");
+ EXPECT_EQ(utf8_length(ctx_ptr, e.data(), static_cast<int>(e.length())), 0);
+ EXPECT_TRUE(ctx.get_error().find(
+ "unexpected byte \\61 encountered while decoding utf8 string") !=
+ std::string::npos)
+ << ctx.get_error();
+ ctx.Reset();
+
+ std::string f(
+ "a\xc3\xe3"
+ "a");
+ EXPECT_EQ(utf8_length(ctx_ptr, f.data(), static_cast<int>(f.length())), 0);
+ EXPECT_TRUE(ctx.get_error().find(
+ "unexpected byte \\e3 encountered while decoding utf8 string") !=
+ std::string::npos)
+ << ctx.get_error();
+ ctx.Reset();
+}
+
+TEST(TestStringOps, TestConvertReplaceInvalidUtf8Char) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+
+ // invalid utf8 (xf8 is invalid but x28 is not - x28 = '(')
+ std::string a(
+ "ok-\xf8\x28"
+ "-a");
+ auto a_in_out_len = static_cast<int>(a.length());
+ const char* a_str = convert_replace_invalid_fromUTF8_binary(
+ ctx_ptr, a.data(), a_in_out_len, "a", 1, &a_in_out_len);
+ EXPECT_EQ(std::string(a_str, a_in_out_len), "ok-a(-a");
+ EXPECT_FALSE(ctx.has_error());
+
+ // invalid utf8 (xa0 and xa1 are invalid)
+ std::string b("ok-\xa0\xa1-valid");
+ auto b_in_out_len = static_cast<int>(b.length());
+ const char* b_str = convert_replace_invalid_fromUTF8_binary(
+ ctx_ptr, b.data(), b_in_out_len, "b", 1, &b_in_out_len);
+ EXPECT_EQ(std::string(b_str, b_in_out_len), "ok-bb-valid");
+ EXPECT_FALSE(ctx.has_error());
+
+ // full valid utf8
+ std::string c("all-valid");
+ auto c_in_out_len = static_cast<int>(c.length());
+ const char* c_str = convert_replace_invalid_fromUTF8_binary(
+ ctx_ptr, c.data(), c_in_out_len, "c", 1, &c_in_out_len);
+ EXPECT_EQ(std::string(c_str, c_in_out_len), "all-valid");
+ EXPECT_FALSE(ctx.has_error());
+
+ // valid utf8 (महसुस is 4-char string, each char of which is likely a multibyte char)
+ std::string d("ok-महसुस-valid-new");
+ auto d_in_out_len = static_cast<int>(d.length());
+ const char* d_str = convert_replace_invalid_fromUTF8_binary(
+ ctx_ptr, d.data(), d_in_out_len, "d", 1, &d_in_out_len);
+ EXPECT_EQ(std::string(d_str, d_in_out_len), "ok-महसुस-valid-new");
+ EXPECT_FALSE(ctx.has_error());
+
+ // full valid utf8, but invalid replacement char length
+ std::string e("all-valid");
+ auto e_in_out_len = static_cast<int>(e.length());
+ const char* e_str = convert_replace_invalid_fromUTF8_binary(
+ ctx_ptr, e.data(), e_in_out_len, "ee", 2, &e_in_out_len);
+ EXPECT_EQ(std::string(e_str, e_in_out_len), "");
+ EXPECT_TRUE(ctx.has_error());
+ ctx.Reset();
+
+ // invalid utf8 (xa0 and xa1 are invalid) with empty replacement char length
+ std::string f("ok-\xa0\xa1-valid");
+ auto f_in_out_len = static_cast<int>(f.length());
+ const char* f_str = convert_replace_invalid_fromUTF8_binary(
+ ctx_ptr, f.data(), f_in_out_len, "", 0, &f_in_out_len);
+ EXPECT_EQ(std::string(f_str, f_in_out_len), "ok--valid");
+ EXPECT_FALSE(ctx.has_error());
+ ctx.Reset();
+
+ // invalid utf8 (xa0 and xa1 are invalid) with empty replacement char length
+ std::string g("\xa0\xa1-ok-\xa0\xa1-valid-\xa0\xa1");
+ auto g_in_out_len = static_cast<int>(g.length());
+ const char* g_str = convert_replace_invalid_fromUTF8_binary(
+ ctx_ptr, g.data(), g_in_out_len, "", 0, &g_in_out_len);
+ EXPECT_EQ(std::string(g_str, g_in_out_len), "-ok--valid-");
+ EXPECT_FALSE(ctx.has_error());
+ ctx.Reset();
+
+ std::string h("\xa0\xa1-valid");
+ auto h_in_out_len = static_cast<int>(h.length());
+ const char* h_str = convert_replace_invalid_fromUTF8_binary(
+ ctx_ptr, h.data(), h_in_out_len, "", 0, &h_in_out_len);
+ EXPECT_EQ(std::string(h_str, h_in_out_len), "-valid");
+ EXPECT_FALSE(ctx.has_error());
+ ctx.Reset();
+
+ std::string i("\xa0\xa1-valid-\xa0\xa1-valid-\xa0\xa1");
+ auto i_in_out_len = static_cast<int>(i.length());
+ const char* i_str = convert_replace_invalid_fromUTF8_binary(
+ ctx_ptr, i.data(), i_in_out_len, "", 0, &i_in_out_len);
+ EXPECT_EQ(std::string(i_str, i_in_out_len), "-valid--valid-");
+ EXPECT_FALSE(ctx.has_error());
+ ctx.Reset();
+}
+
+TEST(TestStringOps, TestRepeat) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+
+ const char* out_str = repeat_utf8_int32(ctx_ptr, "abc", 3, 2, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "abcabc");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = repeat_utf8_int32(ctx_ptr, "a", 1, 5, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "aaaaa");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = repeat_utf8_int32(ctx_ptr, "", 0, 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = repeat_utf8_int32(ctx_ptr, "", -20, 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = repeat_utf8_int32(ctx_ptr, "a", 1, -10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Repeat number can't be negative"));
+ ctx.Reset();
+}
+
+TEST(TestStringOps, TestCastBoolToVarchar) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+
+ const char* out_str = castVARCHAR_bool_int64(ctx_ptr, true, 2, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "tr");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_bool_int64(ctx_ptr, true, 7, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "true");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_bool_int64(ctx_ptr, false, 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "fals");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_bool_int64(ctx_ptr, false, 5, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "false");
+ EXPECT_FALSE(ctx.has_error());
+
+ castVARCHAR_bool_int64(ctx_ptr, true, -3, &out_len);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Output buffer length can't be negative"));
+ ctx.Reset();
+}
+
+TEST(TestStringOps, TestCastVarcharToBool) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, "true", 4), true);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, " true ", 14), true);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, "true ", 9), true);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, " true", 9), true);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, "TRUE", 4), true);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, "TrUe", 4), true);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, "1", 1), true);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, " 1", 3), true);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, "false", 5), false);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, "false ", 10), false);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, " false", 10), false);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, "0", 1), false);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, "0 ", 4), false);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, "FALSE", 5), false);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, "FaLsE", 5), false);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, "test", 4), false);
+ EXPECT_TRUE(ctx.has_error());
+ EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Invalid value for boolean"));
+ ctx.Reset();
+}
+
+TEST(TestStringOps, TestCastVarchar) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+
+ // BINARY TESTS
+ const char* out_str = castVARCHAR_binary_int64(ctx_ptr, "asdf", 4, 1, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "a");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_binary_int64(ctx_ptr, "asdf", 4, 6, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "asdf");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_binary_int64(ctx_ptr, "asdf", 4, 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "asd");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_binary_int64(ctx_ptr, "asdf", 4, 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "asdf");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_binary_int64(ctx_ptr, "asdf", 4, 5, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "asdf");
+ EXPECT_FALSE(ctx.has_error());
+
+ // do not truncate if output length is 0
+ out_str = castVARCHAR_binary_int64(ctx_ptr, "asdf", 4, 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "asdf");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_binary_int64(ctx_ptr, "", 0, 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_binary_int64(ctx_ptr, "çåå†", 9, 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "çåå");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_binary_int64(ctx_ptr, "çåå†", 9, 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "çåå†");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_binary_int64(ctx_ptr, "çåå†", 9, 5, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "çåå†");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_binary_int64(ctx_ptr, "çåå†", 9, 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "çåå†");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_binary_int64(ctx_ptr, "çåå†", 9, 6, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "çåå†");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_binary_int64(ctx_ptr, "abc", 3, -1, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Output buffer length can't be negative"));
+ ctx.Reset();
+
+ std::string z("aa\xc3");
+ out_str = castVARCHAR_binary_int64(ctx_ptr, z.data(), static_cast<int>(z.length()), 2,
+ &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "aa");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_binary_int64(ctx_ptr, "1234567812341234", 16, 16, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "1234567812341234");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_binary_int64(ctx_ptr, "1234567812341234", 16, 15, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "123456781234123");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_binary_int64(ctx_ptr, "1234567812341234", 16, 12, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "123456781234");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_binary_int64(ctx_ptr, "1234567812341234", 16, 8, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "12345678");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_binary_int64(ctx_ptr, "1234567812341234", 16, 7, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "1234567");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_binary_int64(ctx_ptr, "1234567812341234", 16, 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "1234");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_binary_int64(ctx_ptr, "1234567812341234", 16, 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "123");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_binary_int64(ctx_ptr, "1234567812çåå†123456", 25, 16, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "1234567812çåå†12");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_binary_int64(ctx_ptr, "123456781234çåå†1234", 25, 15, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "123456781234çåå");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_binary_int64(ctx_ptr, "12çåå†34567812123456", 25, 16, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "12çåå†3456781212");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_binary_int64(ctx_ptr, "çåå†1234567812123456", 25, 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "çåå†");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_binary_int64(ctx_ptr, "çåå†1234567812123456", 25, 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "çåå");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_binary_int64(ctx_ptr, "123456781234çåå†", 21, 40, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "123456781234çåå†");
+ EXPECT_FALSE(ctx.has_error());
+
+ std::string f("123456781234çåå\xc3");
+ out_str = castVARCHAR_binary_int64(ctx_ptr, f.data(), static_cast<int32_t>(f.length()),
+ 16, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr(
+ "unexpected byte \\c3 encountered while decoding utf8 string"));
+ ctx.Reset();
+
+ // UTF8 TESTS
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, "asdf", 4, 1, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "a");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, "asdf", 4, 6, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "asdf");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, "asdf", 4, 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "asd");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, "asdf", 4, 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "asdf");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, "asdf", 4, 5, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "asdf");
+ EXPECT_FALSE(ctx.has_error());
+
+ // do not truncate if output length is 0
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, "asdf", 4, 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "asdf");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, "", 0, 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, "çåå†", 9, 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "çåå");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, "çåå†", 9, 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "çåå†");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, "çåå†", 9, 5, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "çåå†");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, "çåå†", 9, 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "çåå†");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, "çåå†", 9, 6, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "çåå†");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, "abc", 3, -1, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Output buffer length can't be negative"));
+ ctx.Reset();
+
+ std::string d("aa\xc3");
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, d.data(), static_cast<int>(d.length()), 2,
+ &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "aa");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, "1234567812341234", 16, 16, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "1234567812341234");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, "1234567812341234", 16, 15, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "123456781234123");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, "1234567812341234", 16, 12, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "123456781234");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, "1234567812341234", 16, 8, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "12345678");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, "1234567812341234", 16, 7, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "1234567");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, "1234567812341234", 16, 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "1234");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, "1234567812341234", 16, 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "123");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, "1234567812çåå†123456", 25, 16, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "1234567812çåå†12");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, "123456781234çåå†1234", 25, 15, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "123456781234çåå");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, "12çåå†34567812123456", 25, 16, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "12çåå†3456781212");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, "çåå†1234567812123456", 25, 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "çåå†");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, "çåå†1234567812123456", 25, 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "çåå");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, "123456781234çåå†", 21, 40, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "123456781234çåå†");
+ EXPECT_FALSE(ctx.has_error());
+
+ std::string y("123456781234çåå\xc3");
+ out_str = castVARCHAR_utf8_int64(ctx_ptr, y.data(), static_cast<int32_t>(y.length()),
+ 16, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr(
+ "unexpected byte \\c3 encountered while decoding utf8 string"));
+ ctx.Reset();
+}
+
+TEST(TestStringOps, TestSubstring) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+
+ const char* out_str = substr_utf8_int64_int64(ctx_ptr, "asdf", 4, 1, 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = substr_utf8_int64_int64(ctx_ptr, "asdf", 4, 1, 2, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "as");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = substr_utf8_int64_int64(ctx_ptr, "asdf", 4, 1, 5, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "asdf");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = substr_utf8_int64_int64(ctx_ptr, "asdf", 4, 0, 5, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "asdf");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = substr_utf8_int64_int64(ctx_ptr, "asdf", 4, -2, 5, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "df");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = substr_utf8_int64_int64(ctx_ptr, "asdf", 4, -5, 5, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = substr_utf8_int64_int64(ctx_ptr, "अपाचे एरो", 25, 1, 5, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "अपाचे");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = substr_utf8_int64_int64(ctx_ptr, "अपाचे एरो", 25, 7, 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "एरो");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = substr_utf8_int64_int64(ctx_ptr, "çåå†", 9, 4, 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "†");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = substr_utf8_int64_int64(ctx_ptr, "çåå†", 9, 2, 2, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "åå");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = substr_utf8_int64_int64(ctx_ptr, "çåå†", 9, 0, 2, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "çå");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = substr_utf8_int64_int64(ctx_ptr, "afg", 4, 0, -5, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = substr_utf8_int64_int64(ctx_ptr, "", 0, 5, 5, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = substr_utf8_int64(ctx_ptr, "abcd", 4, 2, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "bcd");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = substr_utf8_int64(ctx_ptr, "abcd", 4, 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "abcd");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = substr_utf8_int64(ctx_ptr, "çåå†", 9, 2, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "åå†");
+ EXPECT_FALSE(ctx.has_error());
+}
+
+TEST(TestStringOps, TestSubstringInvalidInputs) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+
+ char bytes[] = {'\xA7', 'a'};
+ const char* out_str = substr_utf8_int64_int64(ctx_ptr, bytes, 2, 1, 1, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_TRUE(ctx.has_error());
+ ctx.Reset();
+
+ char midbytes[] = {'c', '\xA7', 'a'};
+ out_str = substr_utf8_int64_int64(ctx_ptr, midbytes, 3, 1, 1, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_TRUE(ctx.has_error());
+ ctx.Reset();
+
+ char midbytes2[] = {'\xC3', 'a', 'a'};
+ out_str = substr_utf8_int64_int64(ctx_ptr, midbytes2, 3, 1, 2, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_TRUE(ctx.has_error());
+ ctx.Reset();
+
+ char endbytes[] = {'a', 'a', '\xA7'};
+ out_str = substr_utf8_int64_int64(ctx_ptr, endbytes, 3, 1, 1, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_TRUE(ctx.has_error());
+ ctx.Reset();
+
+ char endbytes2[] = {'a', 'a', '\xC3'};
+ out_str = substr_utf8_int64_int64(ctx_ptr, endbytes2, 3, 1, 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_TRUE(ctx.has_error());
+ ctx.Reset();
+
+ out_str = substr_utf8_int64_int64(ctx_ptr, "çåå†", 9, 2147483656, 2, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+}
+
+TEST(TestGdvFnStubs, TestCastVarbinaryUtf8) {
+ gandiva::ExecutionContext ctx;
+
+ int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
+ int32_t out_len = 0;
+ const char* input = "abc";
+ const char* out;
+
+ out = castVARBINARY_utf8_int64(ctx_ptr, input, 3, 0, &out_len);
+ EXPECT_EQ(std::string(out, out_len), input);
+
+ out = castVARBINARY_utf8_int64(ctx_ptr, input, 3, 1, &out_len);
+ EXPECT_EQ(std::string(out, out_len), "a");
+
+ out = castVARBINARY_utf8_int64(ctx_ptr, input, 3, 500, &out_len);
+ EXPECT_EQ(std::string(out, out_len), input);
+
+ out = castVARBINARY_utf8_int64(ctx_ptr, input, 3, -10, &out_len);
+ EXPECT_EQ(std::string(out, out_len), "");
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Output buffer length can't be negative"));
+ ctx.Reset();
+}
+
+TEST(TestGdvFnStubs, TestCastVarbinaryBinary) {
+ gandiva::ExecutionContext ctx;
+
+ int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
+ int32_t out_len = 0;
+ const char* input = "\\x41\\x42\\x43";
+ const char* out;
+
+ out = castVARBINARY_binary_int64(ctx_ptr, input, 12, 0, &out_len);
+ EXPECT_EQ(std::string(out, out_len), input);
+
+ out = castVARBINARY_binary_int64(ctx_ptr, input, 8, 8, &out_len);
+ EXPECT_EQ(std::string(out, out_len), "\\x41\\x42");
+
+ out = castVARBINARY_binary_int64(ctx_ptr, input, 12, 500, &out_len);
+ EXPECT_EQ(std::string(out, out_len), input);
+
+ out = castVARBINARY_binary_int64(ctx_ptr, input, 12, -10, &out_len);
+ EXPECT_EQ(std::string(out, out_len), "");
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Output buffer length can't be negative"));
+ ctx.Reset();
+}
+
+TEST(TestStringOps, TestConcat) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+
+ const char* out_str =
+ concat_utf8_utf8(ctx_ptr, "abcd", 4, true, "\npq", 3, false, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "abcd");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = concatOperator_utf8_utf8(ctx_ptr, "asdf", 4, "jkl", 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "asdfjkl");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = concatOperator_utf8_utf8(ctx_ptr, "asdf", 4, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "asdf");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = concatOperator_utf8_utf8(ctx_ptr, "", 0, "jkl", 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "jkl");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = concatOperator_utf8_utf8(ctx_ptr, "", 0, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = concatOperator_utf8_utf8(ctx_ptr, "abcd\n", 5, "a", 1, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "abcd\na");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = concat_utf8_utf8_utf8(ctx_ptr, "abcd", 4, false, "\npq", 3, true, "ard", 3,
+ true, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "\npqard");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str =
+ concatOperator_utf8_utf8_utf8(ctx_ptr, "abcd\n", 5, "a", 1, "bcd", 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "abcd\nabcd");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = concatOperator_utf8_utf8_utf8(ctx_ptr, "abcd", 4, "a", 1, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "abcda");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = concatOperator_utf8_utf8_utf8(ctx_ptr, "", 0, "a", 1, "pqrs", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "apqrs");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = concat_utf8_utf8_utf8_utf8(ctx_ptr, "abcd", 4, false, "\npq", 3, true, "ard",
+ 3, true, "uvw", 3, false, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "\npqard");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = concatOperator_utf8_utf8_utf8_utf8(ctx_ptr, "pqrs", 4, "", 0, "\nabc", 4, "y",
+ 1, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "pqrs\nabcy");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = concat_utf8_utf8_utf8_utf8_utf8(ctx_ptr, "abcd", 4, false, "\npq", 3, true,
+ "ard", 3, true, "uvw", 3, false, "abc\n", 4,
+ true, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "\npqardabc\n");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = concatOperator_utf8_utf8_utf8_utf8_utf8(ctx_ptr, "pqrs", 4, "", 0, "\nabc", 4,
+ "y", 1, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "pqrs\nabcy");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = concat_utf8_utf8_utf8_utf8_utf8_utf8(
+ ctx_ptr, "abcd", 4, false, "\npq", 3, true, "ard", 3, true, "uvw", 3, false,
+ "abc\n", 4, true, "sdfgs", 5, true, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "\npqardabc\nsdfgs");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = concatOperator_utf8_utf8_utf8_utf8_utf8_utf8(
+ ctx_ptr, "pqrs", 4, "", 0, "\nabc", 4, "y", 1, "", 0, "\nbcd", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "pqrs\nabcy\nbcd");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = concat_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ ctx_ptr, "abcd", 4, false, "\npq", 3, true, "ard", 3, true, "uvw", 3, false,
+ "abc\n", 4, true, "sdfgs", 5, true, "wfw", 3, false, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "\npqardabc\nsdfgs");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ ctx_ptr, "", 0, "pqrs", 4, "abc\n", 4, "y", 1, "", 0, "asdf", 4, "jkl", 3,
+ &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "pqrsabc\nyasdfjkl");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = concat_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ ctx_ptr, "abcd", 4, false, "\npq", 3, true, "ard", 3, true, "uvw", 3, false,
+ "abc\n", 4, true, "sdfgs", 5, true, "wfw", 3, false, "", 0, true, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "\npqardabc\nsdfgs");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ ctx_ptr, "", 0, "pqrs", 4, "abc\n", 4, "y", 1, "", 0, "asdf", 4, "jkl", 3, "", 0,
+ &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "pqrsabc\nyasdfjkl");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = concat_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ ctx_ptr, "abcd", 4, false, "\npq", 3, true, "ard", 3, true, "uvw", 3, false,
+ "abc\n", 4, true, "sdfgs", 5, true, "wfw", 3, false, "", 0, true, "qwert|n", 7,
+ true, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "\npqardabc\nsdfgsqwert|n");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ ctx_ptr, "", 0, "pqrs", 4, "abc\n", 4, "y", 1, "", 0, "asdf", 4, "jkl", 3, "", 0,
+ "sfl\n", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "pqrsabc\nyasdfjklsfl\n");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = concat_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ ctx_ptr, "abcd", 4, false, "\npq", 3, true, "ard", 3, true, "uvw", 3, false,
+ "abc\n", 4, true, "sdfgs", 5, true, "wfw", 3, false, "", 0, true, "qwert|n", 7,
+ true, "ewfwe", 5, false, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "\npqardabc\nsdfgsqwert|n");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ ctx_ptr, "", 0, "pqrs", 4, "abc\n", 4, "y", 1, "", 0, "asdf", 4, "", 0, "jkl", 3,
+ "sfl\n", 4, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "pqrsabc\nyasdfjklsfl\n");
+ EXPECT_FALSE(ctx.has_error());
+}
+
+TEST(TestStringOps, TestReverse) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+
+ const char* out_str;
+ out_str = reverse_utf8(ctx_ptr, "TestString", 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "gnirtStseT");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = reverse_utf8(ctx_ptr, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = reverse_utf8(ctx_ptr, "çåå†", 9, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "†ååç");
+ EXPECT_FALSE(ctx.has_error());
+
+ std::string d("aa\xc3");
+ out_str = reverse_utf8(ctx_ptr, d.data(), static_cast<int>(d.length()), &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr(
+ "unexpected byte \\c3 encountered while decoding utf8 string"));
+ ctx.Reset();
+}
+
+TEST(TestStringOps, TestLtrim) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+ const char* out_str;
+
+ out_str = ltrim_utf8(ctx_ptr, "TestString ", 12, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString ");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = ltrim_utf8(ctx_ptr, " TestString ", 18, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString ");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = ltrim_utf8(ctx_ptr, " Test çåå†bD", 18, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Test çåå†bD");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = ltrim_utf8(ctx_ptr, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = ltrim_utf8(ctx_ptr, " ", 6, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = ltrim_utf8_utf8(ctx_ptr, "", 0, "TestString", 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = ltrim_utf8_utf8(ctx_ptr, "TestString", 10, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = ltrim_utf8_utf8(ctx_ptr, "abcbbaccabbcdef", 15, "abc", 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "def");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = ltrim_utf8_utf8(ctx_ptr, "abcbbaccabbcdef", 15, "ababbac", 7, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "def");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = ltrim_utf8_utf8(ctx_ptr, "ååçåå†eç†Dd", 21, "çåå†", 9, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "eç†Dd");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = ltrim_utf8_utf8(ctx_ptr, "ç†ååçåå†", 18, "çåå†", 9, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ std::string d(
+ "aa\xc3"
+ "bcd");
+ out_str =
+ ltrim_utf8_utf8(ctx_ptr, d.data(), static_cast<int>(d.length()), "a", 1, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len),
+ "\xc3"
+ "bcd");
+ EXPECT_FALSE(ctx.has_error());
+
+ std::string e(
+ "åå\xe0\xa0"
+ "bcd");
+ out_str =
+ ltrim_utf8_utf8(ctx_ptr, e.data(), static_cast<int>(e.length()), "å", 2, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len),
+ "\xE0\xa0"
+ "bcd");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = ltrim_utf8_utf8(ctx_ptr, "TestString", 10, "abcd", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = ltrim_utf8_utf8(ctx_ptr, "acbabbcabb", 10, "abcbd", 5, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+}
+
+TEST(TestStringOps, TestLpadString) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+ const char* out_str;
+
+ // LPAD function tests - with defined fill pad text
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 4, "fill", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Test");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 10, "fill", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 0, 10, "fill", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 0, "fill", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, -500, "fill", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 500, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 18, "Fill", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "FillFillTestString");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 15, "Fill", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "FillFTestString");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 20, "Fill", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "FillFillFiTestString");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "абвгд", 10, 7, "д", 2, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "ддабвгд");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "абвгд", 10, 20, "абвгд", 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "абвгдабвгдабвгдабвгд");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "hello", 5, 6, "д", 2, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "дhello");
+
+ // LPAD function tests - with NO pad text
+ out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Test");
+
+ out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+
+ out_str = lpad_utf8_int32(ctx_ptr, "TestString", 0, 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+
+ out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+
+ out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, -500, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+
+ out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, 18, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), " TestString");
+
+ out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, 15, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), " TestString");
+
+ out_str = lpad_utf8_int32(ctx_ptr, "абвгд", 10, 7, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), " абвгд");
+}
+
+TEST(TestStringOps, TestRpadString) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+ const char* out_str;
+
+ // RPAD function tests - with defined fill pad text
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 4, "fill", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Test");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 10, "fill", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 0, 10, "fill", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 0, "fill", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, -500, "fill", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 500, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 18, "Fill", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestStringFillFill");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 15, "Fill", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestStringFillF");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 20, "Fill", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestStringFillFillFi");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "абвгд", 10, 7, "д", 2, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "абвгддд");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "абвгд", 10, 20, "абвгд", 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "абвгдабвгдабвгдабвгд");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "hello", 5, 6, "д", 2, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "helloд");
+
+ // RPAD function tests - with NO pad text
+ out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Test");
+
+ out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+
+ out_str = rpad_utf8_int32(ctx_ptr, "TestString", 0, 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+
+ out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+
+ out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, -500, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+
+ out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, 18, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString ");
+
+ out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, 15, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString ");
+
+ out_str = rpad_utf8_int32(ctx_ptr, "абвгд", 10, 7, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "абвгд ");
+}
+
+TEST(TestStringOps, TestRtrim) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+ const char* out_str;
+
+ out_str = rtrim_utf8(ctx_ptr, " TestString", 12, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), " TestString");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = rtrim_utf8(ctx_ptr, " TestString ", 18, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), " TestString");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = rtrim_utf8(ctx_ptr, "Test çåå†bD ", 20, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Test çåå†bD");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = rtrim_utf8(ctx_ptr, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = rtrim_utf8(ctx_ptr, " ", 6, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = rtrim_utf8_utf8(ctx_ptr, "", 0, "TestString", 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = rtrim_utf8_utf8(ctx_ptr, "TestString", 10, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = rtrim_utf8_utf8(ctx_ptr, "TestString", 10, "ring", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestSt");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = rtrim_utf8_utf8(ctx_ptr, "defabcbbaccabbc", 15, "abc", 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "def");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = rtrim_utf8_utf8(ctx_ptr, "defabcbbaccabbc", 15, "ababbac", 7, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "def");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = rtrim_utf8_utf8(ctx_ptr, "eDdç†ååçåå†", 21, "çåå†", 9, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "eDd");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = rtrim_utf8_utf8(ctx_ptr, "ç†ååçåå†", 18, "çåå†", 9, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ std::string d(
+ "\xc3"
+ "aaa");
+ out_str =
+ rtrim_utf8_utf8(ctx_ptr, d.data(), static_cast<int>(d.length()), "a", 1, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_TRUE(ctx.has_error());
+ ctx.Reset();
+
+ std::string e(
+ "\xe0\xa0"
+ "åå");
+ out_str =
+ rtrim_utf8_utf8(ctx_ptr, e.data(), static_cast<int>(e.length()), "å", 2, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_TRUE(ctx.has_error());
+ ctx.Reset();
+
+ out_str = rtrim_utf8_utf8(ctx_ptr, "åeçå", 7, "çå", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "åe");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = rtrim_utf8_utf8(ctx_ptr, "TestString", 10, "abcd", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = rtrim_utf8_utf8(ctx_ptr, "acbabbcabb", 10, "abcbd", 5, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+}
+
+TEST(TestStringOps, TestBtrim) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+ const char* out_str;
+
+ out_str = btrim_utf8(ctx_ptr, "TestString", 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = btrim_utf8(ctx_ptr, " TestString ", 18, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = btrim_utf8(ctx_ptr, " Test çåå†bD ", 21, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Test çåå†bD");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = btrim_utf8(ctx_ptr, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = btrim_utf8(ctx_ptr, " ", 6, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = btrim_utf8_utf8(ctx_ptr, "", 0, "TestString", 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = btrim_utf8_utf8(ctx_ptr, "TestString", 10, "Test", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "String");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = btrim_utf8_utf8(ctx_ptr, "TestString", 10, "String", 6, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Tes");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = btrim_utf8_utf8(ctx_ptr, "TestString", 10, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = btrim_utf8_utf8(ctx_ptr, "abcbbadefccabbc", 15, "abc", 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "def");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = btrim_utf8_utf8(ctx_ptr, "abcbbadefccabbc", 15, "ababbac", 7, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "def");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = btrim_utf8_utf8(ctx_ptr, "ååçåå†Ddeç†", 21, "çåå†", 9, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Dde");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = btrim_utf8_utf8(ctx_ptr, "ç†ååçåå†", 18, "çåå†", 9, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+ ctx.Reset();
+
+ std::string d(
+ "acd\xc3"
+ "aaa");
+ out_str =
+ btrim_utf8_utf8(ctx_ptr, d.data(), static_cast<int>(d.length()), "a", 1, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_TRUE(ctx.has_error());
+ ctx.Reset();
+
+ std::string e(
+ "åbc\xe0\xa0"
+ "åå");
+ out_str =
+ btrim_utf8_utf8(ctx_ptr, e.data(), static_cast<int>(e.length()), "å", 2, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_TRUE(ctx.has_error());
+ ctx.Reset();
+
+ std::string f(
+ "aa\xc3"
+ "bcd");
+ out_str =
+ btrim_utf8_utf8(ctx_ptr, f.data(), static_cast<int>(f.length()), "a", 1, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len),
+ "\xc3"
+ "bcd");
+ EXPECT_FALSE(ctx.has_error());
+
+ std::string g(
+ "åå\xe0\xa0"
+ "bcå");
+ out_str =
+ btrim_utf8_utf8(ctx_ptr, g.data(), static_cast<int>(g.length()), "å", 2, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len),
+ "\xe0\xa0"
+ "bc");
+
+ out_str = btrim_utf8_utf8(ctx_ptr, "åe†çå", 10, "çå", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "e†");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = btrim_utf8_utf8(ctx_ptr, "TestString", 10, "abcd", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = btrim_utf8_utf8(ctx_ptr, "acbabbcabb", 10, "abcbd", 5, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+}
+
+TEST(TestStringOps, TestLocate) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+
+ int pos;
+
+ pos = locate_utf8_utf8(ctx_ptr, "String", 6, "TestString", 10);
+ EXPECT_EQ(pos, 5);
+ EXPECT_FALSE(ctx.has_error());
+
+ pos = locate_utf8_utf8_int32(ctx_ptr, "String", 6, "TestString", 10, 1);
+ EXPECT_EQ(pos, 5);
+ EXPECT_FALSE(ctx.has_error());
+
+ pos = locate_utf8_utf8_int32(ctx_ptr, "abc", 3, "abcabc", 6, 2);
+ EXPECT_EQ(pos, 4);
+ EXPECT_FALSE(ctx.has_error());
+
+ pos = locate_utf8_utf8(ctx_ptr, "çåå", 6, "s†å†emçåå†d", 21);
+ EXPECT_EQ(pos, 7);
+ EXPECT_FALSE(ctx.has_error());
+
+ pos = locate_utf8_utf8_int32(ctx_ptr, "bar", 3, "†barbar", 9, 3);
+ EXPECT_EQ(pos, 5);
+ EXPECT_FALSE(ctx.has_error());
+
+ pos = locate_utf8_utf8_int32(ctx_ptr, "sub", 3, "", 0, 1);
+ EXPECT_EQ(pos, 0);
+ EXPECT_FALSE(ctx.has_error());
+
+ pos = locate_utf8_utf8_int32(ctx_ptr, "", 0, "str", 3, 1);
+ EXPECT_EQ(pos, 0);
+ EXPECT_FALSE(ctx.has_error());
+
+ pos = locate_utf8_utf8_int32(ctx_ptr, "bar", 3, "barbar", 6, 0);
+ EXPECT_EQ(pos, 0);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr("Start position must be greater than 0"));
+ ctx.Reset();
+
+ pos = locate_utf8_utf8_int32(ctx_ptr, "bar", 3, "barbar", 6, 7);
+ EXPECT_EQ(pos, 0);
+ EXPECT_FALSE(ctx.has_error());
+
+ std::string d(
+ "a\xff"
+ "c");
+ pos =
+ locate_utf8_utf8_int32(ctx_ptr, "c", 1, d.data(), static_cast<int>(d.length()), 3);
+ EXPECT_EQ(pos, 0);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr(
+ "unexpected byte \\ff encountered while decoding utf8 string"));
+ ctx.Reset();
+}
+
+TEST(TestStringOps, TestByteSubstr) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+
+ const char* out_str;
+ out_str = byte_substr_binary_int32_int32(ctx_ptr, "TestString", 10, 5, 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "String");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = byte_substr_binary_int32_int32(ctx_ptr, "TestString", 10, -6, 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "String");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = byte_substr_binary_int32_int32(ctx_ptr, "TestString", 10, 0, 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = byte_substr_binary_int32_int32(ctx_ptr, "TestString", 10, 0, -500, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = byte_substr_binary_int32_int32(ctx_ptr, "TestString", 10, 1, 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = byte_substr_binary_int32_int32(ctx_ptr, "TestString", 10, 1, 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Test");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = byte_substr_binary_int32_int32(ctx_ptr, "TestString", 10, 1, 1000, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = byte_substr_binary_int32_int32(ctx_ptr, "TestString", 10, 5, 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Str");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = byte_substr_binary_int32_int32(ctx_ptr, "TestString", 10, 5, 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "String");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = byte_substr_binary_int32_int32(ctx_ptr, "TestString", 10, -100, 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+ EXPECT_FALSE(ctx.has_error());
+}
+
+TEST(TestStringOps, TestStrPos) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+
+ int pos;
+
+ pos = strpos_utf8_utf8(ctx_ptr, "TestString", 10, "String", 6);
+ EXPECT_EQ(pos, 5);
+ EXPECT_FALSE(ctx.has_error());
+
+ pos = strpos_utf8_utf8(ctx_ptr, "TestString", 10, "String", 6);
+ EXPECT_EQ(pos, 5);
+ EXPECT_FALSE(ctx.has_error());
+
+ pos = strpos_utf8_utf8(ctx_ptr, "abcabc", 6, "abc", 3);
+ EXPECT_EQ(pos, 1);
+ EXPECT_FALSE(ctx.has_error());
+
+ pos = strpos_utf8_utf8(ctx_ptr, "s†å†emçåå†d", 21, "çåå", 6);
+ EXPECT_EQ(pos, 7);
+ EXPECT_FALSE(ctx.has_error());
+
+ pos = strpos_utf8_utf8(ctx_ptr, "†barbar", 9, "bar", 3);
+ EXPECT_EQ(pos, 2);
+ EXPECT_FALSE(ctx.has_error());
+
+ pos = strpos_utf8_utf8(ctx_ptr, "", 0, "sub", 3);
+ EXPECT_EQ(pos, 0);
+ EXPECT_FALSE(ctx.has_error());
+
+ pos = strpos_utf8_utf8(ctx_ptr, "str", 3, "", 0);
+ EXPECT_EQ(pos, 0);
+ EXPECT_FALSE(ctx.has_error());
+
+ std::string d(
+ "a\xff"
+ "c");
+ pos = strpos_utf8_utf8(ctx_ptr, d.data(), static_cast<int>(d.length()), "c", 1);
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr(
+ "unexpected byte \\ff encountered while decoding utf8 string"));
+ ctx.Reset();
+}
+
+TEST(TestStringOps, TestReplace) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+
+ const char* out_str;
+ out_str = replace_utf8_utf8_utf8(ctx_ptr, "TestString1String2", 18, "String", 6,
+ "Replace", 7, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestReplace1Replace2");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str =
+ replace_utf8_utf8_utf8(ctx_ptr, "TestString1", 11, "String", 6, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Test1");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = replace_utf8_utf8_utf8(ctx_ptr, "", 0, "test", 4, "rep", 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = replace_utf8_utf8_utf8(ctx_ptr, "dž†çåå†", 17, "†", 3, "t", 1, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Çttçååt");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = replace_utf8_utf8_utf8(ctx_ptr, "TestString", 10, "", 0, "rep", 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str =
+ replace_utf8_utf8_utf8(ctx_ptr, "Test", 4, "TestString", 10, "rep", 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Test");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = replace_utf8_utf8_utf8(ctx_ptr, "Test", 4, "Test", 4, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str =
+ replace_utf8_utf8_utf8(ctx_ptr, "TestString", 10, "abc", 3, "xyz", 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+ EXPECT_FALSE(ctx.has_error());
+
+ replace_with_max_len_utf8_utf8_utf8(ctx_ptr, "Hell", 4, "ell", 3, "ollow", 5, 5,
+ &out_len);
+ EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Buffer overflow for output string"));
+ ctx.Reset();
+
+ replace_with_max_len_utf8_utf8_utf8(ctx_ptr, "eeee", 4, "e", 1, "aaaa", 4, 14,
+ &out_len);
+ EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Buffer overflow for output string"));
+ ctx.Reset();
+}
+
+TEST(TestStringOps, TestLeftString) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+ const char* out_str;
+
+ out_str = left_utf8_int32(ctx_ptr, "TestString", 10, 10, &out_len);
+ std::string output = std::string(out_str, out_len);
+ EXPECT_EQ(output, "TestString");
+
+ out_str = left_utf8_int32(ctx_ptr, "", 0, 0, &out_len);
+ output = std::string(out_str, out_len);
+ EXPECT_EQ(output, "");
+
+ out_str = left_utf8_int32(ctx_ptr, "", 0, 500, &out_len);
+ output = std::string(out_str, out_len);
+ EXPECT_EQ(output, "");
+
+ out_str = left_utf8_int32(ctx_ptr, "TestString", 10, 3, &out_len);
+ output = std::string(out_str, out_len);
+ EXPECT_EQ(output, "Tes");
+
+ out_str = left_utf8_int32(ctx_ptr, "TestString", 10, -3, &out_len);
+ output = std::string(out_str, out_len);
+ EXPECT_EQ(output, "TestStr");
+
+ // the text length for this string is 10 (each utf8 char is represented by two bytes)
+ out_str = left_utf8_int32(ctx_ptr, "абвгд", 10, 3, &out_len);
+ output = std::string(out_str, out_len);
+ EXPECT_EQ(output, "абв");
+}
+
+TEST(TestStringOps, TestRightString) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+ const char* out_str;
+
+ out_str = right_utf8_int32(ctx_ptr, "TestString", 10, 10, &out_len);
+ std::string output = std::string(out_str, out_len);
+ EXPECT_EQ(output, "TestString");
+
+ out_str = right_utf8_int32(ctx_ptr, "", 0, 0, &out_len);
+ output = std::string(out_str, out_len);
+ EXPECT_EQ(output, "");
+
+ out_str = right_utf8_int32(ctx_ptr, "", 0, 500, &out_len);
+ output = std::string(out_str, out_len);
+ EXPECT_EQ(output, "");
+
+ out_str = right_utf8_int32(ctx_ptr, "TestString", 10, 3, &out_len);
+ output = std::string(out_str, out_len);
+ EXPECT_EQ(output, "ing");
+
+ out_str = right_utf8_int32(ctx_ptr, "TestString", 10, -3, &out_len);
+ output = std::string(out_str, out_len);
+ EXPECT_EQ(output, "tString");
+
+ // the text length for this string is 10 (each utf8 char is represented by two bytes)
+ out_str = right_utf8_int32(ctx_ptr, "абвгд", 10, 3, &out_len);
+ output = std::string(out_str, out_len);
+ EXPECT_EQ(output, "вгд");
+}
+
+TEST(TestStringOps, TestBinaryString) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+ const char* out_str;
+
+ out_str = binary_string(ctx_ptr, "TestString", 10, &out_len);
+ std::string output = std::string(out_str, out_len);
+ EXPECT_EQ(output, "TestString");
+
+ out_str = binary_string(ctx_ptr, "", 0, &out_len);
+ output = std::string(out_str, out_len);
+ EXPECT_EQ(output, "");
+
+ out_str = binary_string(ctx_ptr, "T", 1, &out_len);
+ output = std::string(out_str, out_len);
+ EXPECT_EQ(output, "T");
+
+ out_str = binary_string(ctx_ptr, "\\x41\\x42\\x43", 12, &out_len);
+ output = std::string(out_str, out_len);
+ EXPECT_EQ(output, "ABC");
+
+ out_str = binary_string(ctx_ptr, "\\x41", 4, &out_len);
+ output = std::string(out_str, out_len);
+ EXPECT_EQ(output, "A");
+
+ out_str = binary_string(ctx_ptr, "\\x6d\\x6D", 8, &out_len);
+ output = std::string(out_str, out_len);
+ EXPECT_EQ(output, "mm");
+
+ out_str = binary_string(ctx_ptr, "\\x6f\\x6d", 8, &out_len);
+ output = std::string(out_str, out_len);
+ EXPECT_EQ(output, "om");
+
+ out_str = binary_string(ctx_ptr, "\\x4f\\x4D", 8, &out_len);
+ output = std::string(out_str, out_len);
+ EXPECT_EQ(output, "OM");
+}
+
+TEST(TestStringOps, TestSplitPart) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+ const char* out_str;
+
+ out_str = split_part(ctx_ptr, "A,B,C", 5, ",", 1, 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_THAT(
+ ctx.get_error(),
+ ::testing::HasSubstr("Index in split_part must be positive, value provided was 0"));
+
+ out_str = split_part(ctx_ptr, "A,B,C", 5, ",", 1, 1, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "A");
+
+ out_str = split_part(ctx_ptr, "A,B,C", 5, ",", 1, 2, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "B");
+
+ out_str = split_part(ctx_ptr, "A,B,C", 5, ",", 1, 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "C");
+
+ out_str = split_part(ctx_ptr, "abc~@~def~@~ghi", 15, "~@~", 3, 1, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "abc");
+
+ out_str = split_part(ctx_ptr, "abc~@~def~@~ghi", 15, "~@~", 3, 2, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "def");
+
+ out_str = split_part(ctx_ptr, "abc~@~def~@~ghi", 15, "~@~", 3, 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "ghi");
+
+ // Result must be empty when the index is > no of elements
+ out_str = split_part(ctx_ptr, "123|456|789", 11, "|", 1, 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+
+ out_str = split_part(ctx_ptr, "123|", 4, "|", 1, 1, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "123");
+
+ out_str = split_part(ctx_ptr, "|123", 4, "|", 1, 1, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+
+ out_str = split_part(ctx_ptr, "ç†ååçåå†", 18, "å", 2, 1, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "ç†");
+
+ out_str = split_part(ctx_ptr, "ç†ååçåå†", 18, "†åå", 6, 1, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "ç");
+
+ out_str = split_part(ctx_ptr, "ç†ååçåå†", 18, "†", 3, 2, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "ååçåå");
+}
+
+TEST(TestStringOps, TestConvertTo) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+ const char* out_str;
+
+ const int32_t ALL_BYTES_MATCH = 0;
+
+ int32_t integer_value = std::numeric_limits<int32_t>::max();
+ out_str = convert_toINT(ctx_ptr, integer_value, &out_len);
+ EXPECT_EQ(out_len, sizeof(integer_value));
+ EXPECT_EQ(ALL_BYTES_MATCH, memcmp(out_str, &integer_value, out_len));
+
+ int64_t big_integer_value = std::numeric_limits<int64_t>::max();
+ out_str = convert_toBIGINT(ctx_ptr, big_integer_value, &out_len);
+ EXPECT_EQ(out_len, sizeof(big_integer_value));
+ EXPECT_EQ(ALL_BYTES_MATCH, memcmp(out_str, &big_integer_value, out_len));
+
+ float float_value = std::numeric_limits<float>::max();
+ out_str = convert_toFLOAT(ctx_ptr, float_value, &out_len);
+ EXPECT_EQ(out_len, sizeof(float_value));
+ EXPECT_EQ(ALL_BYTES_MATCH, memcmp(out_str, &float_value, out_len));
+
+ double double_value = std::numeric_limits<double>::max();
+ out_str = convert_toDOUBLE(ctx_ptr, double_value, &out_len);
+ EXPECT_EQ(out_len, sizeof(double_value));
+ EXPECT_EQ(ALL_BYTES_MATCH, memcmp(out_str, &double_value, out_len));
+
+ const char* test_string = "test string";
+ int32_t str_len = 11;
+ out_str = convert_toUTF8(ctx_ptr, test_string, str_len, &out_len);
+ EXPECT_EQ(out_len, str_len);
+ EXPECT_EQ(ALL_BYTES_MATCH, memcmp(out_str, test_string, out_len));
+}
+
+TEST(TestStringOps, TestConvertToBigEndian) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+ gdv_int32 out_len_big_endian = 0;
+ const char* out_str;
+ const char* out_str_big_endian;
+
+ int64_t big_integer_value = std::numeric_limits<int64_t>::max();
+ out_str = convert_toBIGINT(ctx_ptr, big_integer_value, &out_len);
+ out_str_big_endian =
+ convert_toBIGINT_be(ctx_ptr, big_integer_value, &out_len_big_endian);
+ EXPECT_EQ(out_len_big_endian, sizeof(big_integer_value));
+ EXPECT_EQ(out_len_big_endian, out_len);
+
+#if ARROW_LITTLE_ENDIAN
+ // Checks that bytes are in reverse order
+ for (auto i = 0; i < out_len; i++) {
+ EXPECT_EQ(out_str[i], out_str_big_endian[out_len - (i + 1)]);
+ }
+#else
+ for (auto i = 0; i < out_len; i++) {
+ EXPECT_EQ(out_str[i], out_str_big_endian[i]);
+ }
+#endif
+
+ double double_value = std::numeric_limits<double>::max();
+ out_str = convert_toDOUBLE(ctx_ptr, double_value, &out_len);
+ out_str_big_endian = convert_toDOUBLE_be(ctx_ptr, double_value, &out_len_big_endian);
+ EXPECT_EQ(out_len_big_endian, sizeof(double_value));
+ EXPECT_EQ(out_len_big_endian, out_len);
+
+#if ARROW_LITTLE_ENDIAN
+ // Checks that bytes are in reverse order
+ for (auto i = 0; i < out_len; i++) {
+ EXPECT_EQ(out_str[i], out_str_big_endian[out_len - (i + 1)]);
+ }
+#else
+ for (auto i = 0; i < out_len; i++) {
+ EXPECT_EQ(out_str[i], out_str_big_endian[i]);
+ }
+#endif
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/precompiled/testing.h b/src/arrow/cpp/src/gandiva/precompiled/testing.h
new file mode 100644
index 000000000..c41bc5471
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/precompiled/testing.h
@@ -0,0 +1,43 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <ctime>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "arrow/util/logging.h"
+#include "arrow/util/value_parsing.h"
+
+#include "gandiva/date_utils.h"
+#include "gandiva/precompiled/types.h"
+
+namespace gandiva {
+
+static inline gdv_timestamp StringToTimestamp(const std::string& s) {
+ int64_t out = 0;
+ bool success = ::arrow::internal::ParseTimestampStrptime(
+ s.c_str(), s.length(), "%Y-%m-%d %H:%M:%S", /*ignore_time_in_day=*/false,
+ /*allow_trailing_chars=*/false, ::arrow::TimeUnit::SECOND, &out);
+ DCHECK(success);
+ ARROW_UNUSED(success);
+ return out * 1000;
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/precompiled/time.cc b/src/arrow/cpp/src/gandiva/precompiled/time.cc
new file mode 100644
index 000000000..336f69226
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/precompiled/time.cc
@@ -0,0 +1,894 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./epoch_time_point.h"
+
+extern "C" {
+
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "./time_constants.h"
+#include "./time_fields.h"
+#include "./types.h"
+
+#define MINS_IN_HOUR 60
+#define SECONDS_IN_MINUTE 60
+#define SECONDS_IN_HOUR (SECONDS_IN_MINUTE) * (MINS_IN_HOUR)
+
+#define HOURS_IN_DAY 24
+
+// Expand inner macro for all date types.
+#define DATE_TYPES(INNER) \
+ INNER(date64) \
+ INNER(timestamp)
+
+// Expand inner macro for all base numeric types.
+#define NUMERIC_TYPES(INNER) \
+ INNER(int8) \
+ INNER(int16) \
+ INNER(int32) \
+ INNER(int64) \
+ INNER(uint8) \
+ INNER(uint16) \
+ INNER(uint32) \
+ INNER(uint64) \
+ INNER(float32) \
+ INNER(float64)
+
+// Extract millennium
+#define EXTRACT_MILLENNIUM(TYPE) \
+ FORCE_INLINE \
+ gdv_int64 extractMillennium##_##TYPE(gdv_##TYPE millis) { \
+ EpochTimePoint tp(millis); \
+ return (1900 + tp.TmYear() - 1) / 1000 + 1; \
+ }
+
+DATE_TYPES(EXTRACT_MILLENNIUM)
+
+// Extract century
+#define EXTRACT_CENTURY(TYPE) \
+ FORCE_INLINE \
+ gdv_int64 extractCentury##_##TYPE(gdv_##TYPE millis) { \
+ EpochTimePoint tp(millis); \
+ return (1900 + tp.TmYear() - 1) / 100 + 1; \
+ }
+
+DATE_TYPES(EXTRACT_CENTURY)
+
+// Extract decade
+#define EXTRACT_DECADE(TYPE) \
+ FORCE_INLINE \
+ gdv_int64 extractDecade##_##TYPE(gdv_##TYPE millis) { \
+ EpochTimePoint tp(millis); \
+ return (1900 + tp.TmYear()) / 10; \
+ }
+
+DATE_TYPES(EXTRACT_DECADE)
+
+// Extract year.
+#define EXTRACT_YEAR(TYPE) \
+ FORCE_INLINE \
+ gdv_int64 extractYear##_##TYPE(gdv_##TYPE millis) { \
+ EpochTimePoint tp(millis); \
+ return 1900 + tp.TmYear(); \
+ }
+
+DATE_TYPES(EXTRACT_YEAR)
+
+#define EXTRACT_DOY(TYPE) \
+ FORCE_INLINE \
+ gdv_int64 extractDoy##_##TYPE(gdv_##TYPE millis) { \
+ EpochTimePoint tp(millis); \
+ return 1 + tp.TmYday(); \
+ }
+
+DATE_TYPES(EXTRACT_DOY)
+
+#define EXTRACT_QUARTER(TYPE) \
+ FORCE_INLINE \
+ gdv_int64 extractQuarter##_##TYPE(gdv_##TYPE millis) { \
+ EpochTimePoint tp(millis); \
+ return tp.TmMon() / 3 + 1; \
+ }
+
+DATE_TYPES(EXTRACT_QUARTER)
+
+#define EXTRACT_MONTH(TYPE) \
+ FORCE_INLINE \
+ gdv_int64 extractMonth##_##TYPE(gdv_##TYPE millis) { \
+ EpochTimePoint tp(millis); \
+ return 1 + tp.TmMon(); \
+ }
+
+DATE_TYPES(EXTRACT_MONTH)
+
+#define JAN1_WDAY(tp) ((tp.TmWday() - (tp.TmYday() % 7) + 7) % 7)
+
+bool IsLeapYear(int yy) {
+ if ((yy % 4) != 0) {
+ // not divisible by 4
+ return false;
+ }
+
+ // yy = 4x
+ if ((yy % 400) == 0) {
+ // yy = 400x
+ return true;
+ }
+
+ // yy = 4x, return true if yy != 100x
+ return ((yy % 100) != 0);
+}
+
+// Day belongs to current year
+// Note that TmYday is 0 for Jan 1 (subtract 1 from day in the below examples)
+//
+// If Jan 1 is Mon, (TmYday) / 7 + 1 (Jan 1->WK1, Jan 8->WK2, etc)
+// If Jan 1 is Tues, (TmYday + 1) / 7 + 1 (Jan 1->WK1, Jan 7->WK2, etc)
+// If Jan 1 is Wed, (TmYday + 2) / 7 + 1
+// If Jan 1 is Thu, (TmYday + 3) / 7 + 1
+//
+// If Jan 1 is Fri, Sat or Sun, the first few days belong to the previous year
+// If Jan 1 is Fri, (TmYday - 3) / 7 + 1 (Jan 4->WK1, Jan 11->WK2)
+// If Jan 1 is Sat, (TmYday - 2) / 7 + 1 (Jan 3->WK1, Jan 10->WK2)
+// If Jan 1 is Sun, (TmYday - 1) / 7 + 1 (Jan 2->WK1, Jan 9->WK2)
+int weekOfCurrentYear(const EpochTimePoint& tp) {
+ int jan1_wday = JAN1_WDAY(tp);
+ switch (jan1_wday) {
+ // Monday
+ case 1:
+ // Tuesday
+ case 2:
+ // Wednesday
+ case 3:
+ // Thursday
+ case 4: {
+ return (tp.TmYday() + jan1_wday - 1) / 7 + 1;
+ }
+ // Friday
+ case 5:
+ // Saturday
+ case 6: {
+ return (tp.TmYday() - (8 - jan1_wday)) / 7 + 1;
+ }
+ // Sunday
+ case 0: {
+ return (tp.TmYday() - 1) / 7 + 1;
+ }
+ }
+
+ // cannot reach here
+ // keep compiler happy
+ return 0;
+}
+
+// Jan 1-3
+// If Jan 1 is one of Mon, Tue, Wed, Thu - belongs to week of current year
+// If Jan 1 is Fri/Sat/Sun - belongs to previous year
+int getJanWeekOfYear(const EpochTimePoint& tp) {
+ int jan1_wday = JAN1_WDAY(tp);
+
+ if ((jan1_wday >= 1) && (jan1_wday <= 4)) {
+ // Jan 1-3 with the week belonging to this year
+ return 1;
+ }
+
+ if (jan1_wday == 5) {
+ // Jan 1 is a Fri
+ // Jan 1-3 belong to previous year. Dec 31 of previous year same week # as Jan 1-3
+ // previous year is a leap year:
+ // Prev Jan 1 is a Wed. Jan 6th is Mon
+ // Dec 31 - Jan 6 = 366 - 5 = 361
+ // week from Jan 6 = (361 - 1) / 7 + 1 = 52
+ // week # in previous year = 52 + 1 = 53
+ //
+ // previous year is not a leap year. Jan 1 is Thu. Jan 5th is Mon
+ // Dec 31 - Jan 5 = 365 - 4 = 361
+ // week from Jan 5 = (361 - 1) / 7 + 1 = 52
+ // week # in previous year = 52 + 1 = 53
+ return 53;
+ }
+
+ if (jan1_wday == 0) {
+ // Jan 1 is a Sun
+ if (tp.TmMday() > 1) {
+ // Jan 2 and 3 belong to current year
+ return 1;
+ }
+
+ // day belongs to previous year. Same as Dec 31
+ // Same as the case where Jan 1 is a Fri, except that previous year
+ // does not have an extra week
+ // Hence, return 52
+ return 52;
+ }
+
+ // Jan 1 is a Sat
+ // Jan 1-2 belong to previous year
+ if (tp.TmMday() == 3) {
+ // Jan 3, return 1
+ return 1;
+ }
+
+ // prev Jan 1 is leap year
+ // prev Jan 1 is a Thu
+ // return 53 (extra week)
+ if (IsLeapYear(1900 + tp.TmYear() - 1)) {
+ return 53;
+ }
+
+ // prev Jan 1 is not a leap year
+ // prev Jan 1 is a Fri
+ // return 52 (no extra week)
+ return 52;
+}
+
+// Dec 29-31
+int getDecWeekOfYear(const EpochTimePoint& tp) {
+ int next_jan1_wday = (tp.TmWday() + (31 - tp.TmMday()) + 1) % 7;
+
+ if (next_jan1_wday == 4) {
+ // next Jan 1 is a Thu
+ // day belongs to week 1 of next year
+ return 1;
+ }
+
+ if (next_jan1_wday == 3) {
+ // next Jan 1 is a Wed
+ // Dec 31 and 30 belong to next year - return 1
+ if (tp.TmMday() != 29) {
+ return 1;
+ }
+
+ // Dec 29 belongs to current year
+ return weekOfCurrentYear(tp);
+ }
+
+ if (next_jan1_wday == 2) {
+ // next Jan 1 is a Tue
+ // Dec 31 belongs to next year - return 1
+ if (tp.TmMday() == 31) {
+ return 1;
+ }
+
+ // Dec 29 and 30 belong to current year
+ return weekOfCurrentYear(tp);
+ }
+
+ // next Jan 1 is a Fri/Sat/Sun. No day from this year belongs to that week
+ // next Jan 1 is a Mon. No day from this year belongs to that week
+ return weekOfCurrentYear(tp);
+}
+
+// Week of year is determined by ISO 8601 standard
+// Take a look at: https://en.wikipedia.org/wiki/ISO_week_date
+//
+// Important points to note:
+// Week starts with a Monday and ends with a Sunday
+// A week can have some days in this year and some days in the previous/next year
+// This is true for the first and last weeks
+//
+// The first week of the year should have at-least 4 days in the current year
+// The last week of the year should have at-least 4 days in the current year
+//
+// A given day might belong to the first week of the next year - e.g Dec 29, 30 and 31
+// A given day might belong to the last week of the previous year - e.g. Jan 1, 2 and 3
+//
+// Algorithm:
+// If day belongs to week in current year, weekOfCurrentYear
+//
+// If day is Jan 1-3, see getJanWeekOfYear
+// If day is Dec 29-21, see getDecWeekOfYear
+//
+gdv_int64 weekOfYear(const EpochTimePoint& tp) {
+ if (tp.TmYday() < 3) {
+ // Jan 1-3
+ return getJanWeekOfYear(tp);
+ }
+
+ if ((tp.TmMon() == 11) && (tp.TmMday() >= 29)) {
+ // Dec 29-31
+ return getDecWeekOfYear(tp);
+ }
+
+ return weekOfCurrentYear(tp);
+}
+
+#define EXTRACT_WEEK(TYPE) \
+ FORCE_INLINE \
+ gdv_int64 extractWeek##_##TYPE(gdv_##TYPE millis) { \
+ EpochTimePoint tp(millis); \
+ return weekOfYear(tp); \
+ }
+
+DATE_TYPES(EXTRACT_WEEK)
+
+#define EXTRACT_DOW(TYPE) \
+ FORCE_INLINE \
+ gdv_int64 extractDow##_##TYPE(gdv_##TYPE millis) { \
+ EpochTimePoint tp(millis); \
+ return 1 + tp.TmWday(); \
+ }
+
+DATE_TYPES(EXTRACT_DOW)
+
+#define EXTRACT_DAY(TYPE) \
+ FORCE_INLINE \
+ gdv_int64 extractDay##_##TYPE(gdv_##TYPE millis) { \
+ EpochTimePoint tp(millis); \
+ return tp.TmMday(); \
+ }
+
+DATE_TYPES(EXTRACT_DAY)
+
+#define EXTRACT_HOUR(TYPE) \
+ FORCE_INLINE \
+ gdv_int64 extractHour##_##TYPE(gdv_##TYPE millis) { \
+ EpochTimePoint tp(millis); \
+ return tp.TmHour(); \
+ }
+
+DATE_TYPES(EXTRACT_HOUR)
+
+#define EXTRACT_MINUTE(TYPE) \
+ FORCE_INLINE \
+ gdv_int64 extractMinute##_##TYPE(gdv_##TYPE millis) { \
+ EpochTimePoint tp(millis); \
+ return tp.TmMin(); \
+ }
+
+DATE_TYPES(EXTRACT_MINUTE)
+
+#define EXTRACT_SECOND(TYPE) \
+ FORCE_INLINE \
+ gdv_int64 extractSecond##_##TYPE(gdv_##TYPE millis) { \
+ EpochTimePoint tp(millis); \
+ return tp.TmSec(); \
+ }
+
+DATE_TYPES(EXTRACT_SECOND)
+
+#define EXTRACT_EPOCH(TYPE) \
+ FORCE_INLINE \
+ gdv_int64 extractEpoch##_##TYPE(gdv_##TYPE millis) { return MILLIS_TO_SEC(millis); }
+
+DATE_TYPES(EXTRACT_EPOCH)
+
+// Functions that work on millis in a day
+#define EXTRACT_SECOND_TIME(TYPE) \
+ FORCE_INLINE \
+ gdv_int64 extractSecond##_##TYPE(gdv_##TYPE millis) { \
+ gdv_int64 seconds_of_day = MILLIS_TO_SEC(millis); \
+ gdv_int64 sec = seconds_of_day % SECONDS_IN_MINUTE; \
+ return sec; \
+ }
+
+EXTRACT_SECOND_TIME(time32)
+
+#define EXTRACT_MINUTE_TIME(TYPE) \
+ FORCE_INLINE \
+ gdv_int64 extractMinute##_##TYPE(gdv_##TYPE millis) { \
+ gdv_##TYPE mins = MILLIS_TO_MINS(millis); \
+ return (mins % (MINS_IN_HOUR)); \
+ }
+
+EXTRACT_MINUTE_TIME(time32)
+
+#define EXTRACT_HOUR_TIME(TYPE) \
+ FORCE_INLINE \
+ gdv_int64 extractHour##_##TYPE(gdv_##TYPE millis) { return MILLIS_TO_HOUR(millis); }
+
+EXTRACT_HOUR_TIME(time32)
+
+#define DATE_TRUNC_FIXED_UNIT(NAME, TYPE, NMILLIS_IN_UNIT) \
+ FORCE_INLINE \
+ gdv_##TYPE NAME##_##TYPE(gdv_##TYPE millis) { \
+ return ((millis / NMILLIS_IN_UNIT) * NMILLIS_IN_UNIT); \
+ }
+
+#define DATE_TRUNC_WEEK(TYPE) \
+ FORCE_INLINE \
+ gdv_##TYPE date_trunc_Week_##TYPE(gdv_##TYPE millis) { \
+ EpochTimePoint tp(millis); \
+ int ndays_to_trunc = 0; \
+ if (tp.TmWday() == 0) { \
+ /* Sunday */ \
+ ndays_to_trunc = 6; \
+ } else { \
+ /* All other days */ \
+ ndays_to_trunc = tp.TmWday() - 1; \
+ } \
+ return tp.AddDays(-ndays_to_trunc).ClearTimeOfDay().MillisSinceEpoch(); \
+ }
+
+#define DATE_TRUNC_MONTH_UNITS(NAME, TYPE, NMONTHS_IN_UNIT) \
+ FORCE_INLINE \
+ gdv_##TYPE NAME##_##TYPE(gdv_##TYPE millis) { \
+ EpochTimePoint tp(millis); \
+ int ndays_to_trunc = tp.TmMday() - 1; \
+ int nmonths_to_trunc = \
+ tp.TmMon() - ((tp.TmMon() / NMONTHS_IN_UNIT) * NMONTHS_IN_UNIT); \
+ return tp.AddDays(-ndays_to_trunc) \
+ .AddMonths(-nmonths_to_trunc) \
+ .ClearTimeOfDay() \
+ .MillisSinceEpoch(); \
+ }
+
+#define DATE_TRUNC_YEAR_UNITS(NAME, TYPE, NYEARS_IN_UNIT, OFF_BY) \
+ FORCE_INLINE \
+ gdv_##TYPE NAME##_##TYPE(gdv_##TYPE millis) { \
+ EpochTimePoint tp(millis); \
+ int ndays_to_trunc = tp.TmMday() - 1; \
+ int nmonths_to_trunc = tp.TmMon(); \
+ int year = 1900 + tp.TmYear(); \
+ year = ((year - OFF_BY) / NYEARS_IN_UNIT) * NYEARS_IN_UNIT + OFF_BY; \
+ int nyears_to_trunc = tp.TmYear() - (year - 1900); \
+ return tp.AddDays(-ndays_to_trunc) \
+ .AddMonths(-nmonths_to_trunc) \
+ .AddYears(-nyears_to_trunc) \
+ .ClearTimeOfDay() \
+ .MillisSinceEpoch(); \
+ }
+
+#define DATE_TRUNC_FUNCTIONS(TYPE) \
+ DATE_TRUNC_FIXED_UNIT(date_trunc_Second, TYPE, MILLIS_IN_SEC) \
+ DATE_TRUNC_FIXED_UNIT(date_trunc_Minute, TYPE, MILLIS_IN_MIN) \
+ DATE_TRUNC_FIXED_UNIT(date_trunc_Hour, TYPE, MILLIS_IN_HOUR) \
+ DATE_TRUNC_FIXED_UNIT(date_trunc_Day, TYPE, MILLIS_IN_DAY) \
+ DATE_TRUNC_WEEK(TYPE) \
+ DATE_TRUNC_MONTH_UNITS(date_trunc_Month, TYPE, 1) \
+ DATE_TRUNC_MONTH_UNITS(date_trunc_Quarter, TYPE, 3) \
+ DATE_TRUNC_MONTH_UNITS(date_trunc_Year, TYPE, 12) \
+ DATE_TRUNC_YEAR_UNITS(date_trunc_Decade, TYPE, 10, 0) \
+ DATE_TRUNC_YEAR_UNITS(date_trunc_Century, TYPE, 100, 1) \
+ DATE_TRUNC_YEAR_UNITS(date_trunc_Millennium, TYPE, 1000, 1)
+
+DATE_TRUNC_FUNCTIONS(date64)
+DATE_TRUNC_FUNCTIONS(timestamp)
+
+#define LAST_DAY_FUNC(TYPE) \
+ FORCE_INLINE \
+ gdv_date64 last_day_from_##TYPE(gdv_date64 millis) { \
+ EpochTimePoint received_day(millis); \
+ const auto& day_without_hours_and_sec = received_day.ClearTimeOfDay(); \
+ \
+ int received_day_in_month = day_without_hours_and_sec.TmMday(); \
+ const auto& first_day_in_month = \
+ day_without_hours_and_sec.AddDays(1 - received_day_in_month); \
+ \
+ const auto& month_last_day = first_day_in_month.AddMonths(1).AddDays(-1); \
+ \
+ return month_last_day.MillisSinceEpoch(); \
+ }
+
+DATE_TYPES(LAST_DAY_FUNC)
+
+FORCE_INLINE
+gdv_date64 castDATE_int64(gdv_int64 in) { return in; }
+
+FORCE_INLINE
+gdv_date32 castDATE_int32(gdv_int32 in) { return in; }
+
+FORCE_INLINE
+gdv_date64 castDATE_date32(gdv_date32 days) {
+ return days * static_cast<gdv_date64>(MILLIS_IN_DAY);
+}
+
+static int days_in_month[] = {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
+
+bool IsLastDayOfMonth(const EpochTimePoint& tp) {
+ if (tp.TmMon() != 1) {
+ // not February. Don't worry about leap year
+ return (tp.TmMday() == days_in_month[tp.TmMon()]);
+ }
+
+ // this is February, check if the day is 28 or 29
+ if (tp.TmMday() < 28) {
+ return false;
+ }
+
+ if (tp.TmMday() == 29) {
+ // Feb 29th
+ return true;
+ }
+
+ // check if year is non-leap year
+ return !IsLeapYear(tp.TmYear());
+}
+
+FORCE_INLINE
+bool is_valid_time(const int hours, const int minutes, const int seconds) {
+ return hours >= 0 && hours < 24 && minutes >= 0 && minutes < 60 && seconds >= 0 &&
+ seconds < 60;
+}
+
+// MONTHS_BETWEEN returns number of months between dates date1 and date2.
+// If date1 is later than date2, then the result is positive.
+// If date1 is earlier than date2, then the result is negative.
+// If date1 and date2 are either the same days of the month or both last days of months,
+// then the result is always an integer. Otherwise Oracle Database calculates the
+// fractional portion of the result based on a 31-day month and considers the difference
+// in time components date1 and date2
+#define MONTHS_BETWEEN(TYPE) \
+ FORCE_INLINE \
+ double months_between##_##TYPE##_##TYPE(uint64_t endEpoch, uint64_t startEpoch) { \
+ EpochTimePoint endTime(endEpoch); \
+ EpochTimePoint startTime(startEpoch); \
+ int endYear = endTime.TmYear(); \
+ int endMonth = endTime.TmMon(); \
+ int startYear = startTime.TmYear(); \
+ int startMonth = startTime.TmMon(); \
+ int monthsDiff = (endYear - startYear) * 12 + (endMonth - startMonth); \
+ if ((endTime.TmMday() == startTime.TmMday()) || \
+ (IsLastDayOfMonth(endTime) && IsLastDayOfMonth(startTime))) { \
+ return static_cast<double>(monthsDiff); \
+ } \
+ double diffDays = static_cast<double>(endTime.TmMday() - startTime.TmMday()) / \
+ static_cast<double>(31); \
+ double diffHours = static_cast<double>(endTime.TmHour() - startTime.TmHour()) + \
+ static_cast<double>(endTime.TmMin() - startTime.TmMin()) / \
+ static_cast<double>(MINS_IN_HOUR) + \
+ static_cast<double>(endTime.TmSec() - startTime.TmSec()) / \
+ static_cast<double>(SECONDS_IN_HOUR); \
+ return static_cast<double>(monthsDiff) + diffDays + \
+ diffHours / static_cast<double>(HOURS_IN_DAY * 31); \
+ }
+
+DATE_TYPES(MONTHS_BETWEEN)
+
+FORCE_INLINE
+void set_error_for_date(gdv_int32 length, const char* input, const char* msg,
+ int64_t execution_context) {
+ int size = length + static_cast<int>(strlen(msg)) + 1;
+ char* error = reinterpret_cast<char*>(malloc(size));
+ snprintf(error, size, "%s%s", msg, input);
+ gdv_fn_context_set_error_msg(execution_context, error);
+ free(error);
+}
+
+gdv_date64 castDATE_utf8(int64_t context, const char* input, gdv_int32 length) {
+ using arrow_vendored::date::day;
+ using arrow_vendored::date::month;
+ using arrow_vendored::date::sys_days;
+ using arrow_vendored::date::year;
+ using arrow_vendored::date::year_month_day;
+ using gandiva::TimeFields;
+ // format : 0 is year, 1 is month and 2 is day.
+ int dateFields[3];
+ int dateIndex = 0, index = 0, value = 0;
+ int year_str_len = 0;
+ while (dateIndex < 3 && index < length) {
+ if (!isdigit(input[index])) {
+ dateFields[dateIndex++] = value;
+ value = 0;
+ } else {
+ value = (value * 10) + (input[index] - '0');
+ if (dateIndex == TimeFields::kYear) {
+ year_str_len++;
+ }
+ }
+ index++;
+ }
+
+ if (dateIndex < 3) {
+ // If we reached the end of input, we would have not encountered a separator
+ // store the last value
+ dateFields[dateIndex++] = value;
+ }
+ const char* msg = "Not a valid date value ";
+ if (dateIndex != 3) {
+ set_error_for_date(length, input, msg, context);
+ return 0;
+ }
+
+ /* Handle two digit years
+ * If range of two digits is between 70 - 99 then year = 1970 - 1999
+ * Else if two digits is between 00 - 69 = 2000 - 2069
+ */
+ if (dateFields[TimeFields::kYear] < 100 && year_str_len < 4) {
+ if (dateFields[TimeFields::kYear] < 70) {
+ dateFields[TimeFields::kYear] += 2000;
+ } else {
+ dateFields[TimeFields::kYear] += 1900;
+ }
+ }
+ year_month_day date = year(dateFields[TimeFields::kYear]) /
+ month(dateFields[TimeFields::kMonth]) /
+ day(dateFields[TimeFields::kDay]);
+ if (!date.ok()) {
+ set_error_for_date(length, input, msg, context);
+ return 0;
+ }
+ return std::chrono::time_point_cast<std::chrono::milliseconds>(sys_days(date))
+ .time_since_epoch()
+ .count();
+}
+
+/*
+ * Input consists of mandatory and optional fields.
+ * Mandatory fields are year, month and day.
+ * Optional fields are time, displacement and zone.
+ * Format is <year-month-day>[ hours:minutes:seconds][.millis][ displacement|zone]
+ */
+gdv_timestamp castTIMESTAMP_utf8(int64_t context, const char* input, gdv_int32 length) {
+ using arrow_vendored::date::day;
+ using arrow_vendored::date::month;
+ using arrow_vendored::date::sys_days;
+ using arrow_vendored::date::year;
+ using arrow_vendored::date::year_month_day;
+ using gandiva::TimeFields;
+ using std::chrono::hours;
+ using std::chrono::milliseconds;
+ using std::chrono::minutes;
+ using std::chrono::seconds;
+
+ int ts_fields[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
+ gdv_boolean add_displacement = true;
+ gdv_boolean encountered_zone = false;
+ int year_str_len = 0, sub_seconds_len = 0;
+ int ts_field_index = TimeFields::kYear, index = 0, value = 0;
+ while (ts_field_index < TimeFields::kMax && index < length) {
+ if (isdigit(input[index])) {
+ value = (value * 10) + (input[index] - '0');
+ if (ts_field_index == TimeFields::kYear) {
+ year_str_len++;
+ }
+ if (ts_field_index == TimeFields::kSubSeconds) {
+ sub_seconds_len++;
+ }
+ } else {
+ ts_fields[ts_field_index] = value;
+ value = 0;
+
+ switch (input[index]) {
+ case '.':
+ case ':':
+ case ' ':
+ ts_field_index++;
+ break;
+ case '+':
+ // +08:00, means time zone is 8 hours ahead. Need to subtract.
+ add_displacement = false;
+ ts_field_index = TimeFields::kDisplacementHours;
+ break;
+ case '-':
+ // Overloaded as date separator and negative displacement.
+ ts_field_index = (ts_field_index < 3) ? (ts_field_index + 1)
+ : TimeFields::kDisplacementHours;
+ break;
+ default:
+ encountered_zone = true;
+ break;
+ }
+ }
+ if (encountered_zone) {
+ break;
+ }
+ index++;
+ }
+
+ // Store the last value
+ if (ts_field_index < TimeFields::kMax) {
+ ts_fields[ts_field_index++] = value;
+ }
+
+ // adjust the year
+ if (ts_fields[TimeFields::kYear] < 100 && year_str_len < 4) {
+ if (ts_fields[TimeFields::kYear] < 70) {
+ ts_fields[TimeFields::kYear] += 2000;
+ } else {
+ ts_fields[TimeFields::kYear] += 1900;
+ }
+ }
+
+ // adjust the milliseconds
+ if (sub_seconds_len > 0) {
+ if (sub_seconds_len > 3) {
+ const char* msg = "Invalid millis for timestamp value ";
+ set_error_for_date(length, input, msg, context);
+ return 0;
+ }
+ while (sub_seconds_len < 3) {
+ ts_fields[TimeFields::kSubSeconds] *= 10;
+ sub_seconds_len++;
+ }
+ }
+ // handle timezone
+ if (encountered_zone) {
+ int err = 0;
+ gdv_timestamp ret_time = 0;
+ err = gdv_fn_time_with_zone(&ts_fields[0], (input + index), (length - index),
+ &ret_time);
+ if (err) {
+ const char* msg = "Invalid timestamp or unknown zone for timestamp value ";
+ set_error_for_date(length, input, msg, context);
+ return 0;
+ }
+ return ret_time;
+ }
+
+ year_month_day date = year(ts_fields[TimeFields::kYear]) /
+ month(ts_fields[TimeFields::kMonth]) /
+ day(ts_fields[TimeFields::kDay]);
+ if (!date.ok()) {
+ const char* msg = "Not a valid day for timestamp value ";
+ set_error_for_date(length, input, msg, context);
+ return 0;
+ }
+
+ if (!is_valid_time(ts_fields[TimeFields::kHours], ts_fields[TimeFields::kMinutes],
+ ts_fields[TimeFields::kSeconds])) {
+ const char* msg = "Not a valid time for timestamp value ";
+ set_error_for_date(length, input, msg, context);
+ return 0;
+ }
+
+ auto date_time = sys_days(date) + hours(ts_fields[TimeFields::kHours]) +
+ minutes(ts_fields[TimeFields::kMinutes]) +
+ seconds(ts_fields[TimeFields::kSeconds]) +
+ milliseconds(ts_fields[TimeFields::kSubSeconds]);
+ if (ts_fields[TimeFields::kDisplacementHours] ||
+ ts_fields[TimeFields::kDisplacementMinutes]) {
+ auto displacement_time = hours(ts_fields[TimeFields::kDisplacementHours]) +
+ minutes(ts_fields[TimeFields::kDisplacementMinutes]);
+ date_time = (add_displacement) ? (date_time + displacement_time)
+ : (date_time - displacement_time);
+ }
+ return std::chrono::time_point_cast<milliseconds>(date_time).time_since_epoch().count();
+}
+
+gdv_timestamp castTIMESTAMP_date64(gdv_date64 date_in_millis) { return date_in_millis; }
+
+gdv_timestamp castTIMESTAMP_int64(gdv_int64 in) { return in; }
+
+gdv_date64 castDATE_timestamp(gdv_timestamp timestamp_in_millis) {
+ EpochTimePoint tp(timestamp_in_millis);
+ return tp.ClearTimeOfDay().MillisSinceEpoch();
+}
+
+gdv_time32 castTIME_timestamp(gdv_timestamp timestamp_in_millis) {
+ // Retrieves a timestamp and returns the number of milliseconds since the midnight
+ EpochTimePoint tp(timestamp_in_millis);
+ auto tp_at_midnight = tp.ClearTimeOfDay();
+
+ int64_t millis_since_midnight =
+ tp.MillisSinceEpoch() - tp_at_midnight.MillisSinceEpoch();
+
+ return static_cast<int32_t>(millis_since_midnight);
+}
+
+const char* castVARCHAR_timestamp_int64(gdv_int64 context, gdv_timestamp in,
+ gdv_int64 length, gdv_int32* out_len) {
+ gdv_int64 year = extractYear_timestamp(in);
+ gdv_int64 month = extractMonth_timestamp(in);
+ gdv_int64 day = extractDay_timestamp(in);
+ gdv_int64 hour = extractHour_timestamp(in);
+ gdv_int64 minute = extractMinute_timestamp(in);
+ gdv_int64 second = extractSecond_timestamp(in);
+ gdv_int64 millis = in % MILLIS_IN_SEC;
+
+ static const int kTimeStampStringLen = 23;
+ const int char_buffer_length = kTimeStampStringLen + 1; // snprintf adds \0
+ char char_buffer[char_buffer_length];
+
+ // yyyy-MM-dd hh:mm:ss.sss
+ int res = snprintf(char_buffer, char_buffer_length,
+ "%04" PRId64 "-%02" PRId64 "-%02" PRId64 " %02" PRId64 ":%02" PRId64
+ ":%02" PRId64 ".%03" PRId64,
+ year, month, day, hour, minute, second, millis);
+ if (res < 0) {
+ gdv_fn_context_set_error_msg(context, "Could not format the timestamp");
+ return "";
+ }
+
+ *out_len = static_cast<gdv_int32>(length);
+ if (*out_len > kTimeStampStringLen) {
+ *out_len = kTimeStampStringLen;
+ }
+
+ if (*out_len <= 0) {
+ if (*out_len < 0) {
+ gdv_fn_context_set_error_msg(context, "Length of output string cannot be negative");
+ }
+ *out_len = 0;
+ return "";
+ }
+
+ char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+
+ memcpy(ret, char_buffer, *out_len);
+ return ret;
+}
+
+FORCE_INLINE
+gdv_int64 extractDay_daytimeinterval(gdv_day_time_interval in) {
+ gdv_int32 days = static_cast<gdv_int32>(in & 0x00000000FFFFFFFF);
+ return static_cast<gdv_int64>(days);
+}
+
+FORCE_INLINE
+gdv_int64 extractMillis_daytimeinterval(gdv_day_time_interval in) {
+ gdv_int32 millis = static_cast<gdv_int32>((in & 0xFFFFFFFF00000000) >> 32);
+ return static_cast<gdv_int64>(millis);
+}
+
+FORCE_INLINE
+gdv_int64 castBIGINT_daytimeinterval(gdv_day_time_interval in) {
+ return extractMillis_daytimeinterval(in) +
+ extractDay_daytimeinterval(in) * MILLIS_IN_DAY;
+}
+
+// Convert the seconds since epoch argument to timestamp
+#define TO_TIMESTAMP(TYPE) \
+ FORCE_INLINE \
+ gdv_timestamp to_timestamp##_##TYPE(gdv_##TYPE seconds) { \
+ return static_cast<gdv_timestamp>(seconds * MILLIS_IN_SEC); \
+ }
+
+NUMERIC_TYPES(TO_TIMESTAMP)
+
+// Convert the seconds since epoch argument to time
+#define TO_TIME(TYPE) \
+ FORCE_INLINE \
+ gdv_time32 to_time##_##TYPE(gdv_##TYPE seconds) { \
+ EpochTimePoint tp(static_cast<int64_t>(seconds * MILLIS_IN_SEC)); \
+ return static_cast<gdv_time32>(tp.TimeOfDay().to_duration().count()); \
+ }
+
+NUMERIC_TYPES(TO_TIME)
+
+#define CAST_INT_YEAR_INTERVAL(TYPE, OUT_TYPE) \
+ FORCE_INLINE \
+ gdv_##OUT_TYPE TYPE##_year_interval(gdv_month_interval in) { \
+ return static_cast<gdv_##OUT_TYPE>(in / 12.0); \
+ }
+
+CAST_INT_YEAR_INTERVAL(castBIGINT, int64)
+CAST_INT_YEAR_INTERVAL(castINT, int32)
+
+#define CAST_NULLABLE_INTERVAL_DAY(TYPE) \
+ FORCE_INLINE \
+ gdv_day_time_interval castNULLABLEINTERVALDAY_##TYPE(gdv_##TYPE in) { \
+ return static_cast<gdv_day_time_interval>(in); \
+ }
+
+CAST_NULLABLE_INTERVAL_DAY(int32)
+CAST_NULLABLE_INTERVAL_DAY(int64)
+
+#define CAST_NULLABLE_INTERVAL_YEAR(TYPE) \
+ FORCE_INLINE \
+ gdv_month_interval castNULLABLEINTERVALYEAR_##TYPE(int64_t context, gdv_##TYPE in) { \
+ gdv_month_interval value = static_cast<gdv_month_interval>(in); \
+ if (value != in) { \
+ gdv_fn_context_set_error_msg(context, "Integer overflow"); \
+ } \
+ return value; \
+ }
+
+CAST_NULLABLE_INTERVAL_YEAR(int32)
+CAST_NULLABLE_INTERVAL_YEAR(int64)
+
+} // extern "C"
diff --git a/src/arrow/cpp/src/gandiva/precompiled/time_constants.h b/src/arrow/cpp/src/gandiva/precompiled/time_constants.h
new file mode 100644
index 000000000..015ef4bf9
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/precompiled/time_constants.h
@@ -0,0 +1,30 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#define MILLIS_IN_SEC (1000)
+#define MILLIS_IN_MIN (60 * MILLIS_IN_SEC)
+#define MILLIS_IN_HOUR (60 * MILLIS_IN_MIN)
+#define MILLIS_IN_DAY (24 * MILLIS_IN_HOUR)
+#define MILLIS_IN_WEEK (7 * MILLIS_IN_DAY)
+
+#define MILLIS_TO_SEC(millis) ((millis) / MILLIS_IN_SEC)
+#define MILLIS_TO_MINS(millis) ((millis) / MILLIS_IN_MIN)
+#define MILLIS_TO_HOUR(millis) ((millis) / MILLIS_IN_HOUR)
+#define MILLIS_TO_DAY(millis) ((millis) / MILLIS_IN_DAY)
+#define MILLIS_TO_WEEK(millis) ((millis) / MILLIS_IN_WEEK)
diff --git a/src/arrow/cpp/src/gandiva/precompiled/time_fields.h b/src/arrow/cpp/src/gandiva/precompiled/time_fields.h
new file mode 100644
index 000000000..d5277e743
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/precompiled/time_fields.h
@@ -0,0 +1,35 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+namespace gandiva {
+
+enum TimeFields {
+ kYear,
+ kMonth,
+ kDay,
+ kHours,
+ kMinutes,
+ kSeconds,
+ kSubSeconds,
+ kDisplacementHours,
+ kDisplacementMinutes,
+ kMax
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/precompiled/time_test.cc b/src/arrow/cpp/src/gandiva/precompiled/time_test.cc
new file mode 100644
index 000000000..332ffa332
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/precompiled/time_test.cc
@@ -0,0 +1,953 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+#include <time.h>
+
+#include "../execution_context.h"
+#include "gandiva/precompiled/testing.h"
+#include "gandiva/precompiled/types.h"
+
+namespace gandiva {
+
+TEST(TestTime, TestCastDate) {
+ ExecutionContext context;
+ int64_t context_ptr = reinterpret_cast<int64_t>(&context);
+
+ EXPECT_EQ(castDATE_utf8(context_ptr, "1967-12-1", 9), -65836800000);
+ EXPECT_EQ(castDATE_utf8(context_ptr, "2067-12-1", 9), 3089923200000);
+
+ EXPECT_EQ(castDATE_utf8(context_ptr, "7-12-1", 6), 1196467200000);
+ EXPECT_EQ(castDATE_utf8(context_ptr, "67-12-1", 7), 3089923200000);
+ EXPECT_EQ(castDATE_utf8(context_ptr, "067-12-1", 8), 3089923200000);
+ EXPECT_EQ(castDATE_utf8(context_ptr, "0067-12-1", 9), -60023980800000);
+ EXPECT_EQ(castDATE_utf8(context_ptr, "00067-12-1", 10), -60023980800000);
+ EXPECT_EQ(castDATE_utf8(context_ptr, "167-12-1", 8), -56868307200000);
+
+ EXPECT_EQ(castDATE_utf8(context_ptr, "1972-12-1", 9), 92016000000);
+ EXPECT_EQ(castDATE_utf8(context_ptr, "72-12-1", 7), 92016000000);
+
+ EXPECT_EQ(castDATE_utf8(context_ptr, "1972222222", 10), 0);
+ EXPECT_EQ(context.get_error(), "Not a valid date value 1972222222");
+ context.Reset();
+
+ EXPECT_EQ(castDATE_utf8(context_ptr, "blahblah", 8), 0);
+ EXPECT_EQ(castDATE_utf8(context_ptr, "1967-12-1bb", 11), -65836800000);
+
+ EXPECT_EQ(castDATE_utf8(context_ptr, "67-12-1", 7), 3089923200000);
+ EXPECT_EQ(castDATE_utf8(context_ptr, "67-1-1", 6), 3061065600000);
+ EXPECT_EQ(castDATE_utf8(context_ptr, "71-1-1", 6), 31536000000);
+ EXPECT_EQ(castDATE_utf8(context_ptr, "71-45-1", 7), 0);
+ EXPECT_EQ(castDATE_utf8(context_ptr, "71-12-XX", 8), 0);
+
+ EXPECT_EQ(castDATE_date32(1), 86400000);
+}
+
+TEST(TestTime, TestCastTimestamp) {
+ ExecutionContext context;
+ int64_t context_ptr = reinterpret_cast<int64_t>(&context);
+
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "1967-12-1", 9), -65836800000);
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "2067-12-1", 9), 3089923200000);
+
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "7-12-1", 6), 1196467200000);
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "67-12-1", 7), 3089923200000);
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "067-12-1", 8), 3089923200000);
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "0067-12-1", 9), -60023980800000);
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "00067-12-1", 10), -60023980800000);
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "167-12-1", 8), -56868307200000);
+
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "1972-12-1", 9), 92016000000);
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "72-12-1", 7), 92016000000);
+
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "1972-12-1", 9), 92016000000);
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "67-12-1", 7), 3089923200000);
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "67-1-1", 6), 3061065600000);
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "71-1-1", 6), 31536000000);
+
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "2000-09-23 9:45:30", 18), 969702330000);
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "2000-09-23 9:45:30.920", 22), 969702330920);
+
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "2000-09-23 9:45:30.920 +08:00", 29),
+ 969673530920);
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "2000-09-23 9:45:30.920 -11:45", 29),
+ 969744630920);
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "65-03-04 00:20:40.920 +00:30", 28),
+ 3003349840920);
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "1932-05-18 11:30:00.920 +11:30", 30),
+ -1187308799080);
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "1857-02-11 20:31:40.920 -05:30", 30),
+ -3562264699080);
+ EXPECT_EQ(castTIMESTAMP_date64(
+ castDATE_utf8(context_ptr, "2000-09-23 9:45:30.920 +08:00", 29)),
+ castTIMESTAMP_utf8(context_ptr, "2000-09-23 0:00:00.000 +00:00", 29));
+
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "2000-09-23 9:45:30.1", 20),
+ castTIMESTAMP_utf8(context_ptr, "2000-09-23 9:45:30", 18) + 100);
+
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "2000-09-23 9:45:30.10", 20),
+ castTIMESTAMP_utf8(context_ptr, "2000-09-23 9:45:30", 18) + 100);
+
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "2000-09-23 9:45:30.100", 20),
+ castTIMESTAMP_utf8(context_ptr, "2000-09-23 9:45:30", 18) + 100);
+
+ // error cases
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "2000-01-01 24:00:00", 19), 0);
+ EXPECT_EQ(context.get_error(),
+ "Not a valid time for timestamp value 2000-01-01 24:00:00");
+ context.Reset();
+
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "2000-01-01 00:60:00", 19), 0);
+ EXPECT_EQ(context.get_error(),
+ "Not a valid time for timestamp value 2000-01-01 00:60:00");
+ context.Reset();
+
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "2000-01-01 00:00:100", 20), 0);
+ EXPECT_EQ(context.get_error(),
+ "Not a valid time for timestamp value 2000-01-01 00:00:100");
+ context.Reset();
+
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "2000-01-01 00:00:00.0001", 24), 0);
+ EXPECT_EQ(context.get_error(),
+ "Invalid millis for timestamp value 2000-01-01 00:00:00.0001");
+ context.Reset();
+
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "2000-01-01 00:00:00.1000", 24), 0);
+ EXPECT_EQ(context.get_error(),
+ "Invalid millis for timestamp value 2000-01-01 00:00:00.1000");
+ context.Reset();
+}
+
+#ifndef _WIN32
+
+// TODO(wesm): ARROW-4495. Need to address TZ database issues on Windows
+
+TEST(TestTime, TestCastTimestampWithTZ) {
+ ExecutionContext context;
+ int64_t context_ptr = reinterpret_cast<int64_t>(&context);
+
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "2000-09-23 9:45:30.920 Canada/Pacific", 37),
+ 969727530920);
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "2012-02-28 23:30:59 Asia/Kolkata", 32),
+ 1330452059000);
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "1923-10-07 03:03:03 America/New_York", 36),
+ -1459094217000);
+}
+
+TEST(TestTime, TestCastTimestampErrors) {
+ ExecutionContext context;
+ int64_t context_ptr = reinterpret_cast<int64_t>(&context);
+
+ // error cases
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "20000923", 8), 0);
+ EXPECT_EQ(context.get_error(), "Not a valid day for timestamp value 20000923");
+ context.Reset();
+
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "2000-09-2b", 10), 0);
+ EXPECT_EQ(context.get_error(),
+ "Invalid timestamp or unknown zone for timestamp value 2000-09-2b");
+ context.Reset();
+
+ EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "2000-09-23 9:45:30.920 Unknown/Zone", 35),
+ 0);
+ EXPECT_EQ(context.get_error(),
+ "Invalid timestamp or unknown zone for timestamp value 2000-09-23 "
+ "9:45:30.920 Unknown/Zone");
+ context.Reset();
+}
+
+#endif
+
+TEST(TestTime, TestExtractTime) {
+ // 10:20:33
+ gdv_int32 time_as_millis_in_day = 37233000;
+
+ EXPECT_EQ(extractHour_time32(time_as_millis_in_day), 10);
+ EXPECT_EQ(extractMinute_time32(time_as_millis_in_day), 20);
+ EXPECT_EQ(extractSecond_time32(time_as_millis_in_day), 33);
+}
+
+TEST(TestTime, TestTimestampDiffMonth) {
+ gdv_timestamp ts1 = StringToTimestamp("2019-06-30 00:00:00");
+ gdv_timestamp ts2 = StringToTimestamp("2019-05-31 00:00:00");
+ EXPECT_EQ(timestampdiffMonth_timestamp_timestamp(ts1, ts2), -1);
+
+ ts1 = StringToTimestamp("2019-06-30 00:00:00");
+ ts2 = StringToTimestamp("2019-02-28 00:00:00");
+ EXPECT_EQ(timestampdiffMonth_timestamp_timestamp(ts1, ts2), -4);
+
+ ts1 = StringToTimestamp("2019-06-30 00:00:00");
+ ts2 = StringToTimestamp("2019-03-31 00:00:00");
+ EXPECT_EQ(timestampdiffMonth_timestamp_timestamp(ts1, ts2), -3);
+
+ ts1 = StringToTimestamp("2019-06-30 00:00:00");
+ ts2 = StringToTimestamp("2019-06-30 00:00:00");
+ EXPECT_EQ(timestampdiffMonth_timestamp_timestamp(ts1, ts2), 0);
+
+ ts1 = StringToTimestamp("2019-06-30 00:00:00");
+ ts2 = StringToTimestamp("2019-07-31 00:00:00");
+ EXPECT_EQ(timestampdiffMonth_timestamp_timestamp(ts1, ts2), 1);
+
+ ts1 = StringToTimestamp("2019-06-30 00:00:00");
+ ts2 = StringToTimestamp("2019-07-30 00:00:00");
+ EXPECT_EQ(timestampdiffMonth_timestamp_timestamp(ts1, ts2), 1);
+
+ ts1 = StringToTimestamp("2019-06-30 00:00:00");
+ ts2 = StringToTimestamp("2019-07-29 00:00:00");
+ EXPECT_EQ(timestampdiffMonth_timestamp_timestamp(ts1, ts2), 0);
+}
+
+TEST(TestTime, TestExtractTimestamp) {
+ gdv_timestamp ts = StringToTimestamp("1970-05-02 10:20:33");
+
+ EXPECT_EQ(extractMillennium_timestamp(ts), 2);
+ EXPECT_EQ(extractCentury_timestamp(ts), 20);
+ EXPECT_EQ(extractDecade_timestamp(ts), 197);
+ EXPECT_EQ(extractYear_timestamp(ts), 1970);
+ EXPECT_EQ(extractDoy_timestamp(ts), 122);
+ EXPECT_EQ(extractMonth_timestamp(ts), 5);
+ EXPECT_EQ(extractDow_timestamp(ts), 7);
+ EXPECT_EQ(extractDay_timestamp(ts), 2);
+ EXPECT_EQ(extractHour_timestamp(ts), 10);
+ EXPECT_EQ(extractMinute_timestamp(ts), 20);
+ EXPECT_EQ(extractSecond_timestamp(ts), 33);
+}
+
+TEST(TestTime, TimeStampTrunc) {
+ EXPECT_EQ(date_trunc_Second_date64(StringToTimestamp("2015-05-05 10:20:34")),
+ StringToTimestamp("2015-05-05 10:20:34"));
+ EXPECT_EQ(date_trunc_Minute_date64(StringToTimestamp("2015-05-05 10:20:34")),
+ StringToTimestamp("2015-05-05 10:20:00"));
+ EXPECT_EQ(date_trunc_Hour_date64(StringToTimestamp("2015-05-05 10:20:34")),
+ StringToTimestamp("2015-05-05 10:00:00"));
+ EXPECT_EQ(date_trunc_Day_date64(StringToTimestamp("2015-05-05 10:20:34")),
+ StringToTimestamp("2015-05-05 00:00:00"));
+ EXPECT_EQ(date_trunc_Month_date64(StringToTimestamp("2015-05-05 10:20:34")),
+ StringToTimestamp("2015-05-01 00:00:00"));
+ EXPECT_EQ(date_trunc_Quarter_date64(StringToTimestamp("2015-05-05 10:20:34")),
+ StringToTimestamp("2015-04-01 00:00:00"));
+ EXPECT_EQ(date_trunc_Year_date64(StringToTimestamp("2015-05-05 10:20:34")),
+ StringToTimestamp("2015-01-01 00:00:00"));
+ EXPECT_EQ(date_trunc_Decade_date64(StringToTimestamp("2015-05-05 10:20:34")),
+ StringToTimestamp("2010-01-01 00:00:00"));
+ EXPECT_EQ(date_trunc_Century_date64(StringToTimestamp("2115-05-05 10:20:34")),
+ StringToTimestamp("2101-01-01 00:00:00"));
+ EXPECT_EQ(date_trunc_Millennium_date64(StringToTimestamp("2115-05-05 10:20:34")),
+ StringToTimestamp("2001-01-01 00:00:00"));
+
+ // truncate week going to previous year
+ EXPECT_EQ(date_trunc_Week_timestamp(StringToTimestamp("2011-01-01 10:10:10")),
+ StringToTimestamp("2010-12-27 00:00:00"));
+ EXPECT_EQ(date_trunc_Week_timestamp(StringToTimestamp("2011-01-02 10:10:10")),
+ StringToTimestamp("2010-12-27 00:00:00"));
+ EXPECT_EQ(date_trunc_Week_timestamp(StringToTimestamp("2011-01-03 10:10:10")),
+ StringToTimestamp("2011-01-03 00:00:00"));
+ EXPECT_EQ(date_trunc_Week_timestamp(StringToTimestamp("2011-01-04 10:10:10")),
+ StringToTimestamp("2011-01-03 00:00:00"));
+ EXPECT_EQ(date_trunc_Week_timestamp(StringToTimestamp("2011-01-05 10:10:10")),
+ StringToTimestamp("2011-01-03 00:00:00"));
+ EXPECT_EQ(date_trunc_Week_timestamp(StringToTimestamp("2011-01-06 10:10:10")),
+ StringToTimestamp("2011-01-03 00:00:00"));
+ EXPECT_EQ(date_trunc_Week_timestamp(StringToTimestamp("2011-01-07 10:10:10")),
+ StringToTimestamp("2011-01-03 00:00:00"));
+ EXPECT_EQ(date_trunc_Week_timestamp(StringToTimestamp("2011-01-08 10:10:10")),
+ StringToTimestamp("2011-01-03 00:00:00"));
+ EXPECT_EQ(date_trunc_Week_timestamp(StringToTimestamp("2011-01-09 10:10:10")),
+ StringToTimestamp("2011-01-03 00:00:00"));
+
+ // truncate week for Feb in a leap year
+ EXPECT_EQ(date_trunc_Week_timestamp(StringToTimestamp("2000-02-28 10:10:10")),
+ StringToTimestamp("2000-02-28 00:00:00"));
+ EXPECT_EQ(date_trunc_Week_timestamp(StringToTimestamp("2000-02-29 10:10:10")),
+ StringToTimestamp("2000-02-28 00:00:00"));
+ EXPECT_EQ(date_trunc_Week_timestamp(StringToTimestamp("2000-03-01 10:10:10")),
+ StringToTimestamp("2000-02-28 00:00:00"));
+ EXPECT_EQ(date_trunc_Week_timestamp(StringToTimestamp("2000-03-02 10:10:10")),
+ StringToTimestamp("2000-02-28 00:00:00"));
+ EXPECT_EQ(date_trunc_Week_timestamp(StringToTimestamp("2000-03-03 10:10:10")),
+ StringToTimestamp("2000-02-28 00:00:00"));
+ EXPECT_EQ(date_trunc_Week_timestamp(StringToTimestamp("2000-03-04 10:10:10")),
+ StringToTimestamp("2000-02-28 00:00:00"));
+ EXPECT_EQ(date_trunc_Week_timestamp(StringToTimestamp("2000-03-05 10:10:10")),
+ StringToTimestamp("2000-02-28 00:00:00"));
+ EXPECT_EQ(date_trunc_Week_timestamp(StringToTimestamp("2000-03-06 10:10:10")),
+ StringToTimestamp("2000-03-06 00:00:00"));
+}
+
+TEST(TestTime, TimeStampAdd) {
+ EXPECT_EQ(
+ timestampaddSecond_int32_timestamp(30, StringToTimestamp("2000-05-01 10:20:34")),
+ StringToTimestamp("2000-05-01 10:21:04"));
+
+ EXPECT_EQ(
+ timestampaddSecond_timestamp_int32(StringToTimestamp("2000-05-01 10:20:34"), 30),
+ StringToTimestamp("2000-05-01 10:21:04"));
+
+ EXPECT_EQ(
+ timestampaddMinute_int64_timestamp(-30, StringToTimestamp("2000-05-01 10:20:34")),
+ StringToTimestamp("2000-05-01 09:50:34"));
+
+ EXPECT_EQ(
+ timestampaddMinute_timestamp_int64(StringToTimestamp("2000-05-01 10:20:34"), -30),
+ StringToTimestamp("2000-05-01 09:50:34"));
+
+ EXPECT_EQ(
+ timestampaddHour_int32_timestamp(20, StringToTimestamp("2000-05-01 10:20:34")),
+ StringToTimestamp("2000-05-02 06:20:34"));
+
+ EXPECT_EQ(
+ timestampaddHour_timestamp_int32(StringToTimestamp("2000-05-01 10:20:34"), 20),
+ StringToTimestamp("2000-05-02 06:20:34"));
+
+ EXPECT_EQ(
+ timestampaddDay_int64_timestamp(-35, StringToTimestamp("2000-05-01 10:20:34")),
+ StringToTimestamp("2000-03-27 10:20:34"));
+
+ EXPECT_EQ(
+ timestampaddDay_timestamp_int64(StringToTimestamp("2000-05-01 10:20:34"), -35),
+ StringToTimestamp("2000-03-27 10:20:34"));
+
+ EXPECT_EQ(timestampaddWeek_int32_timestamp(4, StringToTimestamp("2000-05-01 10:20:34")),
+ StringToTimestamp("2000-05-29 10:20:34"));
+
+ EXPECT_EQ(timestampaddWeek_timestamp_int32(StringToTimestamp("2000-05-01 10:20:34"), 4),
+ StringToTimestamp("2000-05-29 10:20:34"));
+
+ EXPECT_EQ(timestampaddWeek_timestamp_int32(StringToTimestamp("2000-05-01 10:20:34"), 4),
+ StringToTimestamp("2000-05-29 10:20:34"));
+
+ EXPECT_EQ(
+ timestampaddMonth_int64_timestamp(10, StringToTimestamp("2000-05-01 10:20:34")),
+ StringToTimestamp("2001-03-01 10:20:34"));
+
+ EXPECT_EQ(
+ timestampaddMonth_int64_timestamp(1, StringToTimestamp("2000-01-31 10:20:34")),
+ StringToTimestamp("2000-2-29 10:20:34"));
+ EXPECT_EQ(
+ timestampaddMonth_int64_timestamp(13, StringToTimestamp("2001-01-31 10:20:34")),
+ StringToTimestamp("2002-02-28 10:20:34"));
+
+ EXPECT_EQ(
+ timestampaddMonth_int64_timestamp(11, StringToTimestamp("2000-05-31 10:20:34")),
+ StringToTimestamp("2001-04-30 10:20:34"));
+
+ EXPECT_EQ(
+ timestampaddMonth_timestamp_int64(StringToTimestamp("2000-05-31 10:20:34"), 11),
+ StringToTimestamp("2001-04-30 10:20:34"));
+
+ EXPECT_EQ(
+ timestampaddQuarter_int32_timestamp(-2, StringToTimestamp("2000-05-01 10:20:34")),
+ StringToTimestamp("1999-11-01 10:20:34"));
+
+ EXPECT_EQ(timestampaddYear_int64_timestamp(2, StringToTimestamp("2000-05-01 10:20:34")),
+ StringToTimestamp("2002-05-01 10:20:34"));
+
+ EXPECT_EQ(
+ timestampaddQuarter_int32_timestamp(-5, StringToTimestamp("2000-05-01 10:20:34")),
+ StringToTimestamp("1999-02-01 10:20:34"));
+ EXPECT_EQ(
+ timestampaddQuarter_int32_timestamp(-6, StringToTimestamp("2000-05-01 10:20:34")),
+ StringToTimestamp("1998-11-01 10:20:34"));
+
+ // date_add
+ EXPECT_EQ(date_add_int32_timestamp(7, StringToTimestamp("2000-05-01 00:00:00")),
+ StringToTimestamp("2000-05-08 00:00:00"));
+
+ EXPECT_EQ(add_int32_timestamp(4, StringToTimestamp("2000-05-01 00:00:00")),
+ StringToTimestamp("2000-05-05 00:00:00"));
+
+ EXPECT_EQ(add_int64_timestamp(7, StringToTimestamp("2000-05-01 00:00:00")),
+ StringToTimestamp("2000-05-08 00:00:00"));
+
+ EXPECT_EQ(date_add_int64_timestamp(4, StringToTimestamp("2000-05-01 00:00:00")),
+ StringToTimestamp("2000-05-05 00:00:00"));
+
+ EXPECT_EQ(date_add_int64_timestamp(4, StringToTimestamp("2000-02-27 00:00:00")),
+ StringToTimestamp("2000-03-02 00:00:00"));
+
+ EXPECT_EQ(add_date64_int64(StringToTimestamp("2000-02-27 00:00:00"), 4),
+ StringToTimestamp("2000-03-02 00:00:00"));
+
+ // date_sub
+ EXPECT_EQ(date_sub_timestamp_int32(StringToTimestamp("2000-05-01 00:00:00"), 7),
+ StringToTimestamp("2000-04-24 00:00:00"));
+
+ EXPECT_EQ(subtract_timestamp_int32(StringToTimestamp("2000-05-01 00:00:00"), -7),
+ StringToTimestamp("2000-05-08 00:00:00"));
+
+ EXPECT_EQ(date_diff_timestamp_int64(StringToTimestamp("2000-05-01 00:00:00"), 365),
+ StringToTimestamp("1999-05-02 00:00:00"));
+
+ EXPECT_EQ(date_diff_timestamp_int64(StringToTimestamp("2000-03-01 00:00:00"), 1),
+ StringToTimestamp("2000-02-29 00:00:00"));
+
+ EXPECT_EQ(date_diff_timestamp_int64(StringToTimestamp("2000-02-29 00:00:00"), 365),
+ StringToTimestamp("1999-03-01 00:00:00"));
+}
+
+// test cases from http://www.staff.science.uu.nl/~gent0113/calendar/isocalendar.htm
+TEST(TestTime, TestExtractWeek) {
+ std::vector<std::string> data;
+
+ // A type
+ // Jan 1, 2 and 3
+ data.push_back("2006-01-01 10:10:10");
+ data.push_back("52");
+ data.push_back("2006-01-02 10:10:10");
+ data.push_back("1");
+ data.push_back("2006-01-03 10:10:10");
+ data.push_back("1");
+ // middle, Monday and Sunday
+ data.push_back("2006-04-24 10:10:10");
+ data.push_back("17");
+ data.push_back("2006-04-30 10:10:10");
+ data.push_back("17");
+ // Dec 29-31
+ data.push_back("2006-12-29 10:10:10");
+ data.push_back("52");
+ data.push_back("2006-12-30 10:10:10");
+ data.push_back("52");
+ data.push_back("2006-12-31 10:10:10");
+ data.push_back("52");
+ // B(C) type
+ // Jan 1, 2 and 3
+ data.push_back("2011-01-01 10:10:10");
+ data.push_back("52");
+ data.push_back("2011-01-02 10:10:10");
+ data.push_back("52");
+ data.push_back("2011-01-03 10:10:10");
+ data.push_back("1");
+ // middle, Monday and Sunday
+ data.push_back("2011-07-18 10:10:10");
+ data.push_back("29");
+ data.push_back("2011-07-24 10:10:10");
+ data.push_back("29");
+ // Dec 29-31
+ data.push_back("2011-12-29 10:10:10");
+ data.push_back("52");
+ data.push_back("2011-12-30 10:10:10");
+ data.push_back("52");
+ data.push_back("2011-12-31 10:10:10");
+ data.push_back("52");
+ // B(DC) type
+ // Jan 1, 2 and 3
+ data.push_back("2005-01-01 10:10:10");
+ data.push_back("53");
+ data.push_back("2005-01-02 10:10:10");
+ data.push_back("53");
+ data.push_back("2005-01-03 10:10:10");
+ data.push_back("1");
+ // middle, Monday and Sunday
+ data.push_back("2005-11-07 10:10:10");
+ data.push_back("45");
+ data.push_back("2005-11-13 10:10:10");
+ data.push_back("45");
+ // Dec 29-31
+ data.push_back("2005-12-29 10:10:10");
+ data.push_back("52");
+ data.push_back("2005-12-30 10:10:10");
+ data.push_back("52");
+ data.push_back("2005-12-31 10:10:10");
+ data.push_back("52");
+ // C type
+ // Jan 1, 2 and 3
+ data.push_back("2010-01-01 10:10:10");
+ data.push_back("53");
+ data.push_back("2010-01-02 10:10:10");
+ data.push_back("53");
+ data.push_back("2010-01-03 10:10:10");
+ data.push_back("53");
+ // middle, Monday and Sunday
+ data.push_back("2010-09-13 10:10:10");
+ data.push_back("37");
+ data.push_back("2010-09-19 10:10:10");
+ data.push_back("37");
+ // Dec 29-31
+ data.push_back("2010-12-29 10:10:10");
+ data.push_back("52");
+ data.push_back("2010-12-30 10:10:10");
+ data.push_back("52");
+ data.push_back("2010-12-31 10:10:10");
+ data.push_back("52");
+ // D type
+ // Jan 1, 2 and 3
+ data.push_back("2037-01-01 10:10:10");
+ data.push_back("1");
+ data.push_back("2037-01-02 10:10:10");
+ data.push_back("1");
+ data.push_back("2037-01-03 10:10:10");
+ data.push_back("1");
+ // middle, Monday and Sunday
+ data.push_back("2037-08-17 10:10:10");
+ data.push_back("34");
+ data.push_back("2037-08-23 10:10:10");
+ data.push_back("34");
+ // Dec 29-31
+ data.push_back("2037-12-29 10:10:10");
+ data.push_back("53");
+ data.push_back("2037-12-30 10:10:10");
+ data.push_back("53");
+ data.push_back("2037-12-31 10:10:10");
+ data.push_back("53");
+ // E type
+ // Jan 1, 2 and 3
+ data.push_back("2014-01-01 10:10:10");
+ data.push_back("1");
+ data.push_back("2014-01-02 10:10:10");
+ data.push_back("1");
+ data.push_back("2014-01-03 10:10:10");
+ data.push_back("1");
+ // middle, Monday and Sunday
+ data.push_back("2014-01-13 10:10:10");
+ data.push_back("3");
+ data.push_back("2014-01-19 10:10:10");
+ data.push_back("3");
+ // Dec 29-31
+ data.push_back("2014-12-29 10:10:10");
+ data.push_back("1");
+ data.push_back("2014-12-30 10:10:10");
+ data.push_back("1");
+ data.push_back("2014-12-31 10:10:10");
+ data.push_back("1");
+ // F type
+ // Jan 1, 2 and 3
+ data.push_back("2019-01-01 10:10:10");
+ data.push_back("1");
+ data.push_back("2019-01-02 10:10:10");
+ data.push_back("1");
+ data.push_back("2019-01-03 10:10:10");
+ data.push_back("1");
+ // middle, Monday and Sunday
+ data.push_back("2019-02-11 10:10:10");
+ data.push_back("7");
+ data.push_back("2019-02-17 10:10:10");
+ data.push_back("7");
+ // Dec 29-31
+ data.push_back("2019-12-29 10:10:10");
+ data.push_back("52");
+ data.push_back("2019-12-30 10:10:10");
+ data.push_back("1");
+ data.push_back("2019-12-31 10:10:10");
+ data.push_back("1");
+ // G type
+ // Jan 1, 2 and 3
+ data.push_back("2001-01-01 10:10:10");
+ data.push_back("1");
+ data.push_back("2001-01-02 10:10:10");
+ data.push_back("1");
+ data.push_back("2001-01-03 10:10:10");
+ data.push_back("1");
+ // middle, Monday and Sunday
+ data.push_back("2001-03-19 10:10:10");
+ data.push_back("12");
+ data.push_back("2001-03-25 10:10:10");
+ data.push_back("12");
+ // Dec 29-31
+ data.push_back("2001-12-29 10:10:10");
+ data.push_back("52");
+ data.push_back("2001-12-30 10:10:10");
+ data.push_back("52");
+ data.push_back("2001-12-31 10:10:10");
+ data.push_back("1");
+ // AG type
+ // Jan 1, 2 and 3
+ data.push_back("2012-01-01 10:10:10");
+ data.push_back("52");
+ data.push_back("2012-01-02 10:10:10");
+ data.push_back("1");
+ data.push_back("2012-01-03 10:10:10");
+ data.push_back("1");
+ // middle, Monday and Sunday
+ data.push_back("2012-04-02 10:10:10");
+ data.push_back("14");
+ data.push_back("2012-04-08 10:10:10");
+ data.push_back("14");
+ // Dec 29-31
+ data.push_back("2012-12-29 10:10:10");
+ data.push_back("52");
+ data.push_back("2012-12-30 10:10:10");
+ data.push_back("52");
+ data.push_back("2012-12-31 10:10:10");
+ data.push_back("1");
+ // BA type
+ // Jan 1, 2 and 3
+ data.push_back("2000-01-01 10:10:10");
+ data.push_back("52");
+ data.push_back("2000-01-02 10:10:10");
+ data.push_back("52");
+ data.push_back("2000-01-03 10:10:10");
+ data.push_back("1");
+ // middle, Monday and Sunday
+ data.push_back("2000-05-22 10:10:10");
+ data.push_back("21");
+ data.push_back("2000-05-28 10:10:10");
+ data.push_back("21");
+ // Dec 29-31
+ data.push_back("2000-12-29 10:10:10");
+ data.push_back("52");
+ data.push_back("2000-12-30 10:10:10");
+ data.push_back("52");
+ data.push_back("2000-12-31 10:10:10");
+ data.push_back("52");
+ // CB type
+ // Jan 1, 2 and 3
+ data.push_back("2016-01-01 10:10:10");
+ data.push_back("53");
+ data.push_back("2016-01-02 10:10:10");
+ data.push_back("53");
+ data.push_back("2016-01-03 10:10:10");
+ data.push_back("53");
+ // middle, Monday and Sunday
+ data.push_back("2016-06-20 10:10:10");
+ data.push_back("25");
+ data.push_back("2016-06-26 10:10:10");
+ data.push_back("25");
+ // Dec 29-31
+ data.push_back("2016-12-29 10:10:10");
+ data.push_back("52");
+ data.push_back("2016-12-30 10:10:10");
+ data.push_back("52");
+ data.push_back("2016-12-31 10:10:10");
+ data.push_back("52");
+ // DC type
+ // Jan 1, 2 and 3
+ data.push_back("2004-01-01 10:10:10");
+ data.push_back("1");
+ data.push_back("2004-01-02 10:10:10");
+ data.push_back("1");
+ data.push_back("2004-01-03 10:10:10");
+ data.push_back("1");
+ // middle, Monday and Sunday
+ data.push_back("2004-07-19 10:10:10");
+ data.push_back("30");
+ data.push_back("2004-07-25 10:10:10");
+ data.push_back("30");
+ // Dec 29-31
+ data.push_back("2004-12-29 10:10:10");
+ data.push_back("53");
+ data.push_back("2004-12-30 10:10:10");
+ data.push_back("53");
+ data.push_back("2004-12-31 10:10:10");
+ data.push_back("53");
+ // ED type
+ // Jan 1, 2 and 3
+ data.push_back("2020-01-01 10:10:10");
+ data.push_back("1");
+ data.push_back("2020-01-02 10:10:10");
+ data.push_back("1");
+ data.push_back("2020-01-03 10:10:10");
+ data.push_back("1");
+ // middle, Monday and Sunday
+ data.push_back("2020-08-17 10:10:10");
+ data.push_back("34");
+ data.push_back("2020-08-23 10:10:10");
+ data.push_back("34");
+ // Dec 29-31
+ data.push_back("2020-12-29 10:10:10");
+ data.push_back("53");
+ data.push_back("2020-12-30 10:10:10");
+ data.push_back("53");
+ data.push_back("2020-12-31 10:10:10");
+ data.push_back("53");
+ // FE type
+ // Jan 1, 2 and 3
+ data.push_back("2008-01-01 10:10:10");
+ data.push_back("1");
+ data.push_back("2008-01-02 10:10:10");
+ data.push_back("1");
+ data.push_back("2008-01-03 10:10:10");
+ data.push_back("1");
+ // middle, Monday and Sunday
+ data.push_back("2008-09-15 10:10:10");
+ data.push_back("38");
+ data.push_back("2008-09-21 10:10:10");
+ data.push_back("38");
+ // Dec 29-31
+ data.push_back("2008-12-29 10:10:10");
+ data.push_back("1");
+ data.push_back("2008-12-30 10:10:10");
+ data.push_back("1");
+ data.push_back("2008-12-31 10:10:10");
+ data.push_back("1");
+ // GF type
+ // Jan 1, 2 and 3
+ data.push_back("2024-01-01 10:10:10");
+ data.push_back("1");
+ data.push_back("2024-01-02 10:10:10");
+ data.push_back("1");
+ data.push_back("2024-01-03 10:10:10");
+ data.push_back("1");
+ // middle, Monday and Sunday
+ data.push_back("2024-10-07 10:10:10");
+ data.push_back("41");
+ data.push_back("2024-10-13 10:10:10");
+ data.push_back("41");
+ // Dec 29-31
+ data.push_back("2024-12-29 10:10:10");
+ data.push_back("52");
+ data.push_back("2024-12-30 10:10:10");
+ data.push_back("1");
+ data.push_back("2024-12-31 10:10:10");
+ data.push_back("1");
+
+ for (uint32_t i = 0; i < data.size(); i += 2) {
+ gdv_timestamp ts = StringToTimestamp(data.at(i).c_str());
+ gdv_int64 exp = atol(data.at(i + 1).c_str());
+ EXPECT_EQ(extractWeek_timestamp(ts), exp);
+ }
+}
+
+TEST(TestTime, TestMonthsBetween) {
+ std::vector<std::string> testStrings = {
+ "1995-03-02 00:00:00", "1995-02-02 00:00:00", "1.0",
+ "1995-02-02 00:00:00", "1995-03-02 00:00:00", "-1.0",
+ "1995-03-31 00:00:00", "1995-02-28 00:00:00", "1.0",
+ "1996-03-31 00:00:00", "1996-02-28 00:00:00", "1.09677418",
+ "1996-03-31 00:00:00", "1996-02-29 00:00:00", "1.0",
+ "1996-05-31 00:00:00", "1996-04-30 00:00:00", "1.0",
+ "1996-05-31 00:00:00", "1996-03-31 00:00:00", "2.0",
+ "1996-05-31 00:00:00", "1996-03-30 00:00:00", "2.03225806",
+ "1996-03-15 00:00:00", "1996-02-14 00:00:00", "1.03225806",
+ "1995-02-02 00:00:00", "1995-01-01 00:00:00", "1.03225806",
+ "1995-02-02 10:00:00", "1995-01-01 11:00:00", "1.03091397"};
+
+ for (uint32_t i = 0; i < testStrings.size();) {
+ gdv_timestamp endTs = StringToTimestamp(testStrings[i++].c_str());
+ gdv_timestamp startTs = StringToTimestamp(testStrings[i++].c_str());
+
+ double expectedResult = atof(testStrings[i++].c_str());
+ double actualResult = months_between_timestamp_timestamp(endTs, startTs);
+
+ double diff = actualResult - expectedResult;
+ if (diff < 0) {
+ diff = expectedResult - actualResult;
+ }
+
+ EXPECT_TRUE(diff < 0.001);
+ }
+}
+
+TEST(TestTime, castVarcharTimestamp) {
+ ExecutionContext context;
+ int64_t context_ptr = reinterpret_cast<int64_t>(&context);
+ gdv_int32 out_len;
+ gdv_timestamp ts = StringToTimestamp("2000-05-01 10:20:34");
+ const char* out = castVARCHAR_timestamp_int64(context_ptr, ts, 30L, &out_len);
+ EXPECT_EQ(std::string(out, out_len), "2000-05-01 10:20:34.000");
+
+ out = castVARCHAR_timestamp_int64(context_ptr, ts, 19L, &out_len);
+ EXPECT_EQ(std::string(out, out_len), "2000-05-01 10:20:34");
+
+ out = castVARCHAR_timestamp_int64(context_ptr, ts, 0L, &out_len);
+ EXPECT_EQ(std::string(out, out_len), "");
+
+ ts = StringToTimestamp("2-5-1 00:00:04");
+ out = castVARCHAR_timestamp_int64(context_ptr, ts, 24L, &out_len);
+ EXPECT_EQ(std::string(out, out_len), "0002-05-01 00:00:04.000");
+}
+
+TEST(TestTime, TestCastTimestampToDate) {
+ gdv_timestamp ts = StringToTimestamp("2000-05-01 10:20:34");
+ auto out = castDATE_timestamp(ts);
+ EXPECT_EQ(StringToTimestamp("2000-05-01 00:00:00"), out);
+}
+
+TEST(TestTime, TestCastTimestampToTime) {
+ gdv_timestamp ts = StringToTimestamp("2000-05-01 10:20:34");
+ auto expected_response =
+ static_cast<int32_t>(ts - StringToTimestamp("2000-05-01 00:00:00"));
+ auto out = castTIME_timestamp(ts);
+ EXPECT_EQ(expected_response, out);
+
+ // Test when the defined value is midnight, so the returned value must 0
+ ts = StringToTimestamp("1998-12-01 00:00:00");
+ expected_response = 0;
+ out = castTIME_timestamp(ts);
+ EXPECT_EQ(expected_response, out);
+
+ ts = StringToTimestamp("2015-09-16 23:59:59");
+ expected_response = static_cast<int32_t>(ts - StringToTimestamp("2015-09-16 00:00:00"));
+ out = castTIME_timestamp(ts);
+ EXPECT_EQ(expected_response, out);
+}
+
+TEST(TestTime, TestLastDay) {
+ // leap year test
+ gdv_timestamp ts = StringToTimestamp("2016-02-11 03:20:34");
+ auto out = last_day_from_timestamp(ts);
+ EXPECT_EQ(StringToTimestamp("2016-02-29 00:00:00"), out);
+
+ ts = StringToTimestamp("2016-02-29 23:59:59");
+ out = last_day_from_timestamp(ts);
+ EXPECT_EQ(StringToTimestamp("2016-02-29 00:00:00"), out);
+
+ ts = StringToTimestamp("2016-01-30 23:59:00");
+ out = last_day_from_timestamp(ts);
+ EXPECT_EQ(StringToTimestamp("2016-01-31 00:00:00"), out);
+
+ // normal year
+ ts = StringToTimestamp("2017-02-03 23:59:59");
+ out = last_day_from_timestamp(ts);
+ EXPECT_EQ(StringToTimestamp("2017-02-28 00:00:00"), out);
+
+ // december
+ ts = StringToTimestamp("2015-12-03 03:12:59");
+ out = last_day_from_timestamp(ts);
+ EXPECT_EQ(StringToTimestamp("2015-12-31 00:00:00"), out);
+}
+
+TEST(TestTime, TestToTimestamp) {
+ auto ts = StringToTimestamp("1970-01-01 00:00:00");
+ EXPECT_EQ(ts, to_timestamp_int32(0));
+ EXPECT_EQ(ts, to_timestamp_int64(0));
+ EXPECT_EQ(ts, to_timestamp_float32(0));
+ EXPECT_EQ(ts, to_timestamp_float64(0));
+
+ ts = StringToTimestamp("1970-01-01 00:00:01");
+ EXPECT_EQ(ts, to_timestamp_int32(1));
+ EXPECT_EQ(ts, to_timestamp_int64(1));
+ EXPECT_EQ(ts, to_timestamp_float32(1));
+ EXPECT_EQ(ts, to_timestamp_float64(1));
+
+ ts = StringToTimestamp("1970-01-01 00:01:00");
+ EXPECT_EQ(ts, to_timestamp_int32(60));
+ EXPECT_EQ(ts, to_timestamp_int64(60));
+ EXPECT_EQ(ts, to_timestamp_float32(60));
+ EXPECT_EQ(ts, to_timestamp_float64(60));
+
+ ts = StringToTimestamp("1970-01-01 01:00:00");
+ EXPECT_EQ(ts, to_timestamp_int32(3600));
+ EXPECT_EQ(ts, to_timestamp_int64(3600));
+ EXPECT_EQ(ts, to_timestamp_float32(3600));
+ EXPECT_EQ(ts, to_timestamp_float64(3600));
+
+ ts = StringToTimestamp("1970-01-02 00:00:00");
+ EXPECT_EQ(ts, to_timestamp_int32(86400));
+ EXPECT_EQ(ts, to_timestamp_int64(86400));
+ EXPECT_EQ(ts, to_timestamp_float32(86400));
+ EXPECT_EQ(ts, to_timestamp_float64(86400));
+
+ // tests with fractional part
+ ts = StringToTimestamp("1970-01-01 00:00:01") + 500;
+ EXPECT_EQ(ts, to_timestamp_float32(1.500f));
+ EXPECT_EQ(ts, to_timestamp_float64(1.500));
+
+ ts = StringToTimestamp("1970-01-01 00:01:01") + 600;
+ EXPECT_EQ(ts, to_timestamp_float32(61.600f));
+ EXPECT_EQ(ts, to_timestamp_float64(61.600));
+
+ ts = StringToTimestamp("1970-01-01 01:00:01") + 400;
+ EXPECT_EQ(ts, to_timestamp_float32(3601.400f));
+ EXPECT_EQ(ts, to_timestamp_float64(3601.400));
+}
+
+TEST(TestTime, TestToTimeNumeric) {
+ // input timestamp in seconds: 1970-01-01 00:00:00
+ int64_t expected_output = 0; // 0 milliseconds
+ EXPECT_EQ(expected_output, to_time_int32(0));
+ EXPECT_EQ(expected_output, to_time_int64(0));
+ EXPECT_EQ(expected_output, to_time_float32(0.000f));
+ EXPECT_EQ(expected_output, to_time_float64(0.000));
+
+ // input timestamp in seconds: 1970-01-01 00:00:01
+ expected_output = 1000; // 1 seconds
+ EXPECT_EQ(expected_output, to_time_int32(1));
+ EXPECT_EQ(expected_output, to_time_int64(1));
+ EXPECT_EQ(expected_output, to_time_float32(1.000f));
+ EXPECT_EQ(expected_output, to_time_float64(1.000));
+
+ // input timestamp in seconds: 1970-01-01 01:00:00
+ expected_output = 3600000; // 3600 seconds
+ EXPECT_EQ(expected_output, to_time_int32(3600));
+ EXPECT_EQ(expected_output, to_time_int64(3600));
+ EXPECT_EQ(expected_output, to_time_float32(3600.000f));
+ EXPECT_EQ(expected_output, to_time_float64(3600.000));
+
+ // input timestamp in seconds: 1970-01-01 23:59:59
+ expected_output = 86399000; // 86399 seconds
+ EXPECT_EQ(expected_output, to_time_int32(86399));
+ EXPECT_EQ(expected_output, to_time_int64(86399));
+ EXPECT_EQ(expected_output, to_time_float32(86399.000f));
+ EXPECT_EQ(expected_output, to_time_float64(86399.000));
+
+ // input timestamp in seconds: 2020-01-01 00:00:01
+ expected_output = 1000; // 1 second
+ EXPECT_EQ(expected_output, to_time_int64(1577836801));
+ EXPECT_EQ(expected_output, to_time_float64(1577836801.000));
+
+ // tests with fractional part
+ // input timestamp in seconds: 1970-01-01 00:00:01.500
+ expected_output = 1500; // 1.5 seconds
+ EXPECT_EQ(expected_output, to_time_float32(1.500f));
+ EXPECT_EQ(expected_output, to_time_float64(1.500));
+
+ // input timestamp in seconds: 1970-01-01 00:01:01.500
+ expected_output = 61500; // 61.5 seconds
+ EXPECT_EQ(expected_output, to_time_float32(61.500f));
+ EXPECT_EQ(expected_output, to_time_float64(61.500));
+
+ // input timestamp in seconds: 1970-01-01 01:00:01.500
+ expected_output = 3601500; // 3601.5 seconds
+ EXPECT_EQ(expected_output, to_time_float32(3601.500f));
+ EXPECT_EQ(expected_output, to_time_float64(3601.500));
+}
+
+TEST(TestTime, TestCastIntDayInterval) {
+ EXPECT_EQ(castBIGINT_daytimeinterval(10), 864000000);
+ EXPECT_EQ(castBIGINT_daytimeinterval(-100), -8640000001);
+ EXPECT_EQ(castBIGINT_daytimeinterval(-0), 0);
+}
+
+TEST(TestTime, TestCastIntYearInterval) {
+ EXPECT_EQ(castINT_year_interval(24), 2);
+ EXPECT_EQ(castINT_year_interval(-24), -2);
+ EXPECT_EQ(castINT_year_interval(-23), -1);
+
+ EXPECT_EQ(castBIGINT_year_interval(24), 2);
+ EXPECT_EQ(castBIGINT_year_interval(-24), -2);
+ EXPECT_EQ(castBIGINT_year_interval(-23), -1);
+}
+
+TEST(TestTime, TestCastNullableInterval) {
+ ExecutionContext context;
+ auto context_ptr = reinterpret_cast<int64_t>(&context);
+ // Test castNULLABLEINTERVALDAY for int and bigint
+ EXPECT_EQ(castNULLABLEINTERVALDAY_int32(1), 1);
+ EXPECT_EQ(castNULLABLEINTERVALDAY_int32(12), 12);
+ EXPECT_EQ(castNULLABLEINTERVALDAY_int32(-55), -55);
+ EXPECT_EQ(castNULLABLEINTERVALDAY_int32(-1201), -1201);
+ EXPECT_EQ(castNULLABLEINTERVALDAY_int64(1), 1);
+ EXPECT_EQ(castNULLABLEINTERVALDAY_int64(12), 12);
+ EXPECT_EQ(castNULLABLEINTERVALDAY_int64(-55), -55);
+ EXPECT_EQ(castNULLABLEINTERVALDAY_int64(-1201), -1201);
+
+ // Test castNULLABLEINTERVALYEAR for int and bigint
+ EXPECT_EQ(castNULLABLEINTERVALYEAR_int32(context_ptr, 1), 1);
+ EXPECT_EQ(castNULLABLEINTERVALYEAR_int32(context_ptr, 12), 12);
+ EXPECT_EQ(castNULLABLEINTERVALYEAR_int32(context_ptr, 55), 55);
+ EXPECT_EQ(castNULLABLEINTERVALYEAR_int32(context_ptr, 1201), 1201);
+ EXPECT_EQ(castNULLABLEINTERVALYEAR_int64(context_ptr, 1), 1);
+ EXPECT_EQ(castNULLABLEINTERVALYEAR_int64(context_ptr, 12), 12);
+ EXPECT_EQ(castNULLABLEINTERVALYEAR_int64(context_ptr, 55), 55);
+ EXPECT_EQ(castNULLABLEINTERVALYEAR_int64(context_ptr, 1201), 1201);
+ // validate overflow error when using bigint as input
+ castNULLABLEINTERVALYEAR_int64(context_ptr, INT64_MAX);
+ EXPECT_EQ(context.get_error(), "Integer overflow");
+ context.Reset();
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/precompiled/timestamp_arithmetic.cc b/src/arrow/cpp/src/gandiva/precompiled/timestamp_arithmetic.cc
new file mode 100644
index 000000000..695605b3c
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/precompiled/timestamp_arithmetic.cc
@@ -0,0 +1,283 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./epoch_time_point.h"
+
+// The first row is for non-leap years
+static int days_in_a_month[2][12] = {{31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31},
+ {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}};
+
+bool is_leap_year(int yy) {
+ if ((yy % 4) != 0) {
+ // not divisible by 4
+ return false;
+ }
+ // yy = 4x
+ if ((yy % 400) == 0) {
+ // yy = 400x
+ return true;
+ }
+ // yy = 4x, return true if yy != 100x
+ return ((yy % 100) != 0);
+}
+
+bool is_last_day_of_month(const EpochTimePoint& tp) {
+ int matrix_index = is_leap_year(tp.TmYear()) ? 1 : 0;
+
+ return (tp.TmMday() == days_in_a_month[matrix_index][tp.TmMon()]);
+}
+
+bool did_days_overflow(arrow_vendored::date::year_month_day ymd) {
+ int year = static_cast<int>(ymd.year());
+ int month = static_cast<unsigned int>(ymd.month());
+ int days = static_cast<unsigned int>(ymd.day());
+
+ int matrix_index = is_leap_year(year) ? 1 : 0;
+
+ return days > days_in_a_month[matrix_index][month - 1];
+}
+
+int last_possible_day_in_month(int year, int month) {
+ int matrix_index = is_leap_year(year) ? 1 : 0;
+
+ return days_in_a_month[matrix_index][month - 1];
+}
+
+extern "C" {
+
+#include <time.h>
+
+#include "./time_constants.h"
+#include "./types.h"
+
+#define TIMESTAMP_DIFF_FIXED_UNITS(TYPE, NAME, FROM_MILLIS) \
+ FORCE_INLINE \
+ gdv_int32 NAME##_##TYPE##_##TYPE(gdv_##TYPE start_millis, gdv_##TYPE end_millis) { \
+ return static_cast<int32_t>(FROM_MILLIS(end_millis - start_millis)); \
+ }
+
+#define SIGN_ADJUST_DIFF(is_positive, diff) ((is_positive) ? (diff) : -(diff))
+#define MONTHS_TO_TIMEUNIT(diff, num_months) (diff) / (num_months)
+
+// Assuming end_millis > start_millis, the algorithm to find the diff in months is:
+// diff_in_months = year_diff * 12 + month_diff
+// This is approximately correct, except when the last month has not fully elapsed
+//
+// a) If end_day > start_day, return diff_in_months e.g. diff(2015-09-10, 2017-03-31)
+// b) If end_day < start_day, return diff_in_months - 1 e.g. diff(2015-09-30, 2017-03-10)
+// c) If end_day = start_day, check for millis e.g. diff(2017-03-10, 2015-03-10)
+// Need to check if end_millis_in_day > start_millis_in_day
+// c1) If end_millis_in_day >= start_millis_in_day, return diff_in_months
+// c2) else return diff_in_months - 1
+#define TIMESTAMP_DIFF_MONTH_UNITS(TYPE, NAME, N_MONTHS) \
+ FORCE_INLINE \
+ gdv_int32 NAME##_##TYPE##_##TYPE(gdv_##TYPE start_millis, gdv_##TYPE end_millis) { \
+ gdv_int32 diff; \
+ bool is_positive = (end_millis > start_millis); \
+ if (!is_positive) { \
+ /* if end_millis < start_millis, swap and multiply by -1 at the end */ \
+ gdv_##TYPE tmp = start_millis; \
+ start_millis = end_millis; \
+ end_millis = tmp; \
+ } \
+ EpochTimePoint start_tm(start_millis); \
+ EpochTimePoint end_tm(end_millis); \
+ gdv_int32 months_diff; \
+ months_diff = static_cast<gdv_int32>(12 * (end_tm.TmYear() - start_tm.TmYear()) + \
+ (end_tm.TmMon() - start_tm.TmMon())); \
+ if (end_tm.TmMday() > start_tm.TmMday()) { \
+ /* case a */ \
+ diff = MONTHS_TO_TIMEUNIT(months_diff, N_MONTHS); \
+ return SIGN_ADJUST_DIFF(is_positive, diff); \
+ } \
+ if (end_tm.TmMday() < start_tm.TmMday()) { \
+ /* case b */ \
+ months_diff += (is_last_day_of_month(end_tm) ? 1 : 0); \
+ diff = MONTHS_TO_TIMEUNIT(months_diff - 1, N_MONTHS); \
+ return SIGN_ADJUST_DIFF(is_positive, diff); \
+ } \
+ gdv_int32 end_day_millis = \
+ static_cast<gdv_int32>(end_tm.TmHour() * MILLIS_IN_HOUR + \
+ end_tm.TmMin() * MILLIS_IN_MIN + end_tm.TmSec()); \
+ gdv_int32 start_day_millis = \
+ static_cast<gdv_int32>(start_tm.TmHour() * MILLIS_IN_HOUR + \
+ start_tm.TmMin() * MILLIS_IN_MIN + start_tm.TmSec()); \
+ if (end_day_millis >= start_day_millis) { \
+ /* case c1 */ \
+ diff = MONTHS_TO_TIMEUNIT(months_diff, N_MONTHS); \
+ return SIGN_ADJUST_DIFF(is_positive, diff); \
+ } \
+ /* case c2 */ \
+ diff = MONTHS_TO_TIMEUNIT(months_diff - 1, N_MONTHS); \
+ return SIGN_ADJUST_DIFF(is_positive, diff); \
+ }
+
+#define TIMESTAMP_DIFF(TYPE) \
+ TIMESTAMP_DIFF_FIXED_UNITS(TYPE, timestampdiffSecond, MILLIS_TO_SEC) \
+ TIMESTAMP_DIFF_FIXED_UNITS(TYPE, timestampdiffMinute, MILLIS_TO_MINS) \
+ TIMESTAMP_DIFF_FIXED_UNITS(TYPE, timestampdiffHour, MILLIS_TO_HOUR) \
+ TIMESTAMP_DIFF_FIXED_UNITS(TYPE, timestampdiffDay, MILLIS_TO_DAY) \
+ TIMESTAMP_DIFF_FIXED_UNITS(TYPE, timestampdiffWeek, MILLIS_TO_WEEK) \
+ TIMESTAMP_DIFF_MONTH_UNITS(TYPE, timestampdiffMonth, 1) \
+ TIMESTAMP_DIFF_MONTH_UNITS(TYPE, timestampdiffQuarter, 3) \
+ TIMESTAMP_DIFF_MONTH_UNITS(TYPE, timestampdiffYear, 12)
+
+TIMESTAMP_DIFF(timestamp)
+
+#define ADD_INT32_TO_TIMESTAMP_FIXED_UNITS(TYPE, NAME, TO_MILLIS) \
+ FORCE_INLINE \
+ gdv_##TYPE NAME##_int32_##TYPE(gdv_int32 count, gdv_##TYPE millis) { \
+ return millis + TO_MILLIS * static_cast<gdv_##TYPE>(count); \
+ }
+
+// Documentation of mktime suggests that it handles
+// TmMon() being negative, and also TmMon() being >= 12 by
+// adjusting TmYear() accordingly
+//
+// Using gmtime_r() and timegm() instead of localtime_r() and mktime()
+// since the input millis are since epoch
+#define ADD_INT32_TO_TIMESTAMP_MONTH_UNITS(TYPE, NAME, N_MONTHS) \
+ FORCE_INLINE \
+ gdv_##TYPE NAME##_int32_##TYPE(gdv_int32 count, gdv_##TYPE millis) { \
+ EpochTimePoint tp(millis); \
+ return tp.AddMonths(static_cast<int>(count * N_MONTHS)).MillisSinceEpoch(); \
+ }
+
+// TODO: Handle overflow while converting gdv_int64 to millis
+#define ADD_INT64_TO_TIMESTAMP_FIXED_UNITS(TYPE, NAME, TO_MILLIS) \
+ FORCE_INLINE \
+ gdv_##TYPE NAME##_int64_##TYPE(gdv_int64 count, gdv_##TYPE millis) { \
+ return millis + TO_MILLIS * static_cast<gdv_##TYPE>(count); \
+ }
+
+#define ADD_INT64_TO_TIMESTAMP_MONTH_UNITS(TYPE, NAME, N_MONTHS) \
+ FORCE_INLINE \
+ gdv_##TYPE NAME##_int64_##TYPE(gdv_int64 count, gdv_##TYPE millis) { \
+ EpochTimePoint tp(millis); \
+ return tp.AddMonths(static_cast<int>(count * N_MONTHS)).MillisSinceEpoch(); \
+ }
+
+#define ADD_TIMESTAMP_TO_INT32_FIXED_UNITS(TYPE, NAME, TO_MILLIS) \
+ FORCE_INLINE \
+ gdv_##TYPE NAME##_##TYPE##_int32(gdv_##TYPE millis, gdv_int32 count) { \
+ return millis + TO_MILLIS * static_cast<gdv_##TYPE>(count); \
+ }
+
+#define ADD_TIMESTAMP_TO_INT64_FIXED_UNITS(TYPE, NAME, TO_MILLIS) \
+ FORCE_INLINE \
+ gdv_##TYPE NAME##_##TYPE##_int64(gdv_##TYPE millis, gdv_int64 count) { \
+ return millis + TO_MILLIS * static_cast<gdv_##TYPE>(count); \
+ }
+
+#define ADD_TIMESTAMP_TO_INT32_MONTH_UNITS(TYPE, NAME, N_MONTHS) \
+ FORCE_INLINE \
+ gdv_##TYPE NAME##_##TYPE##_int32(gdv_##TYPE millis, gdv_int32 count) { \
+ EpochTimePoint tp(millis); \
+ return tp.AddMonths(static_cast<int>(count * N_MONTHS)).MillisSinceEpoch(); \
+ }
+
+#define ADD_TIMESTAMP_TO_INT64_MONTH_UNITS(TYPE, NAME, N_MONTHS) \
+ FORCE_INLINE \
+ gdv_##TYPE NAME##_##TYPE##_int64(gdv_##TYPE millis, gdv_int64 count) { \
+ EpochTimePoint tp(millis); \
+ return tp.AddMonths(static_cast<int>(count * N_MONTHS)).MillisSinceEpoch(); \
+ }
+
+#define ADD_TIMESTAMP_INT32_FIXEDUNITS(TYPE, NAME, TO_MILLIS) \
+ ADD_INT32_TO_TIMESTAMP_FIXED_UNITS(TYPE, NAME, TO_MILLIS) \
+ ADD_TIMESTAMP_TO_INT32_FIXED_UNITS(TYPE, NAME, TO_MILLIS)
+
+#define ADD_TIMESTAMP_INT32_MONTHUNITS(TYPE, NAME, N_MONTHS) \
+ ADD_INT32_TO_TIMESTAMP_MONTH_UNITS(TYPE, NAME, N_MONTHS) \
+ ADD_TIMESTAMP_TO_INT32_MONTH_UNITS(TYPE, NAME, N_MONTHS)
+
+#define TIMESTAMP_ADD_INT32(TYPE) \
+ ADD_TIMESTAMP_INT32_FIXEDUNITS(TYPE, timestampaddSecond, MILLIS_IN_SEC) \
+ ADD_TIMESTAMP_INT32_FIXEDUNITS(TYPE, timestampaddMinute, MILLIS_IN_MIN) \
+ ADD_TIMESTAMP_INT32_FIXEDUNITS(TYPE, timestampaddHour, MILLIS_IN_HOUR) \
+ ADD_TIMESTAMP_INT32_FIXEDUNITS(TYPE, timestampaddDay, MILLIS_IN_DAY) \
+ ADD_TIMESTAMP_INT32_FIXEDUNITS(TYPE, timestampaddWeek, MILLIS_IN_WEEK) \
+ ADD_TIMESTAMP_INT32_MONTHUNITS(TYPE, timestampaddMonth, 1) \
+ ADD_TIMESTAMP_INT32_MONTHUNITS(TYPE, timestampaddQuarter, 3) \
+ ADD_TIMESTAMP_INT32_MONTHUNITS(TYPE, timestampaddYear, 12)
+
+#define ADD_TIMESTAMP_INT64_FIXEDUNITS(TYPE, NAME, TO_MILLIS) \
+ ADD_INT64_TO_TIMESTAMP_FIXED_UNITS(TYPE, NAME, TO_MILLIS) \
+ ADD_TIMESTAMP_TO_INT64_FIXED_UNITS(TYPE, NAME, TO_MILLIS)
+
+#define ADD_TIMESTAMP_INT64_MONTHUNITS(TYPE, NAME, N_MONTHS) \
+ ADD_INT64_TO_TIMESTAMP_MONTH_UNITS(TYPE, NAME, N_MONTHS) \
+ ADD_TIMESTAMP_TO_INT64_MONTH_UNITS(TYPE, NAME, N_MONTHS)
+
+#define TIMESTAMP_ADD_INT64(TYPE) \
+ ADD_TIMESTAMP_INT64_FIXEDUNITS(TYPE, timestampaddSecond, MILLIS_IN_SEC) \
+ ADD_TIMESTAMP_INT64_FIXEDUNITS(TYPE, timestampaddMinute, MILLIS_IN_MIN) \
+ ADD_TIMESTAMP_INT64_FIXEDUNITS(TYPE, timestampaddHour, MILLIS_IN_HOUR) \
+ ADD_TIMESTAMP_INT64_FIXEDUNITS(TYPE, timestampaddDay, MILLIS_IN_DAY) \
+ ADD_TIMESTAMP_INT64_FIXEDUNITS(TYPE, timestampaddWeek, MILLIS_IN_WEEK) \
+ ADD_TIMESTAMP_INT64_MONTHUNITS(TYPE, timestampaddMonth, 1) \
+ ADD_TIMESTAMP_INT64_MONTHUNITS(TYPE, timestampaddQuarter, 3) \
+ ADD_TIMESTAMP_INT64_MONTHUNITS(TYPE, timestampaddYear, 12)
+
+#define TIMESTAMP_ADD_INT(TYPE) \
+ TIMESTAMP_ADD_INT32(TYPE) \
+ TIMESTAMP_ADD_INT64(TYPE)
+
+TIMESTAMP_ADD_INT(date64)
+TIMESTAMP_ADD_INT(timestamp)
+
+// add gdv_int32 to timestamp
+ADD_INT32_TO_TIMESTAMP_FIXED_UNITS(date64, date_add, MILLIS_IN_DAY)
+ADD_INT32_TO_TIMESTAMP_FIXED_UNITS(date64, add, MILLIS_IN_DAY)
+ADD_INT32_TO_TIMESTAMP_FIXED_UNITS(timestamp, date_add, MILLIS_IN_DAY)
+ADD_INT32_TO_TIMESTAMP_FIXED_UNITS(timestamp, add, MILLIS_IN_DAY)
+
+// add gdv_int64 to timestamp
+ADD_INT64_TO_TIMESTAMP_FIXED_UNITS(date64, date_add, MILLIS_IN_DAY)
+ADD_INT64_TO_TIMESTAMP_FIXED_UNITS(date64, add, MILLIS_IN_DAY)
+ADD_INT64_TO_TIMESTAMP_FIXED_UNITS(timestamp, date_add, MILLIS_IN_DAY)
+ADD_INT64_TO_TIMESTAMP_FIXED_UNITS(timestamp, add, MILLIS_IN_DAY)
+
+// date_sub, subtract, date_diff on gdv_int32
+ADD_TIMESTAMP_TO_INT32_FIXED_UNITS(date64, date_sub, -1 * MILLIS_IN_DAY)
+ADD_TIMESTAMP_TO_INT32_FIXED_UNITS(date64, subtract, -1 * MILLIS_IN_DAY)
+ADD_TIMESTAMP_TO_INT32_FIXED_UNITS(date64, date_diff, -1 * MILLIS_IN_DAY)
+ADD_TIMESTAMP_TO_INT32_FIXED_UNITS(timestamp, date_sub, -1 * MILLIS_IN_DAY)
+ADD_TIMESTAMP_TO_INT32_FIXED_UNITS(timestamp, subtract, -1 * MILLIS_IN_DAY)
+ADD_TIMESTAMP_TO_INT32_FIXED_UNITS(timestamp, date_diff, -1 * MILLIS_IN_DAY)
+
+// date_sub, subtract, date_diff on gdv_int64
+ADD_TIMESTAMP_TO_INT64_FIXED_UNITS(date64, date_sub, -1 * MILLIS_IN_DAY)
+ADD_TIMESTAMP_TO_INT64_FIXED_UNITS(date64, subtract, -1 * MILLIS_IN_DAY)
+ADD_TIMESTAMP_TO_INT64_FIXED_UNITS(date64, date_diff, -1 * MILLIS_IN_DAY)
+ADD_TIMESTAMP_TO_INT64_FIXED_UNITS(timestamp, date_sub, -1 * MILLIS_IN_DAY)
+ADD_TIMESTAMP_TO_INT64_FIXED_UNITS(timestamp, subtract, -1 * MILLIS_IN_DAY)
+ADD_TIMESTAMP_TO_INT64_FIXED_UNITS(timestamp, date_diff, -1 * MILLIS_IN_DAY)
+
+// add timestamp to gdv_int32
+ADD_TIMESTAMP_TO_INT32_FIXED_UNITS(date64, date_add, MILLIS_IN_DAY)
+ADD_TIMESTAMP_TO_INT32_FIXED_UNITS(date64, add, MILLIS_IN_DAY)
+ADD_TIMESTAMP_TO_INT32_FIXED_UNITS(timestamp, date_add, MILLIS_IN_DAY)
+ADD_TIMESTAMP_TO_INT32_FIXED_UNITS(timestamp, add, MILLIS_IN_DAY)
+
+// add timestamp to gdv_int64
+ADD_TIMESTAMP_TO_INT64_FIXED_UNITS(date64, date_add, MILLIS_IN_DAY)
+ADD_TIMESTAMP_TO_INT64_FIXED_UNITS(date64, add, MILLIS_IN_DAY)
+ADD_TIMESTAMP_TO_INT64_FIXED_UNITS(timestamp, date_add, MILLIS_IN_DAY)
+ADD_TIMESTAMP_TO_INT64_FIXED_UNITS(timestamp, add, MILLIS_IN_DAY)
+
+} // extern "C"
diff --git a/src/arrow/cpp/src/gandiva/precompiled/types.h b/src/arrow/cpp/src/gandiva/precompiled/types.h
new file mode 100644
index 000000000..987ee2c6d
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/precompiled/types.h
@@ -0,0 +1,592 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "gandiva/gdv_function_stubs.h"
+
+// Use the same names as in arrow data types. Makes it easy to write pre-processor macros.
+using gdv_boolean = bool;
+using gdv_int8 = int8_t;
+using gdv_int16 = int16_t;
+using gdv_int32 = int32_t;
+using gdv_int64 = int64_t;
+using gdv_uint8 = uint8_t;
+using gdv_uint16 = uint16_t;
+using gdv_uint32 = uint32_t;
+using gdv_uint64 = uint64_t;
+using gdv_float32 = float;
+using gdv_float64 = double;
+using gdv_date64 = int64_t;
+using gdv_date32 = int32_t;
+using gdv_time32 = int32_t;
+using gdv_timestamp = int64_t;
+using gdv_utf8 = char*;
+using gdv_binary = char*;
+using gdv_day_time_interval = int64_t;
+
+#ifdef GANDIVA_UNIT_TEST
+// unit tests may be compiled without O2, so inlining may not happen.
+#define FORCE_INLINE
+#else
+#define FORCE_INLINE __attribute__((always_inline))
+#endif
+
+extern "C" {
+
+bool bitMapGetBit(const unsigned char* bmap, int64_t position);
+void bitMapSetBit(unsigned char* bmap, int64_t position, bool value);
+void bitMapClearBitIfFalse(unsigned char* bmap, int64_t position, bool value);
+
+gdv_int64 extractMillennium_timestamp(gdv_timestamp millis);
+gdv_int64 extractCentury_timestamp(gdv_timestamp millis);
+gdv_int64 extractDecade_timestamp(gdv_timestamp millis);
+gdv_int64 extractYear_timestamp(gdv_timestamp millis);
+gdv_int64 extractDoy_timestamp(gdv_timestamp millis);
+gdv_int64 extractQuarter_timestamp(gdv_timestamp millis);
+gdv_int64 extractMonth_timestamp(gdv_timestamp millis);
+gdv_int64 extractWeek_timestamp(gdv_timestamp millis);
+gdv_int64 extractDow_timestamp(gdv_timestamp millis);
+gdv_int64 extractDay_timestamp(gdv_timestamp millis);
+gdv_int64 extractHour_timestamp(gdv_timestamp millis);
+gdv_int64 extractMinute_timestamp(gdv_timestamp millis);
+gdv_int64 extractSecond_timestamp(gdv_timestamp millis);
+gdv_int64 extractHour_time32(gdv_int32 millis_in_day);
+gdv_int64 extractMinute_time32(gdv_int32 millis_in_day);
+gdv_int64 extractSecond_time32(gdv_int32 millis_in_day);
+
+gdv_int32 hash32(double val, gdv_int32 seed);
+gdv_int32 hash32_buf(const gdv_uint8* buf, int len, gdv_int32 seed);
+gdv_int64 hash64(double val, gdv_int64 seed);
+gdv_int64 hash64_buf(const gdv_uint8* buf, int len, gdv_int64 seed);
+
+gdv_int32 timestampdiffMonth_timestamp_timestamp(gdv_timestamp, gdv_timestamp);
+
+gdv_int64 timestampaddSecond_int32_timestamp(gdv_int32, gdv_timestamp);
+gdv_int64 timestampaddMinute_int32_timestamp(gdv_int32, gdv_timestamp);
+gdv_int64 timestampaddHour_int32_timestamp(gdv_int32, gdv_timestamp);
+gdv_int64 timestampaddDay_int32_timestamp(gdv_int32, gdv_timestamp);
+gdv_int64 timestampaddWeek_int32_timestamp(gdv_int32, gdv_timestamp);
+gdv_int64 timestampaddMonth_int32_timestamp(gdv_int32, gdv_timestamp);
+gdv_int64 timestampaddQuarter_int32_timestamp(gdv_int32, gdv_timestamp);
+gdv_int64 timestampaddYear_int32_timestamp(gdv_int32, gdv_timestamp);
+
+gdv_int64 timestampaddSecond_timestamp_int32(gdv_timestamp, gdv_int32);
+gdv_int64 timestampaddMinute_timestamp_int32(gdv_timestamp, gdv_int32);
+gdv_int64 timestampaddHour_timestamp_int32(gdv_timestamp, gdv_int32);
+gdv_int64 timestampaddDay_timestamp_int32(gdv_timestamp, gdv_int32);
+gdv_int64 timestampaddWeek_timestamp_int32(gdv_timestamp, gdv_int32);
+gdv_int64 timestampaddMonth_timestamp_int32(gdv_timestamp, gdv_int32);
+gdv_int64 timestampaddQuarter_timestamp_int32(gdv_timestamp, gdv_int32);
+gdv_int64 timestampaddYear_timestamp_int32(gdv_timestamp, gdv_int32);
+
+gdv_int64 timestampaddSecond_int64_timestamp(gdv_int64, gdv_timestamp);
+gdv_int64 timestampaddMinute_int64_timestamp(gdv_int64, gdv_timestamp);
+gdv_int64 timestampaddHour_int64_timestamp(gdv_int64, gdv_timestamp);
+gdv_int64 timestampaddDay_int64_timestamp(gdv_int64, gdv_timestamp);
+gdv_int64 timestampaddWeek_int64_timestamp(gdv_int64, gdv_timestamp);
+gdv_int64 timestampaddMonth_int64_timestamp(gdv_int64, gdv_timestamp);
+gdv_int64 timestampaddQuarter_int64_timestamp(gdv_int64, gdv_timestamp);
+gdv_int64 timestampaddYear_int64_timestamp(gdv_int64, gdv_timestamp);
+
+gdv_int64 timestampaddSecond_timestamp_int64(gdv_timestamp, gdv_int64);
+gdv_int64 timestampaddMinute_timestamp_int64(gdv_timestamp, gdv_int64);
+gdv_int64 timestampaddHour_timestamp_int64(gdv_timestamp, gdv_int64);
+gdv_int64 timestampaddDay_timestamp_int64(gdv_timestamp, gdv_int64);
+gdv_int64 timestampaddWeek_timestamp_int64(gdv_timestamp, gdv_int64);
+gdv_int64 timestampaddMonth_timestamp_int64(gdv_timestamp, gdv_int64);
+gdv_int64 timestampaddQuarter_timestamp_int64(gdv_timestamp, gdv_int64);
+gdv_int64 timestampaddYear_timestamp_int64(gdv_timestamp, gdv_int64);
+
+gdv_int64 date_add_int32_timestamp(gdv_int32, gdv_timestamp);
+gdv_int64 add_int64_timestamp(gdv_int64, gdv_timestamp);
+gdv_int64 add_int32_timestamp(gdv_int32, gdv_timestamp);
+gdv_int64 date_add_int64_timestamp(gdv_int64, gdv_timestamp);
+gdv_timestamp add_date64_int64(gdv_date64, gdv_int64);
+
+gdv_timestamp to_timestamp_int32(gdv_int32);
+gdv_timestamp to_timestamp_int64(gdv_int64);
+gdv_timestamp to_timestamp_float32(gdv_float32);
+gdv_timestamp to_timestamp_float64(gdv_float64);
+
+gdv_time32 to_time_int32(gdv_int32);
+gdv_time32 to_time_int64(gdv_int64);
+gdv_time32 to_time_float32(gdv_float32);
+gdv_time32 to_time_float64(gdv_float64);
+
+gdv_int64 date_sub_timestamp_int32(gdv_timestamp, gdv_int32);
+gdv_int64 subtract_timestamp_int32(gdv_timestamp, gdv_int32);
+gdv_int64 date_diff_timestamp_int64(gdv_timestamp, gdv_int64);
+
+gdv_boolean castBIT_utf8(gdv_int64 context, const char* data, gdv_int32 data_len);
+
+bool is_distinct_from_timestamp_timestamp(gdv_int64, bool, gdv_int64, bool);
+bool is_not_distinct_from_int32_int32(gdv_int32, bool, gdv_int32, bool);
+
+gdv_int64 date_trunc_Second_date64(gdv_date64);
+gdv_int64 date_trunc_Minute_date64(gdv_date64);
+gdv_int64 date_trunc_Hour_date64(gdv_date64);
+gdv_int64 date_trunc_Day_date64(gdv_date64);
+gdv_int64 date_trunc_Month_date64(gdv_date64);
+gdv_int64 date_trunc_Quarter_date64(gdv_date64);
+gdv_int64 date_trunc_Year_date64(gdv_date64);
+gdv_int64 date_trunc_Decade_date64(gdv_date64);
+gdv_int64 date_trunc_Century_date64(gdv_date64);
+gdv_int64 date_trunc_Millennium_date64(gdv_date64);
+
+gdv_int64 date_trunc_Week_timestamp(gdv_timestamp);
+double months_between_timestamp_timestamp(gdv_uint64, gdv_uint64);
+
+gdv_int32 mem_compare(const char* left, gdv_int32 left_len, const char* right,
+ gdv_int32 right_len);
+
+gdv_int32 mod_int64_int32(gdv_int64 left, gdv_int32 right);
+gdv_float64 mod_float64_float64(gdv_int64 context, gdv_float64 left, gdv_float64 right);
+
+gdv_int64 divide_int64_int64(gdv_int64 context, gdv_int64 in1, gdv_int64 in2);
+
+gdv_int64 div_int64_int64(gdv_int64 context, gdv_int64 in1, gdv_int64 in2);
+gdv_float32 div_float32_float32(gdv_int64 context, gdv_float32 in1, gdv_float32 in2);
+gdv_float64 div_float64_float64(gdv_int64 context, gdv_float64 in1, gdv_float64 in2);
+
+gdv_float32 round_float32(gdv_float32);
+gdv_float64 round_float64(gdv_float64);
+gdv_float32 round_float32_int32(gdv_float32 number, gdv_int32 out_scale);
+gdv_float64 round_float64_int32(gdv_float64 number, gdv_int32 out_scale);
+gdv_float64 get_scale_multiplier(gdv_int32);
+gdv_int32 round_int32_int32(gdv_int32 number, gdv_int32 precision);
+gdv_int64 round_int64_int32(gdv_int64 number, gdv_int32 precision);
+gdv_int32 round_int32(gdv_int32);
+gdv_int64 round_int64(gdv_int64);
+gdv_int64 get_power_of_10(gdv_int32);
+
+const char* bin_int32(int64_t context, gdv_int32 value, int32_t* out_len);
+const char* bin_int64(int64_t context, gdv_int64 value, int32_t* out_len);
+
+gdv_float64 cbrt_int32(gdv_int32);
+gdv_float64 cbrt_int64(gdv_int64);
+gdv_float64 cbrt_float32(gdv_float32);
+gdv_float64 cbrt_float64(gdv_float64);
+
+gdv_float64 exp_int32(gdv_int32);
+gdv_float64 exp_int64(gdv_int64);
+gdv_float64 exp_float32(gdv_float32);
+gdv_float64 exp_float64(gdv_float64);
+
+gdv_float64 log_int32(gdv_int32);
+gdv_float64 log_int64(gdv_int64);
+gdv_float64 log_float32(gdv_float32);
+gdv_float64 log_float64(gdv_float64);
+
+gdv_float64 log10_int32(gdv_int32);
+gdv_float64 log10_int64(gdv_int64);
+gdv_float64 log10_float32(gdv_float32);
+gdv_float64 log10_float64(gdv_float64);
+
+gdv_float64 sin_int32(gdv_int32);
+gdv_float64 sin_int64(gdv_int64);
+gdv_float64 sin_float32(gdv_float32);
+gdv_float64 sin_float64(gdv_float64);
+gdv_float64 cos_int32(gdv_int32);
+gdv_float64 cos_int64(gdv_int64);
+gdv_float64 cos_float32(gdv_float32);
+gdv_float64 cos_float64(gdv_float64);
+gdv_float64 asin_int32(gdv_int32);
+gdv_float64 asin_int64(gdv_int64);
+gdv_float64 asin_float32(gdv_float32);
+gdv_float64 asin_float64(gdv_float64);
+gdv_float64 acos_int32(gdv_int32);
+gdv_float64 acos_int64(gdv_int64);
+gdv_float64 acos_float32(gdv_float32);
+gdv_float64 acos_float64(gdv_float64);
+gdv_float64 tan_int32(gdv_int32);
+gdv_float64 tan_int64(gdv_int64);
+gdv_float64 tan_float32(gdv_float32);
+gdv_float64 tan_float64(gdv_float64);
+gdv_float64 atan_int32(gdv_int32);
+gdv_float64 atan_int64(gdv_int64);
+gdv_float64 atan_float32(gdv_float32);
+gdv_float64 atan_float64(gdv_float64);
+gdv_float64 sinh_int32(gdv_int32);
+gdv_float64 sinh_int64(gdv_int64);
+gdv_float64 sinh_float32(gdv_float32);
+gdv_float64 sinh_float64(gdv_float64);
+gdv_float64 cosh_int32(gdv_int32);
+gdv_float64 cosh_int64(gdv_int64);
+gdv_float64 cosh_float32(gdv_float32);
+gdv_float64 cosh_float64(gdv_float64);
+gdv_float64 tanh_int32(gdv_int32);
+gdv_float64 tanh_int64(gdv_int64);
+gdv_float64 tanh_float32(gdv_float32);
+gdv_float64 tanh_float64(gdv_float64);
+gdv_float64 atan2_int32_int32(gdv_int32 in1, gdv_int32 in2);
+gdv_float64 atan2_int64_int64(gdv_int64 in1, gdv_int64 in2);
+gdv_float64 atan2_float32_float32(gdv_float32 in1, gdv_float32 in2);
+gdv_float64 atan2_float64_float64(gdv_float64 in1, gdv_float64 in2);
+gdv_float64 cot_float32(gdv_float32);
+gdv_float64 cot_float64(gdv_float64);
+gdv_float64 radians_int32(gdv_int32);
+gdv_float64 radians_int64(gdv_int64);
+gdv_float64 radians_float32(gdv_float32);
+gdv_float64 radians_float64(gdv_float64);
+gdv_float64 degrees_int32(gdv_int32);
+gdv_float64 degrees_int64(gdv_int64);
+gdv_float64 degrees_float32(gdv_float32);
+gdv_float64 degrees_float64(gdv_float64);
+
+gdv_int32 bitwise_and_int32_int32(gdv_int32 in1, gdv_int32 in2);
+gdv_int64 bitwise_and_int64_int64(gdv_int64 in1, gdv_int64 in2);
+gdv_int32 bitwise_or_int32_int32(gdv_int32 in1, gdv_int32 in2);
+gdv_int64 bitwise_or_int64_int64(gdv_int64 in1, gdv_int64 in2);
+gdv_int32 bitwise_xor_int32_int32(gdv_int32 in1, gdv_int32 in2);
+gdv_int64 bitwise_xor_int64_int64(gdv_int64 in1, gdv_int64 in2);
+gdv_int32 bitwise_not_int32(gdv_int32);
+gdv_int64 bitwise_not_int64(gdv_int64);
+
+gdv_float64 power_float64_float64(gdv_float64, gdv_float64);
+
+gdv_float64 log_int32_int32(gdv_int64 context, gdv_int32 base, gdv_int32 value);
+
+bool starts_with_utf8_utf8(const char* data, gdv_int32 data_len, const char* prefix,
+ gdv_int32 prefix_len);
+bool ends_with_utf8_utf8(const char* data, gdv_int32 data_len, const char* suffix,
+ gdv_int32 suffix_len);
+bool is_substr_utf8_utf8(const char* data, gdv_int32 data_len, const char* substr,
+ gdv_int32 substr_len);
+
+gdv_int32 utf8_length(gdv_int64 context, const char* data, gdv_int32 data_len);
+
+gdv_int32 utf8_last_char_pos(gdv_int64 context, const char* data, gdv_int32 data_len);
+
+gdv_date64 castDATE_utf8(int64_t execution_context, const char* input, gdv_int32 length);
+
+gdv_date64 castDATE_int64(gdv_int64 date);
+
+gdv_date64 castDATE_date32(gdv_date32 date);
+
+gdv_date32 castDATE_int32(gdv_int32 date);
+
+gdv_timestamp castTIMESTAMP_utf8(int64_t execution_context, const char* input,
+ gdv_int32 length);
+gdv_timestamp castTIMESTAMP_date64(gdv_date64);
+gdv_timestamp castTIMESTAMP_int64(gdv_int64);
+gdv_date64 castDATE_timestamp(gdv_timestamp);
+gdv_time32 castTIME_timestamp(gdv_timestamp timestamp_in_millis);
+const char* castVARCHAR_timestamp_int64(int64_t, gdv_timestamp, gdv_int64, gdv_int32*);
+gdv_date64 last_day_from_timestamp(gdv_date64 millis);
+
+gdv_int64 truncate_int64_int32(gdv_int64 in, gdv_int32 out_scale);
+
+const char* repeat_utf8_int32(gdv_int64 context, const char* in, gdv_int32 in_len,
+ gdv_int32 repeat_times, gdv_int32* out_len);
+
+const char* substr_utf8_int64_int64(gdv_int64 context, const char* input,
+ gdv_int32 in_len, gdv_int64 offset64,
+ gdv_int64 length, gdv_int32* out_len);
+const char* substr_utf8_int64(gdv_int64 context, const char* input, gdv_int32 in_len,
+ gdv_int64 offset64, gdv_int32* out_len);
+
+const char* concat_utf8_utf8(gdv_int64 context, const char* left, gdv_int32 left_len,
+ bool left_validity, const char* right, gdv_int32 right_len,
+ bool right_validity, gdv_int32* out_len);
+const char* concat_utf8_utf8_utf8(gdv_int64 context, const char* in1, gdv_int32 in1_len,
+ bool in1_validity, const char* in2, gdv_int32 in2_len,
+ bool in2_validity, const char* in3, gdv_int32 in3_len,
+ bool in3_validity, gdv_int32* out_len);
+const char* concat_utf8_utf8_utf8_utf8(gdv_int64 context, const char* in1,
+ gdv_int32 in1_len, bool in1_validity,
+ const char* in2, gdv_int32 in2_len,
+ bool in2_validity, const char* in3,
+ gdv_int32 in3_len, bool in3_validity,
+ const char* in4, gdv_int32 in4_len,
+ bool in4_validity, gdv_int32* out_len);
+const char* space_int32(gdv_int64 ctx, gdv_int32 n, int32_t* out_len);
+const char* space_int64(gdv_int64 ctx, gdv_int64 n, int32_t* out_len);
+const char* concat_utf8_utf8_utf8_utf8_utf8(
+ gdv_int64 context, const char* in1, gdv_int32 in1_len, bool in1_validity,
+ const char* in2, gdv_int32 in2_len, bool in2_validity, const char* in3,
+ gdv_int32 in3_len, bool in3_validity, const char* in4, gdv_int32 in4_len,
+ bool in4_validity, const char* in5, gdv_int32 in5_len, bool in5_validity,
+ gdv_int32* out_len);
+const char* concat_utf8_utf8_utf8_utf8_utf8_utf8(
+ gdv_int64 context, const char* in1, gdv_int32 in1_len, bool in1_validity,
+ const char* in2, gdv_int32 in2_len, bool in2_validity, const char* in3,
+ gdv_int32 in3_len, bool in3_validity, const char* in4, gdv_int32 in4_len,
+ bool in4_validity, const char* in5, gdv_int32 in5_len, bool in5_validity,
+ const char* in6, gdv_int32 in6_len, bool in6_validity, gdv_int32* out_len);
+const char* concat_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ gdv_int64 context, const char* in1, gdv_int32 in1_len, bool in1_validity,
+ const char* in2, gdv_int32 in2_len, bool in2_validity, const char* in3,
+ gdv_int32 in3_len, bool in3_validity, const char* in4, gdv_int32 in4_len,
+ bool in4_validity, const char* in5, gdv_int32 in5_len, bool in5_validity,
+ const char* in6, gdv_int32 in6_len, bool in6_validity, const char* in7,
+ gdv_int32 in7_len, bool in7_validity, gdv_int32* out_len);
+const char* concat_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ gdv_int64 context, const char* in1, gdv_int32 in1_len, bool in1_validity,
+ const char* in2, gdv_int32 in2_len, bool in2_validity, const char* in3,
+ gdv_int32 in3_len, bool in3_validity, const char* in4, gdv_int32 in4_len,
+ bool in4_validity, const char* in5, gdv_int32 in5_len, bool in5_validity,
+ const char* in6, gdv_int32 in6_len, bool in6_validity, const char* in7,
+ gdv_int32 in7_len, bool in7_validity, const char* in8, gdv_int32 in8_len,
+ bool in8_validity, gdv_int32* out_len);
+const char* concat_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ gdv_int64 context, const char* in1, gdv_int32 in1_len, bool in1_validity,
+ const char* in2, gdv_int32 in2_len, bool in2_validity, const char* in3,
+ gdv_int32 in3_len, bool in3_validity, const char* in4, gdv_int32 in4_len,
+ bool in4_validity, const char* in5, gdv_int32 in5_len, bool in5_validity,
+ const char* in6, gdv_int32 in6_len, bool in6_validity, const char* in7,
+ gdv_int32 in7_len, bool in7_validity, const char* in8, gdv_int32 in8_len,
+ bool in8_validity, const char* in9, gdv_int32 in9_len, bool in9_validity,
+ gdv_int32* out_len);
+const char* concat_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ gdv_int64 context, const char* in1, gdv_int32 in1_len, bool in1_validity,
+ const char* in2, gdv_int32 in2_len, bool in2_validity, const char* in3,
+ gdv_int32 in3_len, bool in3_validity, const char* in4, gdv_int32 in4_len,
+ bool in4_validity, const char* in5, gdv_int32 in5_len, bool in5_validity,
+ const char* in6, gdv_int32 in6_len, bool in6_validity, const char* in7,
+ gdv_int32 in7_len, bool in7_validity, const char* in8, gdv_int32 in8_len,
+ bool in8_validity, const char* in9, gdv_int32 in9_len, bool in9_validity,
+ const char* in10, gdv_int32 in10_len, bool in10_validity, gdv_int32* out_len);
+
+const char* concatOperator_utf8_utf8(gdv_int64 context, const char* left,
+ gdv_int32 left_len, const char* right,
+ gdv_int32 right_len, gdv_int32* out_len);
+const char* concatOperator_utf8_utf8_utf8(gdv_int64 context, const char* in1,
+ gdv_int32 in1_len, const char* in2,
+ gdv_int32 in2_len, const char* in3,
+ gdv_int32 in3_len, gdv_int32* out_len);
+const char* concatOperator_utf8_utf8_utf8_utf8(gdv_int64 context, const char* in1,
+ gdv_int32 in1_len, const char* in2,
+ gdv_int32 in2_len, const char* in3,
+ gdv_int32 in3_len, const char* in4,
+ gdv_int32 in4_len, gdv_int32* out_len);
+const char* concatOperator_utf8_utf8_utf8_utf8_utf8(
+ gdv_int64 context, const char* in1, gdv_int32 in1_len, const char* in2,
+ gdv_int32 in2_len, const char* in3, gdv_int32 in3_len, const char* in4,
+ gdv_int32 in4_len, const char* in5, gdv_int32 in5_len, gdv_int32* out_len);
+const char* concatOperator_utf8_utf8_utf8_utf8_utf8_utf8(
+ gdv_int64 context, const char* in1, gdv_int32 in1_len, const char* in2,
+ gdv_int32 in2_len, const char* in3, gdv_int32 in3_len, const char* in4,
+ gdv_int32 in4_len, const char* in5, gdv_int32 in5_len, const char* in6,
+ gdv_int32 in6_len, gdv_int32* out_len);
+const char* concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ gdv_int64 context, const char* in1, gdv_int32 in1_len, const char* in2,
+ gdv_int32 in2_len, const char* in3, gdv_int32 in3_len, const char* in4,
+ gdv_int32 in4_len, const char* in5, gdv_int32 in5_len, const char* in6,
+ gdv_int32 in6_len, const char* in7, gdv_int32 in7_len, gdv_int32* out_len);
+const char* concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ gdv_int64 context, const char* in1, gdv_int32 in1_len, const char* in2,
+ gdv_int32 in2_len, const char* in3, gdv_int32 in3_len, const char* in4,
+ gdv_int32 in4_len, const char* in5, gdv_int32 in5_len, const char* in6,
+ gdv_int32 in6_len, const char* in7, gdv_int32 in7_len, const char* in8,
+ gdv_int32 in8_len, gdv_int32* out_len);
+const char* concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ gdv_int64 context, const char* in1, gdv_int32 in1_len, const char* in2,
+ gdv_int32 in2_len, const char* in3, gdv_int32 in3_len, const char* in4,
+ gdv_int32 in4_len, const char* in5, gdv_int32 in5_len, const char* in6,
+ gdv_int32 in6_len, const char* in7, gdv_int32 in7_len, const char* in8,
+ gdv_int32 in8_len, const char* in9, gdv_int32 in9_len, gdv_int32* out_len);
+const char* concatOperator_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8_utf8(
+ gdv_int64 context, const char* in1, gdv_int32 in1_len, const char* in2,
+ gdv_int32 in2_len, const char* in3, gdv_int32 in3_len, const char* in4,
+ gdv_int32 in4_len, const char* in5, gdv_int32 in5_len, const char* in6,
+ gdv_int32 in6_len, const char* in7, gdv_int32 in7_len, const char* in8,
+ gdv_int32 in8_len, const char* in9, gdv_int32 in9_len, const char* in10,
+ gdv_int32 in10_len, gdv_int32* out_len);
+
+const char* castVARCHAR_binary_int64(gdv_int64 context, const char* data,
+ gdv_int32 data_len, int64_t out_len,
+ int32_t* out_length);
+
+const char* castVARCHAR_utf8_int64(gdv_int64 context, const char* data,
+ gdv_int32 data_len, int64_t out_len,
+ int32_t* out_length);
+
+const char* castVARBINARY_utf8_int64(gdv_int64 context, const char* data,
+ gdv_int32 data_len, int64_t out_len,
+ int32_t* out_length);
+
+const char* castVARBINARY_binary_int64(gdv_int64 context, const char* data,
+ gdv_int32 data_len, int64_t out_len,
+ int32_t* out_length);
+
+const char* reverse_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
+ int32_t* out_len);
+
+const char* ltrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
+ int32_t* out_len);
+
+const char* rtrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
+ int32_t* out_len);
+
+const char* btrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
+ int32_t* out_len);
+
+const char* ltrim_utf8_utf8(gdv_int64 context, const char* basetext,
+ gdv_int32 basetext_len, const char* trimtext,
+ gdv_int32 trimtext_len, int32_t* out_len);
+
+const char* rtrim_utf8_utf8(gdv_int64 context, const char* basetext,
+ gdv_int32 basetext_len, const char* trimtext,
+ gdv_int32 trimtext_len, int32_t* out_len);
+
+const char* btrim_utf8_utf8(gdv_int64 context, const char* basetext,
+ gdv_int32 basetext_len, const char* trimtext,
+ gdv_int32 trimtext_len, int32_t* out_len);
+
+gdv_int32 ascii_utf8(const char* data, gdv_int32 data_len);
+
+gdv_int32 locate_utf8_utf8(gdv_int64 context, const char* sub_str, gdv_int32 sub_str_len,
+ const char* str, gdv_int32 str_len);
+
+gdv_int32 strpos_utf8_utf8(gdv_int64 context, const char* str, gdv_int32 str_len,
+ const char* sub_str, gdv_int32 sub_str_len);
+
+gdv_int32 locate_utf8_utf8_int32(gdv_int64 context, const char* sub_str,
+ gdv_int32 sub_str_len, const char* str,
+ gdv_int32 str_len, gdv_int32 start_pos);
+
+const char* lpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len,
+ gdv_int32 return_length, const char* fill_text,
+ gdv_int32 fill_text_len, gdv_int32* out_len);
+
+const char* rpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len,
+ gdv_int32 return_length, const char* fill_text,
+ gdv_int32 fill_text_len, gdv_int32* out_len);
+
+const char* lpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
+ gdv_int32 return_length, gdv_int32* out_len);
+
+const char* rpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
+ gdv_int32 return_length, gdv_int32* out_len);
+
+const char* replace_with_max_len_utf8_utf8_utf8(gdv_int64 context, const char* text,
+ gdv_int32 text_len, const char* from_str,
+ gdv_int32 from_str_len,
+ const char* to_str, gdv_int32 to_str_len,
+ gdv_int32 max_length, gdv_int32* out_len);
+
+const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text,
+ gdv_int32 text_len, const char* from_str,
+ gdv_int32 from_str_len, const char* to_str,
+ gdv_int32 to_str_len, gdv_int32* out_len);
+
+const char* convert_replace_invalid_fromUTF8_binary(int64_t context, const char* text_in,
+ int32_t text_len,
+ const char* char_to_replace,
+ int32_t char_to_replace_len,
+ int32_t* out_len);
+
+const char* convert_toDOUBLE(int64_t context, double value, int32_t* out_len);
+
+const char* convert_toDOUBLE_be(int64_t context, double value, int32_t* out_len);
+
+const char* convert_toFLOAT(int64_t context, float value, int32_t* out_len);
+
+const char* convert_toFLOAT_be(int64_t context, float value, int32_t* out_len);
+
+const char* convert_toBIGINT(int64_t context, int64_t value, int32_t* out_len);
+
+const char* convert_toBIGINT_be(int64_t context, int64_t value, int32_t* out_len);
+
+const char* convert_toINT(int64_t context, int32_t value, int32_t* out_len);
+
+const char* convert_toINT_be(int64_t context, int32_t value, int32_t* out_len);
+
+const char* convert_toBOOLEAN(int64_t context, bool value, int32_t* out_len);
+
+const char* convert_toTIME_EPOCH(int64_t context, int32_t value, int32_t* out_len);
+
+const char* convert_toTIME_EPOCH_be(int64_t context, int32_t value, int32_t* out_len);
+
+const char* convert_toTIMESTAMP_EPOCH(int64_t context, int64_t timestamp,
+ int32_t* out_len);
+const char* convert_toTIMESTAMP_EPOCH_be(int64_t context, int64_t timestamp,
+ int32_t* out_len);
+
+const char* convert_toDATE_EPOCH(int64_t context, int64_t date, int32_t* out_len);
+
+const char* convert_toDATE_EPOCH_be(int64_t context, int64_t date, int32_t* out_len);
+
+const char* convert_toUTF8(int64_t context, const char* value, int32_t value_len,
+ int32_t* out_len);
+
+const char* split_part(gdv_int64 context, const char* text, gdv_int32 text_len,
+ const char* splitter, gdv_int32 split_len, gdv_int32 index,
+ gdv_int32* out_len);
+
+const char* byte_substr_binary_int32_int32(gdv_int64 context, const char* text,
+ gdv_int32 text_len, gdv_int32 offset,
+ gdv_int32 length, gdv_int32* out_len);
+
+const char* castVARCHAR_bool_int64(gdv_int64 context, gdv_boolean value,
+ gdv_int64 out_len, gdv_int32* out_length);
+
+const char* castVARCHAR_int32_int64(int64_t context, int32_t value, int64_t len,
+ int32_t* out_len);
+
+const char* castVARCHAR_int64_int64(int64_t context, int64_t value, int64_t len,
+ int32_t* out_len);
+
+const char* castVARCHAR_float32_int64(int64_t context, float value, int64_t len,
+ int32_t* out_len);
+
+const char* castVARCHAR_float64_int64(int64_t context, double value, int64_t len,
+ int32_t* out_len);
+
+const char* left_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
+ gdv_int32 number, gdv_int32* out_len);
+
+const char* right_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
+ gdv_int32 number, gdv_int32* out_len);
+
+const char* binary_string(gdv_int64 context, const char* text, gdv_int32 text_len,
+ gdv_int32* out_len);
+
+int32_t castINT_utf8(int64_t context, const char* data, int32_t len);
+
+int64_t castBIGINT_utf8(int64_t context, const char* data, int32_t len);
+
+float castFLOAT4_utf8(int64_t context, const char* data, int32_t len);
+
+double castFLOAT8_utf8(int64_t context, const char* data, int32_t len);
+
+int32_t castINT_float32(gdv_float32 value);
+
+int32_t castINT_float64(gdv_float64 value);
+
+int64_t castBIGINT_float32(gdv_float32 value);
+
+int64_t castBIGINT_float64(gdv_float64 value);
+
+int64_t castBIGINT_daytimeinterval(gdv_day_time_interval in);
+
+int32_t castINT_year_interval(gdv_month_interval in);
+
+int64_t castBIGINT_year_interval(gdv_month_interval in);
+
+gdv_day_time_interval castNULLABLEINTERVALDAY_int32(gdv_int32 in);
+
+gdv_day_time_interval castNULLABLEINTERVALDAY_int64(gdv_int64 in);
+
+gdv_month_interval castNULLABLEINTERVALYEAR_int32(int64_t context, gdv_int32 in);
+
+gdv_month_interval castNULLABLEINTERVALYEAR_int64(int64_t context, gdv_int64 in);
+
+} // extern "C"
diff --git a/src/arrow/cpp/src/gandiva/precompiled_bitcode.cc.in b/src/arrow/cpp/src/gandiva/precompiled_bitcode.cc.in
new file mode 100644
index 000000000..9c382961d
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/precompiled_bitcode.cc.in
@@ -0,0 +1,26 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <string>
+
+namespace gandiva {
+
+// Content of precompiled bitcode file.
+extern const unsigned char kPrecompiledBitcode[] = { <DATA_CHARS> };
+extern const size_t kPrecompiledBitcodeSize = sizeof(kPrecompiledBitcode);
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/projector.cc b/src/arrow/cpp/src/gandiva/projector.cc
new file mode 100644
index 000000000..ff167538f
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/projector.cc
@@ -0,0 +1,369 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/projector.h"
+
+#include <memory>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include "arrow/util/hash_util.h"
+#include "arrow/util/logging.h"
+
+#include "gandiva/cache.h"
+#include "gandiva/expr_validator.h"
+#include "gandiva/llvm_generator.h"
+
+namespace gandiva {
+
+class ProjectorCacheKey {
+ public:
+ ProjectorCacheKey(SchemaPtr schema, std::shared_ptr<Configuration> configuration,
+ ExpressionVector expression_vector, SelectionVector::Mode mode)
+ : schema_(schema), configuration_(configuration), mode_(mode), uniqifier_(0) {
+ static const int kSeedValue = 4;
+ size_t result = kSeedValue;
+ for (auto& expr : expression_vector) {
+ std::string expr_as_string = expr->ToString();
+ expressions_as_strings_.push_back(expr_as_string);
+ arrow::internal::hash_combine(result, expr_as_string);
+ UpdateUniqifier(expr_as_string);
+ }
+ arrow::internal::hash_combine(result, static_cast<size_t>(mode));
+ arrow::internal::hash_combine(result, configuration->Hash());
+ arrow::internal::hash_combine(result, schema_->ToString());
+ arrow::internal::hash_combine(result, uniqifier_);
+ hash_code_ = result;
+ }
+
+ std::size_t Hash() const { return hash_code_; }
+
+ bool operator==(const ProjectorCacheKey& other) const {
+ // arrow schema does not overload equality operators.
+ if (!(schema_->Equals(*other.schema().get(), true))) {
+ return false;
+ }
+
+ if (*configuration_ != *other.configuration_) {
+ return false;
+ }
+
+ if (expressions_as_strings_ != other.expressions_as_strings_) {
+ return false;
+ }
+
+ if (mode_ != other.mode_) {
+ return false;
+ }
+
+ if (uniqifier_ != other.uniqifier_) {
+ return false;
+ }
+ return true;
+ }
+
+ bool operator!=(const ProjectorCacheKey& other) const { return !(*this == other); }
+
+ SchemaPtr schema() const { return schema_; }
+
+ std::string ToString() const {
+ std::stringstream ss;
+ // indent, window, indent_size, null_rep and skip new lines.
+ arrow::PrettyPrintOptions options{0, 10, 2, "null", true};
+ DCHECK_OK(PrettyPrint(*schema_.get(), options, &ss));
+
+ ss << "Expressions: [";
+ bool first = true;
+ for (auto& expr : expressions_as_strings_) {
+ if (first) {
+ first = false;
+ } else {
+ ss << ", ";
+ }
+
+ ss << expr;
+ }
+ ss << "]";
+ return ss.str();
+ }
+
+ private:
+ void UpdateUniqifier(const std::string& expr) {
+ if (uniqifier_ == 0) {
+ // caching of expressions with re2 patterns causes lock contention. So, use
+ // multiple instances to reduce contention.
+ if (expr.find(" like(") != std::string::npos) {
+ uniqifier_ = std::hash<std::thread::id>()(std::this_thread::get_id()) % 16;
+ }
+ }
+ }
+
+ const SchemaPtr schema_;
+ const std::shared_ptr<Configuration> configuration_;
+ SelectionVector::Mode mode_;
+ std::vector<std::string> expressions_as_strings_;
+ size_t hash_code_;
+ uint32_t uniqifier_;
+};
+
+Projector::Projector(std::unique_ptr<LLVMGenerator> llvm_generator, SchemaPtr schema,
+ const FieldVector& output_fields,
+ std::shared_ptr<Configuration> configuration)
+ : llvm_generator_(std::move(llvm_generator)),
+ schema_(schema),
+ output_fields_(output_fields),
+ configuration_(configuration) {}
+
+Projector::~Projector() {}
+
+Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs,
+ std::shared_ptr<Projector>* projector) {
+ return Projector::Make(schema, exprs, SelectionVector::Mode::MODE_NONE,
+ ConfigurationBuilder::DefaultConfiguration(), projector);
+}
+
+Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs,
+ std::shared_ptr<Configuration> configuration,
+ std::shared_ptr<Projector>* projector) {
+ return Projector::Make(schema, exprs, SelectionVector::Mode::MODE_NONE, configuration,
+ projector);
+}
+
+Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs,
+ SelectionVector::Mode selection_vector_mode,
+ std::shared_ptr<Configuration> configuration,
+ std::shared_ptr<Projector>* projector) {
+ ARROW_RETURN_IF(schema == nullptr, Status::Invalid("Schema cannot be null"));
+ ARROW_RETURN_IF(exprs.empty(), Status::Invalid("Expressions cannot be empty"));
+ ARROW_RETURN_IF(configuration == nullptr,
+ Status::Invalid("Configuration cannot be null"));
+
+ // see if equivalent projector was already built
+ static Cache<ProjectorCacheKey, std::shared_ptr<Projector>> cache;
+ ProjectorCacheKey cache_key(schema, configuration, exprs, selection_vector_mode);
+ std::shared_ptr<Projector> cached_projector = cache.GetModule(cache_key);
+ if (cached_projector != nullptr) {
+ *projector = cached_projector;
+ return Status::OK();
+ }
+
+ // Build LLVM generator, and generate code for the specified expressions
+ std::unique_ptr<LLVMGenerator> llvm_gen;
+ ARROW_RETURN_NOT_OK(LLVMGenerator::Make(configuration, &llvm_gen));
+
+ // Run the validation on the expressions.
+ // Return if any of the expression is invalid since
+ // we will not be able to process further.
+ ExprValidator expr_validator(llvm_gen->types(), schema);
+ for (auto& expr : exprs) {
+ ARROW_RETURN_NOT_OK(expr_validator.Validate(expr));
+ }
+
+ // Start measuring build time
+ auto begin = std::chrono::high_resolution_clock::now();
+ ARROW_RETURN_NOT_OK(llvm_gen->Build(exprs, selection_vector_mode));
+ // Stop measuring time and calculate the elapsed time
+ auto end = std::chrono::high_resolution_clock::now();
+ auto elapsed =
+ std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
+
+ // save the output field types. Used for validation at Evaluate() time.
+ std::vector<FieldPtr> output_fields;
+ output_fields.reserve(exprs.size());
+ for (auto& expr : exprs) {
+ output_fields.push_back(expr->result());
+ }
+
+ // Instantiate the projector with the completely built llvm generator
+ *projector = std::shared_ptr<Projector>(
+ new Projector(std::move(llvm_gen), schema, output_fields, configuration));
+ ValueCacheObject<std::shared_ptr<Projector>> value_cache(*projector, elapsed);
+ cache.PutModule(cache_key, value_cache);
+
+ return Status::OK();
+}
+
+Status Projector::Evaluate(const arrow::RecordBatch& batch,
+ const ArrayDataVector& output_data_vecs) {
+ return Evaluate(batch, nullptr, output_data_vecs);
+}
+
+Status Projector::Evaluate(const arrow::RecordBatch& batch,
+ const SelectionVector* selection_vector,
+ const ArrayDataVector& output_data_vecs) {
+ ARROW_RETURN_NOT_OK(ValidateEvaluateArgsCommon(batch));
+
+ if (output_data_vecs.size() != output_fields_.size()) {
+ std::stringstream ss;
+ ss << "number of buffers for output_data_vecs is " << output_data_vecs.size()
+ << ", expected " << output_fields_.size();
+ return Status::Invalid(ss.str());
+ }
+
+ int idx = 0;
+ for (auto& array_data : output_data_vecs) {
+ if (array_data == nullptr) {
+ std::stringstream ss;
+ ss << "array for output field " << output_fields_[idx]->name() << "is null.";
+ return Status::Invalid(ss.str());
+ }
+
+ auto num_rows =
+ selection_vector == nullptr ? batch.num_rows() : selection_vector->GetNumSlots();
+
+ ARROW_RETURN_NOT_OK(
+ ValidateArrayDataCapacity(*array_data, *(output_fields_[idx]), num_rows));
+ ++idx;
+ }
+ return llvm_generator_->Execute(batch, selection_vector, output_data_vecs);
+}
+
+Status Projector::Evaluate(const arrow::RecordBatch& batch, arrow::MemoryPool* pool,
+ arrow::ArrayVector* output) {
+ return Evaluate(batch, nullptr, pool, output);
+}
+
+Status Projector::Evaluate(const arrow::RecordBatch& batch,
+ const SelectionVector* selection_vector,
+ arrow::MemoryPool* pool, arrow::ArrayVector* output) {
+ ARROW_RETURN_NOT_OK(ValidateEvaluateArgsCommon(batch));
+ ARROW_RETURN_IF(output == nullptr, Status::Invalid("Output must be non-null."));
+ ARROW_RETURN_IF(pool == nullptr, Status::Invalid("Memory pool must be non-null."));
+
+ auto num_rows =
+ selection_vector == nullptr ? batch.num_rows() : selection_vector->GetNumSlots();
+ // Allocate the output data vecs.
+ ArrayDataVector output_data_vecs;
+ for (auto& field : output_fields_) {
+ ArrayDataPtr output_data;
+
+ ARROW_RETURN_NOT_OK(AllocArrayData(field->type(), num_rows, pool, &output_data));
+ output_data_vecs.push_back(output_data);
+ }
+
+ // Execute the expression(s).
+ ARROW_RETURN_NOT_OK(
+ llvm_generator_->Execute(batch, selection_vector, output_data_vecs));
+
+ // Create and return array arrays.
+ output->clear();
+ for (auto& array_data : output_data_vecs) {
+ output->push_back(arrow::MakeArray(array_data));
+ }
+ return Status::OK();
+}
+
+// TODO : handle complex vectors (list/map/..)
+Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records,
+ arrow::MemoryPool* pool, ArrayDataPtr* array_data) {
+ arrow::Status astatus;
+ std::vector<std::shared_ptr<arrow::Buffer>> buffers;
+
+ // The output vector always has a null bitmap.
+ int64_t size = arrow::BitUtil::BytesForBits(num_records);
+ ARROW_ASSIGN_OR_RAISE(auto bitmap_buffer, arrow::AllocateBuffer(size, pool));
+ buffers.push_back(std::move(bitmap_buffer));
+
+ // String/Binary vectors have an offsets array.
+ auto type_id = type->id();
+ if (arrow::is_binary_like(type_id)) {
+ auto offsets_len = arrow::BitUtil::BytesForBits((num_records + 1) * 32);
+
+ ARROW_ASSIGN_OR_RAISE(auto offsets_buffer, arrow::AllocateBuffer(offsets_len, pool));
+ buffers.push_back(std::move(offsets_buffer));
+ }
+
+ // The output vector always has a data array.
+ int64_t data_len;
+ if (arrow::is_primitive(type_id) || type_id == arrow::Type::DECIMAL) {
+ const auto& fw_type = dynamic_cast<const arrow::FixedWidthType&>(*type);
+ data_len = arrow::BitUtil::BytesForBits(num_records * fw_type.bit_width());
+ } else if (arrow::is_binary_like(type_id)) {
+ // we don't know the expected size for varlen output vectors.
+ data_len = 0;
+ } else {
+ return Status::Invalid("Unsupported output data type " + type->ToString());
+ }
+ ARROW_ASSIGN_OR_RAISE(auto data_buffer, arrow::AllocateResizableBuffer(data_len, pool));
+
+ // This is not strictly required but valgrind gets confused and detects this
+ // as uninitialized memory access. See arrow::util::SetBitTo().
+ if (type->id() == arrow::Type::BOOL) {
+ memset(data_buffer->mutable_data(), 0, data_len);
+ }
+ buffers.push_back(std::move(data_buffer));
+
+ *array_data = arrow::ArrayData::Make(type, num_records, std::move(buffers));
+ return Status::OK();
+}
+
+Status Projector::ValidateEvaluateArgsCommon(const arrow::RecordBatch& batch) {
+ ARROW_RETURN_IF(!batch.schema()->Equals(*schema_),
+ Status::Invalid("Schema in RecordBatch must match schema in Make()"));
+ ARROW_RETURN_IF(batch.num_rows() == 0,
+ Status::Invalid("RecordBatch must be non-empty."));
+
+ return Status::OK();
+}
+
+Status Projector::ValidateArrayDataCapacity(const arrow::ArrayData& array_data,
+ const arrow::Field& field,
+ int64_t num_records) {
+ ARROW_RETURN_IF(array_data.buffers.size() < 2,
+ Status::Invalid("ArrayData must have at least 2 buffers"));
+
+ int64_t min_bitmap_len = arrow::BitUtil::BytesForBits(num_records);
+ int64_t bitmap_len = array_data.buffers[0]->capacity();
+ ARROW_RETURN_IF(
+ bitmap_len < min_bitmap_len,
+ Status::Invalid("Bitmap buffer too small for ", field.name(), " expected minimum ",
+ min_bitmap_len, " actual size ", bitmap_len));
+
+ auto type_id = field.type()->id();
+ if (arrow::is_binary_like(type_id)) {
+ // validate size of offsets buffer.
+ int64_t min_offsets_len = arrow::BitUtil::BytesForBits((num_records + 1) * 32);
+ int64_t offsets_len = array_data.buffers[1]->capacity();
+ ARROW_RETURN_IF(
+ offsets_len < min_offsets_len,
+ Status::Invalid("offsets buffer too small for ", field.name(),
+ " minimum required ", min_offsets_len, " actual ", offsets_len));
+
+ // check that it's resizable.
+ auto resizable = dynamic_cast<arrow::ResizableBuffer*>(array_data.buffers[2].get());
+ ARROW_RETURN_IF(
+ resizable == nullptr,
+ Status::Invalid("data buffer for varlen output vectors must be resizable"));
+ } else if (arrow::is_primitive(type_id) || type_id == arrow::Type::DECIMAL) {
+ // verify size of data buffer.
+ const auto& fw_type = dynamic_cast<const arrow::FixedWidthType&>(*field.type());
+ int64_t min_data_len =
+ arrow::BitUtil::BytesForBits(num_records * fw_type.bit_width());
+ int64_t data_len = array_data.buffers[1]->capacity();
+ ARROW_RETURN_IF(data_len < min_data_len,
+ Status::Invalid("Data buffer too small for ", field.name()));
+ } else {
+ return Status::Invalid("Unsupported output data type " + field.type()->ToString());
+ }
+
+ return Status::OK();
+}
+
+std::string Projector::DumpIR() { return llvm_generator_->DumpIR(); }
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/projector.h b/src/arrow/cpp/src/gandiva/projector.h
new file mode 100644
index 000000000..20b36c9d8
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/projector.h
@@ -0,0 +1,143 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/status.h"
+
+#include "gandiva/arrow.h"
+#include "gandiva/configuration.h"
+#include "gandiva/expression.h"
+#include "gandiva/selection_vector.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+class LLVMGenerator;
+
+/// \brief projection using expressions.
+///
+/// A projector is built for a specific schema and vector of expressions.
+/// Once the projector is built, it can be used to evaluate many row batches.
+class GANDIVA_EXPORT Projector {
+ public:
+ // Inline dtor will attempt to resolve the destructor for
+ // LLVMGenerator on MSVC, so we compile the dtor in the object code
+ ~Projector();
+
+ /// Build a default projector for the given schema to evaluate
+ /// the vector of expressions.
+ ///
+ /// \param[in] schema schema for the record batches, and the expressions.
+ /// \param[in] exprs vector of expressions.
+ /// \param[out] projector the returned projector object
+ static Status Make(SchemaPtr schema, const ExpressionVector& exprs,
+ std::shared_ptr<Projector>* projector);
+
+ /// Build a projector for the given schema to evaluate the vector of expressions.
+ /// Customize the projector with runtime configuration.
+ ///
+ /// \param[in] schema schema for the record batches, and the expressions.
+ /// \param[in] exprs vector of expressions.
+ /// \param[in] configuration run time configuration.
+ /// \param[out] projector the returned projector object
+ static Status Make(SchemaPtr schema, const ExpressionVector& exprs,
+ std::shared_ptr<Configuration> configuration,
+ std::shared_ptr<Projector>* projector);
+
+ /// Build a projector for the given schema to evaluate the vector of expressions.
+ /// Customize the projector with runtime configuration.
+ ///
+ /// \param[in] schema schema for the record batches, and the expressions.
+ /// \param[in] exprs vector of expressions.
+ /// \param[in] selection_vector_mode mode of selection vector
+ /// \param[in] configuration run time configuration.
+ /// \param[out] projector the returned projector object
+ static Status Make(SchemaPtr schema, const ExpressionVector& exprs,
+ SelectionVector::Mode selection_vector_mode,
+ std::shared_ptr<Configuration> configuration,
+ std::shared_ptr<Projector>* projector);
+
+ /// Evaluate the specified record batch, and return the allocated and populated output
+ /// arrays. The output arrays will be allocated from the memory pool 'pool', and added
+ /// to the vector 'output'.
+ ///
+ /// \param[in] batch the record batch. schema should be the same as the one in 'Make'
+ /// \param[in] pool memory pool used to allocate output arrays (if required).
+ /// \param[out] output the vector of allocated/populated arrays.
+ Status Evaluate(const arrow::RecordBatch& batch, arrow::MemoryPool* pool,
+ arrow::ArrayVector* output);
+
+ /// Evaluate the specified record batch, and populate the output arrays. The output
+ /// arrays of sufficient capacity must be allocated by the caller.
+ ///
+ /// \param[in] batch the record batch. schema should be the same as the one in 'Make'
+ /// \param[in,out] output vector of arrays, the arrays are allocated by the caller and
+ /// populated by Evaluate.
+ Status Evaluate(const arrow::RecordBatch& batch, const ArrayDataVector& output);
+
+ /// Evaluate the specified record batch, and return the allocated and populated output
+ /// arrays. The output arrays will be allocated from the memory pool 'pool', and added
+ /// to the vector 'output'.
+ ///
+ /// \param[in] batch the record batch. schema should be the same as the one in 'Make'
+ /// \param[in] selection_vector selection vector which has filtered row positions.
+ /// \param[in] pool memory pool used to allocate output arrays (if required).
+ /// \param[out] output the vector of allocated/populated arrays.
+ Status Evaluate(const arrow::RecordBatch& batch,
+ const SelectionVector* selection_vector, arrow::MemoryPool* pool,
+ arrow::ArrayVector* output);
+
+ /// Evaluate the specified record batch, and populate the output arrays at the filtered
+ /// positions. The output arrays of sufficient capacity must be allocated by the caller.
+ ///
+ /// \param[in] batch the record batch. schema should be the same as the one in 'Make'
+ /// \param[in] selection_vector selection vector which has the filtered row positions
+ /// \param[in,out] output vector of arrays, the arrays are allocated by the caller and
+ /// populated by Evaluate.
+ Status Evaluate(const arrow::RecordBatch& batch,
+ const SelectionVector* selection_vector, const ArrayDataVector& output);
+
+ std::string DumpIR();
+
+ private:
+ Projector(std::unique_ptr<LLVMGenerator> llvm_generator, SchemaPtr schema,
+ const FieldVector& output_fields, std::shared_ptr<Configuration>);
+
+ /// Allocate an ArrowData of length 'length'.
+ Status AllocArrayData(const DataTypePtr& type, int64_t num_records,
+ arrow::MemoryPool* pool, ArrayDataPtr* array_data);
+
+ /// Validate that the ArrayData has sufficient capacity to accommodate 'num_records'.
+ Status ValidateArrayDataCapacity(const arrow::ArrayData& array_data,
+ const arrow::Field& field, int64_t num_records);
+
+ /// Validate the common args for Evaluate() APIs.
+ Status ValidateEvaluateArgsCommon(const arrow::RecordBatch& batch);
+
+ std::unique_ptr<LLVMGenerator> llvm_generator_;
+ SchemaPtr schema_;
+ FieldVector output_fields_;
+ std::shared_ptr<Configuration> configuration_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/proto/Types.proto b/src/arrow/cpp/src/gandiva/proto/Types.proto
new file mode 100644
index 000000000..eb0d996b9
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/proto/Types.proto
@@ -0,0 +1,255 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+syntax = "proto2";
+package types;
+
+option java_package = "org.apache.arrow.gandiva.ipc";
+option java_outer_classname = "GandivaTypes";
+option optimize_for = SPEED;
+
+enum GandivaType {
+ NONE = 0; // arrow::Type::NA
+ BOOL = 1; // arrow::Type::BOOL
+ UINT8 = 2; // arrow::Type::UINT8
+ INT8 = 3; // arrow::Type::INT8
+ UINT16 = 4; // represents arrow::Type fields in src/arrow/type.h
+ INT16 = 5;
+ UINT32 = 6;
+ INT32 = 7;
+ UINT64 = 8;
+ INT64 = 9;
+ HALF_FLOAT = 10;
+ FLOAT = 11;
+ DOUBLE = 12;
+ UTF8 = 13;
+ BINARY = 14;
+ FIXED_SIZE_BINARY = 15;
+ DATE32 = 16;
+ DATE64 = 17;
+ TIMESTAMP = 18;
+ TIME32 = 19;
+ TIME64 = 20;
+ INTERVAL = 21;
+ DECIMAL = 22;
+ LIST = 23;
+ STRUCT = 24;
+ UNION = 25;
+ DICTIONARY = 26;
+ MAP = 27;
+}
+
+enum DateUnit {
+ DAY = 0;
+ MILLI = 1;
+}
+
+enum TimeUnit {
+ SEC = 0;
+ MILLISEC = 1;
+ MICROSEC = 2;
+ NANOSEC = 3;
+}
+
+enum IntervalType {
+ YEAR_MONTH = 0;
+ DAY_TIME = 1;
+}
+
+enum SelectionVectorType {
+ SV_NONE = 0;
+ SV_INT16 = 1;
+ SV_INT32 = 2;
+}
+
+message ExtGandivaType {
+ optional GandivaType type = 1;
+ optional uint32 width = 2; // used by FIXED_SIZE_BINARY
+ optional int32 precision = 3; // used by DECIMAL
+ optional int32 scale = 4; // used by DECIMAL
+ optional DateUnit dateUnit = 5; // used by DATE32/DATE64
+ optional TimeUnit timeUnit = 6; // used by TIME32/TIME64
+ optional string timeZone = 7; // used by TIMESTAMP
+ optional IntervalType intervalType = 8; // used by INTERVAL
+}
+
+message Field {
+ // name of the field
+ optional string name = 1;
+ optional ExtGandivaType type = 2;
+ optional bool nullable = 3;
+ // for complex data types like structs, unions
+ repeated Field children = 4;
+}
+
+message FieldNode {
+ optional Field field = 1;
+}
+
+message FunctionNode {
+ optional string functionName = 1;
+ repeated TreeNode inArgs = 2;
+ optional ExtGandivaType returnType = 3;
+}
+
+message IfNode {
+ optional TreeNode cond = 1;
+ optional TreeNode thenNode = 2;
+ optional TreeNode elseNode = 3;
+ optional ExtGandivaType returnType = 4;
+}
+
+message AndNode {
+ repeated TreeNode args = 1;
+}
+
+message OrNode {
+ repeated TreeNode args = 1;
+}
+
+message NullNode {
+ optional ExtGandivaType type = 1;
+}
+
+message IntNode {
+ optional int32 value = 1;
+}
+
+message FloatNode {
+ optional float value = 1;
+}
+
+message DoubleNode {
+ optional double value = 1;
+}
+
+message BooleanNode {
+ optional bool value = 1;
+}
+
+message LongNode {
+ optional int64 value = 1;
+}
+
+message StringNode {
+ optional bytes value = 1;
+}
+
+message BinaryNode {
+ optional bytes value = 1;
+}
+
+message DecimalNode {
+ optional string value = 1;
+ optional int32 precision = 2;
+ optional int32 scale = 3;
+}
+
+
+message TreeNode {
+ optional FieldNode fieldNode = 1;
+ optional FunctionNode fnNode = 2;
+
+ // control expressions
+ optional IfNode ifNode = 6;
+ optional AndNode andNode = 7;
+ optional OrNode orNode = 8;
+
+ // literals
+ optional NullNode nullNode = 11;
+ optional IntNode intNode = 12;
+ optional FloatNode floatNode = 13;
+ optional LongNode longNode = 14;
+ optional BooleanNode booleanNode = 15;
+ optional DoubleNode doubleNode = 16;
+ optional StringNode stringNode = 17;
+ optional BinaryNode binaryNode = 18;
+ optional DecimalNode decimalNode = 19;
+
+ // in expr
+ optional InNode inNode = 21;
+}
+
+message ExpressionRoot {
+ optional TreeNode root = 1;
+ optional Field resultType = 2;
+}
+
+message ExpressionList {
+ repeated ExpressionRoot exprs = 2;
+}
+
+message Condition {
+ optional TreeNode root = 1;
+}
+
+message Schema {
+ repeated Field columns = 1;
+}
+
+message GandivaDataTypes {
+ repeated ExtGandivaType dataType = 1;
+}
+
+message GandivaFunctions {
+ repeated FunctionSignature function = 1;
+}
+
+message FunctionSignature {
+ optional string name = 1;
+ optional ExtGandivaType returnType = 2;
+ repeated ExtGandivaType paramTypes = 3;
+}
+
+message InNode {
+ optional TreeNode node = 1;
+ optional IntConstants intValues = 2;
+ optional LongConstants longValues = 3;
+ optional StringConstants stringValues = 4;
+ optional BinaryConstants binaryValues = 5;
+ optional DecimalConstants decimalValues = 6;
+ optional FloatConstants floatValues = 7;
+ optional DoubleConstants doubleValues = 8;
+}
+
+message IntConstants {
+ repeated IntNode intValues = 1;
+}
+
+message LongConstants {
+ repeated LongNode longValues = 1;
+}
+
+message DecimalConstants {
+ repeated DecimalNode decimalValues = 1;
+}
+
+message FloatConstants {
+ repeated FloatNode floatValues = 1;
+}
+
+message DoubleConstants {
+ repeated DoubleNode doubleValues = 1;
+}
+
+message StringConstants {
+ repeated StringNode stringValues = 1;
+}
+
+message BinaryConstants {
+ repeated BinaryNode binaryValues = 1;
+}
diff --git a/src/arrow/cpp/src/gandiva/random_generator_holder.cc b/src/arrow/cpp/src/gandiva/random_generator_holder.cc
new file mode 100644
index 000000000..3471c87d9
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/random_generator_holder.cc
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/random_generator_holder.h"
+#include "gandiva/node.h"
+
+namespace gandiva {
+Status RandomGeneratorHolder::Make(const FunctionNode& node,
+ std::shared_ptr<RandomGeneratorHolder>* holder) {
+ ARROW_RETURN_IF(node.children().size() > 1,
+ Status::Invalid("'random' function requires at most one parameter"));
+
+ if (node.children().size() == 0) {
+ *holder = std::shared_ptr<RandomGeneratorHolder>(new RandomGeneratorHolder());
+ return Status::OK();
+ }
+
+ auto literal = dynamic_cast<LiteralNode*>(node.children().at(0).get());
+ ARROW_RETURN_IF(literal == nullptr,
+ Status::Invalid("'random' function requires a literal as parameter"));
+
+ auto literal_type = literal->return_type()->id();
+ ARROW_RETURN_IF(
+ literal_type != arrow::Type::INT32,
+ Status::Invalid("'random' function requires an int32 literal as parameter"));
+
+ *holder = std::shared_ptr<RandomGeneratorHolder>(new RandomGeneratorHolder(
+ literal->is_null() ? 0 : arrow::util::get<int32_t>(literal->holder())));
+ return Status::OK();
+}
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/random_generator_holder.h b/src/arrow/cpp/src/gandiva/random_generator_holder.h
new file mode 100644
index 000000000..65b6607e8
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/random_generator_holder.h
@@ -0,0 +1,57 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <random>
+
+#include "arrow/status.h"
+#include "arrow/util/io_util.h"
+
+#include "gandiva/function_holder.h"
+#include "gandiva/node.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+/// Function Holder for 'random'
+class GANDIVA_EXPORT RandomGeneratorHolder : public FunctionHolder {
+ public:
+ ~RandomGeneratorHolder() override = default;
+
+ static Status Make(const FunctionNode& node,
+ std::shared_ptr<RandomGeneratorHolder>* holder);
+
+ double operator()() { return distribution_(generator_); }
+
+ private:
+ explicit RandomGeneratorHolder(int seed) : distribution_(0, 1) {
+ int64_t seed64 = static_cast<int64_t>(seed);
+ seed64 = (seed64 ^ 0x00000005DEECE66D) & 0x0000ffffffffffff;
+ generator_.seed(static_cast<uint64_t>(seed64));
+ }
+
+ RandomGeneratorHolder() : distribution_(0, 1) {
+ generator_.seed(::arrow::internal::GetRandomSeed());
+ }
+
+ std::mt19937_64 generator_;
+ std::uniform_real_distribution<> distribution_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/random_generator_holder_test.cc b/src/arrow/cpp/src/gandiva/random_generator_holder_test.cc
new file mode 100644
index 000000000..4b16c1b7d
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/random_generator_holder_test.cc
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/random_generator_holder.h"
+
+#include <memory>
+
+#include <gtest/gtest.h>
+
+namespace gandiva {
+
+class TestRandGenHolder : public ::testing::Test {
+ public:
+ FunctionNode BuildRandFunc() { return FunctionNode("random", {}, arrow::float64()); }
+
+ FunctionNode BuildRandWithSeedFunc(int32_t seed, bool seed_is_null) {
+ auto seed_node =
+ std::make_shared<LiteralNode>(arrow::int32(), LiteralHolder(seed), seed_is_null);
+ return FunctionNode("rand", {seed_node}, arrow::float64());
+ }
+};
+
+TEST_F(TestRandGenHolder, NoSeed) {
+ std::shared_ptr<RandomGeneratorHolder> rand_gen_holder;
+ FunctionNode rand_func = BuildRandFunc();
+ auto status = RandomGeneratorHolder::Make(rand_func, &rand_gen_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& random = *rand_gen_holder;
+ EXPECT_NE(random(), random());
+}
+
+TEST_F(TestRandGenHolder, WithValidEqualSeeds) {
+ std::shared_ptr<RandomGeneratorHolder> rand_gen_holder_1;
+ std::shared_ptr<RandomGeneratorHolder> rand_gen_holder_2;
+ FunctionNode rand_func_1 = BuildRandWithSeedFunc(12, false);
+ FunctionNode rand_func_2 = BuildRandWithSeedFunc(12, false);
+ auto status = RandomGeneratorHolder::Make(rand_func_1, &rand_gen_holder_1);
+ EXPECT_EQ(status.ok(), true) << status.message();
+ status = RandomGeneratorHolder::Make(rand_func_2, &rand_gen_holder_2);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& random_1 = *rand_gen_holder_1;
+ auto& random_2 = *rand_gen_holder_2;
+ EXPECT_EQ(random_1(), random_2());
+ EXPECT_EQ(random_1(), random_2());
+ EXPECT_GT(random_1(), 0);
+ EXPECT_NE(random_1(), random_2());
+ EXPECT_LT(random_2(), 1);
+ EXPECT_EQ(random_1(), random_2());
+}
+
+TEST_F(TestRandGenHolder, WithValidSeeds) {
+ std::shared_ptr<RandomGeneratorHolder> rand_gen_holder_1;
+ std::shared_ptr<RandomGeneratorHolder> rand_gen_holder_2;
+ std::shared_ptr<RandomGeneratorHolder> rand_gen_holder_3;
+ FunctionNode rand_func_1 = BuildRandWithSeedFunc(11, false);
+ FunctionNode rand_func_2 = BuildRandWithSeedFunc(12, false);
+ FunctionNode rand_func_3 = BuildRandWithSeedFunc(-12, false);
+ auto status = RandomGeneratorHolder::Make(rand_func_1, &rand_gen_holder_1);
+ EXPECT_EQ(status.ok(), true) << status.message();
+ status = RandomGeneratorHolder::Make(rand_func_2, &rand_gen_holder_2);
+ EXPECT_EQ(status.ok(), true) << status.message();
+ status = RandomGeneratorHolder::Make(rand_func_3, &rand_gen_holder_3);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& random_1 = *rand_gen_holder_1;
+ auto& random_2 = *rand_gen_holder_2;
+ auto& random_3 = *rand_gen_holder_3;
+ EXPECT_NE(random_2(), random_3());
+ EXPECT_NE(random_1(), random_2());
+}
+
+TEST_F(TestRandGenHolder, WithInValidSeed) {
+ std::shared_ptr<RandomGeneratorHolder> rand_gen_holder_1;
+ std::shared_ptr<RandomGeneratorHolder> rand_gen_holder_2;
+ FunctionNode rand_func_1 = BuildRandWithSeedFunc(12, true);
+ FunctionNode rand_func_2 = BuildRandWithSeedFunc(0, false);
+ auto status = RandomGeneratorHolder::Make(rand_func_1, &rand_gen_holder_1);
+ EXPECT_EQ(status.ok(), true) << status.message();
+ status = RandomGeneratorHolder::Make(rand_func_2, &rand_gen_holder_2);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& random_1 = *rand_gen_holder_1;
+ auto& random_2 = *rand_gen_holder_2;
+ EXPECT_EQ(random_1(), random_2());
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/regex_util.cc b/src/arrow/cpp/src/gandiva/regex_util.cc
new file mode 100644
index 000000000..abdd579d1
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/regex_util.cc
@@ -0,0 +1,63 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/regex_util.h"
+
+namespace gandiva {
+
+const std::set<char> RegexUtil::pcre_regex_specials_ = {
+ '[', ']', '(', ')', '|', '^', '-', '+', '*', '?', '{', '}', '$', '\\', '.'};
+
+Status RegexUtil::SqlLikePatternToPcre(const std::string& sql_pattern, char escape_char,
+ std::string& pcre_pattern) {
+ /// Characters that are considered special by pcre regex. These needs to be
+ /// escaped with '\\'.
+ pcre_pattern.clear();
+ for (size_t idx = 0; idx < sql_pattern.size(); ++idx) {
+ auto cur = sql_pattern.at(idx);
+
+ // Escape any char that is special for pcre regex
+ if (pcre_regex_specials_.find(cur) != pcre_regex_specials_.end()) {
+ pcre_pattern += "\\";
+ }
+
+ if (cur == escape_char) {
+ // escape char must be followed by '_', '%' or the escape char itself.
+ ++idx;
+ ARROW_RETURN_IF(
+ idx == sql_pattern.size(),
+ Status::Invalid("Unexpected escape char at the end of pattern ", sql_pattern));
+
+ cur = sql_pattern.at(idx);
+ if (cur == '_' || cur == '%' || cur == escape_char) {
+ pcre_pattern += cur;
+ } else {
+ return Status::Invalid("Invalid escape sequence in pattern ", sql_pattern,
+ " at offset ", idx);
+ }
+ } else if (cur == '_') {
+ pcre_pattern += '.';
+ } else if (cur == '%') {
+ pcre_pattern += ".*";
+ } else {
+ pcre_pattern += cur;
+ }
+ }
+ return Status::OK();
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/regex_util.h b/src/arrow/cpp/src/gandiva/regex_util.h
new file mode 100644
index 000000000..cf0002b8c
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/regex_util.h
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <set>
+#include <sstream>
+#include <string>
+
+#include "gandiva/arrow.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+/// \brief Utility class for converting sql patterns to pcre patterns.
+class GANDIVA_EXPORT RegexUtil {
+ public:
+ // Convert an sql pattern to a pcre pattern
+ static Status SqlLikePatternToPcre(const std::string& like_pattern, char escape_char,
+ std::string& pcre_pattern);
+
+ static Status SqlLikePatternToPcre(const std::string& like_pattern,
+ std::string& pcre_pattern) {
+ return SqlLikePatternToPcre(like_pattern, 0 /*escape_char*/, pcre_pattern);
+ }
+
+ private:
+ static const std::set<char> pcre_regex_specials_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/replace_holder.cc b/src/arrow/cpp/src/gandiva/replace_holder.cc
new file mode 100644
index 000000000..8b42b585f
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/replace_holder.cc
@@ -0,0 +1,65 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/replace_holder.h"
+
+#include "gandiva/node.h"
+#include "gandiva/regex_util.h"
+
+namespace gandiva {
+
+static bool IsArrowStringLiteral(arrow::Type::type type) {
+ return type == arrow::Type::STRING || type == arrow::Type::BINARY;
+}
+
+Status ReplaceHolder::Make(const FunctionNode& node,
+ std::shared_ptr<ReplaceHolder>* holder) {
+ ARROW_RETURN_IF(node.children().size() != 3,
+ Status::Invalid("'replace' function requires three parameters"));
+
+ auto literal = dynamic_cast<LiteralNode*>(node.children().at(1).get());
+ ARROW_RETURN_IF(
+ literal == nullptr,
+ Status::Invalid("'replace' function requires a literal as the second parameter"));
+
+ auto literal_type = literal->return_type()->id();
+ ARROW_RETURN_IF(
+ !IsArrowStringLiteral(literal_type),
+ Status::Invalid(
+ "'replace' function requires a string literal as the second parameter"));
+
+ return Make(arrow::util::get<std::string>(literal->holder()), holder);
+}
+
+Status ReplaceHolder::Make(const std::string& sql_pattern,
+ std::shared_ptr<ReplaceHolder>* holder) {
+ auto lholder = std::shared_ptr<ReplaceHolder>(new ReplaceHolder(sql_pattern));
+ ARROW_RETURN_IF(!lholder->regex_.ok(),
+ Status::Invalid("Building RE2 pattern '", sql_pattern, "' failed"));
+
+ *holder = lholder;
+ return Status::OK();
+}
+
+void ReplaceHolder::return_error(ExecutionContext* context, std::string& data,
+ std::string& replace_string) {
+ std::string err_msg = "Error replacing '" + replace_string + "' on the given string '" +
+ data + "' for the given pattern: " + pattern_;
+ context->set_error_msg(err_msg.c_str());
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/replace_holder.h b/src/arrow/cpp/src/gandiva/replace_holder.h
new file mode 100644
index 000000000..79150d7aa
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/replace_holder.h
@@ -0,0 +1,97 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <re2/re2.h>
+
+#include <memory>
+#include <string>
+
+#include "arrow/status.h"
+#include "gandiva/execution_context.h"
+#include "gandiva/function_holder.h"
+#include "gandiva/node.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+/// Function Holder for 'replace'
+class GANDIVA_EXPORT ReplaceHolder : public FunctionHolder {
+ public:
+ ~ReplaceHolder() override = default;
+
+ static Status Make(const FunctionNode& node, std::shared_ptr<ReplaceHolder>* holder);
+
+ static Status Make(const std::string& sql_pattern,
+ std::shared_ptr<ReplaceHolder>* holder);
+
+ /// Return a new string with the pattern that matched the regex replaced for
+ /// the replace_input parameter.
+ const char* operator()(ExecutionContext* ctx, const char* user_input,
+ int32_t user_input_len, const char* replace_input,
+ int32_t replace_input_len, int32_t* out_length) {
+ std::string user_input_as_str(user_input, user_input_len);
+ std::string replace_input_as_str(replace_input, replace_input_len);
+
+ int32_t total_replaces =
+ RE2::GlobalReplace(&user_input_as_str, regex_, replace_input_as_str);
+
+ if (total_replaces < 0) {
+ return_error(ctx, user_input_as_str, replace_input_as_str);
+ *out_length = 0;
+ return "";
+ }
+
+ if (total_replaces == 0) {
+ *out_length = user_input_len;
+ return user_input;
+ }
+
+ *out_length = static_cast<int32_t>(user_input_as_str.size());
+
+ // This condition treats the case where the whole string is replaced by an empty
+ // string
+ if (*out_length == 0) {
+ return "";
+ }
+
+ char* result_buffer = reinterpret_cast<char*>(ctx->arena()->Allocate(*out_length));
+
+ if (result_buffer == NULLPTR) {
+ ctx->set_error_msg("Could not allocate memory for result");
+ *out_length = 0;
+ return "";
+ }
+
+ memcpy(result_buffer, user_input_as_str.data(), *out_length);
+
+ return result_buffer;
+ }
+
+ private:
+ explicit ReplaceHolder(const std::string& pattern)
+ : pattern_(pattern), regex_(pattern) {}
+
+ void return_error(ExecutionContext* context, std::string& data,
+ std::string& replace_string);
+
+ std::string pattern_; // posix pattern string, to help debugging
+ RE2 regex_; // compiled regex for the pattern
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/replace_holder_test.cc b/src/arrow/cpp/src/gandiva/replace_holder_test.cc
new file mode 100644
index 000000000..b0830d4f0
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/replace_holder_test.cc
@@ -0,0 +1,129 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/replace_holder.h"
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <vector>
+
+namespace gandiva {
+
+class TestReplaceHolder : public ::testing::Test {
+ protected:
+ ExecutionContext execution_context_;
+};
+
+TEST_F(TestReplaceHolder, TestMultipleReplace) {
+ std::shared_ptr<ReplaceHolder> replace_holder;
+
+ auto status = ReplaceHolder::Make("ana", &replace_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ std::string input_string = "banana";
+ std::string replace_string;
+ int32_t out_length = 0;
+
+ auto& replace = *replace_holder;
+ const char* ret =
+ replace(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()), replace_string.c_str(),
+ static_cast<int32_t>(replace_string.length()), &out_length);
+ std::string ret_as_str(ret, out_length);
+ EXPECT_EQ(out_length, 3);
+ EXPECT_EQ(ret_as_str, "bna");
+
+ input_string = "bananaana";
+
+ ret = replace(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()), replace_string.c_str(),
+ static_cast<int32_t>(replace_string.length()), &out_length);
+ ret_as_str = std::string(ret, out_length);
+ EXPECT_EQ(out_length, 3);
+ EXPECT_EQ(ret_as_str, "bna");
+
+ input_string = "bananana";
+
+ ret = replace(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()), replace_string.c_str(),
+ static_cast<int32_t>(replace_string.length()), &out_length);
+ ret_as_str = std::string(ret, out_length);
+ EXPECT_EQ(out_length, 2);
+ EXPECT_EQ(ret_as_str, "bn");
+
+ input_string = "anaana";
+
+ ret = replace(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()), replace_string.c_str(),
+ static_cast<int32_t>(replace_string.length()), &out_length);
+ ret_as_str = std::string(ret, out_length);
+ EXPECT_EQ(out_length, 0);
+ EXPECT_FALSE(execution_context_.has_error());
+ EXPECT_EQ(ret_as_str, "");
+}
+
+TEST_F(TestReplaceHolder, TestNoMatchPattern) {
+ std::shared_ptr<ReplaceHolder> replace_holder;
+
+ auto status = ReplaceHolder::Make("ana", &replace_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ std::string input_string = "apple";
+ std::string replace_string;
+ int32_t out_length = 0;
+
+ auto& replace = *replace_holder;
+ const char* ret =
+ replace(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()), replace_string.c_str(),
+ static_cast<int32_t>(replace_string.length()), &out_length);
+ std::string ret_as_string(ret, out_length);
+ EXPECT_EQ(out_length, 5);
+ EXPECT_EQ(ret_as_string, "apple");
+}
+
+TEST_F(TestReplaceHolder, TestReplaceSameSize) {
+ std::shared_ptr<ReplaceHolder> replace_holder;
+
+ auto status = ReplaceHolder::Make("a", &replace_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ std::string input_string = "ananindeua";
+ std::string replace_string = "b";
+ int32_t out_length = 0;
+
+ auto& replace = *replace_holder;
+ const char* ret =
+ replace(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()), replace_string.c_str(),
+ static_cast<int32_t>(replace_string.length()), &out_length);
+ std::string ret_as_string(ret, out_length);
+ EXPECT_EQ(out_length, 10);
+ EXPECT_EQ(ret_as_string, "bnbnindeub");
+}
+
+TEST_F(TestReplaceHolder, TestReplaceInvalidPattern) {
+ std::shared_ptr<ReplaceHolder> replace_holder;
+
+ auto status = ReplaceHolder::Make("+", &replace_holder);
+ EXPECT_EQ(status.ok(), false) << status.message();
+
+ execution_context_.Reset();
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/selection_vector.cc b/src/arrow/cpp/src/gandiva/selection_vector.cc
new file mode 100644
index 000000000..a30bba686
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/selection_vector.cc
@@ -0,0 +1,179 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/selection_vector.h"
+
+#include <memory>
+#include <sstream>
+#include <utility>
+#include <vector>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/endian.h"
+
+#include "gandiva/selection_vector_impl.h"
+
+namespace gandiva {
+
+constexpr SelectionVector::Mode SelectionVector::kAllModes[kNumModes];
+
+Status SelectionVector::PopulateFromBitMap(const uint8_t* bitmap, int64_t bitmap_size,
+ int64_t max_bitmap_index) {
+ const uint64_t max_idx = static_cast<uint64_t>(max_bitmap_index);
+ ARROW_RETURN_IF(bitmap_size % 8, Status::Invalid("Bitmap size ", bitmap_size,
+ " must be aligned to 64-bit size"));
+ ARROW_RETURN_IF(max_bitmap_index < 0,
+ Status::Invalid("Max bitmap index must be positive"));
+ ARROW_RETURN_IF(
+ max_idx > GetMaxSupportedValue(),
+ Status::Invalid("max_bitmap_index ", max_idx, " must be <= maxSupportedValue ",
+ GetMaxSupportedValue(), " in selection vector"));
+
+ int64_t max_slots = GetMaxSlots();
+
+ // jump 8-bytes at a time, add the index corresponding to each valid bit to the
+ // the selection vector.
+ int64_t selection_idx = 0;
+ const uint64_t* bitmap_64 = reinterpret_cast<const uint64_t*>(bitmap);
+ for (int64_t bitmap_idx = 0; bitmap_idx < bitmap_size / 8; ++bitmap_idx) {
+ uint64_t current_word = arrow::BitUtil::ToLittleEndian(bitmap_64[bitmap_idx]);
+
+ while (current_word != 0) {
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4146)
+#endif
+ // MSVC warns about negating an unsigned type. We suppress it for now
+ uint64_t highest_only = current_word & -current_word;
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+ int pos_in_word = arrow::BitUtil::CountTrailingZeros(highest_only);
+
+ int64_t pos_in_bitmap = bitmap_idx * 64 + pos_in_word;
+ if (pos_in_bitmap > max_bitmap_index) {
+ // the bitmap may be slightly larger for alignment/padding.
+ break;
+ }
+
+ ARROW_RETURN_IF(selection_idx >= max_slots,
+ Status::Invalid("selection vector has no remaining slots"));
+
+ SetIndex(selection_idx, pos_in_bitmap);
+ ++selection_idx;
+
+ current_word ^= highest_only;
+ }
+ }
+
+ SetNumSlots(selection_idx);
+ return Status::OK();
+}
+
+Status SelectionVector::MakeInt16(int64_t max_slots,
+ std::shared_ptr<arrow::Buffer> buffer,
+ std::shared_ptr<SelectionVector>* selection_vector) {
+ ARROW_RETURN_NOT_OK(SelectionVectorInt16::ValidateBuffer(max_slots, buffer));
+ *selection_vector = std::make_shared<SelectionVectorInt16>(max_slots, buffer);
+ return Status::OK();
+}
+
+Status SelectionVector::MakeInt16(int64_t max_slots, arrow::MemoryPool* pool,
+ std::shared_ptr<SelectionVector>* selection_vector) {
+ std::shared_ptr<arrow::Buffer> buffer;
+ ARROW_RETURN_NOT_OK(SelectionVectorInt16::AllocateBuffer(max_slots, pool, &buffer));
+ *selection_vector = std::make_shared<SelectionVectorInt16>(max_slots, buffer);
+ return Status::OK();
+}
+
+Status SelectionVector::MakeImmutableInt16(
+ int64_t num_slots, std::shared_ptr<arrow::Buffer> buffer,
+ std::shared_ptr<SelectionVector>* selection_vector) {
+ *selection_vector =
+ std::make_shared<SelectionVectorInt16>(num_slots, num_slots, buffer);
+ return Status::OK();
+}
+
+Status SelectionVector::MakeInt32(int64_t max_slots,
+ std::shared_ptr<arrow::Buffer> buffer,
+ std::shared_ptr<SelectionVector>* selection_vector) {
+ ARROW_RETURN_NOT_OK(SelectionVectorInt32::ValidateBuffer(max_slots, buffer));
+ *selection_vector = std::make_shared<SelectionVectorInt32>(max_slots, buffer);
+
+ return Status::OK();
+}
+
+Status SelectionVector::MakeInt32(int64_t max_slots, arrow::MemoryPool* pool,
+ std::shared_ptr<SelectionVector>* selection_vector) {
+ std::shared_ptr<arrow::Buffer> buffer;
+ ARROW_RETURN_NOT_OK(SelectionVectorInt32::AllocateBuffer(max_slots, pool, &buffer));
+ *selection_vector = std::make_shared<SelectionVectorInt32>(max_slots, buffer);
+
+ return Status::OK();
+}
+
+Status SelectionVector::MakeImmutableInt32(
+ int64_t num_slots, std::shared_ptr<arrow::Buffer> buffer,
+ std::shared_ptr<SelectionVector>* selection_vector) {
+ *selection_vector =
+ std::make_shared<SelectionVectorInt32>(num_slots, num_slots, buffer);
+ return Status::OK();
+}
+
+Status SelectionVector::MakeInt64(int64_t max_slots,
+ std::shared_ptr<arrow::Buffer> buffer,
+ std::shared_ptr<SelectionVector>* selection_vector) {
+ ARROW_RETURN_NOT_OK(SelectionVectorInt64::ValidateBuffer(max_slots, buffer));
+ *selection_vector = std::make_shared<SelectionVectorInt64>(max_slots, buffer);
+
+ return Status::OK();
+}
+
+Status SelectionVector::MakeInt64(int64_t max_slots, arrow::MemoryPool* pool,
+ std::shared_ptr<SelectionVector>* selection_vector) {
+ std::shared_ptr<arrow::Buffer> buffer;
+ ARROW_RETURN_NOT_OK(SelectionVectorInt64::AllocateBuffer(max_slots, pool, &buffer));
+ *selection_vector = std::make_shared<SelectionVectorInt64>(max_slots, buffer);
+
+ return Status::OK();
+}
+
+template <typename C_TYPE, typename A_TYPE, SelectionVector::Mode mode>
+Status SelectionVectorImpl<C_TYPE, A_TYPE, mode>::AllocateBuffer(
+ int64_t max_slots, arrow::MemoryPool* pool, std::shared_ptr<arrow::Buffer>* buffer) {
+ auto buffer_len = max_slots * sizeof(C_TYPE);
+ ARROW_ASSIGN_OR_RAISE(*buffer, arrow::AllocateBuffer(buffer_len, pool));
+
+ return Status::OK();
+}
+
+template <typename C_TYPE, typename A_TYPE, SelectionVector::Mode mode>
+Status SelectionVectorImpl<C_TYPE, A_TYPE, mode>::ValidateBuffer(
+ int64_t max_slots, std::shared_ptr<arrow::Buffer> buffer) {
+ ARROW_RETURN_IF(!buffer->is_mutable(),
+ Status::Invalid("buffer for selection vector must be mutable"));
+
+ const int64_t min_len = max_slots * sizeof(C_TYPE);
+ ARROW_RETURN_IF(buffer->size() < min_len,
+ Status::Invalid("Buffer for selection vector is too small"));
+
+ return Status::OK();
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/selection_vector.h b/src/arrow/cpp/src/gandiva/selection_vector.h
new file mode 100644
index 000000000..1c0fef1c5
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/selection_vector.h
@@ -0,0 +1,151 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/status.h"
+
+#include "arrow/util/logging.h"
+#include "gandiva/arrow.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+/// \brief Selection Vector : vector of indices in a row-batch for a selection,
+/// backed by an arrow-array.
+class GANDIVA_EXPORT SelectionVector {
+ public:
+ virtual ~SelectionVector() = default;
+
+ enum Mode : int {
+ MODE_NONE,
+ MODE_UINT16,
+ MODE_UINT32,
+ MODE_UINT64,
+ MODE_MAX = MODE_UINT64, // dummy
+ };
+ static constexpr int kNumModes = static_cast<int>(MODE_MAX) + 1;
+ static constexpr Mode kAllModes[kNumModes] = {MODE_NONE, MODE_UINT16, MODE_UINT32,
+ MODE_UINT64};
+
+ /// Get the value at a given index.
+ virtual uint64_t GetIndex(int64_t index) const = 0;
+
+ /// Set the value at a given index.
+ virtual void SetIndex(int64_t index, uint64_t value) = 0;
+
+ // Get the max supported value in the selection vector.
+ virtual uint64_t GetMaxSupportedValue() const = 0;
+
+ /// The maximum slots (capacity) of the selection vector.
+ virtual int64_t GetMaxSlots() const = 0;
+
+ /// The number of slots (size) of the selection vector.
+ virtual int64_t GetNumSlots() const = 0;
+
+ /// Set the number of slots in the selection vector.
+ virtual void SetNumSlots(int64_t num_slots) = 0;
+
+ /// Convert to arrow-array.
+ virtual ArrayPtr ToArray() const = 0;
+
+ /// Get the underlying arrow buffer.
+ virtual arrow::Buffer& GetBuffer() const = 0;
+
+ /// Mode of SelectionVector
+ virtual Mode GetMode() const = 0;
+
+ /// \brief populate selection vector for all the set bits in the bitmap.
+ ///
+ /// \param[in] bitmap the bitmap
+ /// \param[in] bitmap_size size of the bitmap in bytes
+ /// \param[in] max_bitmap_index max valid index in bitmap (can be lesser than
+ /// capacity in the bitmap, due to alignment/padding).
+ Status PopulateFromBitMap(const uint8_t* bitmap, int64_t bitmap_size,
+ int64_t max_bitmap_index);
+
+ /// \brief make selection vector with int16 type records.
+ ///
+ /// \param[in] max_slots max number of slots
+ /// \param[in] buffer buffer sized to accommodate max_slots
+ /// \param[out] selection_vector selection vector backed by 'buffer'
+ static Status MakeInt16(int64_t max_slots, std::shared_ptr<arrow::Buffer> buffer,
+ std::shared_ptr<SelectionVector>* selection_vector);
+
+ /// \param[in] max_slots max number of slots
+ /// \param[in] pool memory pool to allocate buffer
+ /// \param[out] selection_vector selection vector backed by a buffer allocated from the
+ /// pool.
+ static Status MakeInt16(int64_t max_slots, arrow::MemoryPool* pool,
+ std::shared_ptr<SelectionVector>* selection_vector);
+
+ /// \brief creates a selection vector with pre populated buffer.
+ ///
+ /// \param[in] num_slots size of the selection vector
+ /// \param[in] buffer pre-populated buffer
+ /// \param[out] selection_vector selection vector backed by 'buffer'
+ static Status MakeImmutableInt16(int64_t num_slots,
+ std::shared_ptr<arrow::Buffer> buffer,
+ std::shared_ptr<SelectionVector>* selection_vector);
+
+ /// \brief make selection vector with int32 type records.
+ ///
+ /// \param[in] max_slots max number of slots
+ /// \param[in] buffer buffer sized to accommodate max_slots
+ /// \param[out] selection_vector selection vector backed by 'buffer'
+ static Status MakeInt32(int64_t max_slots, std::shared_ptr<arrow::Buffer> buffer,
+ std::shared_ptr<SelectionVector>* selection_vector);
+
+ /// \brief make selection vector with int32 type records.
+ ///
+ /// \param[in] max_slots max number of slots
+ /// \param[in] pool memory pool to allocate buffer
+ /// \param[out] selection_vector selection vector backed by a buffer allocated from the
+ /// pool.
+ static Status MakeInt32(int64_t max_slots, arrow::MemoryPool* pool,
+ std::shared_ptr<SelectionVector>* selection_vector);
+
+ /// \brief creates a selection vector with pre populated buffer.
+ ///
+ /// \param[in] num_slots size of the selection vector
+ /// \param[in] buffer pre-populated buffer
+ /// \param[out] selection_vector selection vector backed by 'buffer'
+ static Status MakeImmutableInt32(int64_t num_slots,
+ std::shared_ptr<arrow::Buffer> buffer,
+ std::shared_ptr<SelectionVector>* selection_vector);
+
+ /// \brief make selection vector with int64 type records.
+ ///
+ /// \param[in] max_slots max number of slots
+ /// \param[in] buffer buffer sized to accommodate max_slots
+ /// \param[out] selection_vector selection vector backed by 'buffer'
+ static Status MakeInt64(int64_t max_slots, std::shared_ptr<arrow::Buffer> buffer,
+ std::shared_ptr<SelectionVector>* selection_vector);
+
+ /// \brief make selection vector with int64 type records.
+ ///
+ /// \param[in] max_slots max number of slots
+ /// \param[in] pool memory pool to allocate buffer
+ /// \param[out] selection_vector selection vector backed by a buffer allocated from the
+ /// pool.
+ static Status MakeInt64(int64_t max_slots, arrow::MemoryPool* pool,
+ std::shared_ptr<SelectionVector>* selection_vector);
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/selection_vector_impl.h b/src/arrow/cpp/src/gandiva/selection_vector_impl.h
new file mode 100644
index 000000000..dc9724ca8
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/selection_vector_impl.h
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <limits>
+#include <memory>
+
+#include "arrow/status.h"
+#include "arrow/util/macros.h"
+
+#include "arrow/util/logging.h"
+#include "gandiva/arrow.h"
+#include "gandiva/selection_vector.h"
+
+namespace gandiva {
+
+/// \brief template implementation of selection vector with a specific ctype and arrow
+/// type.
+template <typename C_TYPE, typename A_TYPE, SelectionVector::Mode mode>
+class SelectionVectorImpl : public SelectionVector {
+ public:
+ SelectionVectorImpl(int64_t max_slots, std::shared_ptr<arrow::Buffer> buffer)
+ : max_slots_(max_slots), num_slots_(0), buffer_(buffer), mode_(mode) {
+ raw_data_ = reinterpret_cast<C_TYPE*>(buffer->mutable_data());
+ }
+
+ SelectionVectorImpl(int64_t max_slots, int64_t num_slots,
+ std::shared_ptr<arrow::Buffer> buffer)
+ : max_slots_(max_slots), num_slots_(num_slots), buffer_(buffer), mode_(mode) {
+ if (buffer) {
+ raw_data_ = const_cast<C_TYPE*>(reinterpret_cast<const C_TYPE*>(buffer->data()));
+ }
+ }
+
+ uint64_t GetIndex(int64_t index) const override { return raw_data_[index]; }
+
+ void SetIndex(int64_t index, uint64_t value) override {
+ raw_data_[index] = static_cast<C_TYPE>(value);
+ }
+
+ ArrayPtr ToArray() const override;
+
+ int64_t GetMaxSlots() const override { return max_slots_; }
+
+ int64_t GetNumSlots() const override { return num_slots_; }
+
+ void SetNumSlots(int64_t num_slots) override {
+ DCHECK_LE(num_slots, max_slots_);
+ num_slots_ = num_slots;
+ }
+
+ uint64_t GetMaxSupportedValue() const override {
+ return std::numeric_limits<C_TYPE>::max();
+ }
+
+ Mode GetMode() const override { return mode_; }
+
+ arrow::Buffer& GetBuffer() const override { return *buffer_; }
+
+ static Status AllocateBuffer(int64_t max_slots, arrow::MemoryPool* pool,
+ std::shared_ptr<arrow::Buffer>* buffer);
+
+ static Status ValidateBuffer(int64_t max_slots, std::shared_ptr<arrow::Buffer> buffer);
+
+ protected:
+ /// maximum slots in the vector
+ int64_t max_slots_;
+
+ /// number of slots in the vector
+ int64_t num_slots_;
+
+ std::shared_ptr<arrow::Buffer> buffer_;
+ C_TYPE* raw_data_;
+
+ /// SelectionVector mode
+ Mode mode_;
+};
+
+template <typename C_TYPE, typename A_TYPE, SelectionVector::Mode mode>
+ArrayPtr SelectionVectorImpl<C_TYPE, A_TYPE, mode>::ToArray() const {
+ auto data_type = arrow::TypeTraits<A_TYPE>::type_singleton();
+ auto array_data = arrow::ArrayData::Make(data_type, num_slots_, {NULLPTR, buffer_});
+ return arrow::MakeArray(array_data);
+}
+
+using SelectionVectorInt16 =
+ SelectionVectorImpl<uint16_t, arrow::UInt16Type, SelectionVector::MODE_UINT16>;
+using SelectionVectorInt32 =
+ SelectionVectorImpl<uint32_t, arrow::UInt32Type, SelectionVector::MODE_UINT32>;
+using SelectionVectorInt64 =
+ SelectionVectorImpl<uint64_t, arrow::UInt64Type, SelectionVector::MODE_UINT64>;
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/selection_vector_test.cc b/src/arrow/cpp/src/gandiva/selection_vector_test.cc
new file mode 100644
index 000000000..686892901
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/selection_vector_test.cc
@@ -0,0 +1,270 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/selection_vector.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "arrow/testing/gtest_util.h"
+
+namespace gandiva {
+
+class TestSelectionVector : public ::testing::Test {
+ protected:
+ virtual void SetUp() { pool_ = arrow::default_memory_pool(); }
+
+ arrow::MemoryPool* pool_;
+};
+
+static inline uint32_t RoundUpNumi64(uint32_t value) { return (value + 63) >> 6; }
+
+TEST_F(TestSelectionVector, TestInt16Make) {
+ int max_slots = 10;
+
+ // Test with pool allocation
+ std::shared_ptr<SelectionVector> selection;
+ auto status = SelectionVector::MakeInt16(max_slots, pool_, &selection);
+ EXPECT_EQ(status.ok(), true) << status.message();
+ EXPECT_EQ(selection->GetMaxSlots(), max_slots);
+ EXPECT_EQ(selection->GetNumSlots(), 0);
+
+ // Test with pre-alloced buffer
+ std::shared_ptr<SelectionVector> selection2;
+ auto buffer_len = max_slots * sizeof(int16_t);
+ ASSERT_OK_AND_ASSIGN(auto buffer, arrow::AllocateBuffer(buffer_len, pool_));
+
+ status = SelectionVector::MakeInt16(max_slots, std::move(buffer), &selection2);
+ EXPECT_EQ(status.ok(), true) << status.message();
+ EXPECT_EQ(selection2->GetMaxSlots(), max_slots);
+ EXPECT_EQ(selection2->GetNumSlots(), 0);
+}
+
+TEST_F(TestSelectionVector, TestInt16MakeNegative) {
+ int max_slots = 10;
+
+ std::shared_ptr<SelectionVector> selection;
+ auto buffer_len = max_slots * sizeof(int16_t);
+
+ // alloc a buffer that's insufficient.
+ ASSERT_OK_AND_ASSIGN(auto buffer, arrow::AllocateBuffer(buffer_len - 16, pool_));
+
+ auto status = SelectionVector::MakeInt16(max_slots, std::move(buffer), &selection);
+ EXPECT_EQ(status.IsInvalid(), true);
+}
+
+TEST_F(TestSelectionVector, TestInt16Set) {
+ int max_slots = 10;
+
+ std::shared_ptr<SelectionVector> selection;
+ auto status = SelectionVector::MakeInt16(max_slots, pool_, &selection);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ selection->SetIndex(0, 100);
+ EXPECT_EQ(selection->GetIndex(0), 100);
+
+ selection->SetIndex(1, 200);
+ EXPECT_EQ(selection->GetIndex(1), 200);
+
+ selection->SetNumSlots(2);
+ EXPECT_EQ(selection->GetNumSlots(), 2);
+
+ // TopArray() should return an array with 100,200
+ auto array_raw = selection->ToArray();
+ const auto& array = dynamic_cast<const arrow::UInt16Array&>(*array_raw);
+ EXPECT_EQ(array.length(), 2) << array_raw->ToString();
+ EXPECT_EQ(array.Value(0), 100) << array_raw->ToString();
+ EXPECT_EQ(array.Value(1), 200) << array_raw->ToString();
+}
+
+TEST_F(TestSelectionVector, TestInt16PopulateFromBitMap) {
+ int max_slots = 200;
+
+ std::shared_ptr<SelectionVector> selection;
+ auto status = SelectionVector::MakeInt16(max_slots, pool_, &selection);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ int bitmap_size = RoundUpNumi64(max_slots) * 8;
+ std::vector<uint8_t> bitmap(bitmap_size);
+
+ arrow::BitUtil::SetBit(&bitmap[0], 0);
+ arrow::BitUtil::SetBit(&bitmap[0], 5);
+ arrow::BitUtil::SetBit(&bitmap[0], 121);
+ arrow::BitUtil::SetBit(&bitmap[0], 220);
+
+ status = selection->PopulateFromBitMap(&bitmap[0], bitmap_size, max_slots - 1);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ EXPECT_EQ(selection->GetNumSlots(), 3);
+ EXPECT_EQ(selection->GetIndex(0), 0);
+ EXPECT_EQ(selection->GetIndex(1), 5);
+ EXPECT_EQ(selection->GetIndex(2), 121);
+}
+
+TEST_F(TestSelectionVector, TestInt16PopulateFromBitMapNegative) {
+ int max_slots = 2;
+
+ std::shared_ptr<SelectionVector> selection;
+ auto status = SelectionVector::MakeInt16(max_slots, pool_, &selection);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ int bitmap_size = 16;
+ std::vector<uint8_t> bitmap(bitmap_size);
+
+ arrow::BitUtil::SetBit(&bitmap[0], 0);
+ arrow::BitUtil::SetBit(&bitmap[0], 1);
+ arrow::BitUtil::SetBit(&bitmap[0], 2);
+
+ // The bitmap has three set bits, whereas the selection vector has capacity for only 2.
+ status = selection->PopulateFromBitMap(&bitmap[0], bitmap_size, 2);
+ EXPECT_EQ(status.IsInvalid(), true);
+}
+
+TEST_F(TestSelectionVector, TestInt32Set) {
+ int max_slots = 10;
+
+ std::shared_ptr<SelectionVector> selection;
+ auto status = SelectionVector::MakeInt32(max_slots, pool_, &selection);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ selection->SetIndex(0, 100);
+ EXPECT_EQ(selection->GetIndex(0), 100);
+
+ selection->SetIndex(1, 200);
+ EXPECT_EQ(selection->GetIndex(1), 200);
+
+ selection->SetIndex(2, 100000);
+ EXPECT_EQ(selection->GetIndex(2), 100000);
+
+ selection->SetNumSlots(3);
+ EXPECT_EQ(selection->GetNumSlots(), 3);
+
+ // TopArray() should return an array with 100,200,100000
+ auto array_raw = selection->ToArray();
+ const auto& array = dynamic_cast<const arrow::UInt32Array&>(*array_raw);
+ EXPECT_EQ(array.length(), 3) << array_raw->ToString();
+ EXPECT_EQ(array.Value(0), 100) << array_raw->ToString();
+ EXPECT_EQ(array.Value(1), 200) << array_raw->ToString();
+ EXPECT_EQ(array.Value(2), 100000) << array_raw->ToString();
+}
+
+TEST_F(TestSelectionVector, TestInt32PopulateFromBitMap) {
+ int max_slots = 200;
+
+ std::shared_ptr<SelectionVector> selection;
+ auto status = SelectionVector::MakeInt32(max_slots, pool_, &selection);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ int bitmap_size = RoundUpNumi64(max_slots) * 8;
+ std::vector<uint8_t> bitmap(bitmap_size);
+
+ arrow::BitUtil::SetBit(&bitmap[0], 0);
+ arrow::BitUtil::SetBit(&bitmap[0], 5);
+ arrow::BitUtil::SetBit(&bitmap[0], 121);
+ arrow::BitUtil::SetBit(&bitmap[0], 220);
+
+ status = selection->PopulateFromBitMap(&bitmap[0], bitmap_size, max_slots - 1);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ EXPECT_EQ(selection->GetNumSlots(), 3);
+ EXPECT_EQ(selection->GetIndex(0), 0);
+ EXPECT_EQ(selection->GetIndex(1), 5);
+ EXPECT_EQ(selection->GetIndex(2), 121);
+}
+
+TEST_F(TestSelectionVector, TestInt32MakeNegative) {
+ int max_slots = 10;
+
+ std::shared_ptr<SelectionVector> selection;
+ auto buffer_len = max_slots * sizeof(int32_t);
+
+ // alloc a buffer that's insufficient.
+ ASSERT_OK_AND_ASSIGN(auto buffer, arrow::AllocateBuffer(buffer_len - 1, pool_));
+
+ auto status = SelectionVector::MakeInt32(max_slots, std::move(buffer), &selection);
+ EXPECT_EQ(status.IsInvalid(), true);
+}
+
+TEST_F(TestSelectionVector, TestInt64Set) {
+ int max_slots = 10;
+
+ std::shared_ptr<SelectionVector> selection;
+ auto status = SelectionVector::MakeInt64(max_slots, pool_, &selection);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ selection->SetIndex(0, 100);
+ EXPECT_EQ(selection->GetIndex(0), 100);
+
+ selection->SetIndex(1, 200);
+ EXPECT_EQ(selection->GetIndex(1), 200);
+
+ selection->SetIndex(2, 100000);
+ EXPECT_EQ(selection->GetIndex(2), 100000);
+
+ selection->SetNumSlots(3);
+ EXPECT_EQ(selection->GetNumSlots(), 3);
+
+ // TopArray() should return an array with 100,200,100000
+ auto array_raw = selection->ToArray();
+ const auto& array = dynamic_cast<const arrow::UInt64Array&>(*array_raw);
+ EXPECT_EQ(array.length(), 3) << array_raw->ToString();
+ EXPECT_EQ(array.Value(0), 100) << array_raw->ToString();
+ EXPECT_EQ(array.Value(1), 200) << array_raw->ToString();
+ EXPECT_EQ(array.Value(2), 100000) << array_raw->ToString();
+}
+
+TEST_F(TestSelectionVector, TestInt64PopulateFromBitMap) {
+ int max_slots = 200;
+
+ std::shared_ptr<SelectionVector> selection;
+ auto status = SelectionVector::MakeInt64(max_slots, pool_, &selection);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ int bitmap_size = RoundUpNumi64(max_slots) * 8;
+ std::vector<uint8_t> bitmap(bitmap_size);
+
+ arrow::BitUtil::SetBit(&bitmap[0], 0);
+ arrow::BitUtil::SetBit(&bitmap[0], 5);
+ arrow::BitUtil::SetBit(&bitmap[0], 121);
+ arrow::BitUtil::SetBit(&bitmap[0], 220);
+
+ status = selection->PopulateFromBitMap(&bitmap[0], bitmap_size, max_slots - 1);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ EXPECT_EQ(selection->GetNumSlots(), 3);
+ EXPECT_EQ(selection->GetIndex(0), 0);
+ EXPECT_EQ(selection->GetIndex(1), 5);
+ EXPECT_EQ(selection->GetIndex(2), 121);
+}
+
+TEST_F(TestSelectionVector, TestInt64MakeNegative) {
+ int max_slots = 10;
+
+ std::shared_ptr<SelectionVector> selection;
+ auto buffer_len = max_slots * sizeof(int64_t);
+
+ // alloc a buffer that's insufficient.
+ ASSERT_OK_AND_ASSIGN(auto buffer, arrow::AllocateBuffer(buffer_len - 1, pool_));
+
+ auto status = SelectionVector::MakeInt64(max_slots, std::move(buffer), &selection);
+ EXPECT_EQ(status.IsInvalid(), true);
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/simple_arena.h b/src/arrow/cpp/src/gandiva/simple_arena.h
new file mode 100644
index 000000000..da00b3397
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/simple_arena.h
@@ -0,0 +1,160 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "gandiva/arrow.h"
+
+namespace gandiva {
+
+/// \brief Simple arena allocator.
+///
+/// Memory is allocated from system in units of chunk-size, and dished out in the
+/// requested sizes. If the requested size > chunk-size, allocate directly from the
+/// system.
+///
+/// The allocated memory gets released only when the arena is destroyed, or on
+/// Reset.
+///
+/// This code is not multi-thread safe, and avoids all locking for efficiency.
+///
+class SimpleArena {
+ public:
+ explicit SimpleArena(arrow::MemoryPool* pool, int64_t min_chunk_size = 4096);
+
+ ~SimpleArena();
+
+ // Allocate buffer of requested size.
+ uint8_t* Allocate(int64_t size);
+
+ // Reset arena state.
+ void Reset();
+
+ // total bytes allocated from system.
+ int64_t total_bytes() { return total_bytes_; }
+
+ // total bytes available for allocations.
+ int64_t avail_bytes() { return avail_bytes_; }
+
+ private:
+ struct Chunk {
+ Chunk(uint8_t* buf, int64_t size) : buf_(buf), size_(size) {}
+
+ uint8_t* buf_;
+ int64_t size_;
+ };
+
+ // Allocate new chunk.
+ arrow::Status AllocateChunk(int64_t size);
+
+ // release memory from buffers.
+ void ReleaseChunks(bool retain_first);
+
+ // Memory pool used for allocs.
+ arrow::MemoryPool* pool_;
+
+ // The chunk-size used for allocations from system.
+ int64_t min_chunk_size_;
+
+ // Total bytes allocated from system.
+ int64_t total_bytes_;
+
+ // Bytes available from allocated chunk.
+ int64_t avail_bytes_;
+
+ // buffer from current chunk.
+ uint8_t* avail_buf_;
+
+ // List of allocated chunks.
+ std::vector<Chunk> chunks_;
+};
+
+inline SimpleArena::SimpleArena(arrow::MemoryPool* pool, int64_t min_chunk_size)
+ : pool_(pool),
+ min_chunk_size_(min_chunk_size),
+ total_bytes_(0),
+ avail_bytes_(0),
+ avail_buf_(NULL) {}
+
+inline SimpleArena::~SimpleArena() { ReleaseChunks(false /*retain_first*/); }
+
+inline uint8_t* SimpleArena::Allocate(int64_t size) {
+ if (avail_bytes_ < size) {
+ auto status = AllocateChunk(std::max(size, min_chunk_size_));
+ if (!status.ok()) {
+ return NULL;
+ }
+ }
+
+ uint8_t* ret = avail_buf_;
+ avail_buf_ += size;
+ avail_bytes_ -= size;
+ return ret;
+}
+
+inline arrow::Status SimpleArena::AllocateChunk(int64_t size) {
+ uint8_t* out;
+
+ auto status = pool_->Allocate(size, &out);
+ ARROW_RETURN_NOT_OK(status);
+
+ chunks_.emplace_back(out, size);
+ avail_buf_ = out;
+ avail_bytes_ = size; // left-over bytes in the previous chunk cannot be used anymore.
+ total_bytes_ += size;
+ return arrow::Status::OK();
+}
+
+// In the most common case, a chunk will be allocated when processing the first record.
+// And, the same chunk can be used for processing the remaining records in the batch.
+// By retaining the first chunk, the number of malloc calls are reduced to one per batch,
+// instead of one per record.
+inline void SimpleArena::Reset() {
+ if (chunks_.size() == 0) {
+ // if there are no chunks, nothing to do.
+ return;
+ }
+
+ // Release all but the first chunk.
+ if (chunks_.size() > 1) {
+ ReleaseChunks(true);
+ chunks_.erase(chunks_.begin() + 1, chunks_.end());
+ }
+
+ avail_buf_ = chunks_.at(0).buf_;
+ avail_bytes_ = total_bytes_ = chunks_.at(0).size_;
+}
+
+inline void SimpleArena::ReleaseChunks(bool retain_first) {
+ for (auto& chunk : chunks_) {
+ if (retain_first) {
+ // skip freeing first chunk.
+ retain_first = false;
+ continue;
+ }
+ pool_->Free(chunk.buf_, chunk.size_);
+ }
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/simple_arena_test.cc b/src/arrow/cpp/src/gandiva/simple_arena_test.cc
new file mode 100644
index 000000000..60831280c
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/simple_arena_test.cc
@@ -0,0 +1,102 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/simple_arena.h"
+
+#include <gtest/gtest.h>
+
+#include "arrow/memory_pool.h"
+
+namespace gandiva {
+
+class TestSimpleArena : public ::testing::Test {};
+
+TEST_F(TestSimpleArena, TestAlloc) {
+ int64_t chunk_size = 4096;
+ SimpleArena arena(arrow::default_memory_pool(), chunk_size);
+
+ // Small allocations should come from the same chunk.
+ int64_t small_size = 100;
+ for (int64_t i = 0; i < 20; ++i) {
+ auto p = arena.Allocate(small_size);
+ EXPECT_NE(p, nullptr);
+
+ EXPECT_EQ(arena.total_bytes(), chunk_size);
+ EXPECT_EQ(arena.avail_bytes(), chunk_size - (i + 1) * small_size);
+ }
+
+ // large allocations require separate chunks
+ int64_t large_size = 100 * chunk_size;
+ auto p = arena.Allocate(large_size);
+ EXPECT_NE(p, nullptr);
+ EXPECT_EQ(arena.total_bytes(), chunk_size + large_size);
+ EXPECT_EQ(arena.avail_bytes(), 0);
+}
+
+// small followed by big, then reset
+TEST_F(TestSimpleArena, TestReset1) {
+ int64_t chunk_size = 4096;
+ SimpleArena arena(arrow::default_memory_pool(), chunk_size);
+
+ int64_t small_size = 100;
+ auto p = arena.Allocate(small_size);
+ EXPECT_NE(p, nullptr);
+
+ int64_t large_size = 100 * chunk_size;
+ p = arena.Allocate(large_size);
+ EXPECT_NE(p, nullptr);
+
+ EXPECT_EQ(arena.total_bytes(), chunk_size + large_size);
+ EXPECT_EQ(arena.avail_bytes(), 0);
+ arena.Reset();
+ EXPECT_EQ(arena.total_bytes(), chunk_size);
+ EXPECT_EQ(arena.avail_bytes(), chunk_size);
+
+ // should re-use buffer after reset.
+ p = arena.Allocate(small_size);
+ EXPECT_NE(p, nullptr);
+ EXPECT_EQ(arena.total_bytes(), chunk_size);
+ EXPECT_EQ(arena.avail_bytes(), chunk_size - small_size);
+}
+
+// big followed by small, then reset
+TEST_F(TestSimpleArena, TestReset2) {
+ int64_t chunk_size = 4096;
+ SimpleArena arena(arrow::default_memory_pool(), chunk_size);
+
+ int64_t large_size = 100 * chunk_size;
+ auto p = arena.Allocate(large_size);
+ EXPECT_NE(p, nullptr);
+
+ int64_t small_size = 100;
+ p = arena.Allocate(small_size);
+ EXPECT_NE(p, nullptr);
+
+ EXPECT_EQ(arena.total_bytes(), chunk_size + large_size);
+ EXPECT_EQ(arena.avail_bytes(), chunk_size - small_size);
+ arena.Reset();
+ EXPECT_EQ(arena.total_bytes(), large_size);
+ EXPECT_EQ(arena.avail_bytes(), large_size);
+
+ // should re-use buffer after reset.
+ p = arena.Allocate(small_size);
+ EXPECT_NE(p, nullptr);
+ EXPECT_EQ(arena.total_bytes(), large_size);
+ EXPECT_EQ(arena.avail_bytes(), large_size - small_size);
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/symbols.map b/src/arrow/cpp/src/gandiva/symbols.map
new file mode 100644
index 000000000..77f000106
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/symbols.map
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+{
+ # Symbols marked as 'local' are not exported by the DSO and thus may not
+ # be used by client applications.
+ local:
+ # devtoolset / static-libstdc++ symbols
+ __cxa_*;
+ __once_proxy;
+
+ extern "C++" {
+ # devtoolset or -static-libstdc++ - the Red Hat devtoolset statically
+ # links c++11 symbols into binaries so that the result may be executed on
+ # a system with an older libstdc++ which doesn't include the necessary
+ # c++11 symbols.
+ std::*;
+ *std::__once_call*;
+ };
+};
+
diff --git a/src/arrow/cpp/src/gandiva/tests/CMakeLists.txt b/src/arrow/cpp/src/gandiva/tests/CMakeLists.txt
new file mode 100644
index 000000000..5fa2da16c
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/tests/CMakeLists.txt
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+add_gandiva_test(filter_test)
+add_gandiva_test(projector_test)
+add_gandiva_test(projector_build_validation_test)
+add_gandiva_test(if_expr_test)
+add_gandiva_test(literal_test)
+add_gandiva_test(boolean_expr_test)
+add_gandiva_test(binary_test)
+add_gandiva_test(date_time_test)
+add_gandiva_test(to_string_test)
+add_gandiva_test(utf8_test)
+add_gandiva_test(hash_test)
+add_gandiva_test(in_expr_test)
+add_gandiva_test(null_validity_test)
+add_gandiva_test(decimal_test)
+add_gandiva_test(decimal_single_test)
+add_gandiva_test(filter_project_test)
+
+if(ARROW_BUILD_STATIC)
+ add_gandiva_test(projector_test_static SOURCES projector_test.cc USE_STATIC_LINKING)
+ add_arrow_benchmark(micro_benchmarks
+ PREFIX
+ "gandiva"
+ EXTRA_LINK_LIBS
+ gandiva_static)
+endif()
diff --git a/src/arrow/cpp/src/gandiva/tests/binary_test.cc b/src/arrow/cpp/src/gandiva/tests/binary_test.cc
new file mode 100644
index 000000000..591c5befc
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/tests/binary_test.cc
@@ -0,0 +1,136 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "gandiva/node.h"
+#include "gandiva/projector.h"
+#include "gandiva/tests/test_util.h"
+#include "gandiva/tree_expr_builder.h"
+
+namespace gandiva {
+
+using arrow::binary;
+using arrow::boolean;
+using arrow::int32;
+
+class TestBinary : public ::testing::Test {
+ public:
+ void SetUp() { pool_ = arrow::default_memory_pool(); }
+
+ protected:
+ arrow::MemoryPool* pool_;
+};
+
+TEST_F(TestBinary, TestSimple) {
+ // schema for input fields
+ auto field_a = field("a", binary());
+ auto field_b = field("b", binary());
+ auto schema = arrow::schema({field_a, field_b});
+
+ // output fields
+ auto res = field("res", int32());
+
+ // build expressions.
+ // a > b ? octet_length(a) : octet_length(b)
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto node_b = TreeExprBuilder::MakeField(field_b);
+ auto octet_len_a = TreeExprBuilder::MakeFunction("octet_length", {node_a}, int32());
+ auto octet_len_b = TreeExprBuilder::MakeFunction("octet_length", {node_b}, int32());
+
+ auto is_greater =
+ TreeExprBuilder::MakeFunction("greater_than", {node_a, node_b}, boolean());
+ auto if_greater =
+ TreeExprBuilder::MakeIf(is_greater, octet_len_a, octet_len_b, int32());
+ auto expr = TreeExprBuilder::MakeExpression(if_greater, res);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array_a =
+ MakeArrowArrayBinary({"foo", "hello", "hi", "bye"}, {true, true, true, false});
+ auto array_b =
+ MakeArrowArrayBinary({"fo", "hellos", "hi", "bye"}, {true, true, true, true});
+
+ // expected output
+ auto exp = MakeArrowArrayInt32({3, 6, 2, 3}, {true, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a, array_b});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+TEST_F(TestBinary, TestIfElse) {
+ // schema for input fields
+ auto field0 = field("f0", arrow::binary());
+ auto field1 = field("f1", arrow::binary());
+
+ auto schema = arrow::schema({field0, field1});
+
+ auto f0 = TreeExprBuilder::MakeField(field0);
+ auto f1 = TreeExprBuilder::MakeField(field1);
+
+ // output fields
+ auto field_result = field("out", arrow::binary());
+
+ // Build expression
+ auto cond = TreeExprBuilder::MakeFunction("isnotnull", {f0}, arrow::boolean());
+ auto ifexpr = TreeExprBuilder::MakeIf(cond, f0, f1, arrow::binary());
+ auto expr = TreeExprBuilder::MakeExpression(ifexpr, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array_f0 =
+ MakeArrowArrayBinary({"foo", "hello", "hi", "bye"}, {true, true, true, false});
+ auto array_f1 =
+ MakeArrowArrayBinary({"fe", "fi", "fo", "fum"}, {true, true, true, true});
+
+ // expected output
+ auto exp =
+ MakeArrowArrayBinary({"foo", "hello", "hi", "fum"}, {true, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_f0, array_f1});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/tests/boolean_expr_test.cc b/src/arrow/cpp/src/gandiva/tests/boolean_expr_test.cc
new file mode 100644
index 000000000..9226f3571
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/tests/boolean_expr_test.cc
@@ -0,0 +1,388 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+
+#include "gandiva/projector.h"
+#include "gandiva/tests/test_util.h"
+#include "gandiva/tree_expr_builder.h"
+
+namespace gandiva {
+
+using arrow::boolean;
+using arrow::int32;
+
+class TestBooleanExpr : public ::testing::Test {
+ public:
+ void SetUp() { pool_ = arrow::default_memory_pool(); }
+
+ protected:
+ arrow::MemoryPool* pool_;
+};
+
+TEST_F(TestBooleanExpr, SimpleAnd) {
+ // schema for input fields
+ auto fielda = field("a", int32());
+ auto fieldb = field("b", int32());
+ auto schema = arrow::schema({fielda, fieldb});
+
+ // output fields
+ auto field_result = field("res", boolean());
+
+ // build expression.
+ // (a > 0) && (b > 0)
+ auto node_a = TreeExprBuilder::MakeField(fielda);
+ auto node_b = TreeExprBuilder::MakeField(fieldb);
+ auto literal_0 = TreeExprBuilder::MakeLiteral((int32_t)0);
+ auto a_gt_0 =
+ TreeExprBuilder::MakeFunction("greater_than", {node_a, literal_0}, boolean());
+ auto b_gt_0 =
+ TreeExprBuilder::MakeFunction("greater_than", {node_b, literal_0}, boolean());
+
+ auto node_and = TreeExprBuilder::MakeAnd({a_gt_0, b_gt_0});
+ auto expr = TreeExprBuilder::MakeExpression(node_and, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // FALSE_VALID && ? => FALSE_VALID
+ int num_records = 4;
+ auto arraya = MakeArrowArrayInt32({-2, -2, -2, -2}, {true, true, true, true});
+ auto arrayb = MakeArrowArrayInt32({-2, -2, 2, 2}, {true, false, true, false});
+ auto exp = MakeArrowArrayBool({false, false, false, false}, {true, true, true, true});
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {arraya, arrayb});
+
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+
+ // FALSE_INVALID && ?
+ num_records = 4;
+ arraya = MakeArrowArrayInt32({-2, -2, -2, -2}, {false, false, false, false});
+ arrayb = MakeArrowArrayInt32({-2, -2, 2, 2}, {true, false, true, false});
+ exp = MakeArrowArrayBool({false, false, false, false}, {true, false, false, false});
+ in_batch = arrow::RecordBatch::Make(schema, num_records, {arraya, arrayb});
+ outputs.clear();
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+
+ // TRUE_VALID && ?
+ num_records = 4;
+ arraya = MakeArrowArrayInt32({2, 2, 2, 2}, {true, true, true, true});
+ arrayb = MakeArrowArrayInt32({-2, -2, 2, 2}, {true, false, true, false});
+ exp = MakeArrowArrayBool({false, false, true, false}, {true, false, true, false});
+ in_batch = arrow::RecordBatch::Make(schema, num_records, {arraya, arrayb});
+ outputs.clear();
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+
+ // TRUE_INVALID && ?
+ num_records = 4;
+ arraya = MakeArrowArrayInt32({2, 2, 2, 2}, {false, false, false, false});
+ arrayb = MakeArrowArrayInt32({-2, -2, 2, 2}, {true, false, true, false});
+ exp = MakeArrowArrayBool({false, false, false, false}, {true, false, false, false});
+ in_batch = arrow::RecordBatch::Make(schema, num_records, {arraya, arrayb});
+ outputs.clear();
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+TEST_F(TestBooleanExpr, SimpleOr) {
+ // schema for input fields
+ auto fielda = field("a", int32());
+ auto fieldb = field("b", int32());
+ auto schema = arrow::schema({fielda, fieldb});
+
+ // output fields
+ auto field_result = field("res", boolean());
+
+ // build expression.
+ // (a > 0) || (b > 0)
+ auto node_a = TreeExprBuilder::MakeField(fielda);
+ auto node_b = TreeExprBuilder::MakeField(fieldb);
+ auto literal_0 = TreeExprBuilder::MakeLiteral((int32_t)0);
+ auto a_gt_0 =
+ TreeExprBuilder::MakeFunction("greater_than", {node_a, literal_0}, boolean());
+ auto b_gt_0 =
+ TreeExprBuilder::MakeFunction("greater_than", {node_b, literal_0}, boolean());
+
+ auto node_or = TreeExprBuilder::MakeOr({a_gt_0, b_gt_0});
+ auto expr = TreeExprBuilder::MakeExpression(node_or, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // TRUE_VALID && ? => TRUE_VALID
+ int num_records = 4;
+ auto arraya = MakeArrowArrayInt32({2, 2, 2, 2}, {true, true, true, true});
+ auto arrayb = MakeArrowArrayInt32({-2, -2, 2, 2}, {true, false, true, false});
+ auto exp = MakeArrowArrayBool({true, true, true, true}, {true, true, true, true});
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {arraya, arrayb});
+
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+
+ // TRUE_INVALID && ?
+ num_records = 4;
+ arraya = MakeArrowArrayInt32({2, 2, 2, 2}, {false, false, false, false});
+ arrayb = MakeArrowArrayInt32({-2, -2, 2, 2}, {true, false, true, false});
+ exp = MakeArrowArrayBool({false, false, true, false}, {false, false, true, false});
+ in_batch = arrow::RecordBatch::Make(schema, num_records, {arraya, arrayb});
+ outputs.clear();
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+
+ // FALSE_VALID && ?
+ num_records = 4;
+ arraya = MakeArrowArrayInt32({-2, -2, -2, -2}, {true, true, true, true});
+ arrayb = MakeArrowArrayInt32({-2, -2, 2, 2}, {true, false, true, false});
+ exp = MakeArrowArrayBool({false, false, true, false}, {true, false, true, false});
+ in_batch = arrow::RecordBatch::Make(schema, num_records, {arraya, arrayb});
+ outputs.clear();
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+
+ // FALSE_INVALID && ?
+ num_records = 4;
+ arraya = MakeArrowArrayInt32({-2, -2, -2, -2}, {false, false, false, false});
+ arrayb = MakeArrowArrayInt32({-2, -2, 2, 2}, {true, false, true, false});
+ exp = MakeArrowArrayBool({false, false, true, false}, {false, false, true, false});
+ in_batch = arrow::RecordBatch::Make(schema, num_records, {arraya, arrayb});
+ outputs.clear();
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+TEST_F(TestBooleanExpr, AndThree) {
+ // schema for input fields
+ auto fielda = field("a", int32());
+ auto fieldb = field("b", int32());
+ auto fieldc = field("c", int32());
+ auto schema = arrow::schema({fielda, fieldb, fieldc});
+
+ // output fields
+ auto field_result = field("res", boolean());
+
+ // build expression.
+ // (a > 0) && (b > 0) && (c > 0)
+ auto node_a = TreeExprBuilder::MakeField(fielda);
+ auto node_b = TreeExprBuilder::MakeField(fieldb);
+ auto node_c = TreeExprBuilder::MakeField(fieldc);
+ auto literal_0 = TreeExprBuilder::MakeLiteral((int32_t)0);
+ auto a_gt_0 =
+ TreeExprBuilder::MakeFunction("greater_than", {node_a, literal_0}, boolean());
+ auto b_gt_0 =
+ TreeExprBuilder::MakeFunction("greater_than", {node_b, literal_0}, boolean());
+ auto c_gt_0 =
+ TreeExprBuilder::MakeFunction("greater_than", {node_c, literal_0}, boolean());
+
+ auto node_and = TreeExprBuilder::MakeAnd({a_gt_0, b_gt_0, c_gt_0});
+ auto expr = TreeExprBuilder::MakeExpression(node_and, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ int num_records = 8;
+ std::vector<bool> validity({true, true, true, true, true, true, true, true});
+ auto arraya = MakeArrowArrayInt32({2, 2, 2, 0, 2, 0, 0, 0}, validity);
+ auto arrayb = MakeArrowArrayInt32({2, 2, 0, 2, 0, 2, 0, 0}, validity);
+ auto arrayc = MakeArrowArrayInt32({2, 0, 2, 2, 0, 0, 2, 0}, validity);
+ auto exp = MakeArrowArrayBool({true, false, false, false, false, false, false, false},
+ validity);
+
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {arraya, arrayb, arrayc});
+
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+TEST_F(TestBooleanExpr, OrThree) {
+ // schema for input fields
+ auto fielda = field("a", int32());
+ auto fieldb = field("b", int32());
+ auto fieldc = field("c", int32());
+ auto schema = arrow::schema({fielda, fieldb, fieldc});
+
+ // output fields
+ auto field_result = field("res", boolean());
+
+ // build expression.
+ // (a > 0) || (b > 0) || (c > 0)
+ auto node_a = TreeExprBuilder::MakeField(fielda);
+ auto node_b = TreeExprBuilder::MakeField(fieldb);
+ auto node_c = TreeExprBuilder::MakeField(fieldc);
+ auto literal_0 = TreeExprBuilder::MakeLiteral((int32_t)0);
+ auto a_gt_0 =
+ TreeExprBuilder::MakeFunction("greater_than", {node_a, literal_0}, boolean());
+ auto b_gt_0 =
+ TreeExprBuilder::MakeFunction("greater_than", {node_b, literal_0}, boolean());
+ auto c_gt_0 =
+ TreeExprBuilder::MakeFunction("greater_than", {node_c, literal_0}, boolean());
+
+ auto node_or = TreeExprBuilder::MakeOr({a_gt_0, b_gt_0, c_gt_0});
+ auto expr = TreeExprBuilder::MakeExpression(node_or, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ int num_records = 8;
+ std::vector<bool> validity({true, true, true, true, true, true, true, true});
+ auto arraya = MakeArrowArrayInt32({2, 2, 2, 0, 2, 0, 0, 0}, validity);
+ auto arrayb = MakeArrowArrayInt32({2, 2, 0, 2, 0, 2, 0, 0}, validity);
+ auto arrayc = MakeArrowArrayInt32({2, 0, 2, 2, 0, 0, 2, 0}, validity);
+ auto exp =
+ MakeArrowArrayBool({true, true, true, true, true, true, true, false}, validity);
+
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {arraya, arrayb, arrayc});
+
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+TEST_F(TestBooleanExpr, BooleanAndInsideIf) {
+ // schema for input fields
+ auto fielda = field("a", int32());
+ auto fieldb = field("b", int32());
+ auto schema = arrow::schema({fielda, fieldb});
+
+ // output fields
+ auto field_result = field("res", boolean());
+
+ // build expression.
+ // if (a > 2 && b > 2)
+ // a > 3 && b > 3
+ // else
+ // a > 1 && b > 1
+ auto node_a = TreeExprBuilder::MakeField(fielda);
+ auto node_b = TreeExprBuilder::MakeField(fieldb);
+ auto literal_1 = TreeExprBuilder::MakeLiteral((int32_t)1);
+ auto literal_2 = TreeExprBuilder::MakeLiteral((int32_t)2);
+ auto literal_3 = TreeExprBuilder::MakeLiteral((int32_t)3);
+ auto a_gt_1 =
+ TreeExprBuilder::MakeFunction("greater_than", {node_a, literal_1}, boolean());
+ auto a_gt_2 =
+ TreeExprBuilder::MakeFunction("greater_than", {node_a, literal_2}, boolean());
+ auto a_gt_3 =
+ TreeExprBuilder::MakeFunction("greater_than", {node_a, literal_3}, boolean());
+ auto b_gt_1 =
+ TreeExprBuilder::MakeFunction("greater_than", {node_b, literal_1}, boolean());
+ auto b_gt_2 =
+ TreeExprBuilder::MakeFunction("greater_than", {node_b, literal_2}, boolean());
+ auto b_gt_3 =
+ TreeExprBuilder::MakeFunction("greater_than", {node_b, literal_3}, boolean());
+
+ auto and_1 = TreeExprBuilder::MakeAnd({a_gt_1, b_gt_1});
+ auto and_2 = TreeExprBuilder::MakeAnd({a_gt_2, b_gt_2});
+ auto and_3 = TreeExprBuilder::MakeAnd({a_gt_3, b_gt_3});
+
+ auto node_if = TreeExprBuilder::MakeIf(and_2, and_3, and_1, arrow::boolean());
+ auto expr = TreeExprBuilder::MakeExpression(node_if, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ int num_records = 4;
+ std::vector<bool> validity({true, true, true, true});
+ auto arraya = MakeArrowArrayInt32({4, 4, 2, 1}, validity);
+ auto arrayb = MakeArrowArrayInt32({5, 3, 3, 1}, validity);
+ auto exp = MakeArrowArrayBool({true, false, true, false}, validity);
+
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {arraya, arrayb});
+
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+TEST_F(TestBooleanExpr, IfInsideBooleanAnd) {
+ // schema for input fields
+ auto fielda = field("a", int32());
+ auto fieldb = field("b", int32());
+ auto schema = arrow::schema({fielda, fieldb});
+
+ // output fields
+ auto field_result = field("res", boolean());
+
+ // build expression.
+ // (if (a > b) a > 3 else b > 3) && (if (a > b) a > 2 else b > 2)
+
+ auto node_a = TreeExprBuilder::MakeField(fielda);
+ auto node_b = TreeExprBuilder::MakeField(fieldb);
+ auto literal_2 = TreeExprBuilder::MakeLiteral((int32_t)2);
+ auto literal_3 = TreeExprBuilder::MakeLiteral((int32_t)3);
+ auto a_gt_b =
+ TreeExprBuilder::MakeFunction("greater_than", {node_a, node_b}, boolean());
+ auto a_gt_2 =
+ TreeExprBuilder::MakeFunction("greater_than", {node_a, literal_2}, boolean());
+ auto a_gt_3 =
+ TreeExprBuilder::MakeFunction("greater_than", {node_a, literal_3}, boolean());
+ auto b_gt_2 =
+ TreeExprBuilder::MakeFunction("greater_than", {node_b, literal_2}, boolean());
+ auto b_gt_3 =
+ TreeExprBuilder::MakeFunction("greater_than", {node_b, literal_3}, boolean());
+
+ auto if_3 = TreeExprBuilder::MakeIf(a_gt_b, a_gt_3, b_gt_3, arrow::boolean());
+ auto if_2 = TreeExprBuilder::MakeIf(a_gt_b, a_gt_2, b_gt_2, arrow::boolean());
+ auto node_and = TreeExprBuilder::MakeAnd({if_3, if_2});
+ auto expr = TreeExprBuilder::MakeExpression(node_and, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ int num_records = 4;
+ std::vector<bool> validity({true, true, true, true});
+ auto arraya = MakeArrowArrayInt32({4, 3, 3, 2}, validity);
+ auto arrayb = MakeArrowArrayInt32({3, 4, 2, 3}, validity);
+ auto exp = MakeArrowArrayBool({true, true, false, false}, validity);
+
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {arraya, arrayb});
+
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/tests/date_time_test.cc b/src/arrow/cpp/src/gandiva/tests/date_time_test.cc
new file mode 100644
index 000000000..77139125f
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/tests/date_time_test.cc
@@ -0,0 +1,602 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+#include <math.h>
+#include <time.h>
+
+#include "arrow/memory_pool.h"
+#include "gandiva/precompiled/time_constants.h"
+#include "gandiva/projector.h"
+#include "gandiva/tests/test_util.h"
+#include "gandiva/tree_expr_builder.h"
+
+namespace gandiva {
+
+using arrow::boolean;
+using arrow::date32;
+using arrow::date64;
+using arrow::float32;
+using arrow::int32;
+using arrow::int64;
+using arrow::timestamp;
+
+class TestProjector : public ::testing::Test {
+ public:
+ void SetUp() { pool_ = arrow::default_memory_pool(); }
+
+ protected:
+ arrow::MemoryPool* pool_;
+};
+
+time_t Epoch() {
+ // HACK: MSVC mktime() fails on UTC times before 1970-01-01 00:00:00.
+ // But it first converts its argument from local time to UTC time,
+ // so we ask for 1970-01-02 to avoid failing in timezones ahead of UTC.
+ struct tm y1970;
+ memset(&y1970, 0, sizeof(struct tm));
+ y1970.tm_year = 70;
+ y1970.tm_mon = 0;
+ y1970.tm_mday = 2;
+ y1970.tm_hour = 0;
+ y1970.tm_min = 0;
+ y1970.tm_sec = 0;
+ time_t epoch = mktime(&y1970);
+ if (epoch == static_cast<time_t>(-1)) {
+ ARROW_LOG(FATAL) << "mktime() failed";
+ }
+ // Adjust for the 24h offset above.
+ return epoch - 24 * 3600;
+}
+
+int32_t MillisInDay(int32_t hh, int32_t mm, int32_t ss, int32_t millis) {
+ int32_t mins = hh * 60 + mm;
+ int32_t secs = mins * 60 + ss;
+
+ return secs * 1000 + millis;
+}
+
+int64_t MillisSince(time_t base_line, int32_t yy, int32_t mm, int32_t dd, int32_t hr,
+ int32_t min, int32_t sec, int32_t millis) {
+ struct tm given_ts;
+ memset(&given_ts, 0, sizeof(struct tm));
+ given_ts.tm_year = (yy - 1900);
+ given_ts.tm_mon = (mm - 1);
+ given_ts.tm_mday = dd;
+ given_ts.tm_hour = hr;
+ given_ts.tm_min = min;
+ given_ts.tm_sec = sec;
+
+ time_t ts = mktime(&given_ts);
+ if (ts == static_cast<time_t>(-1)) {
+ ARROW_LOG(FATAL) << "mktime() failed";
+ }
+ // time_t is an arithmetic type on both POSIX and Windows, we can simply
+ // subtract to get a duration in seconds.
+ return static_cast<int64_t>(ts - base_line) * 1000 + millis;
+}
+
+int32_t DaysSince(time_t base_line, int32_t yy, int32_t mm, int32_t dd, int32_t hr,
+ int32_t min, int32_t sec, int32_t millis) {
+ struct tm given_ts;
+ memset(&given_ts, 0, sizeof(struct tm));
+ given_ts.tm_year = (yy - 1900);
+ given_ts.tm_mon = (mm - 1);
+ given_ts.tm_mday = dd;
+ given_ts.tm_hour = hr;
+ given_ts.tm_min = min;
+ given_ts.tm_sec = sec;
+
+ time_t ts = mktime(&given_ts);
+ if (ts == static_cast<time_t>(-1)) {
+ ARROW_LOG(FATAL) << "mktime() failed";
+ }
+ // time_t is an arithmetic type on both POSIX and Windows, we can simply
+ // subtract to get a duration in seconds.
+ return static_cast<int32_t>(((ts - base_line) * 1000 + millis) / MILLIS_IN_DAY);
+}
+
+TEST_F(TestProjector, TestIsNull) {
+ auto d0 = field("d0", date64());
+ auto t0 = field("t0", time32(arrow::TimeUnit::MILLI));
+ auto schema = arrow::schema({d0, t0});
+
+ // output fields
+ auto b0 = field("isnull", boolean());
+
+ // isnull and isnotnull
+ auto isnull_expr = TreeExprBuilder::MakeExpression("isnull", {d0}, b0);
+ auto isnotnull_expr = TreeExprBuilder::MakeExpression("isnotnull", {t0}, b0);
+
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {isnull_expr, isnotnull_expr},
+ TestConfiguration(), &projector);
+ ASSERT_TRUE(status.ok());
+
+ int num_records = 4;
+ std::vector<int64_t> d0_data = {0, 100, 0, 1000};
+ auto t0_data = {0, 100, 0, 1000};
+ auto validity = {false, true, false, true};
+ auto d0_array =
+ MakeArrowTypeArray<arrow::Date64Type, int64_t>(date64(), d0_data, validity);
+ auto t0_array = MakeArrowTypeArray<arrow::Time32Type, int32_t>(
+ time32(arrow::TimeUnit::MILLI), t0_data, validity);
+
+ // expected output
+ auto exp_isnull =
+ MakeArrowArrayBool({true, false, true, false}, {true, true, true, true});
+ auto exp_isnotnull = MakeArrowArrayBool(validity, {true, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {d0_array, t0_array});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_isnull, outputs.at(0));
+ EXPECT_ARROW_ARRAY_EQUALS(exp_isnotnull, outputs.at(1));
+}
+
+TEST_F(TestProjector, TestDate32IsNull) {
+ auto d0 = field("d0", date32());
+ auto schema = arrow::schema({d0});
+
+ // output fields
+ auto b0 = field("isnull", boolean());
+
+ // isnull and isnotnull
+ auto isnull_expr = TreeExprBuilder::MakeExpression("isnull", {d0}, b0);
+
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {isnull_expr}, TestConfiguration(), &projector);
+ ASSERT_TRUE(status.ok());
+
+ int num_records = 4;
+ std::vector<int32_t> d0_data = {0, 100, 0, 1000};
+ auto validity = {false, true, false, true};
+ auto d0_array =
+ MakeArrowTypeArray<arrow::Date32Type, int32_t>(date32(), d0_data, validity);
+
+ // expected output
+ auto exp_isnull =
+ MakeArrowArrayBool({true, false, true, false}, {true, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {d0_array});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_isnull, outputs.at(0));
+}
+
+TEST_F(TestProjector, TestDateTime) {
+ auto field0 = field("f0", date64());
+ auto field1 = field("f1", date32());
+ auto field2 = field("f2", timestamp(arrow::TimeUnit::MILLI));
+ auto schema = arrow::schema({field0, field1, field2});
+
+ // output fields
+ auto field_year = field("yy", int64());
+ auto field_month = field("mm", int64());
+ auto field_day = field("dd", int64());
+ auto field_hour = field("hh", int64());
+ auto field_date64 = field("date64", date64());
+
+ // extract year and month from date
+ auto date2year_expr =
+ TreeExprBuilder::MakeExpression("extractYear", {field0}, field_year);
+ auto date2month_expr =
+ TreeExprBuilder::MakeExpression("extractMonth", {field0}, field_month);
+
+ // extract year and month from date32, cast to date64 first
+ auto node_f1 = TreeExprBuilder::MakeField(field1);
+ auto date32_to_date64_func =
+ TreeExprBuilder::MakeFunction("castDATE", {node_f1}, date64());
+
+ auto date64_2year_func =
+ TreeExprBuilder::MakeFunction("extractYear", {date32_to_date64_func}, int64());
+ auto date64_2year_expr = TreeExprBuilder::MakeExpression(date64_2year_func, field_year);
+
+ auto date64_2month_func =
+ TreeExprBuilder::MakeFunction("extractMonth", {date32_to_date64_func}, int64());
+ auto date64_2month_expr =
+ TreeExprBuilder::MakeExpression(date64_2month_func, field_month);
+
+ // extract month and day from timestamp
+ auto ts2month_expr =
+ TreeExprBuilder::MakeExpression("extractMonth", {field2}, field_month);
+ auto ts2day_expr = TreeExprBuilder::MakeExpression("extractDay", {field2}, field_day);
+
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema,
+ {date2year_expr, date2month_expr, date64_2year_expr,
+ date64_2month_expr, ts2month_expr, ts2day_expr},
+ TestConfiguration(), &projector);
+ ASSERT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ time_t epoch = Epoch();
+ int num_records = 4;
+ auto validity = {true, true, true, true};
+ std::vector<int64_t> field0_data = {MillisSince(epoch, 2000, 1, 1, 5, 0, 0, 0),
+ MillisSince(epoch, 1999, 12, 31, 5, 0, 0, 0),
+ MillisSince(epoch, 2015, 6, 30, 20, 0, 0, 0),
+ MillisSince(epoch, 2015, 7, 1, 20, 0, 0, 0)};
+ auto array0 =
+ MakeArrowTypeArray<arrow::Date64Type, int64_t>(date64(), field0_data, validity);
+
+ std::vector<int32_t> field1_data = {DaysSince(epoch, 2000, 1, 1, 5, 0, 0, 0),
+ DaysSince(epoch, 1999, 12, 31, 5, 0, 0, 0),
+ DaysSince(epoch, 2015, 6, 30, 20, 0, 0, 0),
+ DaysSince(epoch, 2015, 7, 1, 20, 0, 0, 0)};
+ auto array1 =
+ MakeArrowTypeArray<arrow::Date32Type, int32_t>(date32(), field1_data, validity);
+
+ std::vector<int64_t> field2_data = {MillisSince(epoch, 1999, 12, 31, 5, 0, 0, 0),
+ MillisSince(epoch, 2000, 1, 2, 5, 0, 0, 0),
+ MillisSince(epoch, 2015, 7, 1, 1, 0, 0, 0),
+ MillisSince(epoch, 2015, 6, 29, 23, 0, 0, 0)};
+
+ auto array2 = MakeArrowTypeArray<arrow::TimestampType, int64_t>(
+ arrow::timestamp(arrow::TimeUnit::MILLI), field2_data, validity);
+
+ // expected output
+ // date 2 year and date 2 month for date64
+ auto exp_yy_from_date64 = MakeArrowArrayInt64({2000, 1999, 2015, 2015}, validity);
+ auto exp_mm_from_date64 = MakeArrowArrayInt64({1, 12, 6, 7}, validity);
+
+ // date 2 year and date 2 month for date32
+ auto exp_yy_from_date32 = MakeArrowArrayInt64({2000, 1999, 2015, 2015}, validity);
+ auto exp_mm_from_date32 = MakeArrowArrayInt64({1, 12, 6, 7}, validity);
+
+ // ts 2 month and ts 2 day
+ auto exp_mm_from_ts = MakeArrowArrayInt64({12, 1, 7, 6}, validity);
+ auto exp_dd_from_ts = MakeArrowArrayInt64({31, 2, 1, 29}, validity);
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1, array2});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_yy_from_date64, outputs.at(0));
+ EXPECT_ARROW_ARRAY_EQUALS(exp_mm_from_date64, outputs.at(1));
+ EXPECT_ARROW_ARRAY_EQUALS(exp_yy_from_date32, outputs.at(2));
+ EXPECT_ARROW_ARRAY_EQUALS(exp_mm_from_date32, outputs.at(3));
+ EXPECT_ARROW_ARRAY_EQUALS(exp_mm_from_ts, outputs.at(4));
+ EXPECT_ARROW_ARRAY_EQUALS(exp_dd_from_ts, outputs.at(5));
+}
+
+TEST_F(TestProjector, TestTime) {
+ auto field0 = field("f0", time32(arrow::TimeUnit::MILLI));
+ auto schema = arrow::schema({field0});
+
+ auto field_min = field("mm", int64());
+ auto field_hour = field("hh", int64());
+
+ // extract day and hour from time32
+ auto time2min_expr =
+ TreeExprBuilder::MakeExpression("extractMinute", {field0}, field_min);
+ auto time2hour_expr =
+ TreeExprBuilder::MakeExpression("extractHour", {field0}, field_hour);
+
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {time2min_expr, time2hour_expr},
+ TestConfiguration(), &projector);
+ ASSERT_TRUE(status.ok());
+
+ // create input data
+ int num_records = 4;
+ auto validity = {true, true, true, true};
+ std::vector<int32_t> field_data = {
+ MillisInDay(5, 35, 25, 0), // 5:35:25
+ MillisInDay(0, 59, 0, 0), // 0:59:12
+ MillisInDay(12, 30, 0, 0), // 12:30:0
+ MillisInDay(23, 0, 0, 0) // 23:0:0
+ };
+ auto array = MakeArrowTypeArray<arrow::Time32Type, int32_t>(
+ time32(arrow::TimeUnit::MILLI), field_data, validity);
+
+ // expected output
+ auto exp_min = MakeArrowArrayInt64({35, 59, 30, 0}, validity);
+ auto exp_hour = MakeArrowArrayInt64({5, 0, 12, 23}, validity);
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_min, outputs.at(0));
+ EXPECT_ARROW_ARRAY_EQUALS(exp_hour, outputs.at(1));
+}
+
+TEST_F(TestProjector, TestTimestampDiff) {
+ auto f0 = field("f0", timestamp(arrow::TimeUnit::MILLI));
+ auto f1 = field("f1", timestamp(arrow::TimeUnit::MILLI));
+ auto schema = arrow::schema({f0, f1});
+
+ // output fields
+ auto diff_seconds = field("ss", int32());
+
+ // get diff
+ auto diff_secs_expr =
+ TreeExprBuilder::MakeExpression("timestampdiffSecond", {f0, f1}, diff_seconds);
+
+ auto diff_mins_expr =
+ TreeExprBuilder::MakeExpression("timestampdiffMinute", {f0, f1}, diff_seconds);
+
+ auto diff_hours_expr =
+ TreeExprBuilder::MakeExpression("timestampdiffHour", {f0, f1}, diff_seconds);
+
+ auto diff_days_expr =
+ TreeExprBuilder::MakeExpression("timestampdiffDay", {f0, f1}, diff_seconds);
+
+ auto diff_days_expr_with_datediff_fn =
+ TreeExprBuilder::MakeExpression("datediff", {f0, f1}, diff_seconds);
+
+ auto diff_weeks_expr =
+ TreeExprBuilder::MakeExpression("timestampdiffWeek", {f0, f1}, diff_seconds);
+
+ auto diff_months_expr =
+ TreeExprBuilder::MakeExpression("timestampdiffMonth", {f0, f1}, diff_seconds);
+
+ auto diff_quarters_expr =
+ TreeExprBuilder::MakeExpression("timestampdiffQuarter", {f0, f1}, diff_seconds);
+
+ auto diff_years_expr =
+ TreeExprBuilder::MakeExpression("timestampdiffYear", {f0, f1}, diff_seconds);
+
+ std::shared_ptr<Projector> projector;
+ auto exprs = {diff_secs_expr,
+ diff_mins_expr,
+ diff_hours_expr,
+ diff_days_expr,
+ diff_days_expr_with_datediff_fn,
+ diff_weeks_expr,
+ diff_months_expr,
+ diff_quarters_expr,
+ diff_years_expr};
+ auto status = Projector::Make(schema, exprs, TestConfiguration(), &projector);
+ ASSERT_TRUE(status.ok());
+
+ time_t epoch = Epoch();
+
+ // 2015-09-10T20:49:42.000
+ auto start_millis = MillisSince(epoch, 2015, 9, 10, 20, 49, 42, 0);
+ // 2017-03-30T22:50:59.050
+ auto end_millis = MillisSince(epoch, 2017, 3, 30, 22, 50, 59, 50);
+ std::vector<int64_t> f0_data = {start_millis, end_millis,
+ // 2015-09-10T20:49:42.999
+ start_millis + 999,
+ // 2015-09-10T20:49:42.999
+ MillisSince(epoch, 2015, 9, 10, 20, 49, 42, 999)};
+ std::vector<int64_t> f1_data = {end_millis, start_millis,
+ // 2015-09-10T20:49:42.999
+ start_millis + 999,
+ // 2015-09-9T21:49:42.999 (23 hours behind)
+ MillisSince(epoch, 2015, 9, 9, 21, 49, 42, 999)};
+
+ int64_t num_records = f0_data.size();
+ std::vector<bool> validity(num_records, true);
+ auto array0 = MakeArrowTypeArray<arrow::TimestampType, int64_t>(
+ arrow::timestamp(arrow::TimeUnit::MILLI), f0_data, validity);
+ auto array1 = MakeArrowTypeArray<arrow::TimestampType, int64_t>(
+ arrow::timestamp(arrow::TimeUnit::MILLI), f1_data, validity);
+
+ // expected output
+ std::vector<ArrayPtr> exp_output;
+ exp_output.push_back(
+ MakeArrowArrayInt32({48996077, -48996077, 0, -23 * 3600}, validity));
+ exp_output.push_back(MakeArrowArrayInt32({816601, -816601, 0, -23 * 60}, validity));
+ exp_output.push_back(MakeArrowArrayInt32({13610, -13610, 0, -23}, validity));
+ exp_output.push_back(MakeArrowArrayInt32({567, -567, 0, 0}, validity));
+ exp_output.push_back(MakeArrowArrayInt32({567, -567, 0, 0}, validity));
+ exp_output.push_back(MakeArrowArrayInt32({81, -81, 0, 0}, validity));
+ exp_output.push_back(MakeArrowArrayInt32({18, -18, 0, 0}, validity));
+ exp_output.push_back(MakeArrowArrayInt32({6, -6, 0, 0}, validity));
+ exp_output.push_back(MakeArrowArrayInt32({1, -1, 0, 0}, validity));
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ for (uint32_t i = 0; i < exp_output.size(); i++) {
+ EXPECT_ARROW_ARRAY_EQUALS(exp_output.at(i), outputs.at(i));
+ }
+}
+
+TEST_F(TestProjector, TestTimestampDiffMonth) {
+ auto f0 = field("f0", timestamp(arrow::TimeUnit::MILLI));
+ auto f1 = field("f1", timestamp(arrow::TimeUnit::MILLI));
+ auto schema = arrow::schema({f0, f1});
+
+ // output fields
+ auto diff_seconds = field("ss", int32());
+
+ auto diff_months_expr =
+ TreeExprBuilder::MakeExpression("timestampdiffMonth", {f0, f1}, diff_seconds);
+
+ std::shared_ptr<Projector> projector;
+ auto status =
+ Projector::Make(schema, {diff_months_expr}, TestConfiguration(), &projector);
+ std::cout << status.message();
+ ASSERT_TRUE(status.ok());
+
+ time_t epoch = Epoch();
+
+ // Create a row-batch with some sample data
+ std::vector<int64_t> f0_data = {MillisSince(epoch, 2019, 1, 31, 0, 0, 0, 0),
+ MillisSince(epoch, 2020, 1, 31, 0, 0, 0, 0),
+ MillisSince(epoch, 2020, 1, 31, 0, 0, 0, 0),
+ MillisSince(epoch, 2019, 3, 31, 0, 0, 0, 0),
+ MillisSince(epoch, 2020, 3, 30, 0, 0, 0, 0),
+ MillisSince(epoch, 2020, 5, 31, 0, 0, 0, 0)};
+ std::vector<int64_t> f1_data = {MillisSince(epoch, 2019, 2, 28, 0, 0, 0, 0),
+ MillisSince(epoch, 2020, 2, 28, 0, 0, 0, 0),
+ MillisSince(epoch, 2020, 2, 29, 0, 0, 0, 0),
+ MillisSince(epoch, 2019, 4, 30, 0, 0, 0, 0),
+ MillisSince(epoch, 2020, 2, 29, 0, 0, 0, 0),
+ MillisSince(epoch, 2020, 9, 30, 0, 0, 0, 0)};
+ int64_t num_records = f0_data.size();
+ std::vector<bool> validity(num_records, true);
+
+ auto array0 = MakeArrowTypeArray<arrow::TimestampType, int64_t>(
+ arrow::timestamp(arrow::TimeUnit::MILLI), f0_data, validity);
+ auto array1 = MakeArrowTypeArray<arrow::TimestampType, int64_t>(
+ arrow::timestamp(arrow::TimeUnit::MILLI), f1_data, validity);
+
+ // expected output
+ std::vector<ArrayPtr> exp_output;
+ exp_output.push_back(MakeArrowArrayInt32({1, 0, 1, 1, -1, 4}, validity));
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ for (uint32_t i = 0; i < exp_output.size(); i++) {
+ EXPECT_ARROW_ARRAY_EQUALS(exp_output.at(i), outputs.at(i));
+ }
+}
+
+TEST_F(TestProjector, TestMonthsBetween) {
+ auto f0 = field("f0", arrow::date64());
+ auto f1 = field("f1", arrow::date64());
+ auto schema = arrow::schema({f0, f1});
+
+ // output fields
+ auto output = field("out", arrow::float64());
+
+ auto months_between_expr =
+ TreeExprBuilder::MakeExpression("months_between", {f0, f1}, output);
+
+ std::shared_ptr<Projector> projector;
+ auto status =
+ Projector::Make(schema, {months_between_expr}, TestConfiguration(), &projector);
+ std::cout << status.message();
+ ASSERT_TRUE(status.ok());
+
+ time_t epoch = Epoch();
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto validity = {true, true, true, true};
+ std::vector<int64_t> f0_data = {MillisSince(epoch, 1995, 3, 2, 0, 0, 0, 0),
+ MillisSince(epoch, 1995, 2, 2, 0, 0, 0, 0),
+ MillisSince(epoch, 1995, 3, 31, 0, 0, 0, 0),
+ MillisSince(epoch, 1996, 3, 31, 0, 0, 0, 0)};
+
+ auto array0 =
+ MakeArrowTypeArray<arrow::Date64Type, int64_t>(date64(), f0_data, validity);
+
+ std::vector<int64_t> f1_data = {MillisSince(epoch, 1995, 2, 2, 0, 0, 0, 0),
+ MillisSince(epoch, 1995, 3, 2, 0, 0, 0, 0),
+ MillisSince(epoch, 1995, 2, 28, 0, 0, 0, 0),
+ MillisSince(epoch, 1996, 2, 29, 0, 0, 0, 0)};
+
+ auto array1 =
+ MakeArrowTypeArray<arrow::Date64Type, int64_t>(date64(), f1_data, validity);
+
+ // expected output
+ auto exp_output = MakeArrowArrayFloat64({1.0, -1.0, 1.0, 1.0}, validity);
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_output, outputs.at(0));
+}
+
+TEST_F(TestProjector, TestLastDay) {
+ auto f0 = field("f0", arrow::date64());
+ auto schema = arrow::schema({f0});
+
+ // output fields
+ auto output = field("out", arrow::date64());
+
+ auto last_day_expr = TreeExprBuilder::MakeExpression("last_day", {f0}, output);
+
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {last_day_expr}, TestConfiguration(), &projector);
+ std::cout << status.message();
+ ASSERT_TRUE(status.ok());
+
+ time_t epoch = Epoch();
+
+ // Create a row-batch with some sample data
+ // Used a leap year as example.
+ int num_records = 5;
+ auto validity = {true, true, true, true, true};
+ std::vector<int64_t> f0_data = {MillisSince(epoch, 2016, 2, 3, 8, 20, 10, 34),
+ MillisSince(epoch, 2016, 2, 29, 23, 59, 59, 59),
+ MillisSince(epoch, 2016, 1, 30, 1, 15, 20, 0),
+ MillisSince(epoch, 2017, 2, 3, 23, 15, 20, 0),
+ MillisSince(epoch, 2015, 12, 30, 22, 50, 11, 0)};
+
+ auto array0 =
+ MakeArrowTypeArray<arrow::Date64Type, int64_t>(date64(), f0_data, validity);
+
+ std::vector<int64_t> f0_output_data = {MillisSince(epoch, 2016, 2, 29, 0, 0, 0, 0),
+ MillisSince(epoch, 2016, 2, 29, 0, 0, 0, 0),
+ MillisSince(epoch, 2016, 1, 31, 0, 0, 0, 0),
+ MillisSince(epoch, 2017, 2, 28, 0, 0, 0, 0),
+ MillisSince(epoch, 2015, 12, 31, 0, 0, 0, 0)};
+
+ // expected output
+ auto exp_output = MakeArrowArrayDate64(f0_output_data, validity);
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_output, outputs.at(0));
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/tests/decimal_single_test.cc b/src/arrow/cpp/src/gandiva/tests/decimal_single_test.cc
new file mode 100644
index 000000000..666ee4a68
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/tests/decimal_single_test.cc
@@ -0,0 +1,305 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <sstream>
+
+#include <gtest/gtest.h>
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+
+#include "gandiva/decimal_scalar.h"
+#include "gandiva/decimal_type_util.h"
+#include "gandiva/projector.h"
+#include "gandiva/tests/test_util.h"
+#include "gandiva/tree_expr_builder.h"
+
+using arrow::Decimal128;
+
+namespace gandiva {
+
+#define EXPECT_DECIMAL_RESULT(op, x, y, expected, actual) \
+ EXPECT_EQ(expected, actual) << op << " (" << (x).ToString() << "),(" << (y).ToString() \
+ << ")" \
+ << " expected : " << (expected).ToString() \
+ << " actual : " << (actual).ToString();
+
+DecimalScalar128 decimal_literal(const char* value, int precision, int scale) {
+ std::string value_string = std::string(value);
+ return DecimalScalar128(value_string, precision, scale);
+}
+
+class TestDecimalOps : public ::testing::Test {
+ public:
+ void SetUp() { pool_ = arrow::default_memory_pool(); }
+
+ ArrayPtr MakeDecimalVector(const DecimalScalar128& in);
+
+ void Verify(DecimalTypeUtil::Op, const std::string& function, const DecimalScalar128& x,
+ const DecimalScalar128& y, const DecimalScalar128& expected);
+
+ void AddAndVerify(const DecimalScalar128& x, const DecimalScalar128& y,
+ const DecimalScalar128& expected) {
+ Verify(DecimalTypeUtil::kOpAdd, "add", x, y, expected);
+ }
+
+ void SubtractAndVerify(const DecimalScalar128& x, const DecimalScalar128& y,
+ const DecimalScalar128& expected) {
+ Verify(DecimalTypeUtil::kOpSubtract, "subtract", x, y, expected);
+ }
+
+ void MultiplyAndVerify(const DecimalScalar128& x, const DecimalScalar128& y,
+ const DecimalScalar128& expected) {
+ Verify(DecimalTypeUtil::kOpMultiply, "multiply", x, y, expected);
+ }
+
+ void DivideAndVerify(const DecimalScalar128& x, const DecimalScalar128& y,
+ const DecimalScalar128& expected) {
+ Verify(DecimalTypeUtil::kOpDivide, "divide", x, y, expected);
+ }
+
+ void ModAndVerify(const DecimalScalar128& x, const DecimalScalar128& y,
+ const DecimalScalar128& expected) {
+ Verify(DecimalTypeUtil::kOpMod, "mod", x, y, expected);
+ }
+
+ protected:
+ arrow::MemoryPool* pool_;
+};
+
+ArrayPtr TestDecimalOps::MakeDecimalVector(const DecimalScalar128& in) {
+ std::vector<arrow::Decimal128> ret;
+
+ Decimal128 decimal_value = in.value();
+
+ auto decimal_type = std::make_shared<arrow::Decimal128Type>(in.precision(), in.scale());
+ return MakeArrowArrayDecimal(decimal_type, {decimal_value}, {true});
+}
+
+void TestDecimalOps::Verify(DecimalTypeUtil::Op op, const std::string& function,
+ const DecimalScalar128& x, const DecimalScalar128& y,
+ const DecimalScalar128& expected) {
+ auto x_type = std::make_shared<arrow::Decimal128Type>(x.precision(), x.scale());
+ auto y_type = std::make_shared<arrow::Decimal128Type>(y.precision(), y.scale());
+ auto field_x = field("x", x_type);
+ auto field_y = field("y", y_type);
+ auto schema = arrow::schema({field_x, field_y});
+
+ Decimal128TypePtr output_type;
+ auto status = DecimalTypeUtil::GetResultType(op, {x_type, y_type}, &output_type);
+ ARROW_EXPECT_OK(status);
+
+ // output fields
+ auto res = field("res", output_type);
+
+ // build expression : x op y
+ auto expr = TreeExprBuilder::MakeExpression(function, {field_x, field_y}, res);
+
+ // Build a projector for the expression.
+ std::shared_ptr<Projector> projector;
+ status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ ARROW_EXPECT_OK(status);
+
+ // Create a row-batch with some sample data
+ auto array_a = MakeDecimalVector(x);
+ auto array_b = MakeDecimalVector(y);
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, 1 /*num_records*/, {array_a, array_b});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ ARROW_EXPECT_OK(status);
+
+ // Validate results
+ auto out_array = dynamic_cast<arrow::Decimal128Array*>(outputs[0].get());
+ const Decimal128 out_value(out_array->GetValue(0));
+
+ auto dtype = dynamic_cast<arrow::Decimal128Type*>(out_array->type().get());
+ std::string value_string = out_value.ToString(0);
+ DecimalScalar128 actual{value_string, dtype->precision(), dtype->scale()};
+
+ EXPECT_DECIMAL_RESULT(function, x, y, expected, actual);
+}
+
+TEST_F(TestDecimalOps, TestAdd) {
+ // fast-path
+ AddAndVerify(decimal_literal("201", 30, 3), // x
+ decimal_literal("301", 30, 3), // y
+ decimal_literal("502", 31, 3)); // expected
+
+ AddAndVerify(decimal_literal("201", 30, 3), // x
+ decimal_literal("301", 30, 2), // y
+ decimal_literal("3211", 32, 3)); // expected
+
+ AddAndVerify(decimal_literal("201", 30, 3), // x
+ decimal_literal("301", 30, 4), // y
+ decimal_literal("2311", 32, 4)); // expected
+
+ // max precision, but no overflow
+ AddAndVerify(decimal_literal("201", 38, 3), // x
+ decimal_literal("301", 38, 3), // y
+ decimal_literal("502", 38, 3)); // expected
+
+ AddAndVerify(decimal_literal("201", 38, 3), // x
+ decimal_literal("301", 38, 2), // y
+ decimal_literal("3211", 38, 3)); // expected
+
+ AddAndVerify(decimal_literal("201", 38, 3), // x
+ decimal_literal("301", 38, 4), // y
+ decimal_literal("2311", 38, 4)); // expected
+
+ AddAndVerify(decimal_literal("201", 38, 3), // x
+ decimal_literal("301", 38, 7), // y
+ decimal_literal("201030", 38, 6)); // expected
+
+ AddAndVerify(decimal_literal("1201", 38, 3), // x
+ decimal_literal("1801", 38, 3), // y
+ decimal_literal("3002", 38, 3)); // carry-over from fractional
+
+ // max precision
+ AddAndVerify(decimal_literal("09999999999999999999999999999999000000", 38, 5), // x
+ decimal_literal("100", 38, 7), // y
+ decimal_literal("99999999999999999999999999999990000010", 38, 6));
+
+ AddAndVerify(decimal_literal("-09999999999999999999999999999999000000", 38, 5), // x
+ decimal_literal("100", 38, 7), // y
+ decimal_literal("-99999999999999999999999999999989999990", 38, 6));
+
+ AddAndVerify(decimal_literal("09999999999999999999999999999999000000", 38, 5), // x
+ decimal_literal("-100", 38, 7), // y
+ decimal_literal("99999999999999999999999999999989999990", 38, 6));
+
+ AddAndVerify(decimal_literal("-09999999999999999999999999999999000000", 38, 5), // x
+ decimal_literal("-100", 38, 7), // y
+ decimal_literal("-99999999999999999999999999999990000010", 38, 6));
+
+ AddAndVerify(decimal_literal("09999999999999999999999999999999999999", 38, 6), // x
+ decimal_literal("89999999999999999999999999999999999999", 38, 7), // y
+ decimal_literal("18999999999999999999999999999999999999", 38, 6));
+
+ // Both -ve
+ AddAndVerify(decimal_literal("-201", 30, 3), // x
+ decimal_literal("-301", 30, 2), // y
+ decimal_literal("-3211", 32, 3)); // expected
+
+ AddAndVerify(decimal_literal("-201", 38, 3), // x
+ decimal_literal("-301", 38, 4), // y
+ decimal_literal("-2311", 38, 4)); // expected
+
+ // Mix of +ve and -ve
+ AddAndVerify(decimal_literal("-201", 30, 3), // x
+ decimal_literal("301", 30, 2), // y
+ decimal_literal("2809", 32, 3)); // expected
+
+ AddAndVerify(decimal_literal("-201", 38, 3), // x
+ decimal_literal("301", 38, 4), // y
+ decimal_literal("-1709", 38, 4)); // expected
+
+ AddAndVerify(decimal_literal("201", 38, 3), // x
+ decimal_literal("-301", 38, 7), // y
+ decimal_literal("200970", 38, 6)); // expected
+
+ AddAndVerify(decimal_literal("-1901", 38, 4), // x
+ decimal_literal("1801", 38, 4), // y
+ decimal_literal("-100", 38, 4)); // expected
+
+ AddAndVerify(decimal_literal("1801", 38, 4), // x
+ decimal_literal("-1901", 38, 4), // y
+ decimal_literal("-100", 38, 4)); // expected
+
+ // rounding +ve
+ AddAndVerify(decimal_literal("1000999", 38, 6), // x
+ decimal_literal("10000999", 38, 7), // y
+ decimal_literal("2001099", 38, 6));
+
+ AddAndVerify(decimal_literal("1000999", 38, 6), // x
+ decimal_literal("10000995", 38, 7), // y
+ decimal_literal("2001099", 38, 6));
+
+ AddAndVerify(decimal_literal("1000999", 38, 6), // x
+ decimal_literal("10000992", 38, 7), // y
+ decimal_literal("2001098", 38, 6));
+
+ // rounding -ve
+ AddAndVerify(decimal_literal("-1000999", 38, 6), // x
+ decimal_literal("-10000999", 38, 7), // y
+ decimal_literal("-2001099", 38, 6));
+
+ AddAndVerify(decimal_literal("-1000999", 38, 6), // x
+ decimal_literal("-10000995", 38, 7), // y
+ decimal_literal("-2001099", 38, 6));
+
+ AddAndVerify(decimal_literal("-1000999", 38, 6), // x
+ decimal_literal("-10000992", 38, 7), // y
+ decimal_literal("-2001098", 38, 6));
+}
+
+// subtract is a wrapper over add. so, minimal tests are sufficient.
+TEST_F(TestDecimalOps, TestSubtract) {
+ // fast-path
+ SubtractAndVerify(decimal_literal("201", 30, 3), // x
+ decimal_literal("301", 30, 3), // y
+ decimal_literal("-100", 31, 3)); // expected
+
+ // max precision
+ SubtractAndVerify(
+ decimal_literal("09999999999999999999999999999999000000", 38, 5), // x
+ decimal_literal("100", 38, 7), // y
+ decimal_literal("99999999999999999999999999999989999990", 38, 6));
+
+ // Mix of +ve and -ve
+ SubtractAndVerify(decimal_literal("-201", 30, 3), // x
+ decimal_literal("301", 30, 2), // y
+ decimal_literal("-3211", 32, 3)); // expected
+}
+
+// Lots of unit tests for multiply/divide/mod in decimal_ops_test.cc. So, keeping these
+// basic.
+TEST_F(TestDecimalOps, TestMultiply) {
+ // fast-path
+ MultiplyAndVerify(decimal_literal("201", 10, 3), // x
+ decimal_literal("301", 10, 2), // y
+ decimal_literal("60501", 21, 5)); // expected
+
+ // max precision
+ MultiplyAndVerify(DecimalScalar128(std::string(35, '9'), 38, 20), // x
+ DecimalScalar128(std::string(36, '9'), 38, 20), // x
+ DecimalScalar128("9999999999999999999999999999999999890", 38, 6));
+}
+
+TEST_F(TestDecimalOps, TestDivide) {
+ DivideAndVerify(decimal_literal("201", 10, 3), // x
+ decimal_literal("301", 10, 2), // y
+ decimal_literal("6677740863787", 23, 14)); // expected
+
+ DivideAndVerify(DecimalScalar128(std::string(38, '9'), 38, 20), // x
+ DecimalScalar128(std::string(35, '9'), 38, 20), // x
+ DecimalScalar128("1000000000", 38, 6));
+}
+
+TEST_F(TestDecimalOps, TestMod) {
+ ModAndVerify(decimal_literal("201", 20, 2), // x
+ decimal_literal("301", 20, 3), // y
+ decimal_literal("204", 20, 3)); // expected
+
+ ModAndVerify(DecimalScalar128(std::string(38, '9'), 38, 20), // x
+ DecimalScalar128(std::string(35, '9'), 38, 21), // x
+ DecimalScalar128("9990", 38, 21));
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/tests/decimal_test.cc b/src/arrow/cpp/src/gandiva/tests/decimal_test.cc
new file mode 100644
index 000000000..31f2dedf5
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/tests/decimal_test.cc
@@ -0,0 +1,1194 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <sstream>
+
+#include <gtest/gtest.h>
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/util/decimal.h"
+
+#include "gandiva/decimal_type_util.h"
+#include "gandiva/projector.h"
+#include "gandiva/tests/test_util.h"
+#include "gandiva/tree_expr_builder.h"
+
+using arrow::boolean;
+using arrow::Decimal128;
+using arrow::utf8;
+
+namespace gandiva {
+
+class TestDecimal : public ::testing::Test {
+ public:
+ void SetUp() { pool_ = arrow::default_memory_pool(); }
+
+ std::vector<Decimal128> MakeDecimalVector(std::vector<std::string> values,
+ int32_t scale);
+
+ protected:
+ arrow::MemoryPool* pool_;
+};
+
+std::vector<Decimal128> TestDecimal::MakeDecimalVector(std::vector<std::string> values,
+ int32_t scale) {
+ std::vector<arrow::Decimal128> ret;
+ for (auto str : values) {
+ Decimal128 str_value;
+ int32_t str_precision;
+ int32_t str_scale;
+
+ DCHECK_OK(Decimal128::FromString(str, &str_value, &str_precision, &str_scale));
+
+ Decimal128 scaled_value;
+ if (str_scale == scale) {
+ scaled_value = str_value;
+ } else {
+ scaled_value = str_value.Rescale(str_scale, scale).ValueOrDie();
+ }
+ ret.push_back(scaled_value);
+ }
+ return ret;
+}
+
+TEST_F(TestDecimal, TestSimple) {
+ // schema for input fields
+ constexpr int32_t precision = 36;
+ constexpr int32_t scale = 18;
+ auto decimal_type = std::make_shared<arrow::Decimal128Type>(precision, scale);
+ auto field_a = field("a", decimal_type);
+ auto field_b = field("b", decimal_type);
+ auto field_c = field("c", decimal_type);
+ auto schema = arrow::schema({field_a, field_b, field_c});
+
+ Decimal128TypePtr add2_type;
+ auto status = DecimalTypeUtil::GetResultType(DecimalTypeUtil::kOpAdd,
+ {decimal_type, decimal_type}, &add2_type);
+
+ Decimal128TypePtr output_type;
+ status = DecimalTypeUtil::GetResultType(DecimalTypeUtil::kOpAdd,
+ {add2_type, decimal_type}, &output_type);
+
+ // output fields
+ auto res = field("res0", output_type);
+
+ // build expression : a + b + c
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto node_b = TreeExprBuilder::MakeField(field_b);
+ auto node_c = TreeExprBuilder::MakeField(field_c);
+ auto add2 = TreeExprBuilder::MakeFunction("add", {node_a, node_b}, add2_type);
+ auto add3 = TreeExprBuilder::MakeFunction("add", {add2, node_c}, output_type);
+ auto expr = TreeExprBuilder::MakeExpression(add3, res);
+
+ // Build a projector for the expression.
+ std::shared_ptr<Projector> projector;
+ status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ DCHECK_OK(status);
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array_a =
+ MakeArrowArrayDecimal(decimal_type, MakeDecimalVector({"1", "2", "3", "4"}, scale),
+ {false, true, true, true});
+ auto array_b =
+ MakeArrowArrayDecimal(decimal_type, MakeDecimalVector({"2", "3", "4", "5"}, scale),
+ {false, true, true, true});
+ auto array_c =
+ MakeArrowArrayDecimal(decimal_type, MakeDecimalVector({"3", "4", "5", "6"}, scale),
+ {true, true, true, true});
+
+ // prepare input record batch
+ auto in_batch =
+ arrow::RecordBatch::Make(schema, num_records, {array_a, array_b, array_c});
+
+ auto expected =
+ MakeArrowArrayDecimal(output_type, MakeDecimalVector({"6", "9", "12", "15"}, scale),
+ {false, true, true, true});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ DCHECK_OK(status);
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(expected, outputs[0]);
+}
+
+TEST_F(TestDecimal, TestLiteral) {
+ // schema for input fields
+ constexpr int32_t precision = 36;
+ constexpr int32_t scale = 18;
+ auto decimal_type = std::make_shared<arrow::Decimal128Type>(precision, scale);
+ auto field_a = field("a", decimal_type);
+ auto schema = arrow::schema({
+ field_a,
+ });
+
+ Decimal128TypePtr add2_type;
+ auto status = DecimalTypeUtil::GetResultType(DecimalTypeUtil::kOpAdd,
+ {decimal_type, decimal_type}, &add2_type);
+
+ // output fields
+ auto res = field("res0", add2_type);
+
+ // build expression : a + b + c
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ static std::string decimal_point_six = "6";
+ DecimalScalar128 literal(decimal_point_six, 2, 1);
+ auto node_b = TreeExprBuilder::MakeDecimalLiteral(literal);
+ auto add2 = TreeExprBuilder::MakeFunction("add", {node_a, node_b}, add2_type);
+ auto expr = TreeExprBuilder::MakeExpression(add2, res);
+
+ // Build a projector for the expression.
+ std::shared_ptr<Projector> projector;
+ status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ DCHECK_OK(status);
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array_a =
+ MakeArrowArrayDecimal(decimal_type, MakeDecimalVector({"1", "2", "3", "4"}, scale),
+ {false, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
+
+ auto expected = MakeArrowArrayDecimal(
+ add2_type, MakeDecimalVector({"1.6", "2.6", "3.6", "4.6"}, scale),
+ {false, true, true, true});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ DCHECK_OK(status);
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(expected, outputs[0]);
+}
+
+TEST_F(TestDecimal, TestIfElse) {
+ // schema for input fields
+ constexpr int32_t precision = 36;
+ constexpr int32_t scale = 18;
+ auto decimal_type = std::make_shared<arrow::Decimal128Type>(precision, scale);
+ auto field_a = field("a", decimal_type);
+ auto field_b = field("b", decimal_type);
+ auto field_c = field("c", arrow::boolean());
+ auto schema = arrow::schema({field_a, field_b, field_c});
+
+ // output fields
+ auto field_result = field("res", decimal_type);
+
+ // build expression.
+ // if (c)
+ // a
+ // else
+ // b
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto node_b = TreeExprBuilder::MakeField(field_b);
+ auto node_c = TreeExprBuilder::MakeField(field_c);
+ auto if_node = TreeExprBuilder::MakeIf(node_c, node_a, node_b, decimal_type);
+
+ auto expr = TreeExprBuilder::MakeExpression(if_node, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ Status status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ DCHECK_OK(status);
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array_a =
+ MakeArrowArrayDecimal(decimal_type, MakeDecimalVector({"1", "2", "3", "4"}, scale),
+ {false, true, true, true});
+ auto array_b =
+ MakeArrowArrayDecimal(decimal_type, MakeDecimalVector({"2", "3", "4", "5"}, scale),
+ {true, true, true, true});
+
+ auto array_c = MakeArrowArrayBool({true, false, true, false}, {true, true, true, true});
+
+ // expected output
+ auto exp =
+ MakeArrowArrayDecimal(decimal_type, MakeDecimalVector({"0", "3", "3", "5"}, scale),
+ {false, true, true, true});
+
+ // prepare input record batch
+ auto in_batch =
+ arrow::RecordBatch::Make(schema, num_records, {array_a, array_b, array_c});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ DCHECK_OK(status);
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+TEST_F(TestDecimal, TestCompare) {
+ // schema for input fields
+ constexpr int32_t precision = 36;
+ constexpr int32_t scale = 18;
+ auto decimal_type = std::make_shared<arrow::Decimal128Type>(precision, scale);
+ auto field_a = field("a", decimal_type);
+ auto field_b = field("b", decimal_type);
+ auto schema = arrow::schema({field_a, field_b});
+
+ // build expressions
+ auto exprs = std::vector<ExpressionPtr>{
+ TreeExprBuilder::MakeExpression("equal", {field_a, field_b},
+ field("res_eq", boolean())),
+ TreeExprBuilder::MakeExpression("not_equal", {field_a, field_b},
+ field("res_ne", boolean())),
+ TreeExprBuilder::MakeExpression("less_than", {field_a, field_b},
+ field("res_lt", boolean())),
+ TreeExprBuilder::MakeExpression("less_than_or_equal_to", {field_a, field_b},
+ field("res_le", boolean())),
+ TreeExprBuilder::MakeExpression("greater_than", {field_a, field_b},
+ field("res_gt", boolean())),
+ TreeExprBuilder::MakeExpression("greater_than_or_equal_to", {field_a, field_b},
+ field("res_ge", boolean())),
+ };
+
+ // Build a projector for the expression.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, exprs, TestConfiguration(), &projector);
+ DCHECK_OK(status);
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array_a =
+ MakeArrowArrayDecimal(decimal_type, MakeDecimalVector({"1", "2", "3", "-4"}, scale),
+ {true, true, true, true});
+ auto array_b =
+ MakeArrowArrayDecimal(decimal_type, MakeDecimalVector({"1", "3", "2", "-3"}, scale),
+ {true, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a, array_b});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ DCHECK_OK(status);
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(MakeArrowArrayBool({true, false, false, false}),
+ outputs[0]); // equal
+ EXPECT_ARROW_ARRAY_EQUALS(MakeArrowArrayBool({false, true, true, true}),
+ outputs[1]); // not_equal
+ EXPECT_ARROW_ARRAY_EQUALS(MakeArrowArrayBool({false, true, false, true}),
+ outputs[2]); // less_than
+ EXPECT_ARROW_ARRAY_EQUALS(MakeArrowArrayBool({true, true, false, true}),
+ outputs[3]); // less_than_or_equal_to
+ EXPECT_ARROW_ARRAY_EQUALS(MakeArrowArrayBool({false, false, true, false}),
+ outputs[4]); // greater_than
+ EXPECT_ARROW_ARRAY_EQUALS(MakeArrowArrayBool({true, false, true, false}),
+ outputs[5]); // greater_than_or_equal_to
+}
+
+// ARROW-9092: This test is conditionally disabled when building with LLVM 9
+// because it hangs.
+#if GANDIVA_LLVM_VERSION != 9
+
+TEST_F(TestDecimal, TestRoundFunctions) {
+ // schema for input fields
+ constexpr int32_t precision = 38;
+ constexpr int32_t scale = 2;
+ auto decimal_type = std::make_shared<arrow::Decimal128Type>(precision, scale);
+ auto field_a = field("a", decimal_type);
+ auto schema = arrow::schema({field_a});
+
+ auto scale_1 = TreeExprBuilder::MakeLiteral(1);
+
+ // build expressions
+ auto exprs = std::vector<ExpressionPtr>{
+ TreeExprBuilder::MakeExpression("abs", {field_a}, field("res_abs", decimal_type)),
+ TreeExprBuilder::MakeExpression("ceil", {field_a},
+ field("res_ceil", arrow::decimal(precision, 0))),
+ TreeExprBuilder::MakeExpression("floor", {field_a},
+ field("res_floor", arrow::decimal(precision, 0))),
+ TreeExprBuilder::MakeExpression("round", {field_a},
+ field("res_round", arrow::decimal(precision, 0))),
+ TreeExprBuilder::MakeExpression(
+ "truncate", {field_a}, field("res_truncate", arrow::decimal(precision, 0))),
+
+ TreeExprBuilder::MakeExpression(
+ TreeExprBuilder::MakeFunction("round",
+ {TreeExprBuilder::MakeField(field_a), scale_1},
+ arrow::decimal(precision, 1)),
+ field("res_round_3", arrow::decimal(precision, 1))),
+
+ TreeExprBuilder::MakeExpression(
+ TreeExprBuilder::MakeFunction("truncate",
+ {TreeExprBuilder::MakeField(field_a), scale_1},
+ arrow::decimal(precision, 1)),
+ field("res_truncate_3", arrow::decimal(precision, 1))),
+ };
+
+ // Build a projector for the expression.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, exprs, TestConfiguration(), &projector);
+ DCHECK_OK(status);
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto validity = {true, true, true, true};
+ auto array_a = MakeArrowArrayDecimal(
+ decimal_type, MakeDecimalVector({"1.23", "1.58", "-1.23", "-1.58"}, scale),
+ validity);
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ DCHECK_OK(status);
+
+ // Validate results
+
+ // abs(x)
+ EXPECT_ARROW_ARRAY_EQUALS(
+ MakeArrowArrayDecimal(decimal_type,
+ MakeDecimalVector({"1.23", "1.58", "1.23", "1.58"}, scale),
+ validity),
+ outputs[0]);
+
+ // ceil(x)
+ EXPECT_ARROW_ARRAY_EQUALS(
+ MakeArrowArrayDecimal(arrow::decimal(precision, 0),
+ MakeDecimalVector({"2", "2", "-1", "-1"}, 0), validity),
+ outputs[1]);
+
+ // floor(x)
+ EXPECT_ARROW_ARRAY_EQUALS(
+ MakeArrowArrayDecimal(arrow::decimal(precision, 0),
+ MakeDecimalVector({"1", "1", "-2", "-2"}, 0), validity),
+ outputs[2]);
+
+ // round(x)
+ EXPECT_ARROW_ARRAY_EQUALS(
+ MakeArrowArrayDecimal(arrow::decimal(precision, 0),
+ MakeDecimalVector({"1", "2", "-1", "-2"}, 0), validity),
+ outputs[3]);
+
+ // truncate(x)
+ EXPECT_ARROW_ARRAY_EQUALS(
+ MakeArrowArrayDecimal(arrow::decimal(precision, 0),
+ MakeDecimalVector({"1", "1", "-1", "-1"}, 0), validity),
+ outputs[4]);
+
+ // round(x, 1)
+ EXPECT_ARROW_ARRAY_EQUALS(
+ MakeArrowArrayDecimal(arrow::decimal(precision, 1),
+ MakeDecimalVector({"1.2", "1.6", "-1.2", "-1.6"}, 1),
+ validity),
+ outputs[5]);
+
+ // truncate(x, 1)
+ EXPECT_ARROW_ARRAY_EQUALS(
+ MakeArrowArrayDecimal(arrow::decimal(precision, 1),
+ MakeDecimalVector({"1.2", "1.5", "-1.2", "-1.5"}, 1),
+ validity),
+ outputs[6]);
+}
+
+#endif // GANDIVA_LLVM_VERSION != 9
+
+TEST_F(TestDecimal, TestCastFunctions) {
+ // schema for input fields
+ constexpr int32_t precision = 38;
+ constexpr int32_t scale = 2;
+ auto decimal_type = std::make_shared<arrow::Decimal128Type>(precision, scale);
+ auto decimal_type_scale_1 = std::make_shared<arrow::Decimal128Type>(precision, 1);
+ auto field_int32 = field("int32", arrow::int32());
+ auto field_int64 = field("int64", arrow::int64());
+ auto field_float32 = field("float32", arrow::float32());
+ auto field_float64 = field("float64", arrow::float64());
+ auto field_dec = field("dec", decimal_type);
+ auto schema =
+ arrow::schema({field_int32, field_int64, field_float32, field_float64, field_dec});
+
+ // build expressions
+ auto exprs = std::vector<ExpressionPtr>{
+ TreeExprBuilder::MakeExpression("castDECIMAL", {field_int32},
+ field("int32_to_dec", decimal_type)),
+ TreeExprBuilder::MakeExpression("castDECIMAL", {field_int64},
+ field("int64_to_dec", decimal_type)),
+ TreeExprBuilder::MakeExpression("castDECIMAL", {field_float32},
+ field("float32_to_dec", decimal_type)),
+ TreeExprBuilder::MakeExpression("castDECIMAL", {field_float64},
+ field("float64_to_dec", decimal_type)),
+ TreeExprBuilder::MakeExpression("castDECIMAL", {field_dec},
+ field("dec_to_dec", decimal_type_scale_1)),
+ TreeExprBuilder::MakeExpression("castBIGINT", {field_dec},
+ field("dec_to_int64", arrow::int64())),
+ TreeExprBuilder::MakeExpression("castFLOAT8", {field_dec},
+ field("dec_to_float64", arrow::float64())),
+ };
+
+ // Build a projector for the expression.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, exprs, TestConfiguration(), &projector);
+ DCHECK_OK(status);
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto validity = {true, true, true, true};
+
+ auto array_int32 = MakeArrowArrayInt32({123, 158, -123, -158});
+ auto array_int64 = MakeArrowArrayInt64({123, 158, -123, -158});
+ auto array_float32 = MakeArrowArrayFloat32({1.23f, 1.58f, -1.23f, -1.58f});
+ auto array_float64 = MakeArrowArrayFloat64({1.23, 1.58, -1.23, -1.58});
+ auto array_dec = MakeArrowArrayDecimal(
+ decimal_type, MakeDecimalVector({"1.23", "1.58", "-1.23", "-1.58"}, scale),
+ validity);
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(
+ schema, num_records,
+ {array_int32, array_int64, array_float32, array_float64, array_dec});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ DCHECK_OK(status);
+
+ // Validate results
+ auto expected_int_dec = MakeArrowArrayDecimal(
+ decimal_type, MakeDecimalVector({"123", "158", "-123", "-158"}, scale), validity);
+
+ // castDECIMAL(int32)
+ EXPECT_ARROW_ARRAY_EQUALS(expected_int_dec, outputs[0]);
+
+ // castDECIMAL(int64)
+ EXPECT_ARROW_ARRAY_EQUALS(expected_int_dec, outputs[1]);
+
+ // castDECIMAL(float32)
+ EXPECT_ARROW_ARRAY_EQUALS(array_dec, outputs[2]);
+
+ // castDECIMAL(float64)
+ EXPECT_ARROW_ARRAY_EQUALS(array_dec, outputs[3]);
+
+ // castDECIMAL(decimal)
+ EXPECT_ARROW_ARRAY_EQUALS(
+ MakeArrowArrayDecimal(arrow::decimal(precision, 1),
+ MakeDecimalVector({"1.2", "1.6", "-1.2", "-1.6"}, 1),
+ validity),
+ outputs[4]);
+
+ // castBIGINT(decimal)
+ EXPECT_ARROW_ARRAY_EQUALS(MakeArrowArrayInt64({1, 2, -1, -2}), outputs[5]);
+
+ // castDOUBLE(decimal)
+ EXPECT_ARROW_ARRAY_EQUALS(array_float64, outputs[6]);
+}
+
+// isnull, isnumeric
+TEST_F(TestDecimal, TestIsNullNumericFunctions) {
+ // schema for input fields
+ constexpr int32_t precision = 38;
+ constexpr int32_t scale = 2;
+ auto decimal_type = std::make_shared<arrow::Decimal128Type>(precision, scale);
+ auto field_dec = field("dec", decimal_type);
+ auto schema = arrow::schema({field_dec});
+
+ // build expressions
+ auto exprs = std::vector<ExpressionPtr>{
+ TreeExprBuilder::MakeExpression("isnull", {field_dec},
+ field("isnull", arrow::boolean())),
+
+ TreeExprBuilder::MakeExpression("isnotnull", {field_dec},
+ field("isnotnull", arrow::boolean())),
+ TreeExprBuilder::MakeExpression("isnumeric", {field_dec},
+ field("isnumeric", arrow::boolean()))};
+
+ // Build a projector for the expression.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, exprs, TestConfiguration(), &projector);
+ DCHECK_OK(status);
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+ auto validity = {false, true, true, true, false};
+
+ auto array_dec = MakeArrowArrayDecimal(
+ decimal_type, MakeDecimalVector({"1.51", "1.23", "1.23", "-1.23", "-1.24"}, scale),
+ validity);
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_dec});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ DCHECK_OK(status);
+
+ // Validate results
+ auto is_null = outputs.at(0);
+ auto is_not_null = outputs.at(1);
+ auto is_numeric = outputs.at(2);
+
+ // isnull
+ EXPECT_ARROW_ARRAY_EQUALS(MakeArrowArrayBool({true, false, false, false, true}),
+ outputs[0]);
+
+ // isnotnull
+ EXPECT_ARROW_ARRAY_EQUALS(MakeArrowArrayBool(validity), outputs[1]);
+
+ // isnumeric
+ EXPECT_ARROW_ARRAY_EQUALS(MakeArrowArrayBool(validity), outputs[2]);
+}
+
+TEST_F(TestDecimal, TestIsDistinct) {
+ // schema for input fields
+ constexpr int32_t precision = 38;
+ constexpr int32_t scale_1 = 2;
+ auto decimal_type_1 = std::make_shared<arrow::Decimal128Type>(precision, scale_1);
+ auto field_dec_1 = field("dec_1", decimal_type_1);
+ constexpr int32_t scale_2 = 1;
+ auto decimal_type_2 = std::make_shared<arrow::Decimal128Type>(precision, scale_2);
+ auto field_dec_2 = field("dec_2", decimal_type_2);
+
+ auto schema = arrow::schema({field_dec_1, field_dec_2});
+
+ // build expressions
+ auto exprs = std::vector<ExpressionPtr>{
+ TreeExprBuilder::MakeExpression("is_distinct_from", {field_dec_1, field_dec_2},
+ field("isdistinct", arrow::boolean())),
+
+ TreeExprBuilder::MakeExpression("is_not_distinct_from", {field_dec_1, field_dec_2},
+ field("isnotdistinct", arrow::boolean()))};
+
+ // Build a projector for the expression.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, exprs, TestConfiguration(), &projector);
+ DCHECK_OK(status);
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+
+ auto validity_1 = {true, false, true, true};
+ auto array_dec_1 = MakeArrowArrayDecimal(
+ decimal_type_1, MakeDecimalVector({"1.51", "1.23", "1.20", "-1.20"}, scale_1),
+ validity_1);
+
+ auto validity_2 = {true, false, false, true};
+ auto array_dec_2 = MakeArrowArrayDecimal(
+ decimal_type_2, MakeDecimalVector({"1.5", "1.2", "1.2", "-1.2"}, scale_2),
+ validity_2);
+
+ // prepare input record batch
+ auto in_batch =
+ arrow::RecordBatch::Make(schema, num_records, {array_dec_1, array_dec_2});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ DCHECK_OK(status);
+
+ // Validate results
+ auto is_distinct = std::dynamic_pointer_cast<arrow::BooleanArray>(outputs.at(0));
+ auto is_not_distinct = std::dynamic_pointer_cast<arrow::BooleanArray>(outputs.at(1));
+
+ // isdistinct
+ EXPECT_ARROW_ARRAY_EQUALS(MakeArrowArrayBool({true, false, true, false}), outputs[0]);
+
+ // isnotdistinct
+ EXPECT_ARROW_ARRAY_EQUALS(MakeArrowArrayBool({false, true, false, true}), outputs[1]);
+}
+
+// decimal hashes without seed
+TEST_F(TestDecimal, TestHashFunctions) {
+ // schema for input fields
+ constexpr int32_t precision = 38;
+ constexpr int32_t scale = 2;
+ auto decimal_type = std::make_shared<arrow::Decimal128Type>(precision, scale);
+ auto field_dec = field("dec", decimal_type);
+ auto literal_seed32 = TreeExprBuilder::MakeLiteral((int32_t)10);
+ auto literal_seed64 = TreeExprBuilder::MakeLiteral((int64_t)10);
+ auto schema = arrow::schema({field_dec});
+
+ // build expressions
+ auto exprs = std::vector<ExpressionPtr>{
+ TreeExprBuilder::MakeExpression("hash", {field_dec},
+ field("hash_of_dec", arrow::int32())),
+
+ TreeExprBuilder::MakeExpression("hash64", {field_dec},
+ field("hash64_of_dec", arrow::int64())),
+
+ TreeExprBuilder::MakeExpression("hash32AsDouble", {field_dec},
+ field("hash32_as_double", arrow::int32())),
+
+ TreeExprBuilder::MakeExpression("hash64AsDouble", {field_dec},
+ field("hash64_as_double", arrow::int64()))};
+
+ // Build a projector for the expression.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, exprs, TestConfiguration(), &projector);
+ DCHECK_OK(status);
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+ auto validity = {false, true, true, true, true};
+
+ auto array_dec = MakeArrowArrayDecimal(
+ decimal_type, MakeDecimalVector({"1.51", "1.23", "1.23", "-1.23", "-1.24"}, scale),
+ validity);
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_dec});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ DCHECK_OK(status);
+
+ // Validate results
+ auto int32_arr = std::dynamic_pointer_cast<arrow::Int32Array>(outputs.at(0));
+ EXPECT_EQ(int32_arr->null_count(), 0);
+ EXPECT_EQ(int32_arr->Value(0), 0);
+ EXPECT_EQ(int32_arr->Value(1), int32_arr->Value(2));
+ EXPECT_NE(int32_arr->Value(2), int32_arr->Value(3));
+ EXPECT_NE(int32_arr->Value(3), int32_arr->Value(4));
+
+ auto int64_arr = std::dynamic_pointer_cast<arrow::Int64Array>(outputs.at(1));
+ EXPECT_EQ(int64_arr->null_count(), 0);
+ EXPECT_EQ(int64_arr->Value(0), 0);
+ EXPECT_EQ(int64_arr->Value(1), int64_arr->Value(2));
+ EXPECT_NE(int64_arr->Value(2), int64_arr->Value(3));
+ EXPECT_NE(int64_arr->Value(3), int64_arr->Value(4));
+}
+
+TEST_F(TestDecimal, TestHash32WithSeed) {
+ constexpr int32_t precision = 38;
+ constexpr int32_t scale = 2;
+ auto decimal_type = std::make_shared<arrow::Decimal128Type>(precision, scale);
+ auto field_dec_1 = field("dec1", decimal_type);
+ auto field_dec_2 = field("dec2", decimal_type);
+ auto schema = arrow::schema({field_dec_1, field_dec_2});
+
+ auto res = field("hash32_with_seed", arrow::int32());
+
+ auto field_1_nodePtr = TreeExprBuilder::MakeField(field_dec_1);
+ auto field_2_nodePtr = TreeExprBuilder::MakeField(field_dec_2);
+
+ auto hash32 =
+ TreeExprBuilder::MakeFunction("hash32", {field_2_nodePtr}, arrow::int32());
+ auto hash32_with_seed =
+ TreeExprBuilder::MakeFunction("hash32", {field_1_nodePtr, hash32}, arrow::int32());
+ auto expr = TreeExprBuilder::MakeExpression(hash32, field("hash32", arrow::int32()));
+ auto exprWS = TreeExprBuilder::MakeExpression(hash32_with_seed, res);
+
+ auto exprs = std::vector<ExpressionPtr>{expr, exprWS};
+
+ // Build a projector for the expression.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, exprs, TestConfiguration(), &projector);
+ DCHECK_OK(status);
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+ auto validity_1 = {false, false, true, true, true};
+
+ auto array_dec_1 = MakeArrowArrayDecimal(
+ decimal_type, MakeDecimalVector({"1.51", "1.23", "1.23", "-1.23", "-1.24"}, scale),
+ validity_1);
+
+ auto validity_2 = {false, true, false, true, true};
+
+ auto array_dec_2 = MakeArrowArrayDecimal(
+ decimal_type, MakeDecimalVector({"1.51", "1.23", "1.23", "-1.23", "-1.24"}, scale),
+ validity_2);
+
+ // prepare input record batch
+ auto in_batch =
+ arrow::RecordBatch::Make(schema, num_records, {array_dec_1, array_dec_2});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ DCHECK_OK(status);
+
+ // Validate results
+ auto int32_arr = std::dynamic_pointer_cast<arrow::Int32Array>(outputs.at(0));
+ auto int32_arr_WS = std::dynamic_pointer_cast<arrow::Int32Array>(outputs.at(1));
+ EXPECT_EQ(int32_arr->null_count(), 0);
+ // seed 0, null decimal
+ EXPECT_EQ(int32_arr_WS->Value(0), 0);
+ // null decimal => hash = seed
+ EXPECT_EQ(int32_arr_WS->Value(1), int32_arr->Value(1));
+ // seed = 0 => hash = hash without seed
+ EXPECT_EQ(int32_arr_WS->Value(2), int32_arr->Value(1));
+ // different inputs => different outputs
+ EXPECT_NE(int32_arr_WS->Value(3), int32_arr_WS->Value(4));
+ // hash with, without seed are not equal
+ EXPECT_NE(int32_arr_WS->Value(4), int32_arr->Value(4));
+}
+
+TEST_F(TestDecimal, TestHash64WithSeed) {
+ constexpr int32_t precision = 38;
+ constexpr int32_t scale = 2;
+ auto decimal_type = std::make_shared<arrow::Decimal128Type>(precision, scale);
+ auto field_dec_1 = field("dec1", decimal_type);
+ auto field_dec_2 = field("dec2", decimal_type);
+ auto schema = arrow::schema({field_dec_1, field_dec_2});
+
+ auto res = field("hash64_with_seed", arrow::int64());
+
+ auto field_1_nodePtr = TreeExprBuilder::MakeField(field_dec_1);
+ auto field_2_nodePtr = TreeExprBuilder::MakeField(field_dec_2);
+
+ auto hash64 =
+ TreeExprBuilder::MakeFunction("hash64", {field_2_nodePtr}, arrow::int64());
+ auto hash64_with_seed =
+ TreeExprBuilder::MakeFunction("hash64", {field_1_nodePtr, hash64}, arrow::int64());
+ auto expr = TreeExprBuilder::MakeExpression(hash64, field("hash64", arrow::int64()));
+ auto exprWS = TreeExprBuilder::MakeExpression(hash64_with_seed, res);
+
+ auto exprs = std::vector<ExpressionPtr>{expr, exprWS};
+
+ // Build a projector for the expression.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, exprs, TestConfiguration(), &projector);
+ DCHECK_OK(status);
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+ auto validity_1 = {false, false, true, true, true};
+
+ auto array_dec_1 = MakeArrowArrayDecimal(
+ decimal_type, MakeDecimalVector({"1.51", "1.23", "1.23", "-1.23", "-1.24"}, scale),
+ validity_1);
+
+ auto validity_2 = {false, true, false, true, true};
+
+ auto array_dec_2 = MakeArrowArrayDecimal(
+ decimal_type, MakeDecimalVector({"1.51", "1.23", "1.23", "-1.23", "-1.24"}, scale),
+ validity_2);
+
+ // prepare input record batch
+ auto in_batch =
+ arrow::RecordBatch::Make(schema, num_records, {array_dec_1, array_dec_2});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ DCHECK_OK(status);
+
+ // Validate results
+ auto int64_arr = std::dynamic_pointer_cast<arrow::Int64Array>(outputs.at(0));
+ auto int64_arr_WS = std::dynamic_pointer_cast<arrow::Int64Array>(outputs.at(1));
+ EXPECT_EQ(int64_arr->null_count(), 0);
+ // seed 0, null decimal
+ EXPECT_EQ(int64_arr_WS->Value(0), 0);
+ // null decimal => hash = seed
+ EXPECT_EQ(int64_arr_WS->Value(1), int64_arr->Value(1));
+ // seed = 0 => hash = hash without seed
+ EXPECT_EQ(int64_arr_WS->Value(2), int64_arr->Value(1));
+ // different inputs => different outputs
+ EXPECT_NE(int64_arr_WS->Value(3), int64_arr_WS->Value(4));
+ // hash with, without seed are not equal
+ EXPECT_NE(int64_arr_WS->Value(4), int64_arr->Value(4));
+}
+
+TEST_F(TestDecimal, TestNullDecimalConstant) {
+ // schema for input fields
+ constexpr int32_t precision = 36;
+ constexpr int32_t scale = 18;
+ auto decimal_type = std::make_shared<arrow::Decimal128Type>(precision, scale);
+ auto field_b = field("b", decimal_type);
+ auto field_c = field("c", arrow::boolean());
+ auto schema = arrow::schema({field_b, field_c});
+
+ // output fields
+ auto field_result = field("res", decimal_type);
+
+ // build expression.
+ // if (c)
+ // null
+ // else
+ // b
+ auto node_a = TreeExprBuilder::MakeNull(decimal_type);
+ auto node_b = TreeExprBuilder::MakeField(field_b);
+ auto node_c = TreeExprBuilder::MakeField(field_c);
+ auto if_node = TreeExprBuilder::MakeIf(node_c, node_a, node_b, decimal_type);
+
+ auto expr = TreeExprBuilder::MakeExpression(if_node, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ Status status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ DCHECK_OK(status);
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+
+ auto array_b =
+ MakeArrowArrayDecimal(decimal_type, MakeDecimalVector({"2", "3", "4", "5"}, scale),
+ {true, true, true, true});
+
+ auto array_c = MakeArrowArrayBool({true, false, true, false}, {true, true, true, true});
+
+ // expected output
+ auto exp =
+ MakeArrowArrayDecimal(decimal_type, MakeDecimalVector({"0", "3", "3", "5"}, scale),
+ {false, true, false, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_b, array_c});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ DCHECK_OK(status);
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+TEST_F(TestDecimal, TestCastVarCharDecimal) {
+ // schema for input fields
+ constexpr int32_t precision = 38;
+ constexpr int32_t scale = 2;
+ auto decimal_type = std::make_shared<arrow::Decimal128Type>(precision, scale);
+
+ auto field_dec = field("dec", decimal_type);
+ auto field_res_str = field("res_str", utf8());
+ auto field_res_str_1 = field("res_str_1", utf8());
+ auto schema = arrow::schema({field_dec, field_res_str, field_res_str_1});
+
+ // output fields
+ auto res_str = field("res_str", utf8());
+ auto equals_res_bool = field("equals_res", boolean());
+
+ // build expressions.
+ auto node_dec = TreeExprBuilder::MakeField(field_dec);
+ auto node_res_str = TreeExprBuilder::MakeField(field_res_str);
+ auto node_res_str_1 = TreeExprBuilder::MakeField(field_res_str_1);
+ // limits decimal string to input length
+ auto str_len_limit = TreeExprBuilder::MakeLiteral(static_cast<int64_t>(5));
+ auto str_len_limit_1 = TreeExprBuilder::MakeLiteral(static_cast<int64_t>(1));
+ auto cast_varchar =
+ TreeExprBuilder::MakeFunction("castVARCHAR", {node_dec, str_len_limit}, utf8());
+ auto cast_varchar_1 =
+ TreeExprBuilder::MakeFunction("castVARCHAR", {node_dec, str_len_limit_1}, utf8());
+ auto equals =
+ TreeExprBuilder::MakeFunction("equal", {cast_varchar, node_res_str}, boolean());
+ auto equals_1 =
+ TreeExprBuilder::MakeFunction("equal", {cast_varchar_1, node_res_str_1}, boolean());
+ auto expr = TreeExprBuilder::MakeExpression(equals, equals_res_bool);
+ auto expr_1 = TreeExprBuilder::MakeExpression(equals_1, equals_res_bool);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+
+ auto status = Projector::Make(schema, {expr, expr_1}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+ auto array_dec = MakeArrowArrayDecimal(
+ decimal_type,
+ MakeDecimalVector({"10.51", "1.23", "100.23", "-1000.23", "-0000.10"}, scale),
+ {true, false, true, true, true});
+ auto array_str_res = MakeArrowArrayUtf8({"10.51", "-null-", "100.2", "-1000", "-0.10"},
+ {true, false, true, true, true});
+ auto array_str_res_1 =
+ MakeArrowArrayUtf8({"1", "-null-", "1", "-", "-"}, {true, false, true, true, true});
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records,
+ {array_dec, array_str_res, array_str_res_1});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ auto exp = MakeArrowArrayBool({true, false, true, true, true},
+ {true, false, true, true, true});
+ auto exp_1 = MakeArrowArrayBool({true, false, true, true, true},
+ {true, false, true, true, true});
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs[0]);
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs[1]);
+}
+
+TEST_F(TestDecimal, TestCastDecimalVarChar) {
+ // schema for input fields
+ constexpr int32_t precision = 4;
+ constexpr int32_t scale = 2;
+ auto decimal_type = std::make_shared<arrow::Decimal128Type>(precision, scale);
+
+ auto field_str = field("in_str", utf8());
+ auto schema = arrow::schema({field_str});
+
+ // output fields
+ auto res_dec = field("res_dec", decimal_type);
+
+ // build expressions.
+ auto node_str = TreeExprBuilder::MakeField(field_str);
+ auto cast_decimal =
+ TreeExprBuilder::MakeFunction("castDECIMAL", {node_str}, decimal_type);
+ auto expr = TreeExprBuilder::MakeExpression(cast_decimal, res_dec);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+
+ auto array_str = MakeArrowArrayUtf8({"10.5134", "-0.0", "-0.1", "10.516", "-1000"},
+ {true, false, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_str});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ auto array_dec = MakeArrowArrayDecimal(
+ decimal_type, MakeDecimalVector({"10.51", "1.23", "-0.10", "10.52", "0.00"}, scale),
+ {true, false, true, true, true});
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(array_dec, outputs[0]);
+}
+
+TEST_F(TestDecimal, TestCastDecimalVarCharInvalidInput) {
+ // schema for input fields
+ constexpr int32_t precision = 38;
+ constexpr int32_t scale = 0;
+ auto decimal_type = std::make_shared<arrow::Decimal128Type>(precision, scale);
+
+ auto field_str = field("in_str", utf8());
+ auto schema = arrow::schema({field_str});
+
+ // output fields
+ auto res_dec = field("res_dec", decimal_type);
+
+ // build expressions.
+ auto node_str = TreeExprBuilder::MakeField(field_str);
+ auto cast_decimal =
+ TreeExprBuilder::MakeFunction("castDECIMAL", {node_str}, decimal_type);
+ auto expr = TreeExprBuilder::MakeExpression(cast_decimal, res_dec);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+
+ // invalid input
+ auto invalid_in = MakeArrowArrayUtf8({"a10.5134", "-0.0", "-0.1", "10.516", "-1000"},
+ {true, false, true, true, true});
+
+ // prepare input record batch
+ auto in_batch_1 = arrow::RecordBatch::Make(schema, num_records, {invalid_in});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs_1;
+ status = projector->Evaluate(*in_batch_1, pool_, &outputs_1);
+ EXPECT_FALSE(status.ok()) << status.message();
+ EXPECT_NE(status.message().find("not a valid decimal128 number"), std::string::npos);
+}
+
+TEST_F(TestDecimal, TestVarCharDecimalNestedCast) {
+ // schema for input fields
+ constexpr int32_t precision = 38;
+ constexpr int32_t scale = 2;
+ auto decimal_type = std::make_shared<arrow::Decimal128Type>(precision, scale);
+
+ auto field_dec = field("dec", decimal_type);
+ auto schema = arrow::schema({field_dec});
+
+ // output fields
+ auto field_dec_res = field("dec_res", decimal_type);
+
+ // build expressions.
+ auto node_dec = TreeExprBuilder::MakeField(field_dec);
+
+ // limits decimal string to input length
+ auto str_len_limit = TreeExprBuilder::MakeLiteral(static_cast<int64_t>(5));
+ auto cast_varchar =
+ TreeExprBuilder::MakeFunction("castVARCHAR", {node_dec, str_len_limit}, utf8());
+ auto cast_decimal =
+ TreeExprBuilder::MakeFunction("castDECIMAL", {cast_varchar}, decimal_type);
+
+ auto expr = TreeExprBuilder::MakeExpression(cast_decimal, field_dec_res);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+ auto array_dec = MakeArrowArrayDecimal(
+ decimal_type,
+ MakeDecimalVector({"10.51", "1.23", "100.23", "-1000.23", "-0000.10"}, scale),
+ {true, false, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_dec});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Validate results
+ auto array_dec_res = MakeArrowArrayDecimal(
+ decimal_type,
+ MakeDecimalVector({"10.51", "1.23", "100.20", "-1000.00", "-0.10"}, scale),
+ {true, false, true, true, true});
+ EXPECT_ARROW_ARRAY_EQUALS(array_dec_res, outputs[0]);
+}
+
+TEST_F(TestDecimal, TestCastDecimalOverflow) {
+ // schema for input fields
+ constexpr int32_t precision_in = 5;
+ constexpr int32_t scale_in = 2;
+ constexpr int32_t precision_out = 3;
+ constexpr int32_t scale_out = 1;
+ auto decimal_5_2 = std::make_shared<arrow::Decimal128Type>(precision_in, scale_in);
+ auto decimal_3_1 = std::make_shared<arrow::Decimal128Type>(precision_out, scale_out);
+
+ auto field_dec = field("dec", decimal_5_2);
+ auto schema = arrow::schema({field_dec});
+
+ // build expressions
+ auto exprs = std::vector<ExpressionPtr>{
+ TreeExprBuilder::MakeExpression("castDECIMAL", {field_dec},
+ field("dec_to_dec", decimal_3_1)),
+ TreeExprBuilder::MakeExpression("castDECIMALNullOnOverflow", {field_dec},
+ field("dec_to_dec_null_overflow", decimal_3_1)),
+ };
+
+ // Build a projector for the expression.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, exprs, TestConfiguration(), &projector);
+ DCHECK_OK(status);
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto validity = {true, true, true, true};
+
+ auto array_dec = MakeArrowArrayDecimal(
+ decimal_5_2, MakeDecimalVector({"1.23", "671.58", "-1.23", "-1.58"}, scale_in),
+ validity);
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_dec});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ DCHECK_OK(status);
+
+ // Validate results
+ // castDECIMAL(decimal)
+ EXPECT_ARROW_ARRAY_EQUALS(
+ MakeArrowArrayDecimal(arrow::decimal(precision_out, 1),
+ MakeDecimalVector({"1.2", "0.0", "-1.2", "-1.6"}, 1),
+ validity),
+ outputs[0]);
+
+ // castDECIMALNullOnOverflow(decimal)
+ EXPECT_ARROW_ARRAY_EQUALS(
+ MakeArrowArrayDecimal(arrow::decimal(precision_out, 1),
+ MakeDecimalVector({"1.2", "1.6", "-1.2", "-1.6"}, 1),
+ {true, false, true, true}),
+ outputs[1]);
+}
+
+TEST_F(TestDecimal, TestSha) {
+ // schema for input fields
+ const std::shared_ptr<arrow::DataType>& decimal_5_2 = arrow::decimal128(5, 2);
+ auto field_a = field("a", decimal_5_2);
+ auto schema = arrow::schema({field_a});
+
+ // output fields
+ auto res_0 = field("res0", utf8());
+ auto res_1 = field("res1", utf8());
+
+ // build expressions.
+ // hashSHA1(a)
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto hashSha1 = TreeExprBuilder::MakeFunction("hashSHA1", {node_a}, utf8());
+ auto expr_0 = TreeExprBuilder::MakeExpression(hashSha1, res_0);
+
+ auto hashSha256 = TreeExprBuilder::MakeFunction("hashSHA256", {node_a}, utf8());
+ auto expr_1 = TreeExprBuilder::MakeExpression(hashSha256, res_1);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status =
+ Projector::Make(schema, {expr_0, expr_1}, TestConfiguration(), &projector);
+ ASSERT_OK(status) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 3;
+ auto validity_array = {false, true, true};
+
+ auto array_dec = MakeArrowArrayDecimal(
+ decimal_5_2, MakeDecimalVector({"3.45", "0", "0.01"}, 2), validity_array);
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_dec});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ ASSERT_OK(status);
+
+ auto response = outputs.at(0);
+ EXPECT_EQ(response->null_count(), 0);
+ EXPECT_NE(response->GetScalar(0).ValueOrDie()->ToString(), "");
+
+ // Checks if the hash size in response is correct
+ const int sha1_hash_size = 40;
+ for (int i = 1; i < num_records; ++i) {
+ const auto& value_at_position = response->GetScalar(i).ValueOrDie()->ToString();
+
+ EXPECT_EQ(value_at_position.size(), sha1_hash_size);
+ EXPECT_NE(value_at_position, response->GetScalar(i - 1).ValueOrDie()->ToString());
+ }
+
+ response = outputs.at(1);
+ EXPECT_EQ(response->null_count(), 0);
+ EXPECT_NE(response->GetScalar(0).ValueOrDie()->ToString(), "");
+
+ // Checks if the hash size in response is correct
+ const int sha256_hash_size = 64;
+ for (int i = 1; i < num_records; ++i) {
+ const auto& value_at_position = response->GetScalar(i).ValueOrDie()->ToString();
+
+ EXPECT_EQ(value_at_position.size(), sha256_hash_size);
+ EXPECT_NE(value_at_position, response->GetScalar(i - 1).ValueOrDie()->ToString());
+ }
+}
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/tests/filter_project_test.cc b/src/arrow/cpp/src/gandiva/tests/filter_project_test.cc
new file mode 100644
index 000000000..0607feaef
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/tests/filter_project_test.cc
@@ -0,0 +1,276 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+#include "arrow/memory_pool.h"
+#include "gandiva/filter.h"
+#include "gandiva/projector.h"
+#include "gandiva/selection_vector.h"
+#include "gandiva/tests/test_util.h"
+#include "gandiva/tree_expr_builder.h"
+
+namespace gandiva {
+
+using arrow::boolean;
+using arrow::float32;
+using arrow::int32;
+
+class TestFilterProject : public ::testing::Test {
+ public:
+ void SetUp() { pool_ = arrow::default_memory_pool(); }
+
+ protected:
+ arrow::MemoryPool* pool_;
+};
+
+TEST_F(TestFilterProject, TestSimple16) {
+ // schema for input fields
+ auto field0 = field("f0", int32());
+ auto field1 = field("f1", int32());
+ auto field2 = field("f2", int32());
+ auto resultField = field("result", int32());
+ auto schema = arrow::schema({field0, field1, field2});
+
+ // Build condition f0 < f1
+ auto node_f0 = TreeExprBuilder::MakeField(field0);
+ auto node_f1 = TreeExprBuilder::MakeField(field1);
+ auto node_f2 = TreeExprBuilder::MakeField(field2);
+ auto less_than_function =
+ TreeExprBuilder::MakeFunction("less_than", {node_f0, node_f1}, arrow::boolean());
+ auto condition = TreeExprBuilder::MakeCondition(less_than_function);
+ auto sum_expr = TreeExprBuilder::MakeExpression("add", {field1, field2}, resultField);
+
+ auto configuration = TestConfiguration();
+
+ std::shared_ptr<Filter> filter;
+ std::shared_ptr<Projector> projector;
+
+ auto status = Filter::Make(schema, condition, configuration, &filter);
+ EXPECT_TRUE(status.ok());
+
+ status = Projector::Make(schema, {sum_expr}, SelectionVector::MODE_UINT16,
+ configuration, &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+ auto array0 = MakeArrowArrayInt32({1, 2, 6, 40, 3}, {true, true, true, true, true});
+ auto array1 = MakeArrowArrayInt32({5, 9, 3, 17, 6}, {true, true, true, true, true});
+ auto array2 = MakeArrowArrayInt32({1, 2, 6, 40, 3}, {true, true, true, true, false});
+ // expected output
+ auto result = MakeArrowArrayInt32({6, 11, 0}, {true, true, false});
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1, array2});
+
+ std::shared_ptr<SelectionVector> selection_vector;
+ status = SelectionVector::MakeInt16(num_records, pool_, &selection_vector);
+ EXPECT_TRUE(status.ok());
+ // Evaluate expression
+ status = filter->Evaluate(*in_batch, selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+
+ status = projector->Evaluate(*in_batch, selection_vector.get(), pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(result, outputs.at(0));
+}
+
+TEST_F(TestFilterProject, TestSimple32) {
+ // schema for input fields
+ auto field0 = field("f0", int32());
+ auto field1 = field("f1", int32());
+ auto field2 = field("f2", int32());
+ auto resultField = field("result", int32());
+ auto schema = arrow::schema({field0, field1, field2});
+
+ // Build condition f0 < f1
+ auto node_f0 = TreeExprBuilder::MakeField(field0);
+ auto node_f1 = TreeExprBuilder::MakeField(field1);
+ auto node_f2 = TreeExprBuilder::MakeField(field2);
+ auto less_than_function =
+ TreeExprBuilder::MakeFunction("less_than", {node_f0, node_f1}, arrow::boolean());
+ auto condition = TreeExprBuilder::MakeCondition(less_than_function);
+ auto sum_expr = TreeExprBuilder::MakeExpression("add", {field1, field2}, resultField);
+
+ auto configuration = TestConfiguration();
+
+ std::shared_ptr<Filter> filter;
+ std::shared_ptr<Projector> projector;
+
+ auto status = Filter::Make(schema, condition, configuration, &filter);
+ EXPECT_TRUE(status.ok());
+
+ status = Projector::Make(schema, {sum_expr}, SelectionVector::MODE_UINT32,
+ configuration, &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+ auto array0 = MakeArrowArrayInt32({1, 2, 6, 40, 3}, {true, true, true, true, true});
+ auto array1 = MakeArrowArrayInt32({5, 9, 3, 17, 6}, {true, true, true, true, true});
+ auto array2 = MakeArrowArrayInt32({1, 2, 6, 40, 3}, {true, true, true, true, false});
+ // expected output
+ auto result = MakeArrowArrayInt32({6, 11, 0}, {true, true, false});
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1, array2});
+
+ std::shared_ptr<SelectionVector> selection_vector;
+ status = SelectionVector::MakeInt32(num_records, pool_, &selection_vector);
+ EXPECT_TRUE(status.ok());
+ // Evaluate expression
+ status = filter->Evaluate(*in_batch, selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+
+ status = projector->Evaluate(*in_batch, selection_vector.get(), pool_, &outputs);
+ ASSERT_OK(status);
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(result, outputs.at(0));
+}
+
+TEST_F(TestFilterProject, TestSimple64) {
+ // schema for input fields
+ auto field0 = field("f0", int32());
+ auto field1 = field("f1", int32());
+ auto field2 = field("f2", int32());
+ auto resultField = field("result", int32());
+ auto schema = arrow::schema({field0, field1, field2});
+
+ // Build condition f0 < f1
+ auto node_f0 = TreeExprBuilder::MakeField(field0);
+ auto node_f1 = TreeExprBuilder::MakeField(field1);
+ auto node_f2 = TreeExprBuilder::MakeField(field2);
+ auto less_than_function =
+ TreeExprBuilder::MakeFunction("less_than", {node_f0, node_f1}, arrow::boolean());
+ auto condition = TreeExprBuilder::MakeCondition(less_than_function);
+ auto sum_expr = TreeExprBuilder::MakeExpression("add", {field1, field2}, resultField);
+
+ auto configuration = TestConfiguration();
+
+ std::shared_ptr<Filter> filter;
+ std::shared_ptr<Projector> projector;
+
+ auto status = Filter::Make(schema, condition, configuration, &filter);
+ EXPECT_TRUE(status.ok());
+
+ status = Projector::Make(schema, {sum_expr}, SelectionVector::MODE_UINT64,
+ configuration, &projector);
+ ASSERT_OK(status);
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+ auto array0 = MakeArrowArrayInt32({1, 2, 6, 40, 3}, {true, true, true, true, true});
+ auto array1 = MakeArrowArrayInt32({5, 9, 3, 17, 6}, {true, true, true, true, true});
+ auto array2 = MakeArrowArrayInt32({1, 2, 6, 40, 3}, {true, true, true, true, false});
+ // expected output
+ auto result = MakeArrowArrayInt32({6, 11, 0}, {true, true, false});
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1, array2});
+
+ std::shared_ptr<SelectionVector> selection_vector;
+ status = SelectionVector::MakeInt64(num_records, pool_, &selection_vector);
+ EXPECT_TRUE(status.ok());
+ // Evaluate expression
+ status = filter->Evaluate(*in_batch, selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+
+ status = projector->Evaluate(*in_batch, selection_vector.get(), pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(result, outputs.at(0));
+}
+
+TEST_F(TestFilterProject, TestSimpleIf) {
+ // schema for input fields
+ auto fielda = field("a", int32());
+ auto fieldb = field("b", int32());
+ auto fieldc = field("c", int32());
+ auto schema = arrow::schema({fielda, fieldb, fieldc});
+
+ // output fields
+ auto field_result = field("res", int32());
+
+ auto node_a = TreeExprBuilder::MakeField(fielda);
+ auto node_b = TreeExprBuilder::MakeField(fieldb);
+ auto node_c = TreeExprBuilder::MakeField(fieldc);
+
+ auto greater_than_function =
+ TreeExprBuilder::MakeFunction("greater_than", {node_a, node_b}, boolean());
+ auto filter_condition = TreeExprBuilder::MakeCondition(greater_than_function);
+
+ auto project_condition =
+ TreeExprBuilder::MakeFunction("less_than", {node_b, node_c}, boolean());
+ auto if_node = TreeExprBuilder::MakeIf(project_condition, node_b, node_c, int32());
+
+ auto expr = TreeExprBuilder::MakeExpression(if_node, field_result);
+ auto configuration = TestConfiguration();
+
+ // Build a filter for the expressions.
+ std::shared_ptr<Filter> filter;
+ auto status = Filter::Make(schema, filter_condition, configuration, &filter);
+ EXPECT_TRUE(status.ok());
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ status = Projector::Make(schema, {expr}, SelectionVector::MODE_UINT32, configuration,
+ &projector);
+ ASSERT_OK(status);
+
+ // Create a row-batch with some sample data
+ int num_records = 6;
+ auto array0 =
+ MakeArrowArrayInt32({10, 12, -20, 5, 21, 29}, {true, true, true, true, true, true});
+ auto array1 =
+ MakeArrowArrayInt32({5, 15, 15, 17, 12, 3}, {true, true, true, true, true, true});
+ auto array2 = MakeArrowArrayInt32({1, 25, 11, 30, -21, 30},
+ {true, true, true, true, true, false});
+
+ // Create a selection vector
+ std::shared_ptr<SelectionVector> selection_vector;
+ status = SelectionVector::MakeInt32(num_records, pool_, &selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // expected output
+ auto exp = MakeArrowArrayInt32({1, -21, 0}, {true, true, false});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1, array2});
+
+ // Evaluate filter
+ status = filter->Evaluate(*in_batch, selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Evaluate project
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, selection_vector.get(), pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/tests/filter_test.cc b/src/arrow/cpp/src/gandiva/tests/filter_test.cc
new file mode 100644
index 000000000..d4433f11e
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/tests/filter_test.cc
@@ -0,0 +1,340 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/filter.h"
+#include <gtest/gtest.h>
+#include "arrow/memory_pool.h"
+#include "gandiva/tests/test_util.h"
+#include "gandiva/tree_expr_builder.h"
+
+namespace gandiva {
+
+using arrow::boolean;
+using arrow::float32;
+using arrow::int32;
+
+class TestFilter : public ::testing::Test {
+ public:
+ void SetUp() { pool_ = arrow::default_memory_pool(); }
+
+ protected:
+ arrow::MemoryPool* pool_;
+};
+
+TEST_F(TestFilter, TestFilterCache) {
+ // schema for input fields
+ auto field0 = field("f0", int32());
+ auto field1 = field("f1", int32());
+ auto schema = arrow::schema({field0, field1});
+
+ // Build condition f0 + f1 < 10
+ auto node_f0 = TreeExprBuilder::MakeField(field0);
+ auto node_f1 = TreeExprBuilder::MakeField(field1);
+ auto sum_func =
+ TreeExprBuilder::MakeFunction("add", {node_f0, node_f1}, arrow::int32());
+ auto literal_10 = TreeExprBuilder::MakeLiteral((int32_t)10);
+ auto less_than_10 = TreeExprBuilder::MakeFunction("less_than", {sum_func, literal_10},
+ arrow::boolean());
+ auto condition = TreeExprBuilder::MakeCondition(less_than_10);
+ auto configuration = TestConfiguration();
+
+ std::shared_ptr<Filter> filter;
+ auto status = Filter::Make(schema, condition, configuration, &filter);
+ EXPECT_TRUE(status.ok());
+
+ // same schema and condition, should return the same filter as above.
+ std::shared_ptr<Filter> cached_filter;
+ status = Filter::Make(schema, condition, configuration, &cached_filter);
+ EXPECT_TRUE(status.ok());
+ EXPECT_TRUE(cached_filter.get() == filter.get());
+
+ // schema is different should return a new filter.
+ auto field2 = field("f2", int32());
+ auto different_schema = arrow::schema({field0, field1, field2});
+ std::shared_ptr<Filter> should_be_new_filter;
+ status =
+ Filter::Make(different_schema, condition, configuration, &should_be_new_filter);
+ EXPECT_TRUE(status.ok());
+ EXPECT_TRUE(cached_filter.get() != should_be_new_filter.get());
+
+ // condition is different, should return a new filter.
+ auto greater_than_10 = TreeExprBuilder::MakeFunction(
+ "greater_than", {sum_func, literal_10}, arrow::boolean());
+ auto new_condition = TreeExprBuilder::MakeCondition(greater_than_10);
+ std::shared_ptr<Filter> should_be_new_filter1;
+ status = Filter::Make(schema, new_condition, configuration, &should_be_new_filter1);
+ EXPECT_TRUE(status.ok());
+ EXPECT_TRUE(cached_filter.get() != should_be_new_filter1.get());
+}
+
+TEST_F(TestFilter, TestSimple) {
+ // schema for input fields
+ auto field0 = field("f0", int32());
+ auto field1 = field("f1", int32());
+ auto schema = arrow::schema({field0, field1});
+
+ // Build condition f0 + f1 < 10
+ auto node_f0 = TreeExprBuilder::MakeField(field0);
+ auto node_f1 = TreeExprBuilder::MakeField(field1);
+ auto sum_func =
+ TreeExprBuilder::MakeFunction("add", {node_f0, node_f1}, arrow::int32());
+ auto literal_10 = TreeExprBuilder::MakeLiteral((int32_t)10);
+ auto less_than_10 = TreeExprBuilder::MakeFunction("less_than", {sum_func, literal_10},
+ arrow::boolean());
+ auto condition = TreeExprBuilder::MakeCondition(less_than_10);
+
+ std::shared_ptr<Filter> filter;
+ auto status = Filter::Make(schema, condition, TestConfiguration(), &filter);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+ auto array0 = MakeArrowArrayInt32({1, 2, 3, 4, 6}, {true, true, true, false, true});
+ auto array1 = MakeArrowArrayInt32({5, 9, 6, 17, 3}, {true, true, false, true, true});
+ // expected output (indices for which condition matches)
+ auto exp = MakeArrowArrayUint16({0, 4});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+
+ std::shared_ptr<SelectionVector> selection_vector;
+ status = SelectionVector::MakeInt16(num_records, pool_, &selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Evaluate expression
+ status = filter->Evaluate(*in_batch, selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, selection_vector->ToArray());
+}
+
+TEST_F(TestFilter, TestSimpleCustomConfig) {
+ // schema for input fields
+ auto field0 = field("f0", int32());
+ auto field1 = field("f1", int32());
+ auto schema = arrow::schema({field0, field1});
+
+ // Build condition f0 != f1
+ auto condition = TreeExprBuilder::MakeCondition("not_equal", {field0, field1});
+
+ ConfigurationBuilder config_builder;
+ std::shared_ptr<Configuration> config = config_builder.build();
+
+ std::shared_ptr<Filter> filter;
+ auto status = Filter::Make(schema, condition, TestConfiguration(), &filter);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array0 = MakeArrowArrayInt32({1, 2, 3, 4}, {true, true, true, false});
+ auto array1 = MakeArrowArrayInt32({11, 2, 3, 17}, {true, true, false, true});
+ // expected output
+ auto exp = MakeArrowArrayUint16({0});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+
+ std::shared_ptr<SelectionVector> selection_vector;
+ status = SelectionVector::MakeInt16(num_records, pool_, &selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Evaluate expression
+ status = filter->Evaluate(*in_batch, selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, selection_vector->ToArray());
+}
+
+TEST_F(TestFilter, TestZeroCopy) {
+ // schema for input fields
+ auto field0 = field("f0", int32());
+ auto schema = arrow::schema({field0});
+
+ // Build condition
+ auto condition = TreeExprBuilder::MakeCondition("isnotnull", {field0});
+
+ std::shared_ptr<Filter> filter;
+ auto status = Filter::Make(schema, condition, TestConfiguration(), &filter);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array0 = MakeArrowArrayInt32({1, 2, 3, 4}, {true, true, true, false});
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});
+
+ // expected output
+ auto exp = MakeArrowArrayUint16({0, 1, 2});
+
+ // allocate selection buffers
+ int64_t data_sz = sizeof(int16_t) * num_records;
+ std::unique_ptr<uint8_t[]> data(new uint8_t[data_sz]);
+ std::shared_ptr<arrow::MutableBuffer> data_buf =
+ std::make_shared<arrow::MutableBuffer>(data.get(), data_sz);
+
+ std::shared_ptr<SelectionVector> selection_vector;
+ status = SelectionVector::MakeInt16(num_records, data_buf, &selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Evaluate expression
+ status = filter->Evaluate(*in_batch, selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, selection_vector->ToArray());
+}
+
+TEST_F(TestFilter, TestZeroCopyNegative) {
+ ArrayPtr output;
+
+ // schema for input fields
+ auto field0 = field("f0", int32());
+ auto schema = arrow::schema({field0});
+
+ // Build expression
+ auto condition = TreeExprBuilder::MakeCondition("isnotnull", {field0});
+
+ std::shared_ptr<Filter> filter;
+ auto status = Filter::Make(schema, condition, TestConfiguration(), &filter);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array0 = MakeArrowArrayInt32({1, 2, 3, 4}, {true, true, true, false});
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});
+
+ // expected output
+ auto exp = MakeArrowArrayInt16({0, 1, 2});
+
+ // allocate output buffers
+ int64_t data_sz = sizeof(int16_t) * num_records;
+ std::unique_ptr<uint8_t[]> data(new uint8_t[data_sz]);
+ std::shared_ptr<arrow::MutableBuffer> data_buf =
+ std::make_shared<arrow::MutableBuffer>(data.get(), data_sz);
+
+ std::shared_ptr<SelectionVector> selection_vector;
+ status = SelectionVector::MakeInt16(num_records, data_buf, &selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // the batch can't be empty.
+ auto bad_batch = arrow::RecordBatch::Make(schema, 0 /*num_records*/, {array0});
+ status = filter->Evaluate(*bad_batch, selection_vector);
+ EXPECT_EQ(status.code(), StatusCode::Invalid);
+
+ // the selection_vector can't be null.
+ std::shared_ptr<SelectionVector> null_selection;
+ status = filter->Evaluate(*in_batch, null_selection);
+ EXPECT_EQ(status.code(), StatusCode::Invalid);
+
+ // the selection vector must be suitably sized.
+ std::shared_ptr<SelectionVector> bad_selection;
+ status = SelectionVector::MakeInt16(num_records - 1, data_buf, &bad_selection);
+ EXPECT_TRUE(status.ok());
+
+ status = filter->Evaluate(*in_batch, bad_selection);
+ EXPECT_EQ(status.code(), StatusCode::Invalid);
+}
+
+TEST_F(TestFilter, TestSimpleSVInt32) {
+ // schema for input fields
+ auto field0 = field("f0", int32());
+ auto field1 = field("f1", int32());
+ auto schema = arrow::schema({field0, field1});
+
+ // Build condition f0 + f1 < 10
+ auto node_f0 = TreeExprBuilder::MakeField(field0);
+ auto node_f1 = TreeExprBuilder::MakeField(field1);
+ auto sum_func =
+ TreeExprBuilder::MakeFunction("add", {node_f0, node_f1}, arrow::int32());
+ auto literal_10 = TreeExprBuilder::MakeLiteral((int32_t)10);
+ auto less_than_10 = TreeExprBuilder::MakeFunction("less_than", {sum_func, literal_10},
+ arrow::boolean());
+ auto condition = TreeExprBuilder::MakeCondition(less_than_10);
+
+ std::shared_ptr<Filter> filter;
+ auto status = Filter::Make(schema, condition, TestConfiguration(), &filter);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+ auto array0 = MakeArrowArrayInt32({1, 2, 3, 4, 6}, {true, true, true, false, true});
+ auto array1 = MakeArrowArrayInt32({5, 9, 6, 17, 3}, {true, true, false, true, true});
+ // expected output (indices for which condition matches)
+ auto exp = MakeArrowArrayUint32({0, 4});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+
+ std::shared_ptr<SelectionVector> selection_vector;
+ status = SelectionVector::MakeInt32(num_records, pool_, &selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Evaluate expression
+ status = filter->Evaluate(*in_batch, selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, selection_vector->ToArray());
+}
+
+TEST_F(TestFilter, TestOffset) {
+ // schema for input fields
+ auto field0 = field("f0", int32());
+ auto field1 = field("f1", int32());
+ auto schema = arrow::schema({field0, field1});
+
+ // Build condition f0 + f1 < 10
+ auto node_f0 = TreeExprBuilder::MakeField(field0);
+ auto node_f1 = TreeExprBuilder::MakeField(field1);
+ auto sum_func =
+ TreeExprBuilder::MakeFunction("add", {node_f0, node_f1}, arrow::int32());
+ auto literal_10 = TreeExprBuilder::MakeLiteral((int32_t)10);
+ auto less_than_10 = TreeExprBuilder::MakeFunction("less_than", {sum_func, literal_10},
+ arrow::boolean());
+ auto condition = TreeExprBuilder::MakeCondition(less_than_10);
+
+ std::shared_ptr<Filter> filter;
+ auto status = Filter::Make(schema, condition, TestConfiguration(), &filter);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+ auto array0 =
+ MakeArrowArrayInt32({0, 1, 2, 3, 4, 6}, {true, true, true, true, false, true});
+ array0 = array0->Slice(1);
+ auto array1 = MakeArrowArrayInt32({5, 9, 6, 17, 3}, {true, true, false, true, true});
+ // expected output (indices for which condition matches)
+ auto exp = MakeArrowArrayUint16({3});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+ in_batch = in_batch->Slice(1);
+
+ std::shared_ptr<SelectionVector> selection_vector;
+ status = SelectionVector::MakeInt16(num_records, pool_, &selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Evaluate expression
+ status = filter->Evaluate(*in_batch, selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, selection_vector->ToArray());
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/tests/generate_data.h b/src/arrow/cpp/src/gandiva/tests/generate_data.h
new file mode 100644
index 000000000..9fb0e4eae
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/tests/generate_data.h
@@ -0,0 +1,152 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <stdlib.h>
+#include <random>
+#include <string>
+
+#include "arrow/util/decimal.h"
+#include "arrow/util/io_util.h"
+
+#pragma once
+
+namespace gandiva {
+
+template <typename C_TYPE>
+class DataGenerator {
+ public:
+ virtual ~DataGenerator() = default;
+
+ virtual C_TYPE GenerateData() = 0;
+};
+
+class Random {
+ public:
+ Random() : gen_(::arrow::internal::GetRandomSeed()) {}
+ explicit Random(uint64_t seed) : gen_(seed) {}
+
+ int32_t next() { return gen_(); }
+
+ private:
+ std::default_random_engine gen_;
+};
+
+class Int32DataGenerator : public DataGenerator<int32_t> {
+ public:
+ Int32DataGenerator() {}
+
+ int32_t GenerateData() { return random_.next(); }
+
+ protected:
+ Random random_;
+};
+
+class BoundedInt32DataGenerator : public Int32DataGenerator {
+ public:
+ explicit BoundedInt32DataGenerator(uint32_t upperBound)
+ : Int32DataGenerator(), upperBound_(upperBound) {}
+
+ int32_t GenerateData() {
+ int32_t value = (random_.next() % upperBound_);
+ return value;
+ }
+
+ protected:
+ uint32_t upperBound_;
+};
+
+class Int64DataGenerator : public DataGenerator<int64_t> {
+ public:
+ Int64DataGenerator() {}
+
+ int64_t GenerateData() { return random_.next(); }
+
+ protected:
+ Random random_;
+};
+
+class Decimal128DataGenerator : public DataGenerator<arrow::Decimal128> {
+ public:
+ explicit Decimal128DataGenerator(bool large) : large_(large) {}
+
+ arrow::Decimal128 GenerateData() {
+ uint64_t low = random_.next();
+ int64_t high = random_.next();
+ if (large_) {
+ high += (1ull << 62);
+ }
+ return arrow::Decimal128(high, low);
+ }
+
+ protected:
+ bool large_;
+ Random random_;
+};
+
+class FastUtf8DataGenerator : public DataGenerator<std::string> {
+ public:
+ explicit FastUtf8DataGenerator(int max_len) : max_len_(max_len), cur_char_('a') {}
+
+ std::string GenerateData() {
+ std::string generated_str;
+
+ int slen = random_.next() % max_len_;
+ for (int i = 0; i < slen; ++i) {
+ generated_str += generate_next_char();
+ }
+ return generated_str;
+ }
+
+ private:
+ char generate_next_char() {
+ ++cur_char_;
+ if (cur_char_ > 'z') {
+ cur_char_ = 'a';
+ }
+ return cur_char_;
+ }
+
+ Random random_;
+ unsigned int max_len_;
+ char cur_char_;
+};
+
+class Utf8IntDataGenerator : public DataGenerator<std::string> {
+ public:
+ Utf8IntDataGenerator() {}
+
+ std::string GenerateData() { return std::to_string(random_.next()); }
+
+ private:
+ Random random_;
+};
+
+class Utf8FloatDataGenerator : public DataGenerator<std::string> {
+ public:
+ Utf8FloatDataGenerator() {}
+
+ std::string GenerateData() {
+ return std::to_string(
+ static_cast<float>(random_.next()) /
+ static_cast<float>(RAND_MAX / 100)); // random float between 0.0 to 100.0
+ }
+
+ private:
+ Random random_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/tests/hash_test.cc b/src/arrow/cpp/src/gandiva/tests/hash_test.cc
new file mode 100644
index 000000000..40ebc50a2
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/tests/hash_test.cc
@@ -0,0 +1,615 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <sstream>
+
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "gandiva/projector.h"
+#include "gandiva/tests/test_util.h"
+#include "gandiva/tree_expr_builder.h"
+
+namespace gandiva {
+
+using arrow::boolean;
+using arrow::float32;
+using arrow::float64;
+using arrow::int32;
+using arrow::int64;
+using arrow::utf8;
+
+class TestHash : public ::testing::Test {
+ public:
+ void SetUp() { pool_ = arrow::default_memory_pool(); }
+
+ protected:
+ arrow::MemoryPool* pool_;
+};
+
+TEST_F(TestHash, TestSimple) {
+ // schema for input fields
+ auto field_a = field("a", int32());
+ auto schema = arrow::schema({field_a});
+
+ // output fields
+ auto res_0 = field("res0", int32());
+ auto res_1 = field("res1", int64());
+
+ // build expression.
+ // hash32(a, 10)
+ // hash64(a)
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto literal_10 = TreeExprBuilder::MakeLiteral((int32_t)10);
+ auto hash32 = TreeExprBuilder::MakeFunction("hash32", {node_a, literal_10}, int32());
+ auto hash64 = TreeExprBuilder::MakeFunction("hash64", {node_a}, int64());
+ auto expr_0 = TreeExprBuilder::MakeExpression(hash32, res_0);
+ auto expr_1 = TreeExprBuilder::MakeExpression(hash64, res_1);
+
+ // Build a projector for the expression.
+ std::shared_ptr<Projector> projector;
+ auto status =
+ Projector::Make(schema, {expr_0, expr_1}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array_a = MakeArrowArrayInt32({1, 2, 3, 4}, {false, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ auto int32_arr = std::dynamic_pointer_cast<arrow::Int32Array>(outputs.at(0));
+ EXPECT_EQ(int32_arr->null_count(), 0);
+ EXPECT_EQ(int32_arr->Value(0), 10);
+ for (int i = 1; i < num_records; ++i) {
+ EXPECT_NE(int32_arr->Value(i), int32_arr->Value(i - 1));
+ }
+
+ auto int64_arr = std::dynamic_pointer_cast<arrow::Int64Array>(outputs.at(1));
+ EXPECT_EQ(int64_arr->null_count(), 0);
+ EXPECT_EQ(int64_arr->Value(0), 0);
+ for (int i = 1; i < num_records; ++i) {
+ EXPECT_NE(int64_arr->Value(i), int64_arr->Value(i - 1));
+ }
+}
+
+TEST_F(TestHash, TestBuf) {
+ // schema for input fields
+ auto field_a = field("a", utf8());
+ auto schema = arrow::schema({field_a});
+
+ // output fields
+ auto res_0 = field("res0", int32());
+ auto res_1 = field("res1", int64());
+
+ // build expressions.
+ // hash32(a)
+ // hash64(a, 10)
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto literal_10 = TreeExprBuilder::MakeLiteral(static_cast<int64_t>(10));
+ auto hash32 = TreeExprBuilder::MakeFunction("hash32", {node_a}, int32());
+ auto hash64 = TreeExprBuilder::MakeFunction("hash64", {node_a, literal_10}, int64());
+ auto expr_0 = TreeExprBuilder::MakeExpression(hash32, res_0);
+ auto expr_1 = TreeExprBuilder::MakeExpression(hash64, res_1);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status =
+ Projector::Make(schema, {expr_0, expr_1}, TestConfiguration(), &projector);
+ ASSERT_OK(status) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array_a =
+ MakeArrowArrayUtf8({"foo", "hello", "bye", "hi"}, {false, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ ASSERT_OK(status);
+
+ // Validate results
+ auto int32_arr = std::dynamic_pointer_cast<arrow::Int32Array>(outputs.at(0));
+ EXPECT_EQ(int32_arr->null_count(), 0);
+ EXPECT_EQ(int32_arr->Value(0), 0);
+ for (int i = 1; i < num_records; ++i) {
+ EXPECT_NE(int32_arr->Value(i), int32_arr->Value(i - 1));
+ }
+
+ auto int64_arr = std::dynamic_pointer_cast<arrow::Int64Array>(outputs.at(1));
+ EXPECT_EQ(int64_arr->null_count(), 0);
+ EXPECT_EQ(int64_arr->Value(0), 10);
+ for (int i = 1; i < num_records; ++i) {
+ EXPECT_NE(int64_arr->Value(i), int64_arr->Value(i - 1));
+ }
+}
+
+TEST_F(TestHash, TestSha256Simple) {
+ // schema for input fields
+ auto field_a = field("a", int32());
+ auto field_b = field("b", int64());
+ auto field_c = field("c", float32());
+ auto field_d = field("d", float64());
+ auto schema = arrow::schema({field_a, field_b, field_c, field_d});
+
+ // output fields
+ auto res_0 = field("res0", utf8());
+ auto res_1 = field("res1", utf8());
+ auto res_2 = field("res2", utf8());
+ auto res_3 = field("res3", utf8());
+
+ // build expressions.
+ // hashSHA256(a)
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto hashSha256_1 = TreeExprBuilder::MakeFunction("hashSHA256", {node_a}, utf8());
+ auto expr_0 = TreeExprBuilder::MakeExpression(hashSha256_1, res_0);
+
+ auto node_b = TreeExprBuilder::MakeField(field_b);
+ auto hashSha256_2 = TreeExprBuilder::MakeFunction("hashSHA256", {node_b}, utf8());
+ auto expr_1 = TreeExprBuilder::MakeExpression(hashSha256_2, res_1);
+
+ auto node_c = TreeExprBuilder::MakeField(field_c);
+ auto hashSha256_3 = TreeExprBuilder::MakeFunction("hashSHA256", {node_c}, utf8());
+ auto expr_2 = TreeExprBuilder::MakeExpression(hashSha256_3, res_2);
+
+ auto node_d = TreeExprBuilder::MakeField(field_d);
+ auto hashSha256_4 = TreeExprBuilder::MakeFunction("hashSHA256", {node_d}, utf8());
+ auto expr_3 = TreeExprBuilder::MakeExpression(hashSha256_4, res_3);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr_0, expr_1, expr_2, expr_3},
+ TestConfiguration(), &projector);
+ ASSERT_OK(status) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 2;
+ auto validity_array = {false, true};
+
+ auto array_int32 = MakeArrowArrayInt32({1, 0}, validity_array);
+
+ auto array_int64 = MakeArrowArrayInt64({1, 0}, validity_array);
+
+ auto array_float32 = MakeArrowArrayFloat32({1.0, 0.0}, validity_array);
+
+ auto array_float64 = MakeArrowArrayFloat64({1.0, 0.0}, validity_array);
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(
+ schema, num_records, {array_int32, array_int64, array_float32, array_float64});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ ASSERT_OK(status);
+
+ auto response_int32 = outputs.at(0);
+ auto response_int64 = outputs.at(1);
+ auto response_float32 = outputs.at(2);
+ auto response_float64 = outputs.at(3);
+
+ // Checks if the null and zero representation for numeric values
+ // are consistent between the types
+ EXPECT_ARROW_ARRAY_EQUALS(response_int32, response_int64);
+ EXPECT_ARROW_ARRAY_EQUALS(response_int64, response_float32);
+ EXPECT_ARROW_ARRAY_EQUALS(response_float32, response_float64);
+
+ const int sha256_hash_size = 64;
+
+ // Checks if the hash size in response is correct
+ for (int i = 1; i < num_records; ++i) {
+ const auto& value_at_position = response_int32->GetScalar(i).ValueOrDie()->ToString();
+
+ EXPECT_EQ(value_at_position.size(), sha256_hash_size);
+ EXPECT_NE(value_at_position,
+ response_int32->GetScalar(i - 1).ValueOrDie()->ToString());
+ }
+}
+
+TEST_F(TestHash, TestSha256Varlen) {
+ // schema for input fields
+ auto field_a = field("a", utf8());
+ auto schema = arrow::schema({field_a});
+
+ // output fields
+ auto res_0 = field("res0", utf8());
+
+ // build expressions.
+ // hashSHA256(a)
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto hashSha256 = TreeExprBuilder::MakeFunction("hashSHA256", {node_a}, utf8());
+ auto expr_0 = TreeExprBuilder::MakeExpression(hashSha256, res_0);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr_0}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 3;
+
+ std::string first_string =
+ "ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn\nY "
+ "[ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]";
+ std::string second_string =
+ "ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeın\nY "
+ "[ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ] コンニチハ";
+
+ auto array_a =
+ MakeArrowArrayUtf8({"foo", first_string, second_string}, {false, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ ASSERT_OK(status);
+
+ auto response = outputs.at(0);
+ const int sha256_hash_size = 64;
+
+ EXPECT_EQ(response->null_count(), 0);
+
+ // Checks that the null value was hashed
+ EXPECT_NE(response->GetScalar(0).ValueOrDie()->ToString(), "");
+ EXPECT_EQ(response->GetScalar(0).ValueOrDie()->ToString().size(), sha256_hash_size);
+
+ // Check that all generated hashes were different
+ for (int i = 1; i < num_records; ++i) {
+ const auto& value_at_position = response->GetScalar(i).ValueOrDie()->ToString();
+
+ EXPECT_EQ(value_at_position.size(), sha256_hash_size);
+ EXPECT_NE(value_at_position, response->GetScalar(i - 1).ValueOrDie()->ToString());
+ }
+}
+
+TEST_F(TestHash, TestSha1Simple) {
+ // schema for input fields
+ auto field_a = field("a", int32());
+ auto field_b = field("b", int64());
+ auto field_c = field("c", float32());
+ auto field_d = field("d", float64());
+ auto schema = arrow::schema({field_a, field_b, field_c, field_d});
+
+ // output fields
+ auto res_0 = field("res0", utf8());
+ auto res_1 = field("res1", utf8());
+ auto res_2 = field("res2", utf8());
+ auto res_3 = field("res3", utf8());
+
+ // build expressions.
+ // hashSHA1(a)
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto hashSha1_1 = TreeExprBuilder::MakeFunction("hashSHA1", {node_a}, utf8());
+ auto expr_0 = TreeExprBuilder::MakeExpression(hashSha1_1, res_0);
+
+ auto node_b = TreeExprBuilder::MakeField(field_b);
+ auto hashSha1_2 = TreeExprBuilder::MakeFunction("hashSHA1", {node_b}, utf8());
+ auto expr_1 = TreeExprBuilder::MakeExpression(hashSha1_2, res_1);
+
+ auto node_c = TreeExprBuilder::MakeField(field_c);
+ auto hashSha1_3 = TreeExprBuilder::MakeFunction("hashSHA1", {node_c}, utf8());
+ auto expr_2 = TreeExprBuilder::MakeExpression(hashSha1_3, res_2);
+
+ auto node_d = TreeExprBuilder::MakeField(field_d);
+ auto hashSha1_4 = TreeExprBuilder::MakeFunction("hashSHA1", {node_d}, utf8());
+ auto expr_3 = TreeExprBuilder::MakeExpression(hashSha1_4, res_3);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr_0, expr_1, expr_2, expr_3},
+ TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 2;
+ auto validity_array = {false, true};
+
+ auto array_int32 = MakeArrowArrayInt32({1, 0}, validity_array);
+
+ auto array_int64 = MakeArrowArrayInt64({1, 0}, validity_array);
+
+ auto array_float32 = MakeArrowArrayFloat32({1.0, 0.0}, validity_array);
+
+ auto array_float64 = MakeArrowArrayFloat64({1.0, 0.0}, validity_array);
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(
+ schema, num_records, {array_int32, array_int64, array_float32, array_float64});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ ASSERT_OK(status);
+
+ auto response_int32 = outputs.at(0);
+ auto response_int64 = outputs.at(1);
+ auto response_float32 = outputs.at(2);
+ auto response_float64 = outputs.at(3);
+
+ // Checks if the null and zero representation for numeric values
+ // are consistent between the types
+ EXPECT_ARROW_ARRAY_EQUALS(response_int32, response_int64);
+ EXPECT_ARROW_ARRAY_EQUALS(response_int64, response_float32);
+ EXPECT_ARROW_ARRAY_EQUALS(response_float32, response_float64);
+
+ const int sha1_hash_size = 40;
+
+ // Checks if the hash size in response is correct
+ for (int i = 1; i < num_records; ++i) {
+ const auto& value_at_position = response_int32->GetScalar(i).ValueOrDie()->ToString();
+
+ EXPECT_EQ(value_at_position.size(), sha1_hash_size);
+ EXPECT_NE(value_at_position,
+ response_int32->GetScalar(i - 1).ValueOrDie()->ToString());
+ }
+}
+
+TEST_F(TestHash, TestSha1Varlen) {
+ // schema for input fields
+ auto field_a = field("a", utf8());
+ auto schema = arrow::schema({field_a});
+
+ // output fields
+ auto res_0 = field("res0", utf8());
+
+ // build expressions.
+ // hashSHA1(a)
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto hashSha1 = TreeExprBuilder::MakeFunction("hashSHA1", {node_a}, utf8());
+ auto expr_0 = TreeExprBuilder::MakeExpression(hashSha1, res_0);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr_0}, TestConfiguration(), &projector);
+ ASSERT_OK(status) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 3;
+
+ std::string first_string =
+ "ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn\nY [ˈʏpsilɔn], "
+ "Yen [jɛn], Yoga [ˈjoːgɑ]";
+ std::string second_string =
+ "ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeın\nY [ˈʏpsilɔn], "
+ "Yen [jɛn], Yoga [ˈjoːgɑ] コンニチハ";
+
+ auto array_a =
+ MakeArrowArrayUtf8({"", first_string, second_string}, {false, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ ASSERT_OK(status);
+
+ auto response = outputs.at(0);
+ const int sha1_hash_size = 40;
+
+ EXPECT_EQ(response->null_count(), 0);
+
+ // Checks that the null value was hashed
+ EXPECT_NE(response->GetScalar(0).ValueOrDie()->ToString(), "");
+ EXPECT_EQ(response->GetScalar(0).ValueOrDie()->ToString().size(), sha1_hash_size);
+
+ // Check that all generated hashes were different
+ for (int i = 1; i < num_records; ++i) {
+ const auto& value_at_position = response->GetScalar(i).ValueOrDie()->ToString();
+
+ EXPECT_EQ(value_at_position.size(), sha1_hash_size);
+ EXPECT_NE(value_at_position, response->GetScalar(i - 1).ValueOrDie()->ToString());
+ }
+}
+
+TEST_F(TestHash, TestSha1FunctionsAlias) {
+ // schema for input fields
+ auto field_a = field("a", utf8());
+ auto field_b = field("c", int64());
+ auto field_c = field("e", float64());
+ auto schema = arrow::schema({field_a, field_b, field_c});
+
+ // output fields
+ auto res_0 = field("res0", utf8());
+ auto res_0_sha1 = field("res0sha1", utf8());
+ auto res_0_sha = field("res0sha", utf8());
+
+ auto res_1 = field("res1", utf8());
+ auto res_1_sha1 = field("res1sha1", utf8());
+ auto res_1_sha = field("res1sha", utf8());
+
+ auto res_2 = field("res2", utf8());
+ auto res_2_sha1 = field("res2_sha1", utf8());
+ auto res_2_sha = field("res2_sha", utf8());
+
+ // build expressions.
+ // hashSHA1(a)
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto hashSha1 = TreeExprBuilder::MakeFunction("hashSHA1", {node_a}, utf8());
+ auto expr_0 = TreeExprBuilder::MakeExpression(hashSha1, res_0);
+ auto sha1 = TreeExprBuilder::MakeFunction("sha1", {node_a}, utf8());
+ auto expr_0_sha1 = TreeExprBuilder::MakeExpression(sha1, res_0_sha1);
+ auto sha = TreeExprBuilder::MakeFunction("sha", {node_a}, utf8());
+ auto expr_0_sha = TreeExprBuilder::MakeExpression(sha, res_0_sha);
+
+ auto node_b = TreeExprBuilder::MakeField(field_b);
+ auto hashSha1_1 = TreeExprBuilder::MakeFunction("hashSHA1", {node_b}, utf8());
+ auto expr_1 = TreeExprBuilder::MakeExpression(hashSha1_1, res_1);
+ auto sha1_1 = TreeExprBuilder::MakeFunction("sha1", {node_b}, utf8());
+ auto expr_1_sha1 = TreeExprBuilder::MakeExpression(sha1_1, res_1_sha1);
+ auto sha_1 = TreeExprBuilder::MakeFunction("sha", {node_b}, utf8());
+ auto expr_1_sha = TreeExprBuilder::MakeExpression(sha_1, res_1_sha);
+
+ auto node_c = TreeExprBuilder::MakeField(field_c);
+ auto hashSha1_2 = TreeExprBuilder::MakeFunction("hashSHA1", {node_c}, utf8());
+ auto expr_2 = TreeExprBuilder::MakeExpression(hashSha1_2, res_2);
+ auto sha1_2 = TreeExprBuilder::MakeFunction("sha1", {node_c}, utf8());
+ auto expr_2_sha1 = TreeExprBuilder::MakeExpression(sha1_2, res_2_sha1);
+ auto sha_2 = TreeExprBuilder::MakeFunction("sha", {node_c}, utf8());
+ auto expr_2_sha = TreeExprBuilder::MakeExpression(sha_2, res_2_sha);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema,
+ {expr_0, expr_0_sha, expr_0_sha1, expr_1, expr_1_sha,
+ expr_1_sha1, expr_2, expr_2_sha, expr_2_sha1},
+ TestConfiguration(), &projector);
+ ASSERT_OK(status) << status.message();
+
+ // Create a row-batch with some sample data
+ int32_t num_records = 3;
+
+ std::string first_string =
+ "ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn\nY [ˈʏpsilɔn], "
+ "Yen [jɛn], Yoga [ˈjoːgɑ]";
+ std::string second_string =
+ "ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeın\nY [ˈʏpsilɔn], "
+ "Yen [jɛn], Yoga [ˈjoːgɑ] コンニチハ";
+
+ auto array_utf8 =
+ MakeArrowArrayUtf8({"", first_string, second_string}, {false, true, true});
+
+ auto validity_array = {false, true, true};
+
+ auto array_int64 = MakeArrowArrayInt64({1, 0, 32423}, validity_array);
+
+ auto array_float64 = MakeArrowArrayFloat64({1.0, 0.0, 324893.3849}, validity_array);
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records,
+ {array_utf8, array_int64, array_float64});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ ASSERT_OK(status);
+
+ // Checks that the response for the hashSHA1, sha and sha1 are equals for the first
+ // field of utf8 type
+ EXPECT_ARROW_ARRAY_EQUALS(outputs.at(0), outputs.at(1)); // hashSha1 and sha
+ EXPECT_ARROW_ARRAY_EQUALS(outputs.at(1), outputs.at(2)); // sha and sha1
+
+ // Checks that the response for the hashSHA1, sha and sha1 are equals for the second
+ // field of int64 type
+ EXPECT_ARROW_ARRAY_EQUALS(outputs.at(3), outputs.at(4)); // hashSha1 and sha
+ EXPECT_ARROW_ARRAY_EQUALS(outputs.at(4), outputs.at(5)); // sha and sha1
+
+ // Checks that the response for the hashSHA1, sha and sha1 are equals for the first
+ // field of float64 type
+ EXPECT_ARROW_ARRAY_EQUALS(outputs.at(6), outputs.at(7)); // hashSha1 and sha responses
+ EXPECT_ARROW_ARRAY_EQUALS(outputs.at(7), outputs.at(8)); // sha and sha1 responses
+}
+
+TEST_F(TestHash, TestSha256FunctionsAlias) {
+ // schema for input fields
+ auto field_a = field("a", utf8());
+ auto field_b = field("c", int64());
+ auto field_c = field("e", float64());
+ auto schema = arrow::schema({field_a, field_b, field_c});
+
+ // output fields
+ auto res_0 = field("res0", utf8());
+ auto res_0_sha256 = field("res0sha256", utf8());
+
+ auto res_1 = field("res1", utf8());
+ auto res_1_sha256 = field("res1sha256", utf8());
+
+ auto res_2 = field("res2", utf8());
+ auto res_2_sha256 = field("res2_sha256", utf8());
+
+ // build expressions.
+ // hashSHA1(a)
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto hashSha2 = TreeExprBuilder::MakeFunction("hashSHA256", {node_a}, utf8());
+ auto expr_0 = TreeExprBuilder::MakeExpression(hashSha2, res_0);
+ auto sha256 = TreeExprBuilder::MakeFunction("sha256", {node_a}, utf8());
+ auto expr_0_sha256 = TreeExprBuilder::MakeExpression(sha256, res_0_sha256);
+
+ auto node_b = TreeExprBuilder::MakeField(field_b);
+ auto hashSha2_1 = TreeExprBuilder::MakeFunction("hashSHA256", {node_b}, utf8());
+ auto expr_1 = TreeExprBuilder::MakeExpression(hashSha2_1, res_1);
+ auto sha256_1 = TreeExprBuilder::MakeFunction("sha256", {node_b}, utf8());
+ auto expr_1_sha256 = TreeExprBuilder::MakeExpression(sha256_1, res_1_sha256);
+
+ auto node_c = TreeExprBuilder::MakeField(field_c);
+ auto hashSha2_2 = TreeExprBuilder::MakeFunction("hashSHA256", {node_c}, utf8());
+ auto expr_2 = TreeExprBuilder::MakeExpression(hashSha2_2, res_2);
+ auto sha256_2 = TreeExprBuilder::MakeFunction("sha256", {node_c}, utf8());
+ auto expr_2_sha256 = TreeExprBuilder::MakeExpression(sha256_2, res_2_sha256);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(
+ schema, {expr_0, expr_0_sha256, expr_1, expr_1_sha256, expr_2, expr_2_sha256},
+ TestConfiguration(), &projector);
+ ASSERT_OK(status) << status.message();
+
+ // Create a row-batch with some sample data
+ int32_t num_records = 3;
+
+ std::string first_string =
+ "ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn\nY [ˈʏpsilɔn], "
+ "Yen [jɛn], Yoga [ˈjoːgɑ]";
+ std::string second_string =
+ "ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeın\nY [ˈʏpsilɔn], "
+ "Yen [jɛn], Yoga [ˈjoːgɑ] コンニチハ";
+
+ auto array_utf8 =
+ MakeArrowArrayUtf8({"", first_string, second_string}, {false, true, true});
+
+ auto validity_array = {false, true, true};
+
+ auto array_int64 = MakeArrowArrayInt64({1, 0, 32423}, validity_array);
+
+ auto array_float64 = MakeArrowArrayFloat64({1.0, 0.0, 324893.3849}, validity_array);
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records,
+ {array_utf8, array_int64, array_float64});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ ASSERT_OK(status);
+
+ // Checks that the response for the hashSHA2, sha256 and sha2 are equals for the first
+ // field of utf8 type
+ EXPECT_ARROW_ARRAY_EQUALS(outputs.at(0), outputs.at(1)); // hashSha2 and sha256
+
+ // Checks that the response for the hashSHA2, sha256 and sha2 are equals for the second
+ // field of int64 type
+ EXPECT_ARROW_ARRAY_EQUALS(outputs.at(2), outputs.at(3)); // hashSha2 and sha256
+
+ // Checks that the response for the hashSHA2, sha256 and sha2 are equals for the first
+ // field of float64 type
+ EXPECT_ARROW_ARRAY_EQUALS(outputs.at(4),
+ outputs.at(5)); // hashSha2 and sha256 responses
+}
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/tests/huge_table_test.cc b/src/arrow/cpp/src/gandiva/tests/huge_table_test.cc
new file mode 100644
index 000000000..46f814b47
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/tests/huge_table_test.cc
@@ -0,0 +1,157 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+#include "arrow/memory_pool.h"
+#include "gandiva/filter.h"
+#include "gandiva/projector.h"
+#include "gandiva/tests/test_util.h"
+#include "gandiva/tree_expr_builder.h"
+
+namespace gandiva {
+
+using arrow::boolean;
+using arrow::float32;
+using arrow::int32;
+
+class LARGE_MEMORY_TEST(TestHugeProjector) : public ::testing::Test {
+ public:
+ void SetUp() { pool_ = arrow::default_memory_pool(); }
+
+ protected:
+ arrow::MemoryPool* pool_;
+};
+
+class LARGE_MEMORY_TEST(TestHugeFilter) : public ::testing::Test {
+ public:
+ void SetUp() { pool_ = arrow::default_memory_pool(); }
+
+ protected:
+ arrow::MemoryPool* pool_;
+};
+
+TEST_F(LARGE_MEMORY_TEST(TestHugeProjector), SimpleTestSumHuge) {
+ auto atype = arrow::TypeTraits<arrow::Int32Type>::type_singleton();
+
+ // schema for input fields
+ auto field0 = field("f0", atype);
+ auto field1 = field("f1", atype);
+ auto schema = arrow::schema({field0, field1});
+
+ // output fields
+ auto field_sum = field("add", atype);
+
+ // Build expression
+ auto sum_expr = TreeExprBuilder::MakeExpression("add", {field0, field1}, field_sum);
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {sum_expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ // Cause an overflow in int32_t
+ int64_t num_records = static_cast<int64_t>(INT32_MAX) + 3;
+ std::vector<int32_t> input0 = {2, 29, 5, 37, 11, 59, 17, 19};
+ std::vector<int32_t> input1 = {23, 3, 31, 7, 41, 47, 13};
+ std::vector<bool> validity;
+
+ std::vector<int32_t> arr1;
+ std::vector<int32_t> arr2;
+ // expected output
+ std::vector<int32_t> sum1;
+
+ for (int64_t i = 0; i < num_records; i++) {
+ arr1.push_back(input0[i % 8]);
+ arr2.push_back(input1[i % 7]);
+ sum1.push_back(input0[i % 8] + input1[i % 7]);
+ validity.push_back(true);
+ }
+
+ auto exp_sum = MakeArrowArray<arrow::Int32Type, int32_t>(sum1, validity);
+ auto array0 = MakeArrowArray<arrow::Int32Type, int32_t>(arr1, validity);
+ auto array1 = MakeArrowArray<arrow::Int32Type, int32_t>(arr2, validity);
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_sum, outputs.at(0));
+}
+
+TEST_F(LARGE_MEMORY_TEST(TestHugeFilter), TestSimpleHugeFilter) {
+ // Create a row-batch with some sample data
+ // Cause an overflow in int32_t
+ int64_t num_records = static_cast<int64_t>(INT32_MAX) + 3;
+ std::vector<int32_t> input0 = {2, 29, 5, 37, 11, 59, 17, 19};
+ std::vector<int32_t> input1 = {23, 3, 31, 7, 41, 47, 13};
+ std::vector<bool> validity;
+
+ std::vector<int32_t> arr1;
+ std::vector<int32_t> arr2;
+ // expected output
+ std::vector<uint64_t> sel;
+
+ for (int64_t i = 0; i < num_records; i++) {
+ arr1.push_back(input0[i % 8]);
+ arr2.push_back(input1[i % 7]);
+ if (input0[i % 8] + input1[i % 7] > 50) {
+ sel.push_back(i);
+ }
+ validity.push_back(true);
+ }
+
+ auto exp = MakeArrowArrayUint64(sel);
+
+ // schema for input fields
+ auto field0 = field("f0", int32());
+ auto field1 = field("f1", int32());
+ auto schema = arrow::schema({field0, field1});
+
+ // Build condition f0 + f1 < 50
+ auto node_f0 = TreeExprBuilder::MakeField(field0);
+ auto node_f1 = TreeExprBuilder::MakeField(field1);
+ auto sum_func =
+ TreeExprBuilder::MakeFunction("add", {node_f0, node_f1}, arrow::int32());
+ auto literal_50 = TreeExprBuilder::MakeLiteral((int32_t)50);
+ auto less_than_50 = TreeExprBuilder::MakeFunction("less_than", {sum_func, literal_50},
+ arrow::boolean());
+ auto condition = TreeExprBuilder::MakeCondition(less_than_50);
+
+ std::shared_ptr<Filter> filter;
+ auto status = Filter::Make(schema, condition, TestConfiguration(), &filter);
+ EXPECT_TRUE(status.ok());
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {arr1, arr2});
+
+ std::shared_ptr<SelectionVector> selection_vector;
+ status = SelectionVector::MakeInt64(num_records, pool_, &selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Evaluate expression
+ status = filter->Evaluate(*in_batch, selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, selection_vector->ToArray());
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/tests/if_expr_test.cc b/src/arrow/cpp/src/gandiva/tests/if_expr_test.cc
new file mode 100644
index 000000000..54b6d43b4
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/tests/if_expr_test.cc
@@ -0,0 +1,378 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+
+#include "gandiva/projector.h"
+#include "gandiva/tests/test_util.h"
+#include "gandiva/tree_expr_builder.h"
+
+namespace gandiva {
+
+using arrow::boolean;
+using arrow::float32;
+using arrow::int32;
+
+class TestIfExpr : public ::testing::Test {
+ public:
+ void SetUp() { pool_ = arrow::default_memory_pool(); }
+
+ protected:
+ arrow::MemoryPool* pool_;
+};
+
+TEST_F(TestIfExpr, TestSimple) {
+ // schema for input fields
+ auto fielda = field("a", int32());
+ auto fieldb = field("b", int32());
+ auto schema = arrow::schema({fielda, fieldb});
+
+ // output fields
+ auto field_result = field("res", int32());
+
+ // build expression.
+ // if (a > b)
+ // a
+ // else
+ // b
+ auto node_a = TreeExprBuilder::MakeField(fielda);
+ auto node_b = TreeExprBuilder::MakeField(fieldb);
+ auto condition =
+ TreeExprBuilder::MakeFunction("greater_than", {node_a, node_b}, boolean());
+ auto if_node = TreeExprBuilder::MakeIf(condition, node_a, node_b, int32());
+
+ auto expr = TreeExprBuilder::MakeExpression(if_node, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array0 = MakeArrowArrayInt32({10, 12, -20, 5}, {true, true, true, false});
+ auto array1 = MakeArrowArrayInt32({5, 15, 15, 17}, {true, true, true, true});
+
+ // expected output
+ auto exp = MakeArrowArrayInt32({10, 15, 15, 17}, {true, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+TEST_F(TestIfExpr, TestSimpleArithmetic) {
+ // schema for input fields
+ auto fielda = field("a", int32());
+ auto fieldb = field("b", int32());
+ auto schema = arrow::schema({fielda, fieldb});
+
+ // output fields
+ auto field_result = field("res", int32());
+
+ // build expression.
+ // if (a > b)
+ // a + b
+ // else
+ // a - b
+ auto node_a = TreeExprBuilder::MakeField(fielda);
+ auto node_b = TreeExprBuilder::MakeField(fieldb);
+ auto condition =
+ TreeExprBuilder::MakeFunction("greater_than", {node_a, node_b}, boolean());
+ auto sum = TreeExprBuilder::MakeFunction("add", {node_a, node_b}, int32());
+ auto sub = TreeExprBuilder::MakeFunction("subtract", {node_a, node_b}, int32());
+ auto if_node = TreeExprBuilder::MakeIf(condition, sum, sub, int32());
+
+ auto expr = TreeExprBuilder::MakeExpression(if_node, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array0 = MakeArrowArrayInt32({10, 12, -20, 5}, {true, true, true, false});
+ auto array1 = MakeArrowArrayInt32({5, 15, 15, 17}, {true, true, true, true});
+
+ // expected output
+ auto exp = MakeArrowArrayInt32({15, -3, -35, 0}, {true, true, true, false});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+TEST_F(TestIfExpr, TestNested) {
+ // schema for input fields
+ auto fielda = field("a", int32());
+ auto fieldb = field("b", int32());
+ auto schema = arrow::schema({fielda, fieldb});
+
+ // output fields
+ auto field_result = field("res", int32());
+
+ // build expression.
+ // if (a > b)
+ // a + b
+ // else if (a < b)
+ // a - b
+ // else
+ // a * b
+ auto node_a = TreeExprBuilder::MakeField(fielda);
+ auto node_b = TreeExprBuilder::MakeField(fieldb);
+ auto condition_gt =
+ TreeExprBuilder::MakeFunction("greater_than", {node_a, node_b}, boolean());
+ auto condition_lt =
+ TreeExprBuilder::MakeFunction("less_than", {node_a, node_b}, boolean());
+ auto sum = TreeExprBuilder::MakeFunction("add", {node_a, node_b}, int32());
+ auto sub = TreeExprBuilder::MakeFunction("subtract", {node_a, node_b}, int32());
+ auto mult = TreeExprBuilder::MakeFunction("multiply", {node_a, node_b}, int32());
+ auto else_node = TreeExprBuilder::MakeIf(condition_lt, sub, mult, int32());
+ auto if_node = TreeExprBuilder::MakeIf(condition_gt, sum, else_node, int32());
+
+ auto expr = TreeExprBuilder::MakeExpression(if_node, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array0 = MakeArrowArrayInt32({10, 12, 15, 5}, {true, true, true, false});
+ auto array1 = MakeArrowArrayInt32({5, 15, 15, 17}, {true, true, true, true});
+
+ // expected output
+ auto exp = MakeArrowArrayInt32({15, -3, 225, 0}, {true, true, true, false});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+TEST_F(TestIfExpr, TestNestedInIf) {
+ // schema for input fields
+ auto fielda = field("a", int32());
+ auto fieldb = field("b", int32());
+ auto fieldc = field("c", int32());
+ auto schema = arrow::schema({fielda, fieldb, fieldc});
+
+ // output fields
+ auto field_result = field("res", int32());
+
+ // build expression.
+ // if (a > 10)
+ // if (a < 20)
+ // a + b
+ // else
+ // b + c
+ // else
+ // a + c
+ auto node_a = TreeExprBuilder::MakeField(fielda);
+ auto node_b = TreeExprBuilder::MakeField(fieldb);
+ auto node_c = TreeExprBuilder::MakeField(fieldc);
+
+ auto literal_10 = TreeExprBuilder::MakeLiteral(10);
+ auto literal_20 = TreeExprBuilder::MakeLiteral(20);
+
+ auto gt_10 =
+ TreeExprBuilder::MakeFunction("greater_than", {node_a, literal_10}, boolean());
+ auto lt_20 =
+ TreeExprBuilder::MakeFunction("less_than", {node_a, literal_20}, boolean());
+ auto sum_ab = TreeExprBuilder::MakeFunction("add", {node_a, node_b}, int32());
+ auto sum_bc = TreeExprBuilder::MakeFunction("add", {node_b, node_c}, int32());
+ auto sum_ac = TreeExprBuilder::MakeFunction("add", {node_a, node_c}, int32());
+
+ auto if_lt_20 = TreeExprBuilder::MakeIf(lt_20, sum_ab, sum_bc, int32());
+ auto if_gt_10 = TreeExprBuilder::MakeIf(gt_10, if_lt_20, sum_ac, int32());
+
+ auto expr = TreeExprBuilder::MakeExpression(if_gt_10, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 6;
+ auto array_a =
+ MakeArrowArrayInt32({21, 15, 5, 22, 15, 5}, {true, true, true, true, true, true});
+ auto array_b = MakeArrowArrayInt32({20, 18, 19, 20, 18, 19},
+ {true, true, true, false, false, false});
+ auto array_c = MakeArrowArrayInt32({35, 45, 55, 35, 45, 55},
+ {true, true, true, false, false, false});
+
+ // expected output
+ auto exp =
+ MakeArrowArrayInt32({55, 33, 60, 0, 0, 0}, {true, true, true, false, false, false});
+
+ // prepare input record batch
+ auto in_batch =
+ arrow::RecordBatch::Make(schema, num_records, {array_a, array_b, array_c});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+TEST_F(TestIfExpr, TestNestedInCondition) {
+ // schema for input fields
+ auto fielda = field("a", int32());
+ auto fieldb = field("b", int32());
+ auto schema = arrow::schema({fielda, fieldb});
+
+ // output fields
+ auto field_result = field("res", int32());
+
+ // build expression.
+ // if (if (a > b) then true else if (a < b) false else null)
+ // 1
+ // else if !(if (a > b) then true else if (a < b) false else null)
+ // 2
+ // else
+ // 3
+ auto node_a = TreeExprBuilder::MakeField(fielda);
+ auto node_b = TreeExprBuilder::MakeField(fieldb);
+ auto literal_1 = TreeExprBuilder::MakeLiteral(1);
+ auto literal_2 = TreeExprBuilder::MakeLiteral(2);
+ auto literal_3 = TreeExprBuilder::MakeLiteral(3);
+ auto literal_true = TreeExprBuilder::MakeLiteral(true);
+ auto literal_false = TreeExprBuilder::MakeLiteral(false);
+ auto literal_null = TreeExprBuilder::MakeNull(boolean());
+
+ auto a_gt_b =
+ TreeExprBuilder::MakeFunction("greater_than", {node_a, node_b}, boolean());
+ auto a_lt_b = TreeExprBuilder::MakeFunction("less_than", {node_a, node_b}, boolean());
+ auto cond_else =
+ TreeExprBuilder::MakeIf(a_lt_b, literal_false, literal_null, boolean());
+ auto cond_if = TreeExprBuilder::MakeIf(a_gt_b, literal_true, cond_else, boolean());
+ auto not_cond_if = TreeExprBuilder::MakeFunction("not", {cond_if}, boolean());
+
+ auto outer_else = TreeExprBuilder::MakeIf(not_cond_if, literal_2, literal_3, int32());
+ auto outer_if = TreeExprBuilder::MakeIf(cond_if, literal_1, outer_else, int32());
+ auto expr = TreeExprBuilder::MakeExpression(outer_if, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 6;
+ auto array_a =
+ MakeArrowArrayInt32({21, 15, 5, 22, 15, 5}, {true, true, true, true, true, true});
+ auto array_b = MakeArrowArrayInt32({20, 18, 19, 20, 18, 19},
+ {true, true, true, false, false, false});
+ // expected output
+ auto exp =
+ MakeArrowArrayInt32({1, 2, 2, 3, 3, 3}, {true, true, true, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a, array_b});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+TEST_F(TestIfExpr, TestBigNested) {
+ // schema for input fields
+ auto fielda = field("a", int32());
+ auto schema = arrow::schema({fielda});
+
+ // output fields
+ auto field_result = field("res", int32());
+
+ // build expression.
+ // if (a < 10)
+ // 10
+ // else if (a < 20)
+ // 20
+ // ..
+ // ..
+ // else if (a < 190)
+ // 190
+ // else
+ // 200
+ auto node_a = TreeExprBuilder::MakeField(fielda);
+ auto top_node = TreeExprBuilder::MakeLiteral(200);
+ for (int thresh = 190; thresh > 0; thresh -= 10) {
+ auto literal = TreeExprBuilder::MakeLiteral(thresh);
+ auto condition =
+ TreeExprBuilder::MakeFunction("less_than", {node_a, literal}, boolean());
+ auto if_node = TreeExprBuilder::MakeIf(condition, literal, top_node, int32());
+ top_node = if_node;
+ }
+ auto expr = TreeExprBuilder::MakeExpression(top_node, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array0 = MakeArrowArrayInt32({10, 102, 158, 302}, {true, true, true, true});
+
+ // expected output
+ auto exp = MakeArrowArrayInt32({20, 110, 160, 200}, {true, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/tests/in_expr_test.cc b/src/arrow/cpp/src/gandiva/tests/in_expr_test.cc
new file mode 100644
index 000000000..fc1a8a71b
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/tests/in_expr_test.cc
@@ -0,0 +1,278 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+#include <cmath>
+
+#include "arrow/memory_pool.h"
+#include "gandiva/filter.h"
+#include "gandiva/tests/test_util.h"
+#include "gandiva/tree_expr_builder.h"
+
+namespace gandiva {
+
+using arrow::boolean;
+using arrow::float32;
+using arrow::float64;
+using arrow::int32;
+
+class TestIn : public ::testing::Test {
+ public:
+ void SetUp() { pool_ = arrow::default_memory_pool(); }
+
+ protected:
+ arrow::MemoryPool* pool_;
+};
+std::vector<Decimal128> MakeDecimalVector(std::vector<std::string> values) {
+ std::vector<arrow::Decimal128> ret;
+ for (auto str : values) {
+ Decimal128 decimal_value;
+ int32_t decimal_precision;
+ int32_t decimal_scale;
+
+ DCHECK_OK(
+ Decimal128::FromString(str, &decimal_value, &decimal_precision, &decimal_scale));
+
+ ret.push_back(decimal_value);
+ }
+ return ret;
+}
+
+TEST_F(TestIn, TestInSimple) {
+ // schema for input fields
+ auto field0 = field("f0", int32());
+ auto field1 = field("f1", int32());
+ auto schema = arrow::schema({field0, field1});
+
+ // Build In f0 + f1 in (6, 11)
+ auto node_f0 = TreeExprBuilder::MakeField(field0);
+ auto node_f1 = TreeExprBuilder::MakeField(field1);
+ auto sum_func =
+ TreeExprBuilder::MakeFunction("add", {node_f0, node_f1}, arrow::int32());
+ std::unordered_set<int32_t> in_constants({6, 11});
+ auto in_expr = TreeExprBuilder::MakeInExpressionInt32(sum_func, in_constants);
+ auto condition = TreeExprBuilder::MakeCondition(in_expr);
+
+ std::shared_ptr<Filter> filter;
+ auto status = Filter::Make(schema, condition, TestConfiguration(), &filter);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+ auto array0 = MakeArrowArrayInt32({1, 2, 3, 4, 6}, {true, true, true, false, true});
+ auto array1 = MakeArrowArrayInt32({5, 9, 6, 17, 5}, {true, true, false, true, false});
+ // expected output (indices for which condition matches)
+ auto exp = MakeArrowArrayUint16({0, 1});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+
+ std::shared_ptr<SelectionVector> selection_vector;
+ status = SelectionVector::MakeInt16(num_records, pool_, &selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Evaluate expression
+ status = filter->Evaluate(*in_batch, selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, selection_vector->ToArray());
+}
+
+TEST_F(TestIn, TestInFloat) {
+ // schema for input fields
+ auto field0 = field("f0", float32());
+ auto schema = arrow::schema({field0});
+
+ // Build In f0 + f1 in (6, 11)
+ auto node_f0 = TreeExprBuilder::MakeField(field0);
+
+ std::unordered_set<float> in_constants({6.5f, 12.0f, 11.5f});
+ auto in_expr = TreeExprBuilder::MakeInExpressionFloat(node_f0, in_constants);
+ auto condition = TreeExprBuilder::MakeCondition(in_expr);
+
+ std::shared_ptr<Filter> filter;
+ auto status = Filter::Make(schema, condition, TestConfiguration(), &filter);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+ auto array0 =
+ MakeArrowArrayFloat32({6.5f, 11.5f, 4, 3.15f, 6}, {true, true, false, true, true});
+ // expected output (indices for which condition matches)
+ auto exp = MakeArrowArrayUint16({0, 1});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});
+
+ std::shared_ptr<SelectionVector> selection_vector;
+ status = SelectionVector::MakeInt16(num_records, pool_, &selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Evaluate expression
+ status = filter->Evaluate(*in_batch, selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, selection_vector->ToArray());
+}
+
+TEST_F(TestIn, TestInDouble) {
+ // schema for input fields
+ auto field0 = field("double0", float64());
+ auto field1 = field("double1", float64());
+ auto schema = arrow::schema({field0, field1});
+
+ auto node_f0 = TreeExprBuilder::MakeField(field0);
+ auto node_f1 = TreeExprBuilder::MakeField(field1);
+ auto sum_func =
+ TreeExprBuilder::MakeFunction("add", {node_f0, node_f1}, arrow::float64());
+ std::unordered_set<double> in_constants({3.14159265359, 15.5555555});
+ auto in_expr = TreeExprBuilder::MakeInExpressionDouble(sum_func, in_constants);
+ auto condition = TreeExprBuilder::MakeCondition(in_expr);
+
+ std::shared_ptr<Filter> filter;
+ auto status = Filter::Make(schema, condition, TestConfiguration(), &filter);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+ auto array0 = MakeArrowArrayFloat64({1, 2, 3, 4, 11}, {true, true, true, false, false});
+ auto array1 = MakeArrowArrayFloat64({5, 9, 0.14159265359, 17, 4.5555555},
+ {true, true, true, true, true});
+
+ // expected output (indices for which condition matches)
+ auto exp = MakeArrowArrayUint16({2});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+
+ std::shared_ptr<SelectionVector> selection_vector;
+ status = SelectionVector::MakeInt16(num_records, pool_, &selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Evaluate expression
+ status = filter->Evaluate(*in_batch, selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, selection_vector->ToArray());
+}
+
+TEST_F(TestIn, TestInDecimal) {
+ int32_t precision = 38;
+ int32_t scale = 5;
+ auto decimal_type = std::make_shared<arrow::Decimal128Type>(precision, scale);
+
+ // schema for input fields
+ auto field0 = field("f0", arrow::decimal(precision, scale));
+ auto schema = arrow::schema({field0});
+
+ // Build In f0 + f1 in (6, 11)
+ auto node_f0 = TreeExprBuilder::MakeField(field0);
+
+ gandiva::DecimalScalar128 d0("6", precision, scale);
+ gandiva::DecimalScalar128 d1("12", precision, scale);
+ gandiva::DecimalScalar128 d2("11", precision, scale);
+ std::unordered_set<gandiva::DecimalScalar128> in_constants({d0, d1, d2});
+ auto in_expr = TreeExprBuilder::MakeInExpressionDecimal(node_f0, in_constants);
+ auto condition = TreeExprBuilder::MakeCondition(in_expr);
+
+ std::shared_ptr<Filter> filter;
+ auto status = Filter::Make(schema, condition, TestConfiguration(), &filter);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+ auto values0 = MakeDecimalVector({"1", "2", "0", "-6", "6"});
+ auto array0 =
+ MakeArrowArrayDecimal(decimal_type, values0, {true, true, true, false, true});
+ // expected output (indices for which condition matches)
+ auto exp = MakeArrowArrayUint16({4});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});
+
+ std::shared_ptr<SelectionVector> selection_vector;
+ status = SelectionVector::MakeInt16(num_records, pool_, &selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Evaluate expression
+ status = filter->Evaluate(*in_batch, selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, selection_vector->ToArray());
+}
+
+TEST_F(TestIn, TestInString) {
+ // schema for input fields
+ auto field0 = field("f0", arrow::utf8());
+ auto schema = arrow::schema({field0});
+
+ // Build f0 in ("test" ,"me")
+ auto node_f0 = TreeExprBuilder::MakeField(field0);
+ std::unordered_set<std::string> in_constants({"test", "me"});
+ auto in_expr = TreeExprBuilder::MakeInExpressionString(node_f0, in_constants);
+
+ auto condition = TreeExprBuilder::MakeCondition(in_expr);
+
+ std::shared_ptr<Filter> filter;
+ auto status = Filter::Make(schema, condition, TestConfiguration(), &filter);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+ auto array_a = MakeArrowArrayUtf8({"test", "lol", "me", "arrow", "test"},
+ {true, true, true, true, false});
+ // expected output (indices for which condition matches)
+ auto exp = MakeArrowArrayUint16({0, 2});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
+
+ std::shared_ptr<SelectionVector> selection_vector;
+ status = SelectionVector::MakeInt16(num_records, pool_, &selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Evaluate expression
+ status = filter->Evaluate(*in_batch, selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, selection_vector->ToArray());
+}
+
+TEST_F(TestIn, TestInStringValidationError) {
+ // schema for input fields
+ auto field0 = field("f0", arrow::int32());
+ auto schema = arrow::schema({field0});
+
+ // Build f0 in ("test" ,"me")
+ auto node_f0 = TreeExprBuilder::MakeField(field0);
+ std::unordered_set<std::string> in_constants({"test", "me"});
+ auto in_expr = TreeExprBuilder::MakeInExpressionString(node_f0, in_constants);
+ auto condition = TreeExprBuilder::MakeCondition(in_expr);
+
+ std::shared_ptr<Filter> filter;
+ auto status = Filter::Make(schema, condition, TestConfiguration(), &filter);
+
+ EXPECT_TRUE(status.IsExpressionValidationError());
+ std::string expected_error = "Evaluation expression for IN clause returns ";
+ EXPECT_TRUE(status.message().find(expected_error) != std::string::npos);
+}
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/tests/literal_test.cc b/src/arrow/cpp/src/gandiva/tests/literal_test.cc
new file mode 100644
index 000000000..b5ffff031
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/tests/literal_test.cc
@@ -0,0 +1,232 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+
+#include "gandiva/projector.h"
+#include "gandiva/tests/test_util.h"
+#include "gandiva/tree_expr_builder.h"
+
+namespace gandiva {
+
+using arrow::boolean;
+using arrow::float32;
+using arrow::float64;
+using arrow::int32;
+using arrow::int64;
+
+class TestLiteral : public ::testing::Test {
+ public:
+ void SetUp() { pool_ = arrow::default_memory_pool(); }
+
+ protected:
+ arrow::MemoryPool* pool_;
+};
+
+TEST_F(TestLiteral, TestSimpleArithmetic) {
+ // schema for input fields
+ auto field_a = field("a", boolean());
+ auto field_b = field("b", int32());
+ auto field_c = field("c", int64());
+ auto field_d = field("d", float32());
+ auto field_e = field("e", float64());
+ auto schema = arrow::schema({field_a, field_b, field_c, field_d, field_e});
+
+ // output fields
+ auto res_a = field("a+1", boolean());
+ auto res_b = field("b+1", int32());
+ auto res_c = field("c+1", int64());
+ auto res_d = field("d+1", float32());
+ auto res_e = field("e+1", float64());
+
+ // build expressions.
+ // a == true
+ // b + 1
+ // c + 1
+ // d + 1
+ // e + 1
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto literal_a = TreeExprBuilder::MakeLiteral(true);
+ auto func_a = TreeExprBuilder::MakeFunction("equal", {node_a, literal_a}, boolean());
+ auto expr_a = TreeExprBuilder::MakeExpression(func_a, res_a);
+
+ auto node_b = TreeExprBuilder::MakeField(field_b);
+ auto literal_b = TreeExprBuilder::MakeLiteral((int32_t)1);
+ auto func_b = TreeExprBuilder::MakeFunction("add", {node_b, literal_b}, int32());
+ auto expr_b = TreeExprBuilder::MakeExpression(func_b, res_b);
+
+ auto node_c = TreeExprBuilder::MakeField(field_c);
+ auto literal_c = TreeExprBuilder::MakeLiteral((int64_t)1);
+ auto func_c = TreeExprBuilder::MakeFunction("add", {node_c, literal_c}, int64());
+ auto expr_c = TreeExprBuilder::MakeExpression(func_c, res_c);
+
+ auto node_d = TreeExprBuilder::MakeField(field_d);
+ auto literal_d = TreeExprBuilder::MakeLiteral(static_cast<float>(1));
+ auto func_d = TreeExprBuilder::MakeFunction("add", {node_d, literal_d}, float32());
+ auto expr_d = TreeExprBuilder::MakeExpression(func_d, res_d);
+
+ auto node_e = TreeExprBuilder::MakeField(field_e);
+ auto literal_e = TreeExprBuilder::MakeLiteral(static_cast<double>(1));
+ auto func_e = TreeExprBuilder::MakeFunction("add", {node_e, literal_e}, float64());
+ auto expr_e = TreeExprBuilder::MakeExpression(func_e, res_e);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr_a, expr_b, expr_c, expr_d, expr_e},
+ TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array_a = MakeArrowArrayBool({true, true, false, true}, {true, true, true, false});
+ auto array_b = MakeArrowArrayInt32({5, 15, -15, 17}, {true, true, true, false});
+ auto array_c = MakeArrowArrayInt64({5, 15, -15, 17}, {true, true, true, false});
+ auto array_d = MakeArrowArrayFloat32({5.2f, 15, -15.6f, 17}, {true, true, true, false});
+ auto array_e = MakeArrowArrayFloat64({5.6f, 15, -15.9f, 17}, {true, true, true, false});
+
+ // expected output
+ auto exp_a = MakeArrowArrayBool({true, true, false, false}, {true, true, true, false});
+ auto exp_b = MakeArrowArrayInt32({6, 16, -14, 0}, {true, true, true, false});
+ auto exp_c = MakeArrowArrayInt64({6, 16, -14, 0}, {true, true, true, false});
+ auto exp_d = MakeArrowArrayFloat32({6.2f, 16, -14.6f, 0}, {true, true, true, false});
+ auto exp_e = MakeArrowArrayFloat64({6.6f, 16, -14.9f, 0}, {true, true, true, false});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records,
+ {array_a, array_b, array_c, array_d, array_e});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_a, outputs.at(0));
+ EXPECT_ARROW_ARRAY_EQUALS(exp_b, outputs.at(1));
+ EXPECT_ARROW_ARRAY_EQUALS(exp_c, outputs.at(2));
+ EXPECT_ARROW_ARRAY_EQUALS(exp_d, outputs.at(3));
+ EXPECT_ARROW_ARRAY_EQUALS(exp_e, outputs.at(4));
+}
+
+TEST_F(TestLiteral, TestLiteralHash) {
+ auto schema = arrow::schema({});
+ // output fields
+ auto res = field("a", int32());
+ auto int_literal = TreeExprBuilder::MakeLiteral((int32_t)2);
+ auto expr = TreeExprBuilder::MakeExpression(int_literal, res);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ auto res1 = field("a", int64());
+ auto int_literal1 = TreeExprBuilder::MakeLiteral((int64_t)2);
+ auto expr1 = TreeExprBuilder::MakeExpression(int_literal1, res1);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector1;
+ status = Projector::Make(schema, {expr1}, TestConfiguration(), &projector1);
+ EXPECT_TRUE(status.ok()) << status.message();
+ EXPECT_TRUE(projector.get() != projector1.get());
+}
+
+TEST_F(TestLiteral, TestNullLiteral) {
+ // schema for input fields
+ auto field_a = field("a", int32());
+ auto field_b = field("b", int32());
+ auto schema = arrow::schema({field_a, field_b});
+
+ // output fields
+ auto res = field("a+b+null", int32());
+
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto node_b = TreeExprBuilder::MakeField(field_b);
+ auto literal_c = TreeExprBuilder::MakeNull(arrow::int32());
+ auto add_a_b = TreeExprBuilder::MakeFunction("add", {node_a, node_b}, int32());
+ auto add_a_b_c = TreeExprBuilder::MakeFunction("add", {add_a_b, literal_c}, int32());
+ auto expr = TreeExprBuilder::MakeExpression(add_a_b_c, res);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array_a = MakeArrowArrayInt32({5, 15, -15, 17}, {true, true, true, false});
+ auto array_b = MakeArrowArrayInt32({5, 15, -15, 17}, {true, true, true, false});
+
+ // expected output
+ auto exp = MakeArrowArrayInt32({0, 0, 0, 0}, {false, false, false, false});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a, array_b});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+TEST_F(TestLiteral, TestNullLiteralInIf) {
+ // schema for input fields
+ auto field_a = field("a", float64());
+ auto schema = arrow::schema({field_a});
+
+ // output fields
+ auto res = field("res", float64());
+
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto literal_5 = TreeExprBuilder::MakeLiteral(5.0);
+ auto a_gt_5 = TreeExprBuilder::MakeFunction("greater_than", {node_a, literal_5},
+ arrow::boolean());
+ auto literal_null = TreeExprBuilder::MakeNull(arrow::float64());
+ auto if_node =
+ TreeExprBuilder::MakeIf(a_gt_5, literal_5, literal_null, arrow::float64());
+ auto expr = TreeExprBuilder::MakeExpression(if_node, res);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array_a = MakeArrowArrayFloat64({6, 15, -15, 17}, {true, true, true, false});
+
+ // expected output
+ auto exp = MakeArrowArrayFloat64({5, 5, 0, 0}, {true, true, false, false});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/tests/micro_benchmarks.cc b/src/arrow/cpp/src/gandiva/tests/micro_benchmarks.cc
new file mode 100644
index 000000000..35c77e3dd
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/tests/micro_benchmarks.cc
@@ -0,0 +1,456 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <stdlib.h>
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "benchmark/benchmark.h"
+#include "gandiva/decimal_type_util.h"
+#include "gandiva/projector.h"
+#include "gandiva/tests/test_util.h"
+#include "gandiva/tests/timed_evaluate.h"
+#include "gandiva/tree_expr_builder.h"
+
+namespace gandiva {
+
+using arrow::boolean;
+using arrow::int32;
+using arrow::int64;
+using arrow::utf8;
+
+static void TimedTestAdd3(benchmark::State& state) {
+ // schema for input fields
+ auto field0 = field("f0", int64());
+ auto field1 = field("f1", int64());
+ auto field2 = field("f2", int64());
+ auto schema = arrow::schema({field0, field1, field2});
+ auto pool_ = arrow::default_memory_pool();
+
+ // output field
+ auto field_sum = field("add", int64());
+
+ // Build expression
+ auto part_sum = TreeExprBuilder::MakeFunction(
+ "add", {TreeExprBuilder::MakeField(field1), TreeExprBuilder::MakeField(field2)},
+ int64());
+ auto sum = TreeExprBuilder::MakeFunction(
+ "add", {TreeExprBuilder::MakeField(field0), part_sum}, int64());
+
+ auto sum_expr = TreeExprBuilder::MakeExpression(sum, field_sum);
+
+ std::shared_ptr<Projector> projector;
+ ASSERT_OK(Projector::Make(schema, {sum_expr}, TestConfiguration(), &projector));
+
+ Int64DataGenerator data_generator;
+ ProjectEvaluator evaluator(projector);
+
+ Status status = TimedEvaluate<arrow::Int64Type, int64_t>(
+ schema, evaluator, data_generator, pool_, 1 * MILLION, 16 * THOUSAND, state);
+ ASSERT_OK(status);
+}
+
+static void TimedTestBigNested(benchmark::State& state) {
+ // schema for input fields
+ auto fielda = field("a", int32());
+ auto schema = arrow::schema({fielda});
+ auto pool_ = arrow::default_memory_pool();
+
+ // output fields
+ auto field_result = field("res", int32());
+
+ // build expression.
+ // if (a < 10)
+ // 10
+ // else if (a < 20)
+ // 20
+ // ..
+ // ..
+ // else if (a < 190)
+ // 190
+ // else
+ // 200
+ auto node_a = TreeExprBuilder::MakeField(fielda);
+ auto top_node = TreeExprBuilder::MakeLiteral(200);
+ for (int thresh = 190; thresh > 0; thresh -= 10) {
+ auto literal = TreeExprBuilder::MakeLiteral(thresh);
+ auto condition =
+ TreeExprBuilder::MakeFunction("less_than", {node_a, literal}, boolean());
+ auto if_node = TreeExprBuilder::MakeIf(condition, literal, top_node, int32());
+ top_node = if_node;
+ }
+ auto expr = TreeExprBuilder::MakeExpression(top_node, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector));
+
+ BoundedInt32DataGenerator data_generator(250);
+ ProjectEvaluator evaluator(projector);
+
+ Status status = TimedEvaluate<arrow::Int32Type, int32_t>(
+ schema, evaluator, data_generator, pool_, 1 * MILLION, 16 * THOUSAND, state);
+ ASSERT_TRUE(status.ok());
+}
+
+static void TimedTestExtractYear(benchmark::State& state) {
+ // schema for input fields
+ auto field0 = field("f0", arrow::date64());
+ auto schema = arrow::schema({field0});
+ auto pool_ = arrow::default_memory_pool();
+
+ // output field
+ auto field_res = field("res", int64());
+
+ // Build expression
+ auto expr = TreeExprBuilder::MakeExpression("extractYear", {field0}, field_res);
+
+ std::shared_ptr<Projector> projector;
+ ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector));
+
+ Int64DataGenerator data_generator;
+ ProjectEvaluator evaluator(projector);
+
+ Status status = TimedEvaluate<arrow::Date64Type, int64_t>(
+ schema, evaluator, data_generator, pool_, 1 * MILLION, 16 * THOUSAND, state);
+ ASSERT_TRUE(status.ok());
+}
+
+static void TimedTestFilterAdd2(benchmark::State& state) {
+ // schema for input fields
+ auto field0 = field("f0", int64());
+ auto field1 = field("f1", int64());
+ auto field2 = field("f2", int64());
+ auto schema = arrow::schema({field0, field1, field2});
+ auto pool_ = arrow::default_memory_pool();
+
+ // Build expression
+ auto sum = TreeExprBuilder::MakeFunction(
+ "add", {TreeExprBuilder::MakeField(field1), TreeExprBuilder::MakeField(field0)},
+ int64());
+ auto less_than = TreeExprBuilder::MakeFunction(
+ "less_than", {sum, TreeExprBuilder::MakeField(field2)}, boolean());
+ auto condition = TreeExprBuilder::MakeCondition(less_than);
+
+ std::shared_ptr<Filter> filter;
+ ASSERT_OK(Filter::Make(schema, condition, TestConfiguration(), &filter));
+
+ Int64DataGenerator data_generator;
+ FilterEvaluator evaluator(filter);
+
+ Status status = TimedEvaluate<arrow::Int64Type, int64_t>(
+ schema, evaluator, data_generator, pool_, MILLION, 16 * THOUSAND, state);
+ ASSERT_TRUE(status.ok());
+}
+
+static void TimedTestFilterLike(benchmark::State& state) {
+ // schema for input fields
+ auto fielda = field("a", utf8());
+ auto schema = arrow::schema({fielda});
+ auto pool_ = arrow::default_memory_pool();
+
+ // build expression.
+ auto node_a = TreeExprBuilder::MakeField(fielda);
+ auto pattern_node = TreeExprBuilder::MakeStringLiteral("%yellow%");
+ auto like_yellow =
+ TreeExprBuilder::MakeFunction("like", {node_a, pattern_node}, arrow::boolean());
+ auto condition = TreeExprBuilder::MakeCondition(like_yellow);
+
+ std::shared_ptr<Filter> filter;
+ ASSERT_OK(Filter::Make(schema, condition, TestConfiguration(), &filter));
+
+ FastUtf8DataGenerator data_generator(32);
+ FilterEvaluator evaluator(filter);
+
+ Status status = TimedEvaluate<arrow::StringType, std::string>(
+ schema, evaluator, data_generator, pool_, 1 * MILLION, 16 * THOUSAND, state);
+ ASSERT_TRUE(status.ok());
+}
+
+static void TimedTestCastFloatFromString(benchmark::State& state) {
+ auto field_a = field("a", utf8());
+ auto schema = arrow::schema({field_a});
+ auto pool = arrow::default_memory_pool();
+
+ auto field_result = field("res", arrow::float64());
+
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto fn = TreeExprBuilder::MakeFunction("castFLOAT8", {node_a}, arrow::float64());
+ auto expr = TreeExprBuilder::MakeExpression(fn, field_result);
+
+ std::shared_ptr<Projector> projector;
+ ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector));
+
+ Utf8FloatDataGenerator data_generator;
+ ProjectEvaluator evaluator(projector);
+
+ Status status = TimedEvaluate<arrow::StringType, std::string>(
+ schema, evaluator, data_generator, pool, 1 * MILLION, 16 * THOUSAND, state);
+ ASSERT_TRUE(status.ok());
+}
+
+static void TimedTestCastIntFromString(benchmark::State& state) {
+ auto field_a = field("a", utf8());
+ auto schema = arrow::schema({field_a});
+ auto pool = arrow::default_memory_pool();
+
+ auto field_result = field("res", int32());
+
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto fn = TreeExprBuilder::MakeFunction("castINT", {node_a}, int32());
+ auto expr = TreeExprBuilder::MakeExpression(fn, field_result);
+
+ std::shared_ptr<Projector> projector;
+ ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector));
+
+ Utf8IntDataGenerator data_generator;
+ ProjectEvaluator evaluator(projector);
+
+ Status status = TimedEvaluate<arrow::StringType, std::string>(
+ schema, evaluator, data_generator, pool, 1 * MILLION, 16 * THOUSAND, state);
+ ASSERT_TRUE(status.ok());
+}
+
+static void TimedTestAllocs(benchmark::State& state) {
+ // schema for input fields
+ auto field_a = field("a", arrow::utf8());
+ auto schema = arrow::schema({field_a});
+ auto pool_ = arrow::default_memory_pool();
+
+ // output field
+ auto field_res = field("res", int32());
+
+ // Build expression
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto upper = TreeExprBuilder::MakeFunction("upper", {node_a}, utf8());
+ auto length = TreeExprBuilder::MakeFunction("octet_length", {upper}, int32());
+ auto expr = TreeExprBuilder::MakeExpression(length, field_res);
+
+ std::shared_ptr<Projector> projector;
+ ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector));
+
+ FastUtf8DataGenerator data_generator(64);
+ ProjectEvaluator evaluator(projector);
+
+ Status status = TimedEvaluate<arrow::StringType, std::string>(
+ schema, evaluator, data_generator, pool_, 1 * MILLION, 16 * THOUSAND, state);
+ ASSERT_TRUE(status.ok());
+}
+// following two tests are for benchmark optimization of
+// in expr. will be used in follow-up PRs to optimize in expr.
+
+static void TimedTestMultiOr(benchmark::State& state) {
+ // schema for input fields
+ auto fielda = field("a", utf8());
+ auto schema = arrow::schema({fielda});
+ auto pool_ = arrow::default_memory_pool();
+
+ // output fields
+ auto field_result = field("res", boolean());
+
+ // build expression.
+ // booleanOr(a = string1, a = string2, ..)
+ auto node_a = TreeExprBuilder::MakeField(fielda);
+
+ NodeVector boolean_functions;
+ FastUtf8DataGenerator data_generator1(250);
+ for (int thresh = 1; thresh <= 32; thresh++) {
+ auto literal = TreeExprBuilder::MakeStringLiteral(data_generator1.GenerateData());
+ auto condition = TreeExprBuilder::MakeFunction("equal", {node_a, literal}, boolean());
+ boolean_functions.push_back(condition);
+ }
+
+ auto boolean_or = TreeExprBuilder::MakeOr(boolean_functions);
+ auto expr = TreeExprBuilder::MakeExpression(boolean_or, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector));
+
+ FastUtf8DataGenerator data_generator(250);
+ ProjectEvaluator evaluator(projector);
+ Status status = TimedEvaluate<arrow::StringType, std::string>(
+ schema, evaluator, data_generator, pool_, 100 * THOUSAND, 16 * THOUSAND, state);
+ ASSERT_OK(status);
+}
+
+static void TimedTestInExpr(benchmark::State& state) {
+ // schema for input fields
+ auto fielda = field("a", utf8());
+ auto schema = arrow::schema({fielda});
+ auto pool_ = arrow::default_memory_pool();
+
+ // output fields
+ auto field_result = field("res", boolean());
+
+ // build expression.
+ // a in (string1, string2, ..)
+ auto node_a = TreeExprBuilder::MakeField(fielda);
+
+ std::unordered_set<std::string> values;
+ FastUtf8DataGenerator data_generator1(250);
+ for (int i = 1; i <= 32; i++) {
+ values.insert(data_generator1.GenerateData());
+ }
+ auto boolean_or = TreeExprBuilder::MakeInExpressionString(node_a, values);
+ auto expr = TreeExprBuilder::MakeExpression(boolean_or, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector));
+
+ FastUtf8DataGenerator data_generator(250);
+ ProjectEvaluator evaluator(projector);
+
+ Status status = TimedEvaluate<arrow::StringType, std::string>(
+ schema, evaluator, data_generator, pool_, 100 * THOUSAND, 16 * THOUSAND, state);
+
+ ASSERT_OK(status);
+}
+
+static void DoDecimalAdd3(benchmark::State& state, int32_t precision, int32_t scale,
+ bool large = false) {
+ // schema for input fields
+ auto decimal_type = std::make_shared<arrow::Decimal128Type>(precision, scale);
+ auto field0 = field("f0", decimal_type);
+ auto field1 = field("f1", decimal_type);
+ auto field2 = field("f2", decimal_type);
+ auto schema = arrow::schema({field0, field1, field2});
+
+ Decimal128TypePtr add2_type;
+ auto status = DecimalTypeUtil::GetResultType(DecimalTypeUtil::kOpAdd,
+ {decimal_type, decimal_type}, &add2_type);
+
+ Decimal128TypePtr output_type;
+ status = DecimalTypeUtil::GetResultType(DecimalTypeUtil::kOpAdd,
+ {add2_type, decimal_type}, &output_type);
+
+ // output field
+ auto field_sum = field("add", output_type);
+
+ // Build expression
+ auto part_sum = TreeExprBuilder::MakeFunction(
+ "add", {TreeExprBuilder::MakeField(field1), TreeExprBuilder::MakeField(field2)},
+ add2_type);
+ auto sum = TreeExprBuilder::MakeFunction(
+ "add", {TreeExprBuilder::MakeField(field0), part_sum}, output_type);
+
+ auto sum_expr = TreeExprBuilder::MakeExpression(sum, field_sum);
+
+ std::shared_ptr<Projector> projector;
+ status = Projector::Make(schema, {sum_expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ Decimal128DataGenerator data_generator(large);
+ ProjectEvaluator evaluator(projector);
+
+ status = TimedEvaluate<arrow::Decimal128Type, arrow::Decimal128>(
+ schema, evaluator, data_generator, arrow::default_memory_pool(), 1 * MILLION,
+ 16 * THOUSAND, state);
+ ASSERT_OK(status);
+}
+
+static void DoDecimalAdd2(benchmark::State& state, int32_t precision, int32_t scale,
+ bool large = false) {
+ // schema for input fields
+ auto decimal_type = std::make_shared<arrow::Decimal128Type>(precision, scale);
+ auto field0 = field("f0", decimal_type);
+ auto field1 = field("f1", decimal_type);
+ auto schema = arrow::schema({field0, field1});
+
+ Decimal128TypePtr output_type;
+ auto status = DecimalTypeUtil::GetResultType(
+ DecimalTypeUtil::kOpAdd, {decimal_type, decimal_type}, &output_type);
+
+ // output field
+ auto field_sum = field("add", output_type);
+
+ // Build expression
+ auto sum = TreeExprBuilder::MakeExpression("add", {field0, field1}, field_sum);
+
+ std::shared_ptr<Projector> projector;
+ status = Projector::Make(schema, {sum}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ Decimal128DataGenerator data_generator(large);
+ ProjectEvaluator evaluator(projector);
+
+ status = TimedEvaluate<arrow::Decimal128Type, arrow::Decimal128>(
+ schema, evaluator, data_generator, arrow::default_memory_pool(), 1 * MILLION,
+ 16 * THOUSAND, state);
+ ASSERT_OK(status);
+}
+
+static void DecimalAdd2Fast(benchmark::State& state) {
+ // use lesser precision to test the fast-path
+ DoDecimalAdd2(state, DecimalTypeUtil::kMaxPrecision - 6, 18);
+}
+
+static void DecimalAdd2LeadingZeroes(benchmark::State& state) {
+ // use max precision to test the large-integer-path
+ DoDecimalAdd2(state, DecimalTypeUtil::kMaxPrecision, 6);
+}
+
+static void DecimalAdd2LeadingZeroesWithDiv(benchmark::State& state) {
+ // use max precision to test the large-integer-path
+ DoDecimalAdd2(state, DecimalTypeUtil::kMaxPrecision, 18);
+}
+
+static void DecimalAdd2Large(benchmark::State& state) {
+ // use max precision to test the large-integer-path
+ DoDecimalAdd2(state, DecimalTypeUtil::kMaxPrecision, 18, true);
+}
+
+static void DecimalAdd3Fast(benchmark::State& state) {
+ // use lesser precision to test the fast-path
+ DoDecimalAdd3(state, DecimalTypeUtil::kMaxPrecision - 6, 18);
+}
+
+static void DecimalAdd3LeadingZeroes(benchmark::State& state) {
+ // use max precision to test the large-integer-path
+ DoDecimalAdd3(state, DecimalTypeUtil::kMaxPrecision, 6);
+}
+
+static void DecimalAdd3LeadingZeroesWithDiv(benchmark::State& state) {
+ // use max precision to test the large-integer-path
+ DoDecimalAdd3(state, DecimalTypeUtil::kMaxPrecision, 18);
+}
+
+static void DecimalAdd3Large(benchmark::State& state) {
+ // use max precision to test the large-integer-path
+ DoDecimalAdd3(state, DecimalTypeUtil::kMaxPrecision, 18, true);
+}
+
+BENCHMARK(TimedTestAdd3)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
+BENCHMARK(TimedTestBigNested)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
+BENCHMARK(TimedTestExtractYear)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
+BENCHMARK(TimedTestFilterAdd2)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
+BENCHMARK(TimedTestFilterLike)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
+BENCHMARK(TimedTestCastFloatFromString)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
+BENCHMARK(TimedTestCastIntFromString)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
+BENCHMARK(TimedTestAllocs)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
+BENCHMARK(TimedTestMultiOr)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
+BENCHMARK(TimedTestInExpr)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
+BENCHMARK(DecimalAdd2Fast)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
+BENCHMARK(DecimalAdd2LeadingZeroes)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
+BENCHMARK(DecimalAdd2LeadingZeroesWithDiv)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
+BENCHMARK(DecimalAdd2Large)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
+BENCHMARK(DecimalAdd3Fast)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
+BENCHMARK(DecimalAdd3LeadingZeroes)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
+BENCHMARK(DecimalAdd3LeadingZeroesWithDiv)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
+BENCHMARK(DecimalAdd3Large)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/tests/null_validity_test.cc b/src/arrow/cpp/src/gandiva/tests/null_validity_test.cc
new file mode 100644
index 000000000..0374b68d4
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/tests/null_validity_test.cc
@@ -0,0 +1,175 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+#include "arrow/memory_pool.h"
+#include "gandiva/filter.h"
+#include "gandiva/projector.h"
+#include "gandiva/tests/test_util.h"
+#include "gandiva/tree_expr_builder.h"
+
+namespace gandiva {
+
+using arrow::boolean;
+using arrow::int32;
+using arrow::utf8;
+
+class TestNullValidity : public ::testing::Test {
+ public:
+ void SetUp() { pool_ = arrow::default_memory_pool(); }
+
+ protected:
+ arrow::MemoryPool* pool_;
+};
+
+// Create an array without a validity buffer.
+ArrayPtr MakeArrowArrayInt32WithNullValidity(std::vector<int32_t> in_data) {
+ auto array = MakeArrowArrayInt32(in_data);
+ return std::make_shared<arrow::Int32Array>(in_data.size(), array->data()->buffers[1],
+ nullptr, 0);
+}
+
+TEST_F(TestNullValidity, TestFunc) {
+ // schema for input fields
+ auto field0 = field("f0", int32());
+ auto field1 = field("f1", int32());
+ auto schema = arrow::schema({field0, field1});
+
+ // Build condition f0 + f1 < 10
+ auto node_f0 = TreeExprBuilder::MakeField(field0);
+ auto node_f1 = TreeExprBuilder::MakeField(field1);
+ auto sum_func =
+ TreeExprBuilder::MakeFunction("add", {node_f0, node_f1}, arrow::int32());
+ auto literal_10 = TreeExprBuilder::MakeLiteral((int32_t)10);
+ auto less_than_10 = TreeExprBuilder::MakeFunction("less_than", {sum_func, literal_10},
+ arrow::boolean());
+ auto condition = TreeExprBuilder::MakeCondition(less_than_10);
+
+ std::shared_ptr<Filter> filter;
+ auto status = Filter::Make(schema, condition, TestConfiguration(), &filter);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+
+ // Create an array without a validity buffer.
+ auto array0 = MakeArrowArrayInt32WithNullValidity({1, 2, 3, 4, 6});
+ auto array1 = MakeArrowArrayInt32({5, 9, 6, 17, 3}, {true, true, false, true, true});
+ // expected output (indices for which condition matches)
+ auto exp = MakeArrowArrayUint16({0, 4});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+
+ std::shared_ptr<SelectionVector> selection_vector;
+ status = SelectionVector::MakeInt16(num_records, pool_, &selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Evaluate expression
+ status = filter->Evaluate(*in_batch, selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, selection_vector->ToArray());
+}
+
+TEST_F(TestNullValidity, TestIfElse) {
+ // schema for input fields
+ auto fielda = field("a", int32());
+ auto fieldb = field("b", int32());
+ auto schema = arrow::schema({fielda, fieldb});
+
+ // output fields
+ auto field_result = field("res", int32());
+
+ // build expression.
+ // if (a > b)
+ // a
+ // else
+ // b
+ auto node_a = TreeExprBuilder::MakeField(fielda);
+ auto node_b = TreeExprBuilder::MakeField(fieldb);
+ auto condition =
+ TreeExprBuilder::MakeFunction("greater_than", {node_a, node_b}, boolean());
+ auto if_node = TreeExprBuilder::MakeIf(condition, node_a, node_b, int32());
+
+ auto expr = TreeExprBuilder::MakeExpression(if_node, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array0 = MakeArrowArrayInt32WithNullValidity({10, 12, -20, 5});
+ auto array1 = MakeArrowArrayInt32({5, 15, 15, 17});
+
+ // expected output
+ auto exp = MakeArrowArrayInt32({10, 15, 15, 17}, {true, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+TEST_F(TestNullValidity, TestUtf8) {
+ // schema for input fields
+ auto field_a = field("a", utf8());
+ auto schema = arrow::schema({field_a});
+
+ // output fields
+ auto res = field("res1", int32());
+
+ // build expressions.
+ // length(a)
+ auto expr = TreeExprBuilder::MakeExpression("length", {field_a}, res);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+ auto array_v = MakeArrowArrayUtf8({"foo", "hello", "bye", "hi", "मदन"});
+ auto array_a = std::make_shared<arrow::StringArray>(
+ num_records, array_v->data()->buffers[1], array_v->data()->buffers[2]);
+
+ // expected output
+ auto exp = MakeArrowArrayInt32({3, 5, 3, 2, 3}, {true, true, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/tests/projector_build_validation_test.cc b/src/arrow/cpp/src/gandiva/tests/projector_build_validation_test.cc
new file mode 100644
index 000000000..5b86844f9
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/tests/projector_build_validation_test.cc
@@ -0,0 +1,287 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+#include "arrow/memory_pool.h"
+#include "gandiva/projector.h"
+#include "gandiva/tests/test_util.h"
+#include "gandiva/tree_expr_builder.h"
+
+namespace gandiva {
+
+using arrow::boolean;
+using arrow::float32;
+using arrow::int32;
+
+class TestProjector : public ::testing::Test {
+ public:
+ void SetUp() { pool_ = arrow::default_memory_pool(); }
+
+ protected:
+ arrow::MemoryPool* pool_;
+};
+
+TEST_F(TestProjector, TestNonexistentFunction) {
+ // schema for input fields
+ auto field0 = field("f0", float32());
+ auto field1 = field("f2", float32());
+ auto schema = arrow::schema({field0, field1});
+
+ // output fields
+ auto field_result = field("res", boolean());
+
+ // Build expression
+ auto lt_expr = TreeExprBuilder::MakeExpression("nonexistent_function", {field0, field1},
+ field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {lt_expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.IsExpressionValidationError());
+ std::string expected_error =
+ "Function bool nonexistent_function(float, float) not supported yet.";
+ EXPECT_TRUE(status.message().find(expected_error) != std::string::npos);
+}
+
+TEST_F(TestProjector, TestNotMatchingDataType) {
+ // schema for input fields
+ auto field0 = field("f0", float32());
+ auto schema = arrow::schema({field0});
+
+ // output fields
+ auto field_result = field("res", boolean());
+
+ // Build expression
+ auto node_f0 = TreeExprBuilder::MakeField(field0);
+ auto lt_expr = TreeExprBuilder::MakeExpression(node_f0, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {lt_expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.IsExpressionValidationError());
+ std::string expected_error =
+ "Return type of root node float does not match that of expression bool";
+ EXPECT_TRUE(status.message().find(expected_error) != std::string::npos);
+}
+
+TEST_F(TestProjector, TestNotSupportedDataType) {
+ // schema for input fields
+ auto field0 = field("f0", list(int32()));
+ auto schema = arrow::schema({field0});
+
+ // output fields
+ auto field_result = field("res", list(int32()));
+
+ // Build expression
+ auto node_f0 = TreeExprBuilder::MakeField(field0);
+ auto lt_expr = TreeExprBuilder::MakeExpression(node_f0, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {lt_expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.IsExpressionValidationError());
+ std::string expected_error = "Field f0 has unsupported data type list";
+ EXPECT_TRUE(status.message().find(expected_error) != std::string::npos);
+}
+
+TEST_F(TestProjector, TestIncorrectSchemaMissingField) {
+ // schema for input fields
+ auto field0 = field("f0", float32());
+ auto field1 = field("f2", float32());
+ auto schema = arrow::schema({field0, field0});
+
+ // output fields
+ auto field_result = field("res", boolean());
+
+ // Build expression
+ auto lt_expr =
+ TreeExprBuilder::MakeExpression("less_than", {field0, field1}, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {lt_expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.IsExpressionValidationError());
+ std::string expected_error = "Field f2 not in schema";
+ EXPECT_TRUE(status.message().find(expected_error) != std::string::npos);
+}
+
+TEST_F(TestProjector, TestIncorrectSchemaTypeNotMatching) {
+ // schema for input fields
+ auto field0 = field("f0", float32());
+ auto field1 = field("f2", float32());
+ auto field2 = field("f2", int32());
+ auto schema = arrow::schema({field0, field2});
+
+ // output fields
+ auto field_result = field("res", boolean());
+
+ // Build expression
+ auto lt_expr =
+ TreeExprBuilder::MakeExpression("less_than", {field0, field1}, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {lt_expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.IsExpressionValidationError());
+ std::string expected_error =
+ "Field definition in schema f2: int32 different from field in expression f2: float";
+ EXPECT_TRUE(status.message().find(expected_error) != std::string::npos);
+}
+
+TEST_F(TestProjector, TestIfNotSupportedFunction) {
+ // schema for input fields
+ auto fielda = field("a", int32());
+ auto fieldb = field("b", int32());
+ auto schema = arrow::schema({fielda, fieldb});
+
+ // output fields
+ auto field_result = field("res", int32());
+
+ // build expression.
+ // if (a > b)
+ // a
+ // else
+ // b
+ auto node_a = TreeExprBuilder::MakeField(fielda);
+ auto node_b = TreeExprBuilder::MakeField(fieldb);
+ auto condition =
+ TreeExprBuilder::MakeFunction("nonexistent_function", {node_a, node_b}, boolean());
+ auto if_node = TreeExprBuilder::MakeIf(condition, node_a, node_b, int32());
+
+ auto expr = TreeExprBuilder::MakeExpression(if_node, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.IsExpressionValidationError());
+}
+
+TEST_F(TestProjector, TestIfNotMatchingReturnType) {
+ // schema for input fields
+ auto fielda = field("a", int32());
+ auto fieldb = field("b", int32());
+ auto schema = arrow::schema({fielda, fieldb});
+
+ // output fields
+ auto field_result = field("res", int32());
+
+ auto node_a = TreeExprBuilder::MakeField(fielda);
+ auto node_b = TreeExprBuilder::MakeField(fieldb);
+ auto condition =
+ TreeExprBuilder::MakeFunction("less_than", {node_a, node_b}, boolean());
+ auto if_node = TreeExprBuilder::MakeIf(condition, node_a, node_b, boolean());
+
+ auto expr = TreeExprBuilder::MakeExpression(if_node, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.IsExpressionValidationError());
+}
+
+TEST_F(TestProjector, TestElseNotMatchingReturnType) {
+ // schema for input fields
+ auto fielda = field("a", int32());
+ auto fieldb = field("b", int32());
+ auto fieldc = field("c", boolean());
+ auto schema = arrow::schema({fielda, fieldb, fieldc});
+
+ // output fields
+ auto field_result = field("res", int32());
+
+ auto node_a = TreeExprBuilder::MakeField(fielda);
+ auto node_b = TreeExprBuilder::MakeField(fieldb);
+ auto node_c = TreeExprBuilder::MakeField(fieldc);
+ auto condition =
+ TreeExprBuilder::MakeFunction("less_than", {node_a, node_b}, boolean());
+ auto if_node = TreeExprBuilder::MakeIf(condition, node_a, node_c, int32());
+
+ auto expr = TreeExprBuilder::MakeExpression(if_node, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.IsExpressionValidationError());
+}
+
+TEST_F(TestProjector, TestElseNotSupportedType) {
+ // schema for input fields
+ auto fielda = field("a", int32());
+ auto fieldb = field("b", int32());
+ auto fieldc = field("c", list(int32()));
+ auto schema = arrow::schema({fielda, fieldb});
+
+ // output fields
+ auto field_result = field("res", int32());
+
+ auto node_a = TreeExprBuilder::MakeField(fielda);
+ auto node_b = TreeExprBuilder::MakeField(fieldb);
+ auto node_c = TreeExprBuilder::MakeField(fieldc);
+ auto condition =
+ TreeExprBuilder::MakeFunction("less_than", {node_a, node_b}, boolean());
+ auto if_node = TreeExprBuilder::MakeIf(condition, node_a, node_c, int32());
+
+ auto expr = TreeExprBuilder::MakeExpression(if_node, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.IsExpressionValidationError());
+ EXPECT_EQ(status.code(), StatusCode::ExpressionValidationError);
+}
+
+TEST_F(TestProjector, TestAndMinChildren) {
+ // schema for input fields
+ auto fielda = field("a", boolean());
+ auto schema = arrow::schema({fielda});
+
+ // output fields
+ auto field_result = field("res", boolean());
+
+ auto node_a = TreeExprBuilder::MakeField(fielda);
+ auto and_node = TreeExprBuilder::MakeAnd({node_a});
+
+ auto expr = TreeExprBuilder::MakeExpression(and_node, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.IsExpressionValidationError());
+}
+
+TEST_F(TestProjector, TestAndBooleanArgType) {
+ // schema for input fields
+ auto fielda = field("a", boolean());
+ auto fieldb = field("b", int32());
+ auto schema = arrow::schema({fielda, fieldb});
+
+ // output fields
+ auto field_result = field("res", int32());
+
+ auto node_a = TreeExprBuilder::MakeField(fielda);
+ auto node_b = TreeExprBuilder::MakeField(fieldb);
+ auto and_node = TreeExprBuilder::MakeAnd({node_a, node_b});
+
+ auto expr = TreeExprBuilder::MakeExpression(and_node, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.IsExpressionValidationError());
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/tests/projector_test.cc b/src/arrow/cpp/src/gandiva/tests/projector_test.cc
new file mode 100644
index 000000000..120207773
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/tests/projector_test.cc
@@ -0,0 +1,1609 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+#include "gandiva/projector.h"
+
+#include <gtest/gtest.h>
+
+#include <cmath>
+
+#include "arrow/memory_pool.h"
+#include "gandiva/literal_holder.h"
+#include "gandiva/node.h"
+#include "gandiva/tests/test_util.h"
+#include "gandiva/tree_expr_builder.h"
+
+namespace gandiva {
+
+using arrow::boolean;
+using arrow::float32;
+using arrow::int32;
+
+class TestProjector : public ::testing::Test {
+ public:
+ void SetUp() { pool_ = arrow::default_memory_pool(); }
+
+ protected:
+ arrow::MemoryPool* pool_;
+};
+
+TEST_F(TestProjector, TestProjectCache) {
+ // schema for input fields
+ auto field0 = field("f0", int32());
+ auto field1 = field("f2", int32());
+ auto schema = arrow::schema({field0, field1});
+
+ // output fields
+ auto field_sum = field("add", int32());
+ auto field_sub = field("subtract", int32());
+
+ // Build expression
+ auto sum_expr = TreeExprBuilder::MakeExpression("add", {field0, field1}, field_sum);
+ auto sub_expr =
+ TreeExprBuilder::MakeExpression("subtract", {field0, field1}, field_sub);
+
+ auto configuration = TestConfiguration();
+
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {sum_expr, sub_expr}, configuration, &projector);
+ ASSERT_OK(status);
+
+ // everything is same, should return the same projector.
+ auto schema_same = arrow::schema({field0, field1});
+ std::shared_ptr<Projector> cached_projector;
+ status = Projector::Make(schema_same, {sum_expr, sub_expr}, configuration,
+ &cached_projector);
+ ASSERT_OK(status);
+ EXPECT_EQ(cached_projector, projector);
+
+ // schema is different should return a new projector.
+ auto field2 = field("f2", int32());
+ auto different_schema = arrow::schema({field0, field1, field2});
+ std::shared_ptr<Projector> should_be_new_projector;
+ status = Projector::Make(different_schema, {sum_expr, sub_expr}, configuration,
+ &should_be_new_projector);
+ ASSERT_OK(status);
+ EXPECT_NE(cached_projector, should_be_new_projector);
+
+ // expression list is different should return a new projector.
+ std::shared_ptr<Projector> should_be_new_projector1;
+ status = Projector::Make(schema, {sum_expr}, configuration, &should_be_new_projector1);
+ ASSERT_OK(status);
+ EXPECT_NE(cached_projector, should_be_new_projector1);
+
+ // another instance of the same configuration, should return the same projector.
+ status = Projector::Make(schema, {sum_expr, sub_expr}, TestConfiguration(),
+ &cached_projector);
+ ASSERT_OK(status);
+ EXPECT_EQ(cached_projector, projector);
+}
+
+TEST_F(TestProjector, TestProjectCacheFieldNames) {
+ // schema for input fields
+ auto field0 = field("f0", int32());
+ auto field1 = field("f1", int32());
+ auto field2 = field("f2", int32());
+ auto schema = arrow::schema({field0, field1, field2});
+
+ // output fields
+ auto sum_01 = field("sum_01", int32());
+ auto sum_12 = field("sum_12", int32());
+
+ auto sum_expr_01 = TreeExprBuilder::MakeExpression("add", {field0, field1}, sum_01);
+ std::shared_ptr<Projector> projector_01;
+ auto status =
+ Projector::Make(schema, {sum_expr_01}, TestConfiguration(), &projector_01);
+ EXPECT_TRUE(status.ok());
+
+ auto sum_expr_12 = TreeExprBuilder::MakeExpression("add", {field1, field2}, sum_12);
+ std::shared_ptr<Projector> projector_12;
+ status = Projector::Make(schema, {sum_expr_12}, TestConfiguration(), &projector_12);
+ EXPECT_TRUE(status.ok());
+
+ // add(f0, f1) != add(f1, f2)
+ EXPECT_TRUE(projector_01.get() != projector_12.get());
+}
+
+TEST_F(TestProjector, TestProjectCacheDouble) {
+ auto schema = arrow::schema({});
+ auto res = field("result", arrow::float64());
+
+ double d0 = 1.23456788912345677E18;
+ double d1 = 1.23456789012345677E18;
+
+ auto literal0 = TreeExprBuilder::MakeLiteral(d0);
+ auto expr0 = TreeExprBuilder::MakeExpression(literal0, res);
+ auto configuration = TestConfiguration();
+
+ std::shared_ptr<Projector> projector0;
+ auto status = Projector::Make(schema, {expr0}, configuration, &projector0);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ auto literal1 = TreeExprBuilder::MakeLiteral(d1);
+ auto expr1 = TreeExprBuilder::MakeExpression(literal1, res);
+ std::shared_ptr<Projector> projector1;
+ status = Projector::Make(schema, {expr1}, configuration, &projector1);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ EXPECT_TRUE(projector0.get() != projector1.get());
+}
+
+TEST_F(TestProjector, TestProjectCacheFloat) {
+ auto schema = arrow::schema({});
+ auto res = field("result", arrow::float32());
+
+ float f0 = static_cast<float>(12345678891.000000);
+ float f1 = f0 - 1000;
+
+ auto literal0 = TreeExprBuilder::MakeLiteral(f0);
+ auto expr0 = TreeExprBuilder::MakeExpression(literal0, res);
+ std::shared_ptr<Projector> projector0;
+ auto status = Projector::Make(schema, {expr0}, TestConfiguration(), &projector0);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ auto literal1 = TreeExprBuilder::MakeLiteral(f1);
+ auto expr1 = TreeExprBuilder::MakeExpression(literal1, res);
+ std::shared_ptr<Projector> projector1;
+ status = Projector::Make(schema, {expr1}, TestConfiguration(), &projector1);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ EXPECT_TRUE(projector0.get() != projector1.get());
+}
+
+TEST_F(TestProjector, TestProjectCacheLiteral) {
+ auto schema = arrow::schema({});
+ auto res = field("result", arrow::decimal(38, 5));
+
+ DecimalScalar128 d0("12345678", 38, 5);
+ DecimalScalar128 d1("98756432", 38, 5);
+
+ auto literal0 = TreeExprBuilder::MakeDecimalLiteral(d0);
+ auto expr0 = TreeExprBuilder::MakeExpression(literal0, res);
+ std::shared_ptr<Projector> projector0;
+ ASSERT_OK(Projector::Make(schema, {expr0}, TestConfiguration(), &projector0));
+
+ auto literal1 = TreeExprBuilder::MakeDecimalLiteral(d1);
+ auto expr1 = TreeExprBuilder::MakeExpression(literal1, res);
+ std::shared_ptr<Projector> projector1;
+ ASSERT_OK(Projector::Make(schema, {expr1}, TestConfiguration(), &projector1));
+
+ EXPECT_NE(projector0.get(), projector1.get());
+}
+
+TEST_F(TestProjector, TestProjectCacheDecimalCast) {
+ auto field_float64 = field("float64", arrow::float64());
+ auto schema = arrow::schema({field_float64});
+
+ auto res_31_13 = field("result", arrow::decimal(31, 13));
+ auto expr0 = TreeExprBuilder::MakeExpression("castDECIMAL", {field_float64}, res_31_13);
+ std::shared_ptr<Projector> projector0;
+ ASSERT_OK(Projector::Make(schema, {expr0}, TestConfiguration(), &projector0));
+
+ // if the output scale is different, the cache can't be used.
+ auto res_31_14 = field("result", arrow::decimal(31, 14));
+ auto expr1 = TreeExprBuilder::MakeExpression("castDECIMAL", {field_float64}, res_31_14);
+ std::shared_ptr<Projector> projector1;
+ ASSERT_OK(Projector::Make(schema, {expr1}, TestConfiguration(), &projector1));
+ EXPECT_NE(projector0.get(), projector1.get());
+
+ // if the output scale/precision are same, should get a cache hit.
+ auto res_31_13_alt = field("result", arrow::decimal(31, 13));
+ auto expr2 =
+ TreeExprBuilder::MakeExpression("castDECIMAL", {field_float64}, res_31_13_alt);
+ std::shared_ptr<Projector> projector2;
+ ASSERT_OK(Projector::Make(schema, {expr2}, TestConfiguration(), &projector2));
+ EXPECT_EQ(projector0.get(), projector2.get());
+}
+
+TEST_F(TestProjector, TestIntSumSub) {
+ // schema for input fields
+ auto field0 = field("f0", int32());
+ auto field1 = field("f2", int32());
+ auto schema = arrow::schema({field0, field1});
+
+ // output fields
+ auto field_sum = field("add", int32());
+ auto field_sub = field("subtract", int32());
+
+ // Build expression
+ auto sum_expr = TreeExprBuilder::MakeExpression("add", {field0, field1}, field_sum);
+ auto sub_expr =
+ TreeExprBuilder::MakeExpression("subtract", {field0, field1}, field_sub);
+
+ std::shared_ptr<Projector> projector;
+ auto status =
+ Projector::Make(schema, {sum_expr, sub_expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array0 = MakeArrowArrayInt32({1, 2, 3, 4}, {true, true, true, false});
+ auto array1 = MakeArrowArrayInt32({11, 13, 15, 17}, {true, true, false, true});
+ // expected output
+ auto exp_sum = MakeArrowArrayInt32({12, 15, 0, 0}, {true, true, false, false});
+ auto exp_sub = MakeArrowArrayInt32({-10, -11, 0, 0}, {true, true, false, false});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_sum, outputs.at(0));
+ EXPECT_ARROW_ARRAY_EQUALS(exp_sub, outputs.at(1));
+}
+
+template <typename TYPE, typename C_TYPE>
+static void TestArithmeticOpsForType(arrow::MemoryPool* pool) {
+ auto atype = arrow::TypeTraits<TYPE>::type_singleton();
+
+ // schema for input fields
+ auto field0 = field("f0", atype);
+ auto field1 = field("f1", atype);
+ auto schema = arrow::schema({field0, field1});
+
+ // output fields
+ auto field_sum = field("add", atype);
+ auto field_sub = field("subtract", atype);
+ auto field_mul = field("multiply", atype);
+ auto field_div = field("divide", atype);
+ auto field_eq = field("equal", arrow::boolean());
+ auto field_lt = field("less_than", arrow::boolean());
+
+ // Build expression
+ auto sum_expr = TreeExprBuilder::MakeExpression("add", {field0, field1}, field_sum);
+ auto sub_expr =
+ TreeExprBuilder::MakeExpression("subtract", {field0, field1}, field_sub);
+ auto mul_expr =
+ TreeExprBuilder::MakeExpression("multiply", {field0, field1}, field_mul);
+ auto div_expr = TreeExprBuilder::MakeExpression("divide", {field0, field1}, field_div);
+ auto eq_expr = TreeExprBuilder::MakeExpression("equal", {field0, field1}, field_eq);
+ auto lt_expr = TreeExprBuilder::MakeExpression("less_than", {field0, field1}, field_lt);
+
+ std::shared_ptr<Projector> projector;
+ auto status =
+ Projector::Make(schema, {sum_expr, sub_expr, mul_expr, div_expr, eq_expr, lt_expr},
+ TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 12;
+ std::vector<C_TYPE> input0 = {1, 2, 53, 84, 5, 15, 0, 1, 52, 83, 4, 120};
+ std::vector<C_TYPE> input1 = {10, 15, 23, 84, 4, 51, 68, 9, 16, 18, 19, 37};
+ std::vector<bool> validity = {true, true, true, true, true, true,
+ true, true, true, true, true, true};
+
+ auto array0 = MakeArrowArray<TYPE, C_TYPE>(input0, validity);
+ auto array1 = MakeArrowArray<TYPE, C_TYPE>(input1, validity);
+
+ // expected output
+ std::vector<C_TYPE> sum;
+ std::vector<C_TYPE> sub;
+ std::vector<C_TYPE> mul;
+ std::vector<C_TYPE> div;
+ std::vector<bool> eq;
+ std::vector<bool> lt;
+ for (int i = 0; i < num_records; i++) {
+ sum.push_back(static_cast<C_TYPE>(input0[i] + input1[i]));
+ sub.push_back(static_cast<C_TYPE>(input0[i] - input1[i]));
+ mul.push_back(static_cast<C_TYPE>(input0[i] * input1[i]));
+ div.push_back(static_cast<C_TYPE>(input0[i] / input1[i]));
+ eq.push_back(input0[i] == input1[i]);
+ lt.push_back(input0[i] < input1[i]);
+ }
+ auto exp_sum = MakeArrowArray<TYPE, C_TYPE>(sum, validity);
+ auto exp_sub = MakeArrowArray<TYPE, C_TYPE>(sub, validity);
+ auto exp_mul = MakeArrowArray<TYPE, C_TYPE>(mul, validity);
+ auto exp_div = MakeArrowArray<TYPE, C_TYPE>(div, validity);
+ auto exp_eq = MakeArrowArray<arrow::BooleanType, bool>(eq, validity);
+ auto exp_lt = MakeArrowArray<arrow::BooleanType, bool>(lt, validity);
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_sum, outputs.at(0));
+ EXPECT_ARROW_ARRAY_EQUALS(exp_sub, outputs.at(1));
+ EXPECT_ARROW_ARRAY_EQUALS(exp_mul, outputs.at(2));
+ EXPECT_ARROW_ARRAY_EQUALS(exp_div, outputs.at(3));
+ EXPECT_ARROW_ARRAY_EQUALS(exp_eq, outputs.at(4));
+ EXPECT_ARROW_ARRAY_EQUALS(exp_lt, outputs.at(5));
+}
+
+TEST_F(TestProjector, TestAllIntTypes) {
+ TestArithmeticOpsForType<arrow::UInt8Type, uint8_t>(pool_);
+ TestArithmeticOpsForType<arrow::UInt16Type, uint16_t>(pool_);
+ TestArithmeticOpsForType<arrow::UInt32Type, uint32_t>(pool_);
+ TestArithmeticOpsForType<arrow::UInt64Type, uint64_t>(pool_);
+ TestArithmeticOpsForType<arrow::Int8Type, int8_t>(pool_);
+ TestArithmeticOpsForType<arrow::Int16Type, int16_t>(pool_);
+ TestArithmeticOpsForType<arrow::Int32Type, int32_t>(pool_);
+ TestArithmeticOpsForType<arrow::Int64Type, int64_t>(pool_);
+}
+
+TEST_F(TestProjector, TestExtendedMath) {
+ // schema for input fields
+ auto field0 = arrow::field("f0", arrow::float64());
+ auto field1 = arrow::field("f1", arrow::float64());
+ auto schema = arrow::schema({field0, field1});
+
+ // output fields
+ auto field_cbrt = arrow::field("cbrt", arrow::float64());
+ auto field_exp = arrow::field("exp", arrow::float64());
+ auto field_log = arrow::field("log", arrow::float64());
+ auto field_log10 = arrow::field("log10", arrow::float64());
+ auto field_logb = arrow::field("logb", arrow::float64());
+ auto field_power = arrow::field("power", arrow::float64());
+ auto field_sin = arrow::field("sin", arrow::float64());
+ auto field_cos = arrow::field("cos", arrow::float64());
+ auto field_asin = arrow::field("asin", arrow::float64());
+ auto field_acos = arrow::field("acos", arrow::float64());
+ auto field_tan = arrow::field("tan", arrow::float64());
+ auto field_atan = arrow::field("atan", arrow::float64());
+ auto field_sinh = arrow::field("sinh", arrow::float64());
+ auto field_cosh = arrow::field("cosh", arrow::float64());
+ auto field_tanh = arrow::field("tanh", arrow::float64());
+ auto field_atan2 = arrow::field("atan2", arrow::float64());
+ auto field_cot = arrow::field("cot", arrow::float64());
+ auto field_radians = arrow::field("radians", arrow::float64());
+ auto field_degrees = arrow::field("degrees", arrow::float64());
+
+ // Build expression
+ auto cbrt_expr = TreeExprBuilder::MakeExpression("cbrt", {field0}, field_cbrt);
+ auto exp_expr = TreeExprBuilder::MakeExpression("exp", {field0}, field_exp);
+ auto log_expr = TreeExprBuilder::MakeExpression("log", {field0}, field_log);
+ auto log10_expr = TreeExprBuilder::MakeExpression("log10", {field0}, field_log10);
+ auto logb_expr = TreeExprBuilder::MakeExpression("log", {field0, field1}, field_logb);
+ auto power_expr =
+ TreeExprBuilder::MakeExpression("power", {field0, field1}, field_power);
+ auto sin_expr = TreeExprBuilder::MakeExpression("sin", {field0}, field_sin);
+ auto cos_expr = TreeExprBuilder::MakeExpression("cos", {field0}, field_cos);
+ auto asin_expr = TreeExprBuilder::MakeExpression("asin", {field0}, field_asin);
+ auto acos_expr = TreeExprBuilder::MakeExpression("acos", {field0}, field_acos);
+ auto tan_expr = TreeExprBuilder::MakeExpression("tan", {field0}, field_tan);
+ auto atan_expr = TreeExprBuilder::MakeExpression("atan", {field0}, field_atan);
+ auto sinh_expr = TreeExprBuilder::MakeExpression("sinh", {field0}, field_sinh);
+ auto cosh_expr = TreeExprBuilder::MakeExpression("cosh", {field0}, field_cosh);
+ auto tanh_expr = TreeExprBuilder::MakeExpression("tanh", {field0}, field_tanh);
+ auto atan2_expr =
+ TreeExprBuilder::MakeExpression("atan2", {field0, field1}, field_atan2);
+ auto cot_expr = TreeExprBuilder::MakeExpression("cot", {field0}, field_cot);
+ auto radians_expr = TreeExprBuilder::MakeExpression("radians", {field0}, field_radians);
+ auto degrees_expr = TreeExprBuilder::MakeExpression("degrees", {field0}, field_degrees);
+
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(
+ schema,
+ {cbrt_expr, exp_expr, log_expr, log10_expr, logb_expr, power_expr, sin_expr,
+ cos_expr, asin_expr, acos_expr, tan_expr, atan_expr, sinh_expr, cosh_expr,
+ tanh_expr, atan2_expr, cot_expr, radians_expr, degrees_expr},
+ TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ std::vector<double> input0 = {16, 10, -14, 8.3};
+ std::vector<double> input1 = {2, 3, 5, 7};
+ std::vector<bool> validity = {true, true, true, true};
+
+ auto array0 = MakeArrowArray<arrow::DoubleType, double>(input0, validity);
+ auto array1 = MakeArrowArray<arrow::DoubleType, double>(input1, validity);
+
+ // expected output
+ std::vector<double> cbrt_vals;
+ std::vector<double> exp_vals;
+ std::vector<double> log_vals;
+ std::vector<double> log10_vals;
+ std::vector<double> logb_vals;
+ std::vector<double> power_vals;
+ std::vector<double> sin_vals;
+ std::vector<double> cos_vals;
+ std::vector<double> asin_vals;
+ std::vector<double> acos_vals;
+ std::vector<double> tan_vals;
+ std::vector<double> atan_vals;
+ std::vector<double> sinh_vals;
+ std::vector<double> cosh_vals;
+ std::vector<double> tanh_vals;
+ std::vector<double> atan2_vals;
+ std::vector<double> cot_vals;
+ std::vector<double> radians_vals;
+ std::vector<double> degrees_vals;
+ for (int i = 0; i < num_records; i++) {
+ cbrt_vals.push_back(static_cast<double>(cbrtl(input0[i])));
+ exp_vals.push_back(static_cast<double>(expl(input0[i])));
+ log_vals.push_back(static_cast<double>(logl(input0[i])));
+ log10_vals.push_back(static_cast<double>(log10l(input0[i])));
+ logb_vals.push_back(static_cast<double>(logl(input1[i]) / logl(input0[i])));
+ power_vals.push_back(static_cast<double>(powl(input0[i], input1[i])));
+ sin_vals.push_back(static_cast<double>(sin(input0[i])));
+ cos_vals.push_back(static_cast<double>(cos(input0[i])));
+ asin_vals.push_back(static_cast<double>(asin(input0[i])));
+ acos_vals.push_back(static_cast<double>(acos(input0[i])));
+ tan_vals.push_back(static_cast<double>(tan(input0[i])));
+ atan_vals.push_back(static_cast<double>(atan(input0[i])));
+ sinh_vals.push_back(static_cast<double>(sinh(input0[i])));
+ cosh_vals.push_back(static_cast<double>(cosh(input0[i])));
+ tanh_vals.push_back(static_cast<double>(tanh(input0[i])));
+ atan2_vals.push_back(static_cast<double>(atan2(input0[i], input1[i])));
+ cot_vals.push_back(static_cast<double>(tan(M_PI / 2 - input0[i])));
+ radians_vals.push_back(static_cast<double>(input0[i] * M_PI / 180.0));
+ degrees_vals.push_back(static_cast<double>(input0[i] * 180.0 / M_PI));
+ }
+ auto expected_cbrt = MakeArrowArray<arrow::DoubleType, double>(cbrt_vals, validity);
+ auto expected_exp = MakeArrowArray<arrow::DoubleType, double>(exp_vals, validity);
+ auto expected_log = MakeArrowArray<arrow::DoubleType, double>(log_vals, validity);
+ auto expected_log10 = MakeArrowArray<arrow::DoubleType, double>(log10_vals, validity);
+ auto expected_logb = MakeArrowArray<arrow::DoubleType, double>(logb_vals, validity);
+ auto expected_power = MakeArrowArray<arrow::DoubleType, double>(power_vals, validity);
+ auto expected_sin = MakeArrowArray<arrow::DoubleType, double>(sin_vals, validity);
+ auto expected_cos = MakeArrowArray<arrow::DoubleType, double>(cos_vals, validity);
+ auto expected_asin = MakeArrowArray<arrow::DoubleType, double>(asin_vals, validity);
+ auto expected_acos = MakeArrowArray<arrow::DoubleType, double>(acos_vals, validity);
+ auto expected_tan = MakeArrowArray<arrow::DoubleType, double>(tan_vals, validity);
+ auto expected_atan = MakeArrowArray<arrow::DoubleType, double>(atan_vals, validity);
+ auto expected_sinh = MakeArrowArray<arrow::DoubleType, double>(sinh_vals, validity);
+ auto expected_cosh = MakeArrowArray<arrow::DoubleType, double>(cosh_vals, validity);
+ auto expected_tanh = MakeArrowArray<arrow::DoubleType, double>(tanh_vals, validity);
+ auto expected_atan2 = MakeArrowArray<arrow::DoubleType, double>(atan2_vals, validity);
+ auto expected_cot = MakeArrowArray<arrow::DoubleType, double>(cot_vals, validity);
+ auto expected_radians =
+ MakeArrowArray<arrow::DoubleType, double>(radians_vals, validity);
+ auto expected_degrees =
+ MakeArrowArray<arrow::DoubleType, double>(degrees_vals, validity);
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ double epsilon = 1E-13;
+ EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_cbrt, outputs.at(0), epsilon);
+ EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_exp, outputs.at(1), epsilon);
+ EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_log, outputs.at(2), epsilon);
+ EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_log10, outputs.at(3), epsilon);
+ EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_logb, outputs.at(4), epsilon);
+ EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_power, outputs.at(5), epsilon);
+ EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_sin, outputs.at(6), epsilon);
+ EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_cos, outputs.at(7), epsilon);
+ EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_asin, outputs.at(8), epsilon);
+ EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_acos, outputs.at(9), epsilon);
+ EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_tan, outputs.at(10), epsilon);
+ EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_atan, outputs.at(11), epsilon);
+ EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_sinh, outputs.at(12), 1E-08);
+ EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_cosh, outputs.at(13), 1E-08);
+ EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_tanh, outputs.at(14), epsilon);
+ EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_atan2, outputs.at(15), epsilon);
+ EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_cot, outputs.at(16), epsilon);
+ EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_radians, outputs.at(17), epsilon);
+ EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_degrees, outputs.at(18), epsilon);
+}
+
+TEST_F(TestProjector, TestFloatLessThan) {
+ // schema for input fields
+ auto field0 = field("f0", float32());
+ auto field1 = field("f2", float32());
+ auto schema = arrow::schema({field0, field1});
+
+ // output fields
+ auto field_result = field("res", boolean());
+
+ // Build expression
+ auto lt_expr =
+ TreeExprBuilder::MakeExpression("less_than", {field0, field1}, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {lt_expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 3;
+ auto array0 = MakeArrowArrayFloat32({1.0f, 8.9f, 3.0f}, {true, true, false});
+ auto array1 = MakeArrowArrayFloat32({4.0f, 3.4f, 6.8f}, {true, true, true});
+ // expected output
+ auto exp = MakeArrowArrayBool({true, false, false}, {true, true, false});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+TEST_F(TestProjector, TestIsNotNull) {
+ // schema for input fields
+ auto field0 = field("f0", float32());
+ auto schema = arrow::schema({field0});
+
+ // output fields
+ auto field_result = field("res", boolean());
+
+ // Build expression
+ auto myexpr = TreeExprBuilder::MakeExpression("isnotnull", {field0}, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {myexpr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 3;
+ auto array0 = MakeArrowArrayFloat32({1.0f, 8.9f, 3.0f}, {true, true, false});
+ // expected output
+ auto exp = MakeArrowArrayBool({true, true, false}, {true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+TEST_F(TestProjector, TestZeroCopy) {
+ // schema for input fields
+ auto field0 = field("f0", int32());
+ auto schema = arrow::schema({field0});
+
+ // output fields
+ auto res = field("res", float32());
+
+ // Build expression
+ auto cast_expr = TreeExprBuilder::MakeExpression("castFLOAT4", {field0}, res);
+
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {cast_expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array0 = MakeArrowArrayInt32({1, 2, 3, 4}, {true, true, true, false});
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});
+
+ // expected output
+ auto exp = MakeArrowArrayFloat32({1, 2, 3, 0}, {true, true, true, false});
+
+ // allocate output buffers
+ int64_t bitmap_sz = arrow::BitUtil::BytesForBits(num_records);
+ int64_t bitmap_capacity = arrow::BitUtil::RoundUpToMultipleOf64(bitmap_sz);
+ std::vector<uint8_t> bitmap(bitmap_capacity);
+ std::shared_ptr<arrow::MutableBuffer> bitmap_buf =
+ std::make_shared<arrow::MutableBuffer>(&bitmap[0], bitmap_capacity);
+
+ int64_t data_sz = sizeof(float) * num_records;
+ std::vector<uint8_t> data(bitmap_capacity);
+ std::shared_ptr<arrow::MutableBuffer> data_buf =
+ std::make_shared<arrow::MutableBuffer>(&data[0], data_sz);
+
+ auto array_data =
+ arrow::ArrayData::Make(float32(), num_records, {bitmap_buf, data_buf});
+
+ // Evaluate expression
+ status = projector->Evaluate(*in_batch, {array_data});
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ auto output = arrow::MakeArray(array_data);
+ EXPECT_ARROW_ARRAY_EQUALS(exp, output);
+}
+
+TEST_F(TestProjector, TestZeroCopyNegative) {
+ // schema for input fields
+ auto field0 = field("f0", int32());
+ auto schema = arrow::schema({field0});
+
+ // output fields
+ auto res = field("res", float32());
+
+ // Build expression
+ auto cast_expr = TreeExprBuilder::MakeExpression("castFLOAT4", {field0}, res);
+
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {cast_expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array0 = MakeArrowArrayInt32({1, 2, 3, 4}, {true, true, true, false});
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});
+
+ // expected output
+ auto exp = MakeArrowArrayFloat32({1, 2, 3, 0}, {true, true, true, false});
+
+ // allocate output buffers
+ int64_t bitmap_sz = arrow::BitUtil::BytesForBits(num_records);
+ std::unique_ptr<uint8_t[]> bitmap(new uint8_t[bitmap_sz]);
+ std::shared_ptr<arrow::MutableBuffer> bitmap_buf =
+ std::make_shared<arrow::MutableBuffer>(bitmap.get(), bitmap_sz);
+
+ int64_t data_sz = sizeof(float) * num_records;
+ std::unique_ptr<uint8_t[]> data(new uint8_t[data_sz]);
+ std::shared_ptr<arrow::MutableBuffer> data_buf =
+ std::make_shared<arrow::MutableBuffer>(data.get(), data_sz);
+
+ auto array_data =
+ arrow::ArrayData::Make(float32(), num_records, {bitmap_buf, data_buf});
+
+ // the batch can't be empty.
+ auto bad_batch = arrow::RecordBatch::Make(schema, 0 /*num_records*/, {array0});
+ status = projector->Evaluate(*bad_batch, {array_data});
+ EXPECT_EQ(status.code(), StatusCode::Invalid);
+
+ // the output array can't be null.
+ std::shared_ptr<arrow::ArrayData> null_array_data;
+ status = projector->Evaluate(*in_batch, {null_array_data});
+ EXPECT_EQ(status.code(), StatusCode::Invalid);
+
+ // the output array must have at least two buffers.
+ auto bad_array_data = arrow::ArrayData::Make(float32(), num_records, {bitmap_buf});
+ status = projector->Evaluate(*in_batch, {bad_array_data});
+ EXPECT_EQ(status.code(), StatusCode::Invalid);
+
+ // the output buffers must have sufficiently sized data_buf.
+ std::shared_ptr<arrow::MutableBuffer> bad_data_buf =
+ std::make_shared<arrow::MutableBuffer>(data.get(), data_sz - 1);
+ auto bad_array_data2 =
+ arrow::ArrayData::Make(float32(), num_records, {bitmap_buf, bad_data_buf});
+ status = projector->Evaluate(*in_batch, {bad_array_data2});
+ EXPECT_EQ(status.code(), StatusCode::Invalid);
+
+ // the output buffers must have sufficiently sized bitmap_buf.
+ std::shared_ptr<arrow::MutableBuffer> bad_bitmap_buf =
+ std::make_shared<arrow::MutableBuffer>(bitmap.get(), bitmap_sz - 1);
+ auto bad_array_data3 =
+ arrow::ArrayData::Make(float32(), num_records, {bad_bitmap_buf, data_buf});
+ status = projector->Evaluate(*in_batch, {bad_array_data3});
+ EXPECT_EQ(status.code(), StatusCode::Invalid);
+}
+
+TEST_F(TestProjector, TestDivideZero) {
+ // schema for input fields
+ auto field0 = field("f0", int32());
+ auto field1 = field("f2", int32());
+ auto schema = arrow::schema({field0, field1});
+
+ // output fields
+ auto field_div = field("divide", int32());
+
+ // Build expression
+ auto div_expr = TreeExprBuilder::MakeExpression("divide", {field0, field1}, field_div);
+
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {div_expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+ auto array0 = MakeArrowArrayInt32({2, 3, 4, 5, 6}, {true, true, true, true, true});
+ auto array1 = MakeArrowArrayInt32({1, 2, 2, 0, 0}, {true, true, false, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_EQ(status.code(), StatusCode::ExecutionError);
+ std::string expected_error = "divide by zero error";
+ EXPECT_TRUE(status.message().find(expected_error) != std::string::npos);
+
+ // Testing for second batch that has no error should succeed.
+ num_records = 5;
+ array0 = MakeArrowArrayInt32({2, 3, 4, 5, 6}, {true, true, true, true, true});
+ array1 = MakeArrowArrayInt32({1, 2, 2, 1, 1}, {true, true, false, true, true});
+
+ // prepare input record batch
+ in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+ // expected output
+ auto exp = MakeArrowArrayInt32({2, 1, 2, 5, 6}, {true, true, false, true, true});
+
+ // Evaluate expression
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+TEST_F(TestProjector, TestModZero) {
+ // schema for input fields
+ auto field0 = field("f0", arrow::int64());
+ auto field1 = field("f2", int32());
+ auto schema = arrow::schema({field0, field1});
+
+ // output fields
+ auto field_div = field("mod", int32());
+
+ // Build expression
+ auto mod_expr = TreeExprBuilder::MakeExpression("mod", {field0, field1}, field_div);
+
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {mod_expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array0 = MakeArrowArrayInt64({2, 3, 4, 5}, {true, true, true, true});
+ auto array1 = MakeArrowArrayInt32({1, 2, 2, 0}, {true, true, false, true});
+ // expected output
+ auto exp_mod = MakeArrowArrayInt32({0, 1, 0, 5}, {true, true, false, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_mod, outputs.at(0));
+}
+
+TEST_F(TestProjector, TestConcat) {
+ // schema for input fields
+ auto field0 = field("f0", arrow::utf8());
+ auto field1 = field("f1", arrow::utf8());
+ auto schema = arrow::schema({field0, field1});
+
+ // output fields
+ auto field_concat = field("concat", arrow::utf8());
+
+ // Build expression
+ auto concat_expr =
+ TreeExprBuilder::MakeExpression("concat", {field0, field1}, field_concat);
+
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {concat_expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 6;
+ auto array0 = MakeArrowArrayUtf8({"ab", "", "ab", "invalid", "valid", "invalid"},
+ {true, true, true, false, true, false});
+ auto array1 = MakeArrowArrayUtf8({"cd", "cd", "", "valid", "invalid", "invalid"},
+ {true, true, true, true, false, false});
+ // expected output
+ auto exp_concat = MakeArrowArrayUtf8({"abcd", "cd", "ab", "valid", "valid", ""},
+ {true, true, true, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_concat, outputs.at(0));
+}
+
+TEST_F(TestProjector, TestBase64) {
+ // schema for input fields
+ auto field0 = field("f0", arrow::binary());
+ auto schema = arrow::schema({field0});
+
+ // output fields
+ auto field_base = field("base64", arrow::utf8());
+
+ // Build expression
+ auto base_expr = TreeExprBuilder::MakeExpression("base64", {field0}, field_base);
+
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {base_expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array0 =
+ MakeArrowArrayBinary({"hello", "", "test", "hive"}, {true, true, true, true});
+ // expected output
+ auto exp_base = MakeArrowArrayUtf8({"aGVsbG8=", "", "dGVzdA==", "aGl2ZQ=="},
+ {true, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_base, outputs.at(0));
+}
+
+TEST_F(TestProjector, TestUnbase64) {
+ // schema for input fields
+ auto field0 = field("f0", arrow::utf8());
+ auto schema = arrow::schema({field0});
+
+ // output fields
+ auto field_base = field("base64", arrow::binary());
+
+ // Build expression
+ auto base_expr = TreeExprBuilder::MakeExpression("unbase64", {field0}, field_base);
+
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {base_expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array0 = MakeArrowArrayUtf8({"aGVsbG8=", "", "dGVzdA==", "aGl2ZQ=="},
+ {true, true, true, true});
+ // expected output
+ auto exp_unbase =
+ MakeArrowArrayBinary({"hello", "", "test", "hive"}, {true, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_unbase, outputs.at(0));
+}
+
+TEST_F(TestProjector, TestLeftString) {
+ // schema for input fields
+ auto field0 = field("f0", arrow::utf8());
+ auto field1 = field("f1", arrow::int32());
+ auto schema = arrow::schema({field0, field1});
+
+ // output fields
+ auto field_concat = field("left", arrow::utf8());
+
+ // Build expression
+ auto concat_expr =
+ TreeExprBuilder::MakeExpression("left", {field0, field1}, field_concat);
+
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {concat_expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 6;
+ auto array0 = MakeArrowArrayUtf8({"ab", "", "ab", "invalid", "valid", "invalid"},
+ {true, true, true, true, true, true});
+ auto array1 =
+ MakeArrowArrayInt32({1, 500, 2, -5, 5, 0}, {true, true, true, true, true, true});
+ // expected output
+ auto exp_left = MakeArrowArrayUtf8({"a", "", "ab", "in", "valid", ""},
+ {true, true, true, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_left, outputs.at(0));
+}
+
+TEST_F(TestProjector, TestRightString) {
+ // schema for input fields
+ auto field0 = field("f0", arrow::utf8());
+ auto field1 = field("f1", arrow::int32());
+ auto schema = arrow::schema({field0, field1});
+
+ // output fields
+ auto field_concat = field("right", arrow::utf8());
+
+ // Build expression
+ auto concat_expr =
+ TreeExprBuilder::MakeExpression("right", {field0, field1}, field_concat);
+
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {concat_expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 6;
+ auto array0 = MakeArrowArrayUtf8({"ab", "", "ab", "invalid", "valid", "invalid"},
+ {true, true, true, true, true, true});
+ auto array1 =
+ MakeArrowArrayInt32({1, 500, 2, -5, 5, 0}, {true, true, true, true, true, true});
+ // expected output
+ auto exp_left = MakeArrowArrayUtf8({"b", "", "ab", "id", "valid", ""},
+ {true, true, true, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_left, outputs.at(0));
+}
+
+TEST_F(TestProjector, TestOffset) {
+ // schema for input fields
+ auto field0 = field("f0", arrow::int32());
+ auto field1 = field("f1", arrow::int32());
+ auto schema = arrow::schema({field0, field1});
+
+ // output fields
+ auto field_sum = field("sum", arrow::int32());
+
+ // Build expression
+ auto sum_expr = TreeExprBuilder::MakeExpression("add", {field0, field1}, field_sum);
+
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {sum_expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array0 = MakeArrowArrayInt32({1, 2, 3, 4, 5}, {true, true, true, true, false});
+ array0 = array0->Slice(1);
+ auto array1 = MakeArrowArrayInt32({5, 6, 7, 8}, {true, false, true, true});
+ // expected output
+ auto exp_sum = MakeArrowArrayInt32({9, 11, 13}, {false, true, false});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+ in_batch = in_batch->Slice(1);
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_sum, outputs.at(0));
+}
+
+TEST_F(TestProjector, TestByteSubString) {
+ // schema for input fields
+ auto field0 = field("f0", arrow::binary());
+ auto field1 = field("f1", arrow::int32());
+ auto field2 = field("f2", arrow::int32());
+ auto schema = arrow::schema({field0, field1, field2});
+
+ // output fields
+ auto field_byte_substr = field("bytesubstring", arrow::binary());
+
+ // Build expression
+ auto byte_substr_expr = TreeExprBuilder::MakeExpression(
+ "bytesubstring", {field0, field1, field2}, field_byte_substr);
+
+ std::shared_ptr<Projector> projector;
+ auto status =
+ Projector::Make(schema, {byte_substr_expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 6;
+ auto array0 = MakeArrowArrayBinary({"ab", "", "ab", "invalid", "valid", "invalid"},
+ {true, true, true, true, true, true});
+ auto array1 =
+ MakeArrowArrayInt32({0, 1, 1, 1, 3, 3}, {true, true, true, true, true, true});
+ auto array2 =
+ MakeArrowArrayInt32({0, 1, 1, 2, 3, 3}, {true, true, true, true, true, true});
+ // expected output
+ auto exp_byte_substr = MakeArrowArrayBinary({"", "", "a", "in", "lid", "val"},
+ {true, true, true, true, true, true});
+
+ // prepare input record batch
+ auto in = arrow::RecordBatch::Make(schema, num_records, {array0, array1, array2});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_byte_substr, outputs.at(0));
+}
+
+// Test to ensure behaviour of cast functions when the validity is false for an input. The
+// function should not run for that input.
+TEST_F(TestProjector, TestCastFunction) {
+ auto field0 = field("f0", arrow::utf8());
+ auto schema = arrow::schema({field0});
+
+ // output fields
+ auto res_float4 = field("res_float4", arrow::float32());
+ auto res_float8 = field("res_float8", arrow::float64());
+ auto res_int4 = field("castINT", arrow::int32());
+ auto res_int8 = field("castBIGINT", arrow::int64());
+
+ // Build expression
+ auto cast_expr_float4 =
+ TreeExprBuilder::MakeExpression("castFLOAT4", {field0}, res_float4);
+ auto cast_expr_float8 =
+ TreeExprBuilder::MakeExpression("castFLOAT8", {field0}, res_float8);
+ auto cast_expr_int4 = TreeExprBuilder::MakeExpression("castINT", {field0}, res_int4);
+ auto cast_expr_int8 = TreeExprBuilder::MakeExpression("castBIGINT", {field0}, res_int8);
+
+ std::shared_ptr<Projector> projector;
+
+ // {cast_expr_float4, cast_expr_float8, cast_expr_int4, cast_expr_int8}
+ auto status = Projector::Make(
+ schema, {cast_expr_float4, cast_expr_float8, cast_expr_int4, cast_expr_int8},
+ TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+
+ // Last validity is false and the cast functions throw error when input is empty. Should
+ // not be evaluated due to addition of NativeFunction::kCanReturnErrors
+ auto array0 = MakeArrowArrayUtf8({"1", "2", "3", ""}, {true, true, true, false});
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});
+
+ auto out_float4 = MakeArrowArrayFloat32({1, 2, 3, 0}, {true, true, true, false});
+ auto out_float8 = MakeArrowArrayFloat64({1, 2, 3, 0}, {true, true, true, false});
+ auto out_int4 = MakeArrowArrayInt32({1, 2, 3, 0}, {true, true, true, false});
+ auto out_int8 = MakeArrowArrayInt64({1, 2, 3, 0}, {true, true, true, false});
+
+ arrow::ArrayVector outputs;
+
+ // Evaluate expression
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ EXPECT_ARROW_ARRAY_EQUALS(out_float4, outputs.at(0));
+ EXPECT_ARROW_ARRAY_EQUALS(out_float8, outputs.at(1));
+ EXPECT_ARROW_ARRAY_EQUALS(out_int4, outputs.at(2));
+ EXPECT_ARROW_ARRAY_EQUALS(out_int8, outputs.at(3));
+}
+
+TEST_F(TestProjector, TestCastBitFunction) {
+ auto field0 = field("f0", arrow::utf8());
+ auto schema = arrow::schema({field0});
+
+ // output fields
+ auto res_bit = field("res_bit", arrow::boolean());
+
+ // Build expression
+ auto cast_bit = TreeExprBuilder::MakeExpression("castBIT", {field0}, res_bit);
+
+ std::shared_ptr<Projector> projector;
+
+ auto status = Projector::Make(schema, {cast_bit}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto arr = MakeArrowArrayUtf8({"1", "true", "false", "0"}, {true, true, true, true});
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {arr});
+
+ auto out = MakeArrowArrayBool({true, true, false, false}, {true, true, true, true});
+
+ arrow::ArrayVector outputs;
+
+ // Evaluate expression
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ EXPECT_ARROW_ARRAY_EQUALS(out, outputs.at(0));
+}
+
+// Test to ensure behaviour of cast functions when the validity is false for an input. The
+// function should not run for that input.
+TEST_F(TestProjector, TestCastVarbinaryFunction) {
+ auto field0 = field("f0", arrow::binary());
+ auto schema = arrow::schema({field0});
+
+ // output fields
+ auto res_int4 = field("res_int4", arrow::int32());
+ auto res_int8 = field("res_int8", arrow::int64());
+ auto res_float4 = field("res_float4", arrow::float32());
+ auto res_float8 = field("res_float8", arrow::float64());
+
+ // Build expression
+ auto cast_expr_int4 = TreeExprBuilder::MakeExpression("castINT", {field0}, res_int4);
+ auto cast_expr_int8 = TreeExprBuilder::MakeExpression("castBIGINT", {field0}, res_int8);
+ auto cast_expr_float4 =
+ TreeExprBuilder::MakeExpression("castFLOAT4", {field0}, res_float4);
+ auto cast_expr_float8 =
+ TreeExprBuilder::MakeExpression("castFLOAT8", {field0}, res_float8);
+
+ std::shared_ptr<Projector> projector;
+
+ // {cast_expr_float4, cast_expr_float8, cast_expr_int4, cast_expr_int8}
+ auto status = Projector::Make(
+ schema, {cast_expr_int4, cast_expr_int8, cast_expr_float4, cast_expr_float8},
+ TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+
+ // Last validity is false and the cast functions throw error when input is empty. Should
+ // not be evaluated due to addition of NativeFunction::kCanReturnErrors
+ auto array0 =
+ MakeArrowArrayBinary({"37", "-99999", "99999", "4"}, {true, true, true, false});
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});
+
+ auto out_int4 = MakeArrowArrayInt32({37, -99999, 99999, 0}, {true, true, true, false});
+ auto out_int8 = MakeArrowArrayInt64({37, -99999, 99999, 0}, {true, true, true, false});
+ auto out_float4 =
+ MakeArrowArrayFloat32({37, -99999, 99999, 0}, {true, true, true, false});
+ auto out_float8 =
+ MakeArrowArrayFloat64({37, -99999, 99999, 0}, {true, true, true, false});
+
+ arrow::ArrayVector outputs;
+
+ // Evaluate expression
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ EXPECT_ARROW_ARRAY_EQUALS(out_int4, outputs.at(0));
+ EXPECT_ARROW_ARRAY_EQUALS(out_int8, outputs.at(1));
+ EXPECT_ARROW_ARRAY_EQUALS(out_float4, outputs.at(2));
+ EXPECT_ARROW_ARRAY_EQUALS(out_float8, outputs.at(3));
+}
+
+TEST_F(TestProjector, TestToDate) {
+ // schema for input fields
+ auto field0 = field("f0", arrow::utf8());
+ auto field_node = std::make_shared<FieldNode>(field0);
+ auto schema = arrow::schema({field0});
+
+ // output fields
+ auto field_result = field("res", arrow::date64());
+
+ auto pattern_node = std::make_shared<LiteralNode>(
+ arrow::utf8(), LiteralHolder(std::string("YYYY-MM-DD")), false);
+
+ // Build expression
+ auto fn_node = TreeExprBuilder::MakeFunction("to_date", {field_node, pattern_node},
+ arrow::date64());
+ auto expr = TreeExprBuilder::MakeExpression(fn_node, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 3;
+ auto array0 =
+ MakeArrowArrayUtf8({"1986-12-01", "2012-12-01", "invalid"}, {true, true, false});
+ // expected output
+ auto exp = MakeArrowArrayDate64({533779200000, 1354320000000, 0}, {true, true, false});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+// ARROW-11617
+TEST_F(TestProjector, TestIfElseOpt) {
+ // schema for input
+ auto field0 = field("f0", int32());
+ auto field1 = field("f1", int32());
+ auto field2 = field("f2", int32());
+ auto schema = arrow::schema({field0, field1, field2});
+
+ auto f0 = std::make_shared<FieldNode>(field0);
+ auto f1 = std::make_shared<FieldNode>(field1);
+ auto f2 = std::make_shared<FieldNode>(field2);
+
+ // output fields
+ auto field_result = field("out", int32());
+
+ // Expr - (f0, f1 - null; f2 non null)
+ //
+ // if (is not null(f0))
+ // then f0
+ // else add((
+ // if (is not null (f1))
+ // then f1
+ // else f2
+ // ), f1)
+
+ auto cond_node_inner = TreeExprBuilder::MakeFunction("isnotnull", {f1}, boolean());
+ auto if_node_inner = TreeExprBuilder::MakeIf(cond_node_inner, f1, f2, int32());
+
+ auto cond_node_outer = TreeExprBuilder::MakeFunction("isnotnull", {f0}, boolean());
+ auto else_node_outer =
+ TreeExprBuilder::MakeFunction("add", {if_node_inner, f1}, int32());
+
+ auto if_node_outer =
+ TreeExprBuilder::MakeIf(cond_node_outer, f1, else_node_outer, int32());
+ auto expr = TreeExprBuilder::MakeExpression(if_node_outer, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 1;
+ auto array0 = MakeArrowArrayInt32({0}, {false});
+ auto array1 = MakeArrowArrayInt32({0}, {false});
+ auto array2 = MakeArrowArrayInt32({99}, {true});
+ // expected output
+ auto exp = MakeArrowArrayInt32({0}, {false});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1, array2});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+TEST_F(TestProjector, TestRepeat) {
+ // schema for input fields
+ auto field0 = field("f0", arrow::utf8());
+ auto field1 = field("f1", arrow::int32());
+ auto schema = arrow::schema({field0, field1});
+
+ // output fields
+ auto field_repeat = field("repeat", arrow::utf8());
+
+ // Build expression
+ auto repeat_expr =
+ TreeExprBuilder::MakeExpression("repeat", {field0, field1}, field_repeat);
+
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {repeat_expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+ auto array0 =
+ MakeArrowArrayUtf8({"ab", "a", "car", "valid", ""}, {true, true, true, true, true});
+ auto array1 = MakeArrowArrayInt32({2, 1, 3, 2, 10}, {true, true, true, true, true});
+ // expected output
+ auto exp_repeat = MakeArrowArrayUtf8({"abab", "a", "carcarcar", "validvalid", ""},
+ {true, true, true, true, true});
+
+ // prepare input record batch
+ auto in = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_repeat, outputs.at(0));
+}
+
+TEST_F(TestProjector, TestLpad) {
+ // schema for input fields
+ auto field0 = field("f0", arrow::utf8());
+ auto field1 = field("f1", arrow::int32());
+ auto field2 = field("f2", arrow::utf8());
+ auto schema = arrow::schema({field0, field1, field2});
+
+ // output fields
+ auto field_lpad = field("lpad", arrow::utf8());
+
+ // Build expression
+ auto lpad_expr =
+ TreeExprBuilder::MakeExpression("lpad", {field0, field1, field2}, field_lpad);
+
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {lpad_expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 7;
+ auto array0 = MakeArrowArrayUtf8({"ab", "a", "ab", "invalid", "valid", "invalid", ""},
+ {true, true, true, true, true, true, true});
+ auto array1 = MakeArrowArrayInt32({1, 5, 3, 12, 0, 2, 10},
+ {true, true, true, true, true, true, true});
+ auto array2 = MakeArrowArrayUtf8({"z", "z", "c", "valid", "invalid", "invalid", ""},
+ {true, true, true, true, true, true, true});
+ // expected output
+ auto exp_lpad = MakeArrowArrayUtf8({"a", "zzzza", "cab", "validinvalid", "", "in", ""},
+ {true, true, true, true, true, true, true});
+
+ // prepare input record batch
+ auto in = arrow::RecordBatch::Make(schema, num_records, {array0, array1, array2});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_lpad, outputs.at(0));
+}
+
+TEST_F(TestProjector, TestRpad) {
+ // schema for input fields
+ auto field0 = field("f0", arrow::utf8());
+ auto field1 = field("f1", arrow::int32());
+ auto field2 = field("f2", arrow::utf8());
+ auto schema = arrow::schema({field0, field1, field2});
+
+ // output fields
+ auto field_rpad = field("rpad", arrow::utf8());
+
+ // Build expression
+ auto rpad_expr =
+ TreeExprBuilder::MakeExpression("rpad", {field0, field1, field2}, field_rpad);
+
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {rpad_expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 7;
+ auto array0 = MakeArrowArrayUtf8({"ab", "a", "ab", "invalid", "valid", "invalid", ""},
+ {true, true, true, true, true, true, true});
+ auto array1 = MakeArrowArrayInt32({1, 5, 3, 12, 0, 2, 10},
+ {true, true, true, true, true, true, true});
+ auto array2 = MakeArrowArrayUtf8({"z", "z", "c", "valid", "invalid", "invalid", ""},
+ {true, true, true, true, true, true, true});
+ // expected output
+ auto exp_rpad = MakeArrowArrayUtf8({"a", "azzzz", "abc", "invalidvalid", "", "in", ""},
+ {true, true, true, true, true, true, true});
+
+ // prepare input record batch
+ auto in = arrow::RecordBatch::Make(schema, num_records, {array0, array1, array2});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_rpad, outputs.at(0));
+}
+
+TEST_F(TestProjector, TestBinRepresentation) {
+ // schema for input fields
+ auto field0 = field("f0", arrow::int64());
+ auto schema = arrow::schema({field0});
+
+ // output fields
+ auto field_result = field("bin", arrow::utf8());
+
+ // Build expression
+ auto myexpr = TreeExprBuilder::MakeExpression("bin", {field0}, field_result);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {myexpr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 3;
+ auto array0 = MakeArrowArrayInt64({7, -28550, 58117}, {true, true, true});
+ // expected output
+ auto exp = MakeArrowArrayUtf8(
+ {"111", "1111111111111111111111111111111111111111111111111001000001111010",
+ "1110001100000101"},
+ {true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+TEST_F(TestProjector, TestBigIntCastFunction) {
+ // input fields
+ auto field0 = field("f0", arrow::float32());
+ auto field1 = field("f1", arrow::float64());
+ auto field2 = field("f2", arrow::day_time_interval());
+ auto field3 = field("f3", arrow::month_interval());
+ auto schema = arrow::schema({field0, field1, field2, field3});
+
+ // output fields
+ auto res_int64 = field("res", arrow::int64());
+
+ // Build expression
+ auto cast_expr_float4 =
+ TreeExprBuilder::MakeExpression("castBIGINT", {field0}, res_int64);
+ auto cast_expr_float8 =
+ TreeExprBuilder::MakeExpression("castBIGINT", {field1}, res_int64);
+ auto cast_expr_day_interval =
+ TreeExprBuilder::MakeExpression("castBIGINT", {field2}, res_int64);
+ auto cast_expr_year_interval =
+ TreeExprBuilder::MakeExpression("castBIGINT", {field3}, res_int64);
+
+ std::shared_ptr<Projector> projector;
+
+ // {cast_expr_float4, cast_expr_float8, cast_expr_day_interval,
+ // cast_expr_year_interval}
+ auto status = Projector::Make(schema,
+ {cast_expr_float4, cast_expr_float8,
+ cast_expr_day_interval, cast_expr_year_interval},
+ TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+
+ // Last validity is false and the cast functions throw error when input is empty. Should
+ // not be evaluated due to addition of NativeFunction::kCanReturnErrors
+ auto array0 =
+ MakeArrowArrayFloat32({6.6f, -6.6f, 9.999999f, 0}, {true, true, true, false});
+ auto array1 =
+ MakeArrowArrayFloat64({6.6, -6.6, 9.99999999999, 0}, {true, true, true, false});
+ auto array2 = MakeArrowArrayInt64({100, 25, -0, 0}, {true, true, true, false});
+ auto array3 = MakeArrowArrayInt32({25, -25, -0, 0}, {true, true, true, false});
+ auto in_batch =
+ arrow::RecordBatch::Make(schema, num_records, {array0, array1, array2, array3});
+
+ auto out_float4 = MakeArrowArrayInt64({7, -7, 10, 0}, {true, true, true, false});
+ auto out_float8 = MakeArrowArrayInt64({7, -7, 10, 0}, {true, true, true, false});
+ auto out_days_interval =
+ MakeArrowArrayInt64({8640000000, 2160000000, 0, 0}, {true, true, true, false});
+ auto out_year_interval = MakeArrowArrayInt64({2, -2, 0, 0}, {true, true, true, false});
+
+ arrow::ArrayVector outputs;
+
+ // Evaluate expression
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ EXPECT_ARROW_ARRAY_EQUALS(out_float4, outputs.at(0));
+ EXPECT_ARROW_ARRAY_EQUALS(out_float8, outputs.at(1));
+ EXPECT_ARROW_ARRAY_EQUALS(out_days_interval, outputs.at(2));
+ EXPECT_ARROW_ARRAY_EQUALS(out_year_interval, outputs.at(3));
+}
+
+TEST_F(TestProjector, TestIntCastFunction) {
+ // input fields
+ auto field0 = field("f0", arrow::float32());
+ auto field1 = field("f1", arrow::float64());
+ auto field2 = field("f2", arrow::month_interval());
+ auto schema = arrow::schema({field0, field1, field2});
+
+ // output fields
+ auto res_int32 = field("res", arrow::int32());
+
+ // Build expression
+ auto cast_expr_float4 = TreeExprBuilder::MakeExpression("castINT", {field0}, res_int32);
+ auto cast_expr_float8 = TreeExprBuilder::MakeExpression("castINT", {field1}, res_int32);
+ auto cast_expr_year_interval =
+ TreeExprBuilder::MakeExpression("castINT", {field2}, res_int32);
+
+ std::shared_ptr<Projector> projector;
+
+ // {cast_expr_float4, cast_expr_float8, cast_expr_day_interval,
+ // cast_expr_year_interval}
+ auto status = Projector::Make(
+ schema, {cast_expr_float4, cast_expr_float8, cast_expr_year_interval},
+ TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+
+ // Last validity is false and the cast functions throw error when input is empty. Should
+ // not be evaluated due to addition of NativeFunction::kCanReturnErrors
+ auto array0 =
+ MakeArrowArrayFloat32({6.6f, -6.6f, 9.999999f, 0}, {true, true, true, false});
+ auto array1 =
+ MakeArrowArrayFloat64({6.6, -6.6, 9.99999999999, 0}, {true, true, true, false});
+ auto array2 = MakeArrowArrayInt32({25, -25, -0, 0}, {true, true, true, false});
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1, array2});
+
+ auto out_float4 = MakeArrowArrayInt32({7, -7, 10, 0}, {true, true, true, false});
+ auto out_float8 = MakeArrowArrayInt32({7, -7, 10, 0}, {true, true, true, false});
+ auto out_year_interval = MakeArrowArrayInt32({2, -2, 0, 0}, {true, true, true, false});
+
+ arrow::ArrayVector outputs;
+
+ // Evaluate expression
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ EXPECT_ARROW_ARRAY_EQUALS(out_float4, outputs.at(0));
+ EXPECT_ARROW_ARRAY_EQUALS(out_float8, outputs.at(1));
+ EXPECT_ARROW_ARRAY_EQUALS(out_year_interval, outputs.at(2));
+}
+
+TEST_F(TestProjector, TestCastNullableIntYearInterval) {
+ // input fields
+ auto field1 = field("f1", arrow::month_interval());
+ auto schema = arrow::schema({field1});
+
+ // output fields
+ auto res_int32 = field("res", arrow::int32());
+ auto res_int64 = field("res", arrow::int64());
+
+ // Build expression
+ auto cast_expr_int32 =
+ TreeExprBuilder::MakeExpression("castNULLABLEINT", {field1}, res_int32);
+ auto cast_expr_int64 =
+ TreeExprBuilder::MakeExpression("castNULLABLEBIGINT", {field1}, res_int64);
+
+ std::shared_ptr<Projector> projector;
+
+ // {cast_expr_int32, cast_expr_int64, cast_expr_day_interval,
+ // cast_expr_year_interval}
+ auto status = Projector::Make(schema, {cast_expr_int32, cast_expr_int64},
+ TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+
+ // Last validity is false and the cast functions throw error when input is empty. Should
+ // not be evaluated due to addition of NativeFunction::kCanReturnErrors
+ auto array0 = MakeArrowArrayInt32({12, -24, -0, 0}, {true, true, true, false});
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});
+
+ auto out_int32 = MakeArrowArrayInt32({1, -2, -0, 0}, {true, true, true, false});
+ auto out_int64 = MakeArrowArrayInt64({1, -2, -0, 0}, {true, true, true, false});
+
+ arrow::ArrayVector outputs;
+
+ // Evaluate expression
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ EXPECT_ARROW_ARRAY_EQUALS(out_int32, outputs.at(0));
+ EXPECT_ARROW_ARRAY_EQUALS(out_int64, outputs.at(1));
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/tests/test_util.h b/src/arrow/cpp/src/gandiva/tests/test_util.h
new file mode 100644
index 000000000..54270436c
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/tests/test_util.h
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <chrono>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "arrow/testing/gtest_util.h"
+#include "gandiva/arrow.h"
+#include "gandiva/configuration.h"
+
+#pragma once
+
+namespace gandiva {
+
+// Helper function to create an arrow-array of type ARROWTYPE
+// from primitive vectors of data & validity.
+//
+// arrow/testing/gtest_util.h has good utility classes for this purpose.
+// Using those
+template <typename TYPE, typename C_TYPE>
+static inline ArrayPtr MakeArrowArray(std::vector<C_TYPE> values,
+ std::vector<bool> validity) {
+ ArrayPtr out;
+ arrow::ArrayFromVector<TYPE, C_TYPE>(validity, values, &out);
+ return out;
+}
+
+template <typename TYPE, typename C_TYPE>
+static inline ArrayPtr MakeArrowArray(std::vector<C_TYPE> values) {
+ ArrayPtr out;
+ arrow::ArrayFromVector<TYPE, C_TYPE>(values, &out);
+ return out;
+}
+
+template <typename TYPE, typename C_TYPE>
+static inline ArrayPtr MakeArrowArray(const std::shared_ptr<arrow::DataType>& type,
+ std::vector<C_TYPE> values,
+ std::vector<bool> validity) {
+ ArrayPtr out;
+ arrow::ArrayFromVector<TYPE, C_TYPE>(type, validity, values, &out);
+ return out;
+}
+
+template <typename TYPE, typename C_TYPE>
+static inline ArrayPtr MakeArrowTypeArray(const std::shared_ptr<arrow::DataType>& type,
+ const std::vector<C_TYPE>& values,
+ const std::vector<bool>& validity) {
+ ArrayPtr out;
+ arrow::ArrayFromVector<TYPE, C_TYPE>(type, validity, values, &out);
+ return out;
+}
+
+#define MakeArrowArrayBool MakeArrowArray<arrow::BooleanType, bool>
+#define MakeArrowArrayInt8 MakeArrowArray<arrow::Int8Type, int8_t>
+#define MakeArrowArrayInt16 MakeArrowArray<arrow::Int16Type, int16_t>
+#define MakeArrowArrayInt32 MakeArrowArray<arrow::Int32Type, int32_t>
+#define MakeArrowArrayInt64 MakeArrowArray<arrow::Int64Type, int64_t>
+#define MakeArrowArrayUint8 MakeArrowArray<arrow::UInt8Type, uint8_t>
+#define MakeArrowArrayUint16 MakeArrowArray<arrow::UInt16Type, uint16_t>
+#define MakeArrowArrayUint32 MakeArrowArray<arrow::UInt32Type, uint32_t>
+#define MakeArrowArrayUint64 MakeArrowArray<arrow::UInt64Type, uint64_t>
+#define MakeArrowArrayFloat32 MakeArrowArray<arrow::FloatType, float>
+#define MakeArrowArrayFloat64 MakeArrowArray<arrow::DoubleType, double>
+#define MakeArrowArrayDate64 MakeArrowArray<arrow::Date64Type, int64_t>
+#define MakeArrowArrayUtf8 MakeArrowArray<arrow::StringType, std::string>
+#define MakeArrowArrayBinary MakeArrowArray<arrow::BinaryType, std::string>
+#define MakeArrowArrayDecimal MakeArrowArray<arrow::Decimal128Type, arrow::Decimal128>
+
+#define EXPECT_ARROW_ARRAY_EQUALS(a, b) \
+ EXPECT_TRUE((a)->Equals(b, arrow::EqualOptions().nans_equal(true))) \
+ << "expected array: " << (a)->ToString() << " actual array: " << (b)->ToString()
+
+#define EXPECT_ARROW_ARRAY_APPROX_EQUALS(a, b, epsilon) \
+ EXPECT_TRUE( \
+ (a)->ApproxEquals(b, arrow::EqualOptions().atol(epsilon).nans_equal(true))) \
+ << "expected array: " << (a)->ToString() << " actual array: " << (b)->ToString()
+
+#define EXPECT_ARROW_TYPE_EQUALS(a, b) \
+ EXPECT_TRUE((a)->Equals(b)) << "expected type: " << (a)->ToString() \
+ << " actual type: " << (b)->ToString()
+
+static inline std::shared_ptr<Configuration> TestConfiguration() {
+ auto builder = ConfigurationBuilder();
+ return builder.DefaultConfiguration();
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/tests/timed_evaluate.h b/src/arrow/cpp/src/gandiva/tests/timed_evaluate.h
new file mode 100644
index 000000000..eba0f5eb9
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/tests/timed_evaluate.h
@@ -0,0 +1,136 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <memory>
+#include <vector>
+#include "benchmark/benchmark.h"
+#include "gandiva/arrow.h"
+#include "gandiva/filter.h"
+#include "gandiva/projector.h"
+#include "gandiva/tests/generate_data.h"
+
+#pragma once
+
+#define THOUSAND (1024)
+#define MILLION (1024 * 1024)
+#define NUM_BATCHES 16
+
+namespace gandiva {
+
+template <typename C_TYPE>
+std::vector<C_TYPE> GenerateData(int num_records, DataGenerator<C_TYPE>& data_generator) {
+ std::vector<C_TYPE> data;
+
+ for (int i = 0; i < num_records; i++) {
+ data.push_back(data_generator.GenerateData());
+ }
+
+ return data;
+}
+
+class BaseEvaluator {
+ public:
+ virtual ~BaseEvaluator() = default;
+
+ virtual Status Evaluate(arrow::RecordBatch& batch, arrow::MemoryPool* pool) = 0;
+};
+
+class ProjectEvaluator : public BaseEvaluator {
+ public:
+ explicit ProjectEvaluator(std::shared_ptr<Projector> projector)
+ : projector_(projector) {}
+
+ Status Evaluate(arrow::RecordBatch& batch, arrow::MemoryPool* pool) override {
+ arrow::ArrayVector outputs;
+ return projector_->Evaluate(batch, pool, &outputs);
+ }
+
+ private:
+ std::shared_ptr<Projector> projector_;
+};
+
+class FilterEvaluator : public BaseEvaluator {
+ public:
+ explicit FilterEvaluator(std::shared_ptr<Filter> filter) : filter_(filter) {}
+
+ Status Evaluate(arrow::RecordBatch& batch, arrow::MemoryPool* pool) override {
+ if (selection_ == nullptr || selection_->GetMaxSlots() < batch.num_rows()) {
+ auto status = SelectionVector::MakeInt16(batch.num_rows(), pool, &selection_);
+ if (!status.ok()) {
+ return status;
+ }
+ }
+ return filter_->Evaluate(batch, selection_);
+ }
+
+ private:
+ std::shared_ptr<Filter> filter_;
+ std::shared_ptr<SelectionVector> selection_;
+};
+
+template <typename TYPE, typename C_TYPE>
+Status TimedEvaluate(SchemaPtr schema, BaseEvaluator& evaluator,
+ DataGenerator<C_TYPE>& data_generator, arrow::MemoryPool* pool,
+ int num_records, int batch_size, benchmark::State& state) {
+ int num_remaining = num_records;
+ int num_fields = schema->num_fields();
+ int num_calls = 0;
+ Status status;
+
+ // Generate batches of data
+ std::shared_ptr<arrow::RecordBatch> batches[NUM_BATCHES];
+ for (int i = 0; i < NUM_BATCHES; i++) {
+ // generate data for all columns in the schema
+ std::vector<ArrayPtr> columns;
+ for (int col = 0; col < num_fields; col++) {
+ std::vector<C_TYPE> data = GenerateData<C_TYPE>(batch_size, data_generator);
+ std::vector<bool> validity(batch_size, true);
+ ArrayPtr col_data =
+ MakeArrowArray<TYPE, C_TYPE>(schema->field(col)->type(), data, validity);
+
+ columns.push_back(col_data);
+ }
+
+ // make the record batch
+ std::shared_ptr<arrow::RecordBatch> batch =
+ arrow::RecordBatch::Make(schema, batch_size, columns);
+ batches[i] = batch;
+ }
+
+ for (auto _ : state) {
+ int num_in_batch = batch_size;
+ num_remaining = num_records;
+ while (num_remaining > 0) {
+ if (batch_size > num_remaining) {
+ num_in_batch = num_remaining;
+ }
+
+ status = evaluator.Evaluate(*(batches[num_calls % NUM_BATCHES]), pool);
+ if (!status.ok()) {
+ state.SkipWithError("Evaluation of the batch failed");
+ return status;
+ }
+
+ num_calls++;
+ num_remaining -= num_in_batch;
+ }
+ }
+
+ return Status::OK();
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/tests/to_string_test.cc b/src/arrow/cpp/src/gandiva/tests/to_string_test.cc
new file mode 100644
index 000000000..55db6e92b
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/tests/to_string_test.cc
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+#include <math.h>
+#include <time.h>
+#include "arrow/memory_pool.h"
+#include "gandiva/projector.h"
+#include "gandiva/tests/test_util.h"
+#include "gandiva/tree_expr_builder.h"
+
+namespace gandiva {
+
+using arrow::boolean;
+using arrow::float64;
+using arrow::int32;
+using arrow::int64;
+
+class TestToString : public ::testing::Test {
+ public:
+ void SetUp() { pool_ = arrow::default_memory_pool(); }
+
+ protected:
+ arrow::MemoryPool* pool_;
+};
+
+#define CHECK_EXPR_TO_STRING(e, str) EXPECT_STREQ(e->ToString().c_str(), str)
+
+TEST_F(TestToString, TestAll) {
+ auto literal_node = TreeExprBuilder::MakeLiteral((uint64_t)100);
+ auto literal_expr =
+ TreeExprBuilder::MakeExpression(literal_node, arrow::field("r", int64()));
+ CHECK_EXPR_TO_STRING(literal_expr, "(const uint64) 100");
+
+ auto f0 = arrow::field("f0", float64());
+ auto f0_node = TreeExprBuilder::MakeField(f0);
+ auto f0_expr = TreeExprBuilder::MakeExpression(f0_node, f0);
+ CHECK_EXPR_TO_STRING(f0_expr, "(double) f0");
+
+ auto f1 = arrow::field("f1", int64());
+ auto f2 = arrow::field("f2", int64());
+ auto f1_node = TreeExprBuilder::MakeField(f1);
+ auto f2_node = TreeExprBuilder::MakeField(f2);
+ auto add_node = TreeExprBuilder::MakeFunction("add", {f1_node, f2_node}, int64());
+ auto add_expr = TreeExprBuilder::MakeExpression(add_node, f1);
+ CHECK_EXPR_TO_STRING(add_expr, "int64 add((int64) f1, (int64) f2)");
+
+ auto cond_node = TreeExprBuilder::MakeFunction(
+ "lesser_than", {f0_node, TreeExprBuilder::MakeLiteral(static_cast<float>(0))},
+ boolean());
+ auto then_node = TreeExprBuilder::MakeField(f1);
+ auto else_node = TreeExprBuilder::MakeField(f2);
+
+ auto if_node = TreeExprBuilder::MakeIf(cond_node, then_node, else_node, int64());
+ auto if_expr = TreeExprBuilder::MakeExpression(if_node, f1);
+
+ CHECK_EXPR_TO_STRING(if_expr,
+ "if (bool lesser_than((double) f0, (const float) 0 raw(0))) { "
+ "(int64) f1 } else { (int64) f2 }");
+
+ auto f1_gt_100 =
+ TreeExprBuilder::MakeFunction("greater_than", {f1_node, literal_node}, boolean());
+ auto f2_equals_100 =
+ TreeExprBuilder::MakeFunction("equals", {f2_node, literal_node}, boolean());
+ auto and_node = TreeExprBuilder::MakeAnd({f1_gt_100, f2_equals_100});
+ auto and_expr =
+ TreeExprBuilder::MakeExpression(and_node, arrow::field("f0", boolean()));
+
+ CHECK_EXPR_TO_STRING(and_expr,
+ "bool greater_than((int64) f1, (const uint64) 100) && bool "
+ "equals((int64) f2, (const uint64) 100)");
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/tests/utf8_test.cc b/src/arrow/cpp/src/gandiva/tests/utf8_test.cc
new file mode 100644
index 000000000..e19d6712d
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/tests/utf8_test.cc
@@ -0,0 +1,751 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+
+#include "gandiva/projector.h"
+#include "gandiva/tests/test_util.h"
+#include "gandiva/tree_expr_builder.h"
+
+namespace gandiva {
+
+using arrow::boolean;
+using arrow::date64;
+using arrow::int32;
+using arrow::int64;
+using arrow::utf8;
+
+class TestUtf8 : public ::testing::Test {
+ public:
+ void SetUp() { pool_ = arrow::default_memory_pool(); }
+
+ protected:
+ arrow::MemoryPool* pool_;
+};
+
+TEST_F(TestUtf8, TestSimple) {
+ // schema for input fields
+ auto field_a = field("a", utf8());
+ auto schema = arrow::schema({field_a});
+
+ // output fields
+ auto res_1 = field("res1", int32());
+ auto res_2 = field("res2", boolean());
+ auto res_3 = field("res3", int32());
+
+ // build expressions.
+ // octet_length(a)
+ // octet_length(a) == bit_length(a) / 8
+ // length(a)
+ auto expr_a = TreeExprBuilder::MakeExpression("octet_length", {field_a}, res_1);
+
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto octet_length = TreeExprBuilder::MakeFunction("octet_length", {node_a}, int32());
+ auto literal_8 = TreeExprBuilder::MakeLiteral((int32_t)8);
+ auto bit_length = TreeExprBuilder::MakeFunction("bit_length", {node_a}, int32());
+ auto div_8 = TreeExprBuilder::MakeFunction("divide", {bit_length, literal_8}, int32());
+ auto is_equal =
+ TreeExprBuilder::MakeFunction("equal", {octet_length, div_8}, boolean());
+ auto expr_b = TreeExprBuilder::MakeExpression(is_equal, res_2);
+ auto expr_c = TreeExprBuilder::MakeExpression("length", {field_a}, res_3);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status =
+ Projector::Make(schema, {expr_a, expr_b, expr_c}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+ auto array_a = MakeArrowArrayUtf8({"foo", "hello", "bye", "hi", "मदन"},
+ {true, true, false, true, true});
+
+ // expected output
+ auto exp_1 = MakeArrowArrayInt32({3, 5, 0, 2, 9}, {true, true, false, true, true});
+ auto exp_2 = MakeArrowArrayBool({true, true, false, true, true},
+ {true, true, false, true, true});
+ auto exp_3 = MakeArrowArrayInt32({3, 5, 0, 2, 3}, {true, true, false, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_1, outputs.at(0));
+ EXPECT_ARROW_ARRAY_EQUALS(exp_2, outputs.at(1));
+ EXPECT_ARROW_ARRAY_EQUALS(exp_3, outputs.at(2));
+}
+
+TEST_F(TestUtf8, TestLiteral) {
+ // schema for input fields
+ auto field_a = field("a", utf8());
+ auto schema = arrow::schema({field_a});
+
+ // output fields
+ auto res = field("res", boolean());
+
+ // build expressions.
+ // a == literal(s)
+
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto literal_s = TreeExprBuilder::MakeStringLiteral("hello");
+ auto is_equal = TreeExprBuilder::MakeFunction("equal", {node_a, literal_s}, boolean());
+ auto expr = TreeExprBuilder::MakeExpression(is_equal, res);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array_a =
+ MakeArrowArrayUtf8({"foo", "hello", "bye", "hi"}, {true, true, true, false});
+
+ // expected output
+ auto exp = MakeArrowArrayBool({false, true, false, false}, {true, true, true, false});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+TEST_F(TestUtf8, TestNullLiteral) {
+ // schema for input fields
+ auto field_a = field("a", utf8());
+ auto schema = arrow::schema({field_a});
+
+ // output fields
+ auto res = field("res", boolean());
+
+ // build expressions.
+ // a == literal(null)
+
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto literal_null = TreeExprBuilder::MakeNull(arrow::utf8());
+ auto is_equal =
+ TreeExprBuilder::MakeFunction("equal", {node_a, literal_null}, boolean());
+ auto expr = TreeExprBuilder::MakeExpression(is_equal, res);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array_a =
+ MakeArrowArrayUtf8({"foo", "hello", "bye", "hi"}, {true, true, true, false});
+
+ // expected output
+ auto exp =
+ MakeArrowArrayBool({false, false, false, false}, {false, false, false, false});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+TEST_F(TestUtf8, TestLike) {
+ // schema for input fields
+ auto field_a = field("a", utf8());
+ auto schema = arrow::schema({field_a});
+
+ // output fields
+ auto res = field("res", boolean());
+
+ // build expressions.
+ // like(literal(s), a)
+
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto literal_s = TreeExprBuilder::MakeStringLiteral("%spark%");
+ auto is_like = TreeExprBuilder::MakeFunction("like", {node_a, literal_s}, boolean());
+ auto expr = TreeExprBuilder::MakeExpression(is_like, res);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array_a = MakeArrowArrayUtf8({"park", "sparkle", "bright spark and fire", "spark"},
+ {true, true, true, true});
+
+ // expected output
+ auto exp = MakeArrowArrayBool({false, true, true, true}, {true, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+TEST_F(TestUtf8, TestLikeWithEscape) {
+ // schema for input fields
+ auto field_a = field("a", utf8());
+ auto schema = arrow::schema({field_a});
+
+ // output fields
+ auto res = field("res", boolean());
+
+ // build expressions.
+ // like(literal(s), a, '\')
+
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto literal_s = TreeExprBuilder::MakeStringLiteral("%pa\\%rk%");
+ auto escape_char = TreeExprBuilder::MakeStringLiteral("\\");
+ auto is_like =
+ TreeExprBuilder::MakeFunction("like", {node_a, literal_s, escape_char}, boolean());
+ auto expr = TreeExprBuilder::MakeExpression(is_like, res);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array_a = MakeArrowArrayUtf8(
+ {"park", "spa%rkle", "bright spa%rk and fire", "spark"}, {true, true, true, true});
+
+ // expected output
+ auto exp = MakeArrowArrayBool({false, true, true, false}, {true, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+TEST_F(TestUtf8, TestBeginsEnds) {
+ // schema for input fields
+ auto field_a = field("a", utf8());
+ auto schema = arrow::schema({field_a});
+
+ // output fields
+ auto res1 = field("res1", boolean());
+ auto res2 = field("res2", boolean());
+
+ // build expressions.
+ // like(literal("spark%"), a)
+ // like(literal("%spark"), a)
+
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto literal_begin = TreeExprBuilder::MakeStringLiteral("spark%");
+ auto is_like1 =
+ TreeExprBuilder::MakeFunction("like", {node_a, literal_begin}, boolean());
+ auto expr1 = TreeExprBuilder::MakeExpression(is_like1, res1);
+
+ auto literal_end = TreeExprBuilder::MakeStringLiteral("%spark");
+ auto is_like2 = TreeExprBuilder::MakeFunction("like", {node_a, literal_end}, boolean());
+ auto expr2 = TreeExprBuilder::MakeExpression(is_like2, res2);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr1, expr2}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array_a =
+ MakeArrowArrayUtf8({"park", "sparkle", "bright spark and fire", "fiery spark"},
+ {true, true, true, true});
+
+ // expected output
+ auto exp1 = MakeArrowArrayBool({false, true, false, false}, {true, true, true, true});
+ auto exp2 = MakeArrowArrayBool({false, false, false, true}, {true, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp1, outputs.at(0));
+ EXPECT_ARROW_ARRAY_EQUALS(exp2, outputs.at(1));
+}
+
+TEST_F(TestUtf8, TestInternalAllocs) {
+ // schema for input fields
+ auto field_a = field("a", utf8());
+ auto schema = arrow::schema({field_a});
+
+ // output fields
+ auto res = field("res", boolean());
+
+ // build expressions.
+ // like(upper(a), literal("%SPARK%"))
+
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto upper_a = TreeExprBuilder::MakeFunction("upper", {node_a}, utf8());
+ auto literal_spark = TreeExprBuilder::MakeStringLiteral("%SPARK%");
+ auto is_like =
+ TreeExprBuilder::MakeFunction("like", {upper_a, literal_spark}, boolean());
+ auto expr = TreeExprBuilder::MakeExpression(is_like, res);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+ auto array_a = MakeArrowArrayUtf8(
+ {"park", "Sparkle", "bright spark and fire", "fiery SPARK", "मदन"},
+ {true, true, false, true, true});
+
+ // expected output
+ auto exp = MakeArrowArrayBool({false, true, false, true, false},
+ {true, true, false, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+TEST_F(TestUtf8, TestCastDate) {
+ // schema for input fields
+ auto field_a = field("a", utf8());
+ auto schema = arrow::schema({field_a});
+
+ // output fields
+ auto res_1 = field("res1", int64());
+
+ // build expressions.
+ // extractYear(castDATE(a))
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto cast_function = TreeExprBuilder::MakeFunction("castDATE", {node_a}, date64());
+ auto extract_year =
+ TreeExprBuilder::MakeFunction("extractYear", {cast_function}, int64());
+ auto expr = TreeExprBuilder::MakeExpression(extract_year, res_1);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array_a = MakeArrowArrayUtf8({"1967-12-1", "67-12-01", "incorrect", "67-45-11"},
+ {true, true, false, true});
+
+ // expected output
+ auto exp_1 = MakeArrowArrayInt64({1967, 2067, 0, 0}, {true, true, false, false});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_EQ(status.code(), StatusCode::ExecutionError);
+ std::string expected_error = "Not a valid date value ";
+ EXPECT_TRUE(status.message().find(expected_error) != std::string::npos);
+
+ auto array_a_2 = MakeArrowArrayUtf8({"1967-12-1", "67-12-01", "67-1-1", "91-1-1"},
+ {true, true, true, true});
+ auto exp_2 = MakeArrowArrayInt64({1967, 2067, 2067, 1991}, {true, true, true, true});
+ auto in_batch_2 = arrow::RecordBatch::Make(schema, num_records, {array_a_2});
+ arrow::ArrayVector outputs2;
+ status = projector->Evaluate(*in_batch_2, pool_, &outputs2);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_2, outputs2.at(0));
+}
+
+TEST_F(TestUtf8, TestToDateNoError) {
+ // schema for input fields
+ auto field_a = field("a", utf8());
+ auto schema = arrow::schema({field_a});
+
+ // output fields
+ auto res_1 = field("res1", int64());
+
+ // build expressions.
+ // extractYear(castDATE(a))
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto node_b = TreeExprBuilder::MakeStringLiteral("YYYY-MM-DD");
+ auto node_c = TreeExprBuilder::MakeLiteral(1);
+
+ auto cast_function =
+ TreeExprBuilder::MakeFunction("to_date", {node_a, node_b, node_c}, date64());
+ auto extract_year =
+ TreeExprBuilder::MakeFunction("extractYear", {cast_function}, int64());
+ auto expr = TreeExprBuilder::MakeExpression(extract_year, res_1);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array_a = MakeArrowArrayUtf8({"1967-12-1", "67-12-01", "incorrect", "67-45-11"},
+ {true, true, false, true});
+
+ // expected output
+ auto exp_1 = MakeArrowArrayInt64({1967, 67, 0, 0}, {true, true, false, false});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+ EXPECT_ARROW_ARRAY_EQUALS(exp_1, outputs.at(0));
+
+ // Create a row-batch with some sample data
+ auto array_a_2 = MakeArrowArrayUtf8(
+ {"1967-12-1", "1967-12-01", "1967-11-11", "1991-11-11"}, {true, true, true, true});
+ auto exp_2 = MakeArrowArrayInt64({1967, 1967, 1967, 1991}, {true, true, true, true});
+ auto in_batch_2 = arrow::RecordBatch::Make(schema, num_records, {array_a_2});
+ arrow::ArrayVector outputs2;
+ status = projector->Evaluate(*in_batch_2, pool_, &outputs2);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_2, outputs2.at(0));
+}
+
+TEST_F(TestUtf8, TestToDateError) {
+ // schema for input fields
+ auto field_a = field("a", utf8());
+ auto schema = arrow::schema({field_a});
+
+ // output fields
+ auto res_1 = field("res1", int64());
+
+ // build expressions.
+ // extractYear(castDATE(a))
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto node_b = TreeExprBuilder::MakeStringLiteral("YYYY-MM-DD");
+ auto node_c = TreeExprBuilder::MakeLiteral(0);
+
+ auto cast_function =
+ TreeExprBuilder::MakeFunction("to_date", {node_a, node_b, node_c}, date64());
+ auto extract_year =
+ TreeExprBuilder::MakeFunction("extractYear", {cast_function}, int64());
+ auto expr = TreeExprBuilder::MakeExpression(extract_year, res_1);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array_a = MakeArrowArrayUtf8({"1967-12-1", "67-12-01", "incorrect", "67-45-11"},
+ {true, true, false, true});
+
+ // expected output
+ auto exp_1 = MakeArrowArrayInt64({1967, 67, 0, 0}, {true, true, false, false});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_EQ(status.code(), StatusCode::ExecutionError);
+ std::string expected_error = "Error parsing value 67-45-11 for given format";
+ EXPECT_TRUE(status.message().find(expected_error) != std::string::npos)
+ << status.message();
+}
+
+TEST_F(TestUtf8, TestIsNull) {
+ // schema for input fields
+ auto field_a = field("a", utf8());
+ auto schema = arrow::schema({field_a});
+
+ // build expressions
+ auto exprs = std::vector<ExpressionPtr>{
+ TreeExprBuilder::MakeExpression("isnull", {field_a}, field("is_null", boolean())),
+ TreeExprBuilder::MakeExpression("isnotnull", {field_a},
+ field("is_not_null", boolean())),
+ };
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, exprs, TestConfiguration(), &projector);
+ DCHECK_OK(status);
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array_a = MakeArrowArrayUtf8({"hello", "world", "incorrect", "universe"},
+ {true, true, false, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+
+ // validate results
+ EXPECT_ARROW_ARRAY_EQUALS(MakeArrowArrayBool({false, false, true, false}),
+ outputs[0]); // isnull
+ EXPECT_ARROW_ARRAY_EQUALS(MakeArrowArrayBool({true, true, false, true}),
+ outputs[1]); // isnotnull
+}
+
+TEST_F(TestUtf8, TestVarlenOutput) {
+ // schema for input fields
+ auto field_a = field("a", boolean());
+ auto schema = arrow::schema({field_a});
+
+ // build expressions.
+ // if (a) literal_hi else literal_bye
+ auto if_node = TreeExprBuilder::MakeIf(
+ TreeExprBuilder::MakeField(field_a), TreeExprBuilder::MakeStringLiteral("hi"),
+ TreeExprBuilder::MakeStringLiteral("bye"), utf8());
+ auto expr = TreeExprBuilder::MakeExpression(if_node, field("res", utf8()));
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+
+ // assert that it fails gracefully.
+ ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector));
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array_in =
+ MakeArrowArrayBool({true, false, false, false}, {true, true, true, false});
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_in});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ ASSERT_OK(projector->Evaluate(*in_batch, pool_, &outputs));
+
+ // expected output
+ auto exp = MakeArrowArrayUtf8({"hi", "bye", "bye", "bye"}, {true, true, true, true});
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
+TEST_F(TestUtf8, TestConvertUtf8) {
+ // schema for input fields
+ auto field_a = field("a", arrow::binary());
+ auto field_c = field("c", utf8());
+ auto schema = arrow::schema({field_a, field_c});
+
+ // output fields
+ auto res = field("res", boolean());
+
+ // build expressions.
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto node_c = TreeExprBuilder::MakeField(field_c);
+
+ // define char to replace
+ auto node_b = TreeExprBuilder::MakeStringLiteral("z");
+
+ auto convert_replace_utf8 =
+ TreeExprBuilder::MakeFunction("convert_replaceUTF8", {node_a, node_b}, utf8());
+ auto equals =
+ TreeExprBuilder::MakeFunction("equal", {convert_replace_utf8, node_c}, boolean());
+ auto expr = TreeExprBuilder::MakeExpression(equals, res);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 3;
+ auto array_a = MakeArrowArrayUtf8({"ok-\xf8\x28"
+ "-a",
+ "all-valid", "ok-\xa0\xa1-valid"},
+ {true, true, true});
+
+ auto array_b =
+ MakeArrowArrayUtf8({"ok-z(-a", "all-valid", "ok-zz-valid"}, {true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a, array_b});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ auto exp = MakeArrowArrayBool({true, true, true}, {true, true, true});
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs[0]);
+}
+
+TEST_F(TestUtf8, TestCastVarChar) {
+ // schema for input fields
+ auto field_a = field("a", utf8());
+ auto field_c = field("c", utf8());
+ auto schema = arrow::schema({field_a, field_c});
+
+ // output fields
+ auto res = field("res", boolean());
+
+ // build expressions.
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto node_c = TreeExprBuilder::MakeField(field_c);
+ // truncates the string to input length
+ auto node_b = TreeExprBuilder::MakeLiteral(static_cast<int64_t>(10));
+ auto cast_varchar =
+ TreeExprBuilder::MakeFunction("castVARCHAR", {node_a, node_b}, utf8());
+ auto equals = TreeExprBuilder::MakeFunction("equal", {cast_varchar, node_c}, boolean());
+ auto expr = TreeExprBuilder::MakeExpression(equals, res);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+ auto array_a = MakeArrowArrayUtf8(
+ {"park", "Sparkle", "bright spark and fire", "fiery SPARK", "मदन"},
+ {true, true, false, true, true});
+
+ auto array_b =
+ MakeArrowArrayUtf8({"park", "Sparkle", "bright spar", "fiery SPAR", "मदन"},
+ {true, true, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a, array_b});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ auto exp = MakeArrowArrayBool({true, true, false, true, true},
+ {true, true, false, true, true});
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs[0]);
+}
+
+TEST_F(TestUtf8, TestAscii) {
+ // schema for input fields
+ auto field0 = field("f0", arrow::utf8());
+ auto schema = arrow::schema({field0});
+
+ // output fields
+ auto field_asc = field("ascii", arrow::int32());
+
+ // Build expression
+ auto asc_expr = TreeExprBuilder::MakeExpression("ascii", {field0}, field_asc);
+
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {asc_expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 6;
+ auto array0 = MakeArrowArrayUtf8({"ABC", "", "abc", "Hello World", "123", "999"},
+ {true, true, true, true, true, true});
+ // expected output
+ auto exp_asc =
+ MakeArrowArrayInt32({65, 0, 97, 72, 49, 57}, {true, true, true, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_asc, outputs.at(0));
+}
+
+TEST_F(TestUtf8, TestSpace) {
+ // schema for input fields
+ auto field0 = field("f0", arrow::int64());
+ auto schema = arrow::schema({field0});
+
+ // output fields
+ auto field_space = field("space", arrow::utf8());
+
+ // Build expression
+ auto space_expr = TreeExprBuilder::MakeExpression("space", {field0}, field_space);
+
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {space_expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array0 = MakeArrowArrayInt64({1, 0, -5, 2}, {true, true, true, true});
+ // expected output
+ auto exp_space = MakeArrowArrayUtf8({" ", "", "", " "}, {true, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_space, outputs.at(0));
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/to_date_holder.cc b/src/arrow/cpp/src/gandiva/to_date_holder.cc
new file mode 100644
index 000000000..1b7e2864f
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/to_date_holder.cc
@@ -0,0 +1,116 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/to_date_holder.h"
+
+#include <algorithm>
+#include <string>
+
+#include "arrow/util/value_parsing.h"
+#include "arrow/vendored/datetime.h"
+#include "gandiva/date_utils.h"
+#include "gandiva/execution_context.h"
+#include "gandiva/node.h"
+
+namespace gandiva {
+
+Status ToDateHolder::Make(const FunctionNode& node,
+ std::shared_ptr<ToDateHolder>* holder) {
+ if (node.children().size() != 2 && node.children().size() != 3) {
+ return Status::Invalid("'to_date' function requires two or three parameters");
+ }
+
+ auto literal_pattern = dynamic_cast<LiteralNode*>(node.children().at(1).get());
+ if (literal_pattern == nullptr) {
+ return Status::Invalid(
+ "'to_date' function requires a literal as the second parameter");
+ }
+
+ auto literal_type = literal_pattern->return_type()->id();
+ if (literal_type != arrow::Type::STRING && literal_type != arrow::Type::BINARY) {
+ return Status::Invalid(
+ "'to_date' function requires a string literal as the second parameter");
+ }
+ auto pattern = arrow::util::get<std::string>(literal_pattern->holder());
+
+ int suppress_errors = 0;
+ if (node.children().size() == 3) {
+ auto literal_suppress_errors =
+ dynamic_cast<LiteralNode*>(node.children().at(2).get());
+ if (literal_pattern == nullptr) {
+ return Status::Invalid(
+ "The (optional) third parameter to 'to_date' function needs to an integer "
+ "literal to indicate whether to suppress the error");
+ }
+
+ literal_type = literal_suppress_errors->return_type()->id();
+ if (literal_type != arrow::Type::INT32) {
+ return Status::Invalid(
+ "The (optional) third parameter to 'to_date' function needs to an integer "
+ "literal to indicate whether to suppress the error");
+ }
+ suppress_errors = arrow::util::get<int>(literal_suppress_errors->holder());
+ }
+
+ return Make(pattern, suppress_errors, holder);
+}
+
+Status ToDateHolder::Make(const std::string& sql_pattern, int32_t suppress_errors,
+ std::shared_ptr<ToDateHolder>* holder) {
+ std::shared_ptr<std::string> transformed_pattern;
+ ARROW_RETURN_NOT_OK(DateUtils::ToInternalFormat(sql_pattern, &transformed_pattern));
+ auto lholder = std::shared_ptr<ToDateHolder>(
+ new ToDateHolder(*(transformed_pattern.get()), suppress_errors));
+ *holder = lholder;
+ return Status::OK();
+}
+
+int64_t ToDateHolder::operator()(ExecutionContext* context, const char* data,
+ int data_len, bool in_valid, bool* out_valid) {
+ *out_valid = false;
+ if (!in_valid) {
+ return 0;
+ }
+
+ // Issues
+ // 1. processes date that do not match the format.
+ // 2. does not process time in format +08:00 (or) id.
+ int64_t seconds_since_epoch = 0;
+ if (!::arrow::internal::ParseTimestampStrptime(
+ data, data_len, pattern_.c_str(),
+ /*ignore_time_in_day=*/true, /*allow_trailing_chars=*/true,
+ ::arrow::TimeUnit::SECOND, &seconds_since_epoch)) {
+ return_error(context, data, data_len);
+ return 0;
+ }
+
+ *out_valid = true;
+ return seconds_since_epoch * 1000;
+}
+
+void ToDateHolder::return_error(ExecutionContext* context, const char* data,
+ int data_len) {
+ if (suppress_errors_ == 1) {
+ return;
+ }
+
+ std::string err_msg =
+ "Error parsing value " + std::string(data, data_len) + " for given format.";
+ context->set_error_msg(err_msg.c_str());
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/to_date_holder.h b/src/arrow/cpp/src/gandiva/to_date_holder.h
new file mode 100644
index 000000000..1211b6a30
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/to_date_holder.h
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "arrow/status.h"
+
+#include "gandiva/execution_context.h"
+#include "gandiva/function_holder.h"
+#include "gandiva/node.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+/// Function Holder for SQL 'to_date'
+class GANDIVA_EXPORT ToDateHolder : public FunctionHolder {
+ public:
+ ~ToDateHolder() override = default;
+
+ static Status Make(const FunctionNode& node, std::shared_ptr<ToDateHolder>* holder);
+
+ static Status Make(const std::string& sql_pattern, int32_t suppress_errors,
+ std::shared_ptr<ToDateHolder>* holder);
+
+ /// Return true if the data matches the pattern.
+ int64_t operator()(ExecutionContext* context, const char* data, int data_len,
+ bool in_valid, bool* out_valid);
+
+ private:
+ ToDateHolder(const std::string& pattern, int32_t suppress_errors)
+ : pattern_(pattern), suppress_errors_(suppress_errors) {}
+
+ void return_error(ExecutionContext* context, const char* data, int data_len);
+
+ std::string pattern_; // date format string
+
+ int32_t suppress_errors_; // should throw exception on runtime errors
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/to_date_holder_test.cc b/src/arrow/cpp/src/gandiva/to_date_holder_test.cc
new file mode 100644
index 000000000..a420774bf
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/to_date_holder_test.cc
@@ -0,0 +1,152 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <memory>
+#include <vector>
+
+#include "arrow/testing/gtest_util.h"
+
+#include "gandiva/execution_context.h"
+#include "gandiva/to_date_holder.h"
+#include "precompiled/epoch_time_point.h"
+
+#include <gtest/gtest.h>
+
+namespace gandiva {
+
+class TestToDateHolder : public ::testing::Test {
+ public:
+ FunctionNode BuildToDate(std::string pattern) {
+ auto field = std::make_shared<FieldNode>(arrow::field("in", arrow::utf8()));
+ auto pattern_node =
+ std::make_shared<LiteralNode>(arrow::utf8(), LiteralHolder(pattern), false);
+ auto suppress_error_node =
+ std::make_shared<LiteralNode>(arrow::int32(), LiteralHolder(0), false);
+ return FunctionNode("to_date_utf8_utf8_int32",
+ {field, pattern_node, suppress_error_node}, arrow::int64());
+ }
+
+ protected:
+ ExecutionContext execution_context_;
+};
+
+TEST_F(TestToDateHolder, TestSimpleDateTime) {
+ std::shared_ptr<ToDateHolder> to_date_holder;
+ ASSERT_OK(ToDateHolder::Make("YYYY-MM-DD HH:MI:SS", 1, &to_date_holder));
+
+ auto& to_date = *to_date_holder;
+ bool out_valid;
+ std::string s("1986-12-01 01:01:01");
+ int64_t millis_since_epoch =
+ to_date(&execution_context_, s.data(), (int)s.length(), true, &out_valid);
+ EXPECT_EQ(millis_since_epoch, 533779200000);
+
+ s = std::string("1986-12-01 01:01:01.11");
+ millis_since_epoch =
+ to_date(&execution_context_, s.data(), (int)s.length(), true, &out_valid);
+ EXPECT_EQ(millis_since_epoch, 533779200000);
+
+ s = std::string("1986-12-01 01:01:01 +0800");
+ millis_since_epoch =
+ to_date(&execution_context_, s.data(), (int)s.length(), true, &out_valid);
+ EXPECT_EQ(millis_since_epoch, 533779200000);
+
+#if 0
+ // TODO : this fails parsing with date::parse and strptime on linux
+ s = std::string("1886-12-01 00:00:00");
+ millis_since_epoch =
+ to_date(&execution_context_, s.data(), (int) s.length(), true, &out_valid);
+ EXPECT_EQ(out_valid, true);
+ EXPECT_EQ(millis_since_epoch, -2621894400000);
+#endif
+
+ s = std::string("1886-12-01 01:01:01");
+ millis_since_epoch =
+ to_date(&execution_context_, s.data(), (int)s.length(), true, &out_valid);
+ EXPECT_EQ(millis_since_epoch, -2621894400000);
+
+ s = std::string("1986-12-11 01:30:00");
+ millis_since_epoch =
+ to_date(&execution_context_, s.data(), (int)s.length(), true, &out_valid);
+ EXPECT_EQ(millis_since_epoch, 534643200000);
+}
+
+TEST_F(TestToDateHolder, TestSimpleDate) {
+ std::shared_ptr<ToDateHolder> to_date_holder;
+ ASSERT_OK(ToDateHolder::Make("YYYY-MM-DD", 1, &to_date_holder));
+
+ auto& to_date = *to_date_holder;
+ bool out_valid;
+ std::string s("1986-12-01");
+ int64_t millis_since_epoch =
+ to_date(&execution_context_, s.data(), (int)s.length(), true, &out_valid);
+ EXPECT_EQ(millis_since_epoch, 533779200000);
+
+ s = std::string("1986-12-01");
+ millis_since_epoch =
+ to_date(&execution_context_, s.data(), (int)s.length(), true, &out_valid);
+ EXPECT_EQ(millis_since_epoch, 533779200000);
+
+ s = std::string("1886-12-1");
+ millis_since_epoch =
+ to_date(&execution_context_, s.data(), (int)s.length(), true, &out_valid);
+ EXPECT_EQ(millis_since_epoch, -2621894400000);
+
+ s = std::string("2012-12-1");
+ millis_since_epoch =
+ to_date(&execution_context_, s.data(), (int)s.length(), true, &out_valid);
+ EXPECT_EQ(millis_since_epoch, 1354320000000);
+
+ // wrong month. should return 0 since we are suppressing errors.
+ s = std::string("1986-21-01 01:01:01 +0800");
+ millis_since_epoch =
+ to_date(&execution_context_, s.data(), (int)s.length(), true, &out_valid);
+ EXPECT_EQ(millis_since_epoch, 0);
+}
+
+TEST_F(TestToDateHolder, TestSimpleDateTimeError) {
+ std::shared_ptr<ToDateHolder> to_date_holder;
+
+ auto status = ToDateHolder::Make("YYYY-MM-DD HH:MI:SS", 0, &to_date_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+ auto& to_date = *to_date_holder;
+ bool out_valid;
+
+ std::string s("1986-01-40 01:01:01 +0800");
+ int64_t millis_since_epoch =
+ to_date(&execution_context_, s.data(), (int)s.length(), true, &out_valid);
+ EXPECT_EQ(0, millis_since_epoch);
+ std::string expected_error =
+ "Error parsing value 1986-01-40 01:01:01 +0800 for given format";
+ EXPECT_TRUE(execution_context_.get_error().find(expected_error) != std::string::npos)
+ << status.message();
+
+ // not valid should not return error
+ execution_context_.Reset();
+ millis_since_epoch = to_date(&execution_context_, "nullptr", 7, false, &out_valid);
+ EXPECT_EQ(millis_since_epoch, 0);
+ EXPECT_TRUE(execution_context_.has_error() == false);
+}
+
+TEST_F(TestToDateHolder, TestSimpleDateTimeMakeError) {
+ std::shared_ptr<ToDateHolder> to_date_holder;
+ // reject time stamps for now.
+ auto status = ToDateHolder::Make("YYYY-MM-DD HH:MI:SS tzo", 0, &to_date_holder);
+ EXPECT_EQ(status.IsInvalid(), true) << status.message();
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/tree_expr_builder.cc b/src/arrow/cpp/src/gandiva/tree_expr_builder.cc
new file mode 100644
index 000000000..de8e3445a
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/tree_expr_builder.cc
@@ -0,0 +1,223 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/tree_expr_builder.h"
+
+#include <iostream>
+#include <utility>
+
+#include "gandiva/decimal_type_util.h"
+#include "gandiva/gandiva_aliases.h"
+#include "gandiva/node.h"
+
+namespace gandiva {
+
+#define MAKE_LITERAL(atype, ctype) \
+ NodePtr TreeExprBuilder::MakeLiteral(ctype value) { \
+ return std::make_shared<LiteralNode>(atype, LiteralHolder(value), false); \
+ }
+
+MAKE_LITERAL(arrow::boolean(), bool)
+MAKE_LITERAL(arrow::int8(), int8_t)
+MAKE_LITERAL(arrow::int16(), int16_t)
+MAKE_LITERAL(arrow::int32(), int32_t)
+MAKE_LITERAL(arrow::int64(), int64_t)
+MAKE_LITERAL(arrow::uint8(), uint8_t)
+MAKE_LITERAL(arrow::uint16(), uint16_t)
+MAKE_LITERAL(arrow::uint32(), uint32_t)
+MAKE_LITERAL(arrow::uint64(), uint64_t)
+MAKE_LITERAL(arrow::float32(), float)
+MAKE_LITERAL(arrow::float64(), double)
+
+NodePtr TreeExprBuilder::MakeStringLiteral(const std::string& value) {
+ return std::make_shared<LiteralNode>(arrow::utf8(), LiteralHolder(value), false);
+}
+
+NodePtr TreeExprBuilder::MakeBinaryLiteral(const std::string& value) {
+ return std::make_shared<LiteralNode>(arrow::binary(), LiteralHolder(value), false);
+}
+
+NodePtr TreeExprBuilder::MakeDecimalLiteral(const DecimalScalar128& value) {
+ return std::make_shared<LiteralNode>(arrow::decimal(value.precision(), value.scale()),
+ LiteralHolder(value), false);
+}
+
+NodePtr TreeExprBuilder::MakeNull(DataTypePtr data_type) {
+ static const std::string empty;
+
+ if (data_type == nullptr) {
+ return nullptr;
+ }
+
+ switch (data_type->id()) {
+ case arrow::Type::BOOL:
+ return std::make_shared<LiteralNode>(data_type, LiteralHolder(false), true);
+ case arrow::Type::INT8:
+ return std::make_shared<LiteralNode>(data_type, LiteralHolder((int8_t)0), true);
+ case arrow::Type::INT16:
+ return std::make_shared<LiteralNode>(data_type, LiteralHolder((int16_t)0), true);
+ return std::make_shared<LiteralNode>(data_type, LiteralHolder((int32_t)0), true);
+ case arrow::Type::UINT8:
+ return std::make_shared<LiteralNode>(data_type, LiteralHolder((uint8_t)0), true);
+ case arrow::Type::UINT16:
+ return std::make_shared<LiteralNode>(data_type, LiteralHolder((uint16_t)0), true);
+ case arrow::Type::UINT32:
+ return std::make_shared<LiteralNode>(data_type, LiteralHolder((uint32_t)0), true);
+ case arrow::Type::UINT64:
+ return std::make_shared<LiteralNode>(data_type, LiteralHolder((uint64_t)0), true);
+ case arrow::Type::FLOAT:
+ return std::make_shared<LiteralNode>(data_type,
+ LiteralHolder(static_cast<float>(0)), true);
+ case arrow::Type::DOUBLE:
+ return std::make_shared<LiteralNode>(data_type,
+ LiteralHolder(static_cast<double>(0)), true);
+ case arrow::Type::STRING:
+ case arrow::Type::BINARY:
+ return std::make_shared<LiteralNode>(data_type, LiteralHolder(empty), true);
+ case arrow::Type::INT32:
+ case arrow::Type::DATE32:
+ case arrow::Type::TIME32:
+ case arrow::Type::INTERVAL_MONTHS:
+ return std::make_shared<LiteralNode>(data_type, LiteralHolder((int32_t)0), true);
+ case arrow::Type::INT64:
+ case arrow::Type::DATE64:
+ case arrow::Type::TIME64:
+ case arrow::Type::TIMESTAMP:
+ case arrow::Type::INTERVAL_DAY_TIME:
+ return std::make_shared<LiteralNode>(data_type, LiteralHolder((int64_t)0), true);
+ case arrow::Type::DECIMAL: {
+ std::shared_ptr<arrow::DecimalType> decimal_type =
+ arrow::internal::checked_pointer_cast<arrow::DecimalType>(data_type);
+ DecimalScalar128 literal(decimal_type->precision(), decimal_type->scale());
+ return std::make_shared<LiteralNode>(data_type, LiteralHolder(literal), true);
+ }
+ default:
+ return nullptr;
+ }
+}
+
+NodePtr TreeExprBuilder::MakeField(FieldPtr field) {
+ return NodePtr(new FieldNode(field));
+}
+
+NodePtr TreeExprBuilder::MakeFunction(const std::string& name, const NodeVector& params,
+ DataTypePtr result_type) {
+ if (result_type == nullptr) {
+ return nullptr;
+ }
+ return std::make_shared<FunctionNode>(name, params, result_type);
+}
+
+NodePtr TreeExprBuilder::MakeIf(NodePtr condition, NodePtr then_node, NodePtr else_node,
+ DataTypePtr result_type) {
+ if (condition == nullptr || then_node == nullptr || else_node == nullptr ||
+ result_type == nullptr) {
+ return nullptr;
+ }
+ return std::make_shared<IfNode>(condition, then_node, else_node, result_type);
+}
+
+NodePtr TreeExprBuilder::MakeAnd(const NodeVector& children) {
+ return std::make_shared<BooleanNode>(BooleanNode::AND, children);
+}
+
+NodePtr TreeExprBuilder::MakeOr(const NodeVector& children) {
+ return std::make_shared<BooleanNode>(BooleanNode::OR, children);
+}
+
+// set this to true to print expressions for debugging purposes
+static bool print_expr = false;
+
+ExpressionPtr TreeExprBuilder::MakeExpression(NodePtr root_node, FieldPtr result_field) {
+ if (result_field == nullptr) {
+ return nullptr;
+ }
+ if (print_expr) {
+ std::cout << "Expression: " << root_node->ToString() << "\n";
+ }
+ return ExpressionPtr(new Expression(root_node, result_field));
+}
+
+ExpressionPtr TreeExprBuilder::MakeExpression(const std::string& function,
+ const FieldVector& in_fields,
+ FieldPtr out_field) {
+ if (out_field == nullptr) {
+ return nullptr;
+ }
+ std::vector<NodePtr> field_nodes;
+ for (auto& field : in_fields) {
+ auto node = MakeField(field);
+ field_nodes.push_back(node);
+ }
+ auto func_node = MakeFunction(function, field_nodes, out_field->type());
+ return MakeExpression(func_node, out_field);
+}
+
+ConditionPtr TreeExprBuilder::MakeCondition(NodePtr root_node) {
+ if (root_node == nullptr) {
+ return nullptr;
+ }
+ if (print_expr) {
+ std::cout << "Condition: " << root_node->ToString() << "\n";
+ }
+
+ return ConditionPtr(new Condition(root_node));
+}
+
+ConditionPtr TreeExprBuilder::MakeCondition(const std::string& function,
+ const FieldVector& in_fields) {
+ std::vector<NodePtr> field_nodes;
+ for (auto& field : in_fields) {
+ auto node = MakeField(field);
+ field_nodes.push_back(node);
+ }
+
+ auto func_node = MakeFunction(function, field_nodes, arrow::boolean());
+ return ConditionPtr(new Condition(func_node));
+}
+
+NodePtr TreeExprBuilder::MakeInExpressionDecimal(
+ NodePtr node, std::unordered_set<gandiva::DecimalScalar128>& constants) {
+ int32_t precision = 0;
+ int32_t scale = 0;
+ if (!constants.empty()) {
+ precision = constants.begin()->precision();
+ scale = constants.begin()->scale();
+ }
+ return std::make_shared<InExpressionNode<gandiva::DecimalScalar128>>(node, constants,
+ precision, scale);
+}
+
+#define MAKE_IN(NAME, ctype) \
+ NodePtr TreeExprBuilder::MakeInExpression##NAME( \
+ NodePtr node, const std::unordered_set<ctype>& values) { \
+ return std::make_shared<InExpressionNode<ctype>>(node, values); \
+ }
+
+MAKE_IN(Int32, int32_t);
+MAKE_IN(Int64, int64_t);
+MAKE_IN(Date32, int32_t);
+MAKE_IN(Date64, int64_t);
+MAKE_IN(TimeStamp, int64_t);
+MAKE_IN(Time32, int32_t);
+MAKE_IN(Time64, int64_t);
+MAKE_IN(Float, float);
+MAKE_IN(Double, double);
+MAKE_IN(String, std::string);
+MAKE_IN(Binary, std::string);
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/tree_expr_builder.h b/src/arrow/cpp/src/gandiva/tree_expr_builder.h
new file mode 100644
index 000000000..94a4a1793
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/tree_expr_builder.h
@@ -0,0 +1,139 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cmath>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "arrow/type.h"
+#include "gandiva/condition.h"
+#include "gandiva/decimal_scalar.h"
+#include "gandiva/expression.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+/// \brief Tree Builder for a nested expression.
+class GANDIVA_EXPORT TreeExprBuilder {
+ public:
+ /// \brief create a node on a literal.
+ static NodePtr MakeLiteral(bool value);
+ static NodePtr MakeLiteral(uint8_t value);
+ static NodePtr MakeLiteral(uint16_t value);
+ static NodePtr MakeLiteral(uint32_t value);
+ static NodePtr MakeLiteral(uint64_t value);
+ static NodePtr MakeLiteral(int8_t value);
+ static NodePtr MakeLiteral(int16_t value);
+ static NodePtr MakeLiteral(int32_t value);
+ static NodePtr MakeLiteral(int64_t value);
+ static NodePtr MakeLiteral(float value);
+ static NodePtr MakeLiteral(double value);
+ static NodePtr MakeStringLiteral(const std::string& value);
+ static NodePtr MakeBinaryLiteral(const std::string& value);
+ static NodePtr MakeDecimalLiteral(const DecimalScalar128& value);
+
+ /// \brief create a node on a null literal.
+ /// returns null if data_type is null or if it's not a supported datatype.
+ static NodePtr MakeNull(DataTypePtr data_type);
+
+ /// \brief create a node on arrow field.
+ /// returns null if input is null.
+ static NodePtr MakeField(FieldPtr field);
+
+ /// \brief create a node with a function.
+ /// returns null if return_type is null
+ static NodePtr MakeFunction(const std::string& name, const NodeVector& params,
+ DataTypePtr return_type);
+
+ /// \brief create a node with an if-else expression.
+ /// returns null if any of the inputs is null.
+ static NodePtr MakeIf(NodePtr condition, NodePtr then_node, NodePtr else_node,
+ DataTypePtr result_type);
+
+ /// \brief create a node with a boolean AND expression.
+ static NodePtr MakeAnd(const NodeVector& children);
+
+ /// \brief create a node with a boolean OR expression.
+ static NodePtr MakeOr(const NodeVector& children);
+
+ /// \brief create an expression with the specified root_node, and the
+ /// result written to result_field.
+ /// returns null if the result_field is null.
+ static ExpressionPtr MakeExpression(NodePtr root_node, FieldPtr result_field);
+
+ /// \brief convenience function for simple function expressions.
+ /// returns null if the out_field is null.
+ static ExpressionPtr MakeExpression(const std::string& function,
+ const FieldVector& in_fields, FieldPtr out_field);
+
+ /// \brief create a condition with the specified root_node
+ static ConditionPtr MakeCondition(NodePtr root_node);
+
+ /// \brief convenience function for simple function conditions.
+ static ConditionPtr MakeCondition(const std::string& function,
+ const FieldVector& in_fields);
+
+ /// \brief creates an in expression
+ static NodePtr MakeInExpressionInt32(NodePtr node,
+ const std::unordered_set<int32_t>& constants);
+
+ static NodePtr MakeInExpressionInt64(NodePtr node,
+ const std::unordered_set<int64_t>& constants);
+
+ static NodePtr MakeInExpressionDecimal(
+ NodePtr node, std::unordered_set<gandiva::DecimalScalar128>& constants);
+
+ static NodePtr MakeInExpressionString(NodePtr node,
+ const std::unordered_set<std::string>& constants);
+
+ static NodePtr MakeInExpressionBinary(NodePtr node,
+ const std::unordered_set<std::string>& constants);
+
+ /// \brief creates an in expression for float
+ static NodePtr MakeInExpressionFloat(NodePtr node,
+ const std::unordered_set<float>& constants);
+
+ /// \brief creates an in expression for double
+ static NodePtr MakeInExpressionDouble(NodePtr node,
+ const std::unordered_set<double>& constants);
+
+ /// \brief Date as s/millis since epoch.
+ static NodePtr MakeInExpressionDate32(NodePtr node,
+ const std::unordered_set<int32_t>& constants);
+
+ /// \brief Date as millis/us/ns since epoch.
+ static NodePtr MakeInExpressionDate64(NodePtr node,
+ const std::unordered_set<int64_t>& constants);
+
+ /// \brief Time as s/millis of day
+ static NodePtr MakeInExpressionTime32(NodePtr node,
+ const std::unordered_set<int32_t>& constants);
+
+ /// \brief Time as millis/us/ns of day
+ static NodePtr MakeInExpressionTime64(NodePtr node,
+ const std::unordered_set<int64_t>& constants);
+
+ /// \brief Timestamp as millis since epoch.
+ static NodePtr MakeInExpressionTimeStamp(NodePtr node,
+ const std::unordered_set<int64_t>& constants);
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/tree_expr_test.cc b/src/arrow/cpp/src/gandiva/tree_expr_test.cc
new file mode 100644
index 000000000..e70cf1289
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/tree_expr_test.cc
@@ -0,0 +1,159 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/tree_expr_builder.h"
+
+#include <gtest/gtest.h>
+#include "gandiva/annotator.h"
+#include "gandiva/dex.h"
+#include "gandiva/expr_decomposer.h"
+#include "gandiva/function_registry.h"
+#include "gandiva/function_signature.h"
+#include "gandiva/gandiva_aliases.h"
+#include "gandiva/node.h"
+
+namespace gandiva {
+
+using arrow::boolean;
+using arrow::int32;
+
+class TestExprTree : public ::testing::Test {
+ public:
+ void SetUp() {
+ i0_ = field("i0", int32());
+ i1_ = field("i1", int32());
+
+ b0_ = field("b0", boolean());
+ }
+
+ protected:
+ FieldPtr i0_; // int32
+ FieldPtr i1_; // int32
+
+ FieldPtr b0_; // bool
+ FunctionRegistry registry_;
+};
+
+TEST_F(TestExprTree, TestField) {
+ Annotator annotator;
+
+ auto n0 = TreeExprBuilder::MakeField(i0_);
+ EXPECT_EQ(n0->return_type(), int32());
+
+ auto n1 = TreeExprBuilder::MakeField(b0_);
+ EXPECT_EQ(n1->return_type(), boolean());
+
+ ExprDecomposer decomposer(registry_, annotator);
+ ValueValidityPairPtr pair;
+ auto status = decomposer.Decompose(*n1, &pair);
+ DCHECK_EQ(status.ok(), true) << status.message();
+
+ auto value = pair->value_expr();
+ auto value_dex = std::dynamic_pointer_cast<VectorReadFixedLenValueDex>(value);
+ EXPECT_EQ(value_dex->FieldType(), boolean());
+
+ EXPECT_EQ(pair->validity_exprs().size(), 1);
+ auto validity = pair->validity_exprs().at(0);
+ auto validity_dex = std::dynamic_pointer_cast<VectorReadValidityDex>(validity);
+ EXPECT_NE(validity_dex->ValidityIdx(), value_dex->DataIdx());
+}
+
+TEST_F(TestExprTree, TestBinary) {
+ Annotator annotator;
+
+ auto left = TreeExprBuilder::MakeField(i0_);
+ auto right = TreeExprBuilder::MakeField(i1_);
+
+ auto n = TreeExprBuilder::MakeFunction("add", {left, right}, int32());
+ auto add = std::dynamic_pointer_cast<FunctionNode>(n);
+
+ auto func_desc = add->descriptor();
+ FunctionSignature sign(func_desc->name(), func_desc->params(),
+ func_desc->return_type());
+
+ EXPECT_EQ(add->return_type(), int32());
+ EXPECT_TRUE(sign == FunctionSignature("add", {int32(), int32()}, int32()));
+
+ ExprDecomposer decomposer(registry_, annotator);
+ ValueValidityPairPtr pair;
+ auto status = decomposer.Decompose(*n, &pair);
+ DCHECK_EQ(status.ok(), true) << status.message();
+
+ auto value = pair->value_expr();
+ auto null_if_null = std::dynamic_pointer_cast<NonNullableFuncDex>(value);
+
+ FunctionSignature signature("add", {int32(), int32()}, int32());
+ const NativeFunction* fn = registry_.LookupSignature(signature);
+ EXPECT_EQ(null_if_null->native_function(), fn);
+}
+
+TEST_F(TestExprTree, TestUnary) {
+ Annotator annotator;
+
+ auto arg = TreeExprBuilder::MakeField(i0_);
+ auto n = TreeExprBuilder::MakeFunction("isnumeric", {arg}, boolean());
+
+ auto unaryFn = std::dynamic_pointer_cast<FunctionNode>(n);
+ auto func_desc = unaryFn->descriptor();
+ FunctionSignature sign(func_desc->name(), func_desc->params(),
+ func_desc->return_type());
+ EXPECT_EQ(unaryFn->return_type(), boolean());
+ EXPECT_TRUE(sign == FunctionSignature("isnumeric", {int32()}, boolean()));
+
+ ExprDecomposer decomposer(registry_, annotator);
+ ValueValidityPairPtr pair;
+ auto status = decomposer.Decompose(*n, &pair);
+ DCHECK_EQ(status.ok(), true) << status.message();
+
+ auto value = pair->value_expr();
+ auto never_null = std::dynamic_pointer_cast<NullableNeverFuncDex>(value);
+
+ FunctionSignature signature("isnumeric", {int32()}, boolean());
+ const NativeFunction* fn = registry_.LookupSignature(signature);
+ EXPECT_EQ(never_null->native_function(), fn);
+}
+
+TEST_F(TestExprTree, TestExpression) {
+ Annotator annotator;
+ auto left = TreeExprBuilder::MakeField(i0_);
+ auto right = TreeExprBuilder::MakeField(i1_);
+
+ auto n = TreeExprBuilder::MakeFunction("add", {left, right}, int32());
+ auto e = TreeExprBuilder::MakeExpression(n, field("r", int32()));
+ auto root_node = e->root();
+ EXPECT_EQ(root_node->return_type(), int32());
+
+ auto add_node = std::dynamic_pointer_cast<FunctionNode>(root_node);
+ auto func_desc = add_node->descriptor();
+ FunctionSignature sign(func_desc->name(), func_desc->params(),
+ func_desc->return_type());
+ EXPECT_TRUE(sign == FunctionSignature("add", {int32(), int32()}, int32()));
+
+ ExprDecomposer decomposer(registry_, annotator);
+ ValueValidityPairPtr pair;
+ auto status = decomposer.Decompose(*root_node, &pair);
+ DCHECK_EQ(status.ok(), true) << status.message();
+
+ auto value = pair->value_expr();
+ auto null_if_null = std::dynamic_pointer_cast<NonNullableFuncDex>(value);
+
+ FunctionSignature signature("add", {int32(), int32()}, int32());
+ const NativeFunction* fn = registry_.LookupSignature(signature);
+ EXPECT_EQ(null_if_null->native_function(), fn);
+}
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/value_validity_pair.h b/src/arrow/cpp/src/gandiva/value_validity_pair.h
new file mode 100644
index 000000000..e5943b230
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/value_validity_pair.h
@@ -0,0 +1,48 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "gandiva/gandiva_aliases.h"
+#include "gandiva/visibility.h"
+
+namespace gandiva {
+
+/// Pair of vector/validities generated after decomposing an expression tree/subtree.
+class GANDIVA_EXPORT ValueValidityPair {
+ public:
+ ValueValidityPair(const DexVector& validity_exprs, DexPtr value_expr)
+ : validity_exprs_(validity_exprs), value_expr_(value_expr) {}
+
+ ValueValidityPair(DexPtr validity_expr, DexPtr value_expr) : value_expr_(value_expr) {
+ validity_exprs_.push_back(validity_expr);
+ }
+
+ explicit ValueValidityPair(DexPtr value_expr) : value_expr_(value_expr) {}
+
+ const DexVector& validity_exprs() const { return validity_exprs_; }
+
+ const DexPtr& value_expr() const { return value_expr_; }
+
+ private:
+ DexVector validity_exprs_;
+ DexPtr value_expr_;
+};
+
+} // namespace gandiva
diff --git a/src/arrow/cpp/src/gandiva/visibility.h b/src/arrow/cpp/src/gandiva/visibility.h
new file mode 100644
index 000000000..450b3056b
--- /dev/null
+++ b/src/arrow/cpp/src/gandiva/visibility.h
@@ -0,0 +1,48 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4251)
+#else
+#pragma GCC diagnostic ignored "-Wattributes"
+#endif
+
+#ifdef GANDIVA_STATIC
+#define GANDIVA_EXPORT
+#elif defined(GANDIVA_EXPORTING)
+#define GANDIVA_EXPORT __declspec(dllexport)
+#else
+#define GANDIVA_EXPORT __declspec(dllimport)
+#endif
+
+#define GANDIVA_NO_EXPORT
+#else // Not Windows
+#ifndef GANDIVA_EXPORT
+#define GANDIVA_EXPORT __attribute__((visibility("default")))
+#endif
+#ifndef GANDIVA_NO_EXPORT
+#define GANDIVA_NO_EXPORT __attribute__((visibility("hidden")))
+#endif
+#endif // Non-Windows
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif