diff options
Diffstat (limited to 'src/arrow/cpp/src/arrow/compute/exec/subtree_test.cc')
-rw-r--r-- | src/arrow/cpp/src/arrow/compute/exec/subtree_test.cc | 377 |
1 files changed, 377 insertions, 0 deletions
diff --git a/src/arrow/cpp/src/arrow/compute/exec/subtree_test.cc b/src/arrow/cpp/src/arrow/compute/exec/subtree_test.cc new file mode 100644 index 000000000..972131044 --- /dev/null +++ b/src/arrow/cpp/src/arrow/compute/exec/subtree_test.cc @@ -0,0 +1,377 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <cstdint> +#include <memory> +#include <string> +#include <vector> + +#include <gmock/gmock.h> +#include <gtest/gtest.h> + +#include "arrow/compute/exec/forest_internal.h" +#include "arrow/compute/exec/subtree_internal.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/util/string_view.h" + +namespace arrow { +namespace compute { + +using testing::ContainerEq; + +// Tests of subtree pruning + +// Don't depend on FileSystem - port just enough to be useful here +struct FileInfo { + bool is_dir; + std::string path; + + bool operator==(const FileInfo& other) const { + return is_dir == other.is_dir && path == other.path; + } + + static FileInfo Dir(std::string path) { return FileInfo{true, std::move(path)}; } + + static FileInfo File(std::string path) { return FileInfo{false, std::move(path)}; } + + static bool ByPath(const FileInfo& l, const FileInfo& r) { return l.path < r.path; } +}; + +struct TestPathTree { + FileInfo info; + std::vector<TestPathTree> subtrees; + + explicit TestPathTree(std::string file_path) + : info(FileInfo::File(std::move(file_path))) {} + + TestPathTree(std::string dir_path, std::vector<TestPathTree> subtrees) + : info(FileInfo::Dir(std::move(dir_path))), subtrees(std::move(subtrees)) {} + + TestPathTree(Forest::Ref ref, const std::vector<FileInfo>& infos) : info(infos[ref.i]) { + const Forest& forest = *ref.forest; + + int begin = ref.i + 1; + int end = begin + ref.num_descendants(); + + for (int i = begin; i < end; ++i) { + subtrees.emplace_back(forest[i], infos); + i += forest[i].num_descendants(); + } + } + + bool operator==(const TestPathTree& other) const { + return info == other.info && subtrees == other.subtrees; + } + + std::string ToString() const { + auto out = "\n" + info.path; + if (info.is_dir) out += "/"; + + for (const auto& subtree : subtrees) { + out += subtree.ToString(); + } + return out; + } + + friend std::ostream& operator<<(std::ostream& os, const TestPathTree& tree) { + return os << tree.ToString(); + } +}; + +using PT = TestPathTree; + +util::string_view RemoveTrailingSlash(util::string_view key) { + while (!key.empty() && key.back() == '/') { + key.remove_suffix(1); + } + return key; +} +bool IsAncestorOf(util::string_view ancestor, util::string_view descendant) { + // See filesystem/path_util.h + ancestor = RemoveTrailingSlash(ancestor); + if (ancestor == "") return true; + descendant = RemoveTrailingSlash(descendant); + if (!descendant.starts_with(ancestor)) return false; + descendant.remove_prefix(ancestor.size()); + if (descendant.empty()) return true; + return descendant.front() == '/'; +} + +Forest MakeForest(std::vector<FileInfo>* infos) { + std::sort(infos->begin(), infos->end(), FileInfo::ByPath); + + return Forest(static_cast<int>(infos->size()), [&](int i, int j) { + return IsAncestorOf(infos->at(i).path, infos->at(j).path); + }); +} + +void ExpectForestIs(std::vector<FileInfo> infos, std::vector<PT> expected_roots) { + auto forest = MakeForest(&infos); + + std::vector<PT> actual_roots; + ASSERT_OK(forest.Visit( + [&](Forest::Ref ref) -> Result<bool> { + actual_roots.emplace_back(ref, infos); + return false; // only vist roots + }, + [](Forest::Ref) {})); + + // visit expected and assert equality + EXPECT_THAT(actual_roots, ContainerEq(expected_roots)); +} + +TEST(Forest, Basic) { + ExpectForestIs({}, {}); + + ExpectForestIs({FileInfo::File("aa")}, {PT("aa")}); + ExpectForestIs({FileInfo::Dir("AA")}, {PT("AA", {})}); + ExpectForestIs({FileInfo::Dir("AA"), FileInfo::File("AA/aa")}, + {PT("AA", {PT("AA/aa")})}); + ExpectForestIs({FileInfo::Dir("AA"), FileInfo::Dir("AA/BB"), FileInfo::File("AA/BB/0")}, + {PT("AA", {PT("AA/BB", {PT("AA/BB/0")})})}); + + // Missing parent can still find ancestor. + ExpectForestIs({FileInfo::Dir("AA"), FileInfo::File("AA/BB/bb")}, + {PT("AA", {PT("AA/BB/bb")})}); + + // Ancestors should link to parent regardless of ordering. + ExpectForestIs({FileInfo::File("AA/aa"), FileInfo::Dir("AA")}, + {PT("AA", {PT("AA/aa")})}); + + // Multiple roots are supported. + ExpectForestIs({FileInfo::File("aa"), FileInfo::File("bb")}, {PT("aa"), PT("bb")}); + ExpectForestIs({FileInfo::File("00"), FileInfo::Dir("AA"), FileInfo::File("AA/aa"), + FileInfo::File("BB/bb")}, + {PT("00"), PT("AA", {PT("AA/aa")}), PT("BB/bb")}); + ExpectForestIs({FileInfo::Dir("AA"), FileInfo::Dir("AA/BB"), FileInfo::File("AA/BB/0"), + FileInfo::Dir("CC"), FileInfo::Dir("CC/BB"), FileInfo::File("CC/BB/0")}, + {PT("AA", {PT("AA/BB", {PT("AA/BB/0")})}), + PT("CC", {PT("CC/BB", {PT("CC/BB/0")})})}); +} + +TEST(Forest, HourlyETL) { + // This test mimics a scenario where an ETL dumps hourly files in a structure + // `$year/$month/$day/$hour/*.parquet`. + constexpr int64_t kYears = 3; + constexpr int64_t kMonthsPerYear = 12; + constexpr int64_t kDaysPerMonth = 31; + constexpr int64_t kHoursPerDay = 24; + constexpr int64_t kFilesPerHour = 2; + + // Avoid constructing strings + std::vector<std::string> numbers{kDaysPerMonth + 1}; + for (size_t i = 0; i < numbers.size(); i++) { + numbers[i] = std::to_string(i); + if (numbers[i].size() == 1) { + numbers[i] = "0" + numbers[i]; + } + } + + auto join = [](const std::vector<std::string>& path) { + if (path.empty()) return std::string(""); + std::string result = path[0]; + for (const auto& part : path) { + result += '/'; + result += part; + } + return result; + }; + + std::vector<FileInfo> infos; + + std::vector<PT> forest; + for (int64_t year = 0; year < kYears; year++) { + auto year_str = std::to_string(year + 2000); + auto year_dir = FileInfo::Dir(year_str); + infos.push_back(year_dir); + + std::vector<PT> months; + for (int64_t month = 0; month < kMonthsPerYear; month++) { + auto month_str = join({year_str, numbers[month + 1]}); + auto month_dir = FileInfo::Dir(month_str); + infos.push_back(month_dir); + + std::vector<PT> days; + for (int64_t day = 0; day < kDaysPerMonth; day++) { + auto day_str = join({month_str, numbers[day + 1]}); + auto day_dir = FileInfo::Dir(day_str); + infos.push_back(day_dir); + + std::vector<PT> hours; + for (int64_t hour = 0; hour < kHoursPerDay; hour++) { + auto hour_str = join({day_str, numbers[hour]}); + auto hour_dir = FileInfo::Dir(hour_str); + infos.push_back(hour_dir); + + std::vector<PT> files; + for (int64_t file = 0; file < kFilesPerHour; file++) { + auto file_str = join({hour_str, numbers[file] + ".parquet"}); + auto file_fd = FileInfo::File(file_str); + infos.push_back(file_fd); + files.emplace_back(file_str); + } + + auto hour_pt = PT(hour_str, std::move(files)); + hours.push_back(hour_pt); + } + + auto day_pt = PT(day_str, std::move(hours)); + days.push_back(day_pt); + } + + auto month_pt = PT(month_str, std::move(days)); + months.push_back(month_pt); + } + + auto year_pt = PT(year_str, std::move(months)); + forest.push_back(year_pt); + } + + ExpectForestIs(infos, forest); +} + +TEST(Forest, Visit) { + using Infos = std::vector<FileInfo>; + + for (auto infos : + {Infos{}, Infos{FileInfo::Dir("A"), FileInfo::File("A/a")}, + Infos{FileInfo::Dir("AA"), FileInfo::Dir("AA/BB"), FileInfo::File("AA/BB/0"), + FileInfo::Dir("CC"), FileInfo::Dir("CC/BB"), FileInfo::File("CC/BB/0")}}) { + ASSERT_TRUE(std::is_sorted(infos.begin(), infos.end(), FileInfo::ByPath)); + + auto forest = MakeForest(&infos); + + auto ignore_post = [](Forest::Ref) {}; + + // noop is fine + ASSERT_OK( + forest.Visit([](Forest::Ref) -> Result<bool> { return false; }, ignore_post)); + + // Should propagate failure + if (forest.size() != 0) { + ASSERT_RAISES( + Invalid, + forest.Visit([](Forest::Ref) -> Result<bool> { return Status::Invalid(""); }, + ignore_post)); + } + + // Ensure basic visit of all nodes + int i = 0; + ASSERT_OK(forest.Visit( + [&](Forest::Ref ref) -> Result<bool> { + EXPECT_EQ(ref.i, i); + ++i; + return true; + }, + ignore_post)); + + // Visit only directories + Infos actual_dirs; + ASSERT_OK(forest.Visit( + [&](Forest::Ref ref) -> Result<bool> { + if (!infos[ref.i].is_dir) { + return false; + } + actual_dirs.push_back(infos[ref.i]); + return true; + }, + ignore_post)); + + Infos expected_dirs; + for (const auto& info : infos) { + if (info.is_dir) { + expected_dirs.push_back(info); + } + } + EXPECT_THAT(actual_dirs, ContainerEq(expected_dirs)); + } +} + +TEST(Subtree, EncodeExpression) { + SubtreeImpl tree; + ASSERT_EQ(0, tree.GetOrInsert(equal(field_ref("a"), literal("1")))); + // Should be idempotent + ASSERT_EQ(0, tree.GetOrInsert(equal(field_ref("a"), literal("1")))); + ASSERT_EQ(equal(field_ref("a"), literal("1")), tree.code_to_expr_[0]); + + SubtreeImpl::expression_codes codes; + auto conj = + and_(equal(field_ref("a"), literal("1")), equal(field_ref("b"), literal("2"))); + tree.EncodeConjunctionMembers(conj, &codes); + ASSERT_EQ(SubtreeImpl::expression_codes({0, 1}), codes); + + codes.clear(); + conj = or_(equal(field_ref("a"), literal("1")), equal(field_ref("b"), literal("2"))); + tree.EncodeConjunctionMembers(conj, &codes); + ASSERT_EQ(SubtreeImpl::expression_codes({2}), codes); +} + +TEST(Subtree, GetSubtreeExpression) { + SubtreeImpl tree; + const auto expr_a = equal(field_ref("a"), literal("1")); + const auto expr_b = equal(field_ref("b"), literal("2")); + const auto code_a = tree.GetOrInsert(expr_a); + const auto code_b = tree.GetOrInsert(expr_b); + ASSERT_EQ(expr_a, + tree.GetSubtreeExpression(SubtreeImpl::Encoded{util::nullopt, {code_a}})); + ASSERT_EQ(expr_b, tree.GetSubtreeExpression( + SubtreeImpl::Encoded{util::nullopt, {code_a, code_b}})); +} + +class FakeFragment { + public: + explicit FakeFragment(Expression partition_expression) + : partition_expression_(partition_expression) {} + const Expression& partition_expression() const { return partition_expression_; } + + private: + Expression partition_expression_; +}; + +TEST(Subtree, EncodeFragments) { + const auto expr_a = + and_(equal(field_ref("a"), literal("1")), equal(field_ref("b"), literal("2"))); + const auto expr_b = + and_(equal(field_ref("a"), literal("2")), equal(field_ref("b"), literal("3"))); + std::vector<std::shared_ptr<FakeFragment>> fragments; + fragments.push_back(std::make_shared<FakeFragment>(expr_a)); + fragments.push_back(std::make_shared<FakeFragment>(expr_b)); + + SubtreeImpl tree; + auto encoded = tree.EncodeGuarantees( + [&](int index) { return fragments[index]->partition_expression(); }, + static_cast<int>(fragments.size())); + EXPECT_THAT( + tree.code_to_expr_, + ContainerEq(std::vector<compute::Expression>{ + equal(field_ref("a"), literal("1")), equal(field_ref("b"), literal("2")), + equal(field_ref("a"), literal("2")), equal(field_ref("b"), literal("3"))})); + EXPECT_THAT( + encoded, + testing::UnorderedElementsAreArray({ + SubtreeImpl::Encoded{util::make_optional<int>(0), + SubtreeImpl::expression_codes({0, 1})}, + SubtreeImpl::Encoded{util::make_optional<int>(1), + SubtreeImpl::expression_codes({2, 3})}, + SubtreeImpl::Encoded{util::nullopt, SubtreeImpl::expression_codes({0})}, + SubtreeImpl::Encoded{util::nullopt, SubtreeImpl::expression_codes({2})}, + SubtreeImpl::Encoded{util::nullopt, SubtreeImpl::expression_codes({0, 1})}, + SubtreeImpl::Encoded{util::nullopt, SubtreeImpl::expression_codes({2, 3})}, + })); +} +} // namespace compute +} // namespace arrow |