summaryrefslogtreecommitdiffstats
path: root/src/arrow/cpp/src/arrow/array/builder_base.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/arrow/cpp/src/arrow/array/builder_base.h')
-rw-r--r--src/arrow/cpp/src/arrow/array/builder_base.h307
1 files changed, 307 insertions, 0 deletions
diff --git a/src/arrow/cpp/src/arrow/array/builder_base.h b/src/arrow/cpp/src/arrow/array/builder_base.h
new file mode 100644
index 000000000..a513bf0f4
--- /dev/null
+++ b/src/arrow/cpp/src/arrow/array/builder_base.h
@@ -0,0 +1,307 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm> // IWYU pragma: keep
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/array_primitive.h"
+#include "arrow/buffer.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+constexpr int64_t kMinBuilderCapacity = 1 << 5;
+constexpr int64_t kListMaximumElements = std::numeric_limits<int32_t>::max() - 1;
+
+/// Base class for all data array builders.
+///
+/// This class provides a facilities for incrementally building the null bitmap
+/// (see Append methods) and as a side effect the current number of slots and
+/// the null count.
+///
+/// \note Users are expected to use builders as one of the concrete types below.
+/// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use.
+class ARROW_EXPORT ArrayBuilder {
+ public:
+ explicit ArrayBuilder(MemoryPool* pool) : pool_(pool), null_bitmap_builder_(pool) {}
+
+ ARROW_DEFAULT_MOVE_AND_ASSIGN(ArrayBuilder);
+
+ virtual ~ArrayBuilder() = default;
+
+ /// For nested types. Since the objects are owned by this class instance, we
+ /// skip shared pointers and just return a raw pointer
+ ArrayBuilder* child(int i) { return children_[i].get(); }
+
+ const std::shared_ptr<ArrayBuilder>& child_builder(int i) const { return children_[i]; }
+
+ int num_children() const { return static_cast<int>(children_.size()); }
+
+ virtual int64_t length() const { return length_; }
+ int64_t null_count() const { return null_count_; }
+ int64_t capacity() const { return capacity_; }
+
+ /// \brief Ensure that enough memory has been allocated to fit the indicated
+ /// number of total elements in the builder, including any that have already
+ /// been appended. Does not account for reallocations that may be due to
+ /// variable size data, like binary values. To make space for incremental
+ /// appends, use Reserve instead.
+ ///
+ /// \param[in] capacity the minimum number of total array values to
+ /// accommodate. Must be greater than the current capacity.
+ /// \return Status
+ virtual Status Resize(int64_t capacity);
+
+ /// \brief Ensure that there is enough space allocated to append the indicated
+ /// number of elements without any further reallocation. Overallocation is
+ /// used in order to minimize the impact of incremental Reserve() calls.
+ /// Note that additional_capacity is relative to the current number of elements
+ /// rather than to the current capacity, so calls to Reserve() which are not
+ /// interspersed with addition of new elements may not increase the capacity.
+ ///
+ /// \param[in] additional_capacity the number of additional array values
+ /// \return Status
+ Status Reserve(int64_t additional_capacity) {
+ auto current_capacity = capacity();
+ auto min_capacity = length() + additional_capacity;
+ if (min_capacity <= current_capacity) return Status::OK();
+
+ // leave growth factor up to BufferBuilder
+ auto new_capacity = BufferBuilder::GrowByFactor(current_capacity, min_capacity);
+ return Resize(new_capacity);
+ }
+
+ /// Reset the builder.
+ virtual void Reset();
+
+ /// \brief Append a null value to builder
+ virtual Status AppendNull() = 0;
+ /// \brief Append a number of null values to builder
+ virtual Status AppendNulls(int64_t length) = 0;
+
+ /// \brief Append a non-null value to builder
+ ///
+ /// The appended value is an implementation detail, but the corresponding
+ /// memory slot is guaranteed to be initialized.
+ /// This method is useful when appending a null value to a parent nested type.
+ virtual Status AppendEmptyValue() = 0;
+
+ /// \brief Append a number of non-null values to builder
+ ///
+ /// The appended values are an implementation detail, but the corresponding
+ /// memory slot is guaranteed to be initialized.
+ /// This method is useful when appending null values to a parent nested type.
+ virtual Status AppendEmptyValues(int64_t length) = 0;
+
+ /// \brief Append a value from a scalar
+ Status AppendScalar(const Scalar& scalar) { return AppendScalar(scalar, 1); }
+ virtual Status AppendScalar(const Scalar& scalar, int64_t n_repeats);
+ virtual Status AppendScalars(const ScalarVector& scalars);
+
+ /// \brief Append a range of values from an array.
+ ///
+ /// The given array must be the same type as the builder.
+ virtual Status AppendArraySlice(const ArrayData& array, int64_t offset,
+ int64_t length) {
+ return Status::NotImplemented("AppendArraySlice for builder for ", *type());
+ }
+
+ /// For cases where raw data was memcpy'd into the internal buffers, allows us
+ /// to advance the length of the builder. It is your responsibility to use
+ /// this function responsibly.
+ ARROW_DEPRECATED(
+ "Deprecated in 6.0.0. ArrayBuilder::Advance is poorly supported and mostly "
+ "untested.\nFor low-level control over buffer construction, use BufferBuilder "
+ "or TypedBufferBuilder directly.")
+ Status Advance(int64_t elements);
+
+ /// \brief Return result of builder as an internal generic ArrayData
+ /// object. Resets builder except for dictionary builder
+ ///
+ /// \param[out] out the finalized ArrayData object
+ /// \return Status
+ virtual Status FinishInternal(std::shared_ptr<ArrayData>* out) = 0;
+
+ /// \brief Return result of builder as an Array object.
+ ///
+ /// The builder is reset except for DictionaryBuilder.
+ ///
+ /// \param[out] out the finalized Array object
+ /// \return Status
+ Status Finish(std::shared_ptr<Array>* out);
+
+ /// \brief Return result of builder as an Array object.
+ ///
+ /// The builder is reset except for DictionaryBuilder.
+ ///
+ /// \return The finalized Array object
+ Result<std::shared_ptr<Array>> Finish();
+
+ /// \brief Return the type of the built Array
+ virtual std::shared_ptr<DataType> type() const = 0;
+
+ protected:
+ /// Append to null bitmap
+ Status AppendToBitmap(bool is_valid);
+
+ /// Vector append. Treat each zero byte as a null. If valid_bytes is null
+ /// assume all of length bits are valid.
+ Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length);
+
+ /// Uniform append. Append N times the same validity bit.
+ Status AppendToBitmap(int64_t num_bits, bool value);
+
+ /// Set the next length bits to not null (i.e. valid).
+ Status SetNotNull(int64_t length);
+
+ // Unsafe operations (don't check capacity/don't resize)
+
+ void UnsafeAppendNull() { UnsafeAppendToBitmap(false); }
+
+ // Append to null bitmap, update the length
+ void UnsafeAppendToBitmap(bool is_valid) {
+ null_bitmap_builder_.UnsafeAppend(is_valid);
+ ++length_;
+ if (!is_valid) ++null_count_;
+ }
+
+ // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null
+ // assume all of length bits are valid.
+ void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) {
+ if (valid_bytes == NULLPTR) {
+ return UnsafeSetNotNull(length);
+ }
+ null_bitmap_builder_.UnsafeAppend(valid_bytes, length);
+ length_ += length;
+ null_count_ = null_bitmap_builder_.false_count();
+ }
+
+ // Vector append. Copy from a given bitmap. If bitmap is null assume
+ // all of length bits are valid.
+ void UnsafeAppendToBitmap(const uint8_t* bitmap, int64_t offset, int64_t length) {
+ if (bitmap == NULLPTR) {
+ return UnsafeSetNotNull(length);
+ }
+ null_bitmap_builder_.UnsafeAppend(bitmap, offset, length);
+ length_ += length;
+ null_count_ = null_bitmap_builder_.false_count();
+ }
+
+ // Append the same validity value a given number of times.
+ void UnsafeAppendToBitmap(const int64_t num_bits, bool value) {
+ if (value) {
+ UnsafeSetNotNull(num_bits);
+ } else {
+ UnsafeSetNull(num_bits);
+ }
+ }
+
+ void UnsafeAppendToBitmap(const std::vector<bool>& is_valid);
+
+ // Set the next validity bits to not null (i.e. valid).
+ void UnsafeSetNotNull(int64_t length);
+
+ // Set the next validity bits to null (i.e. invalid).
+ void UnsafeSetNull(int64_t length);
+
+ static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer);
+
+ /// \brief Finish to an array of the specified ArrayType
+ template <typename ArrayType>
+ Status FinishTyped(std::shared_ptr<ArrayType>* out) {
+ std::shared_ptr<Array> out_untyped;
+ ARROW_RETURN_NOT_OK(Finish(&out_untyped));
+ *out = std::static_pointer_cast<ArrayType>(std::move(out_untyped));
+ return Status::OK();
+ }
+
+ // Check the requested capacity for validity
+ Status CheckCapacity(int64_t new_capacity) {
+ if (ARROW_PREDICT_FALSE(new_capacity < 0)) {
+ return Status::Invalid(
+ "Resize capacity must be positive (requested: ", new_capacity, ")");
+ }
+
+ if (ARROW_PREDICT_FALSE(new_capacity < length_)) {
+ return Status::Invalid("Resize cannot downsize (requested: ", new_capacity,
+ ", current length: ", length_, ")");
+ }
+
+ return Status::OK();
+ }
+
+ // Check for array type
+ Status CheckArrayType(const std::shared_ptr<DataType>& expected_type,
+ const Array& array, const char* message);
+ Status CheckArrayType(Type::type expected_type, const Array& array,
+ const char* message);
+
+ MemoryPool* pool_;
+
+ TypedBufferBuilder<bool> null_bitmap_builder_;
+ int64_t null_count_ = 0;
+
+ // Array length, so far. Also, the index of the next element to be added
+ int64_t length_ = 0;
+ int64_t capacity_ = 0;
+
+ // Child value array builders. These are owned by this class
+ std::vector<std::shared_ptr<ArrayBuilder>> children_;
+
+ private:
+ ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder);
+};
+
+/// \brief Construct an empty ArrayBuilder corresponding to the data
+/// type
+/// \param[in] pool the MemoryPool to use for allocations
+/// \param[in] type the data type to create the builder for
+/// \param[out] out the created ArrayBuilder
+ARROW_EXPORT
+Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
+ std::unique_ptr<ArrayBuilder>* out);
+
+/// \brief Construct an empty ArrayBuilder corresponding to the data
+/// type, where any top-level or nested dictionary builders return the
+/// exact index type specified by the type.
+ARROW_EXPORT
+Status MakeBuilderExactIndex(MemoryPool* pool, const std::shared_ptr<DataType>& type,
+ std::unique_ptr<ArrayBuilder>* out);
+
+/// \brief Construct an empty DictionaryBuilder initialized optionally
+/// with a pre-existing dictionary
+/// \param[in] pool the MemoryPool to use for allocations
+/// \param[in] type the dictionary type to create the builder for
+/// \param[in] dictionary the initial dictionary, if any. May be nullptr
+/// \param[out] out the created ArrayBuilder
+ARROW_EXPORT
+Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
+ const std::shared_ptr<Array>& dictionary,
+ std::unique_ptr<ArrayBuilder>* out);
+
+} // namespace arrow