summaryrefslogtreecommitdiffstats
path: root/src/arrow/python/pyarrow/includes/libarrow_dataset.pxd
diff options
context:
space:
mode:
Diffstat (limited to 'src/arrow/python/pyarrow/includes/libarrow_dataset.pxd')
-rw-r--r--src/arrow/python/pyarrow/includes/libarrow_dataset.pxd478
1 files changed, 478 insertions, 0 deletions
diff --git a/src/arrow/python/pyarrow/includes/libarrow_dataset.pxd b/src/arrow/python/pyarrow/includes/libarrow_dataset.pxd
new file mode 100644
index 000000000..abc79fea8
--- /dev/null
+++ b/src/arrow/python/pyarrow/includes/libarrow_dataset.pxd
@@ -0,0 +1,478 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# distutils: language = c++
+
+from libcpp.unordered_map cimport unordered_map
+
+from pyarrow.includes.common cimport *
+from pyarrow.includes.libarrow cimport *
+from pyarrow.includes.libarrow_fs cimport *
+from pyarrow._parquet cimport *
+
+
+cdef extern from "arrow/api.h" namespace "arrow" nogil:
+
+ cdef cppclass CRecordBatchIterator "arrow::RecordBatchIterator"(
+ CIterator[shared_ptr[CRecordBatch]]):
+ pass
+
+
+cdef extern from * namespace "arrow::compute":
+ # inlined from expression_internal.h to avoid
+ # proliferation of #include <unordered_map>
+ """
+ #include <unordered_map>
+
+ #include "arrow/type.h"
+ #include "arrow/datum.h"
+
+ namespace arrow {
+ namespace compute {
+ struct KnownFieldValues {
+ std::unordered_map<FieldRef, Datum, FieldRef::Hash> map;
+ };
+ } // namespace compute
+ } // namespace arrow
+ """
+ cdef struct CKnownFieldValues "arrow::compute::KnownFieldValues":
+ unordered_map[CFieldRef, CDatum, CFieldRefHash] map
+
+cdef extern from "arrow/compute/exec/expression.h" \
+ namespace "arrow::compute" nogil:
+
+ cdef cppclass CExpression "arrow::compute::Expression":
+ c_bool Equals(const CExpression& other) const
+ c_string ToString() const
+ CResult[CExpression] Bind(const CSchema&)
+
+ cdef CExpression CMakeScalarExpression \
+ "arrow::compute::literal"(shared_ptr[CScalar] value)
+
+ cdef CExpression CMakeFieldExpression \
+ "arrow::compute::field_ref"(c_string name)
+
+ cdef CExpression CMakeCallExpression \
+ "arrow::compute::call"(c_string function,
+ vector[CExpression] arguments,
+ shared_ptr[CFunctionOptions] options)
+
+ cdef CResult[shared_ptr[CBuffer]] CSerializeExpression \
+ "arrow::compute::Serialize"(const CExpression&)
+
+ cdef CResult[CExpression] CDeserializeExpression \
+ "arrow::compute::Deserialize"(shared_ptr[CBuffer])
+
+ cdef CResult[CKnownFieldValues] \
+ CExtractKnownFieldValues "arrow::compute::ExtractKnownFieldValues"(
+ const CExpression& partition_expression)
+
+ctypedef CStatus cb_writer_finish_internal(CFileWriter*)
+ctypedef void cb_writer_finish(dict, CFileWriter*)
+
+cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
+
+ cdef enum ExistingDataBehavior" arrow::dataset::ExistingDataBehavior":
+ ExistingDataBehavior_DELETE_MATCHING" \
+ arrow::dataset::ExistingDataBehavior::kDeleteMatchingPartitions"
+ ExistingDataBehavior_OVERWRITE_OR_IGNORE" \
+ arrow::dataset::ExistingDataBehavior::kOverwriteOrIgnore"
+ ExistingDataBehavior_ERROR" \
+ arrow::dataset::ExistingDataBehavior::kError"
+
+ cdef cppclass CScanOptions "arrow::dataset::ScanOptions":
+ @staticmethod
+ shared_ptr[CScanOptions] Make(shared_ptr[CSchema] schema)
+
+ shared_ptr[CSchema] dataset_schema
+ shared_ptr[CSchema] projected_schema
+
+ cdef cppclass CFragmentScanOptions "arrow::dataset::FragmentScanOptions":
+ c_string type_name() const
+
+ ctypedef CIterator[shared_ptr[CScanTask]] CScanTaskIterator \
+ "arrow::dataset::ScanTaskIterator"
+
+ cdef cppclass CScanTask" arrow::dataset::ScanTask":
+ CResult[CRecordBatchIterator] Execute()
+
+ cdef cppclass CFragment "arrow::dataset::Fragment":
+ CResult[shared_ptr[CSchema]] ReadPhysicalSchema()
+ CResult[CScanTaskIterator] Scan(shared_ptr[CScanOptions] options)
+ c_bool splittable() const
+ c_string type_name() const
+ const CExpression& partition_expression() const
+
+ ctypedef vector[shared_ptr[CFragment]] CFragmentVector \
+ "arrow::dataset::FragmentVector"
+
+ ctypedef CIterator[shared_ptr[CFragment]] CFragmentIterator \
+ "arrow::dataset::FragmentIterator"
+
+ cdef cppclass CInMemoryFragment "arrow::dataset::InMemoryFragment"(
+ CFragment):
+ CInMemoryFragment(vector[shared_ptr[CRecordBatch]] record_batches,
+ CExpression partition_expression)
+
+ cdef cppclass CTaggedRecordBatch "arrow::dataset::TaggedRecordBatch":
+ shared_ptr[CRecordBatch] record_batch
+ shared_ptr[CFragment] fragment
+
+ ctypedef CIterator[CTaggedRecordBatch] CTaggedRecordBatchIterator \
+ "arrow::dataset::TaggedRecordBatchIterator"
+
+ cdef cppclass CScanner "arrow::dataset::Scanner":
+ CScanner(shared_ptr[CDataset], shared_ptr[CScanOptions])
+ CScanner(shared_ptr[CFragment], shared_ptr[CScanOptions])
+ CResult[CScanTaskIterator] Scan()
+ CResult[CTaggedRecordBatchIterator] ScanBatches()
+ CResult[shared_ptr[CTable]] ToTable()
+ CResult[shared_ptr[CTable]] TakeRows(const CArray& indices)
+ CResult[shared_ptr[CTable]] Head(int64_t num_rows)
+ CResult[int64_t] CountRows()
+ CResult[CFragmentIterator] GetFragments()
+ CResult[shared_ptr[CRecordBatchReader]] ToRecordBatchReader()
+ const shared_ptr[CScanOptions]& options()
+
+ cdef cppclass CScannerBuilder "arrow::dataset::ScannerBuilder":
+ CScannerBuilder(shared_ptr[CDataset],
+ shared_ptr[CScanOptions] scan_options)
+ CScannerBuilder(shared_ptr[CSchema], shared_ptr[CFragment],
+ shared_ptr[CScanOptions] scan_options)
+
+ @staticmethod
+ shared_ptr[CScannerBuilder] FromRecordBatchReader(
+ shared_ptr[CRecordBatchReader] reader)
+ CStatus ProjectColumns "Project"(const vector[c_string]& columns)
+ CStatus Project(vector[CExpression]& exprs, vector[c_string]& columns)
+ CStatus Filter(CExpression filter)
+ CStatus UseThreads(c_bool use_threads)
+ CStatus UseAsync(c_bool use_async)
+ CStatus Pool(CMemoryPool* pool)
+ CStatus BatchSize(int64_t batch_size)
+ CStatus FragmentScanOptions(
+ shared_ptr[CFragmentScanOptions] fragment_scan_options)
+ CResult[shared_ptr[CScanner]] Finish()
+ shared_ptr[CSchema] schema() const
+
+ ctypedef vector[shared_ptr[CDataset]] CDatasetVector \
+ "arrow::dataset::DatasetVector"
+
+ cdef cppclass CDataset "arrow::dataset::Dataset":
+ const shared_ptr[CSchema] & schema()
+ CResult[CFragmentIterator] GetFragments()
+ CResult[CFragmentIterator] GetFragments(CExpression predicate)
+ const CExpression & partition_expression()
+ c_string type_name()
+
+ CResult[shared_ptr[CDataset]] ReplaceSchema(shared_ptr[CSchema])
+
+ CResult[shared_ptr[CScannerBuilder]] NewScan()
+
+ cdef cppclass CInMemoryDataset "arrow::dataset::InMemoryDataset"(
+ CDataset):
+ CInMemoryDataset(shared_ptr[CRecordBatchReader])
+ CInMemoryDataset(shared_ptr[CTable])
+
+ cdef cppclass CUnionDataset "arrow::dataset::UnionDataset"(
+ CDataset):
+ @staticmethod
+ CResult[shared_ptr[CUnionDataset]] Make(shared_ptr[CSchema] schema,
+ CDatasetVector children)
+
+ const CDatasetVector& children() const
+
+ cdef cppclass CInspectOptions "arrow::dataset::InspectOptions":
+ int fragments
+
+ cdef cppclass CFinishOptions "arrow::dataset::FinishOptions":
+ shared_ptr[CSchema] schema
+ CInspectOptions inspect_options
+ c_bool validate_fragments
+
+ cdef cppclass CDatasetFactory "arrow::dataset::DatasetFactory":
+ CResult[vector[shared_ptr[CSchema]]] InspectSchemas(CInspectOptions)
+ CResult[shared_ptr[CSchema]] Inspect(CInspectOptions)
+ CResult[shared_ptr[CDataset]] FinishWithSchema "Finish"(
+ const shared_ptr[CSchema]& schema)
+ CResult[shared_ptr[CDataset]] Finish()
+ const CExpression& root_partition()
+ CStatus SetRootPartition(CExpression partition)
+
+ cdef cppclass CUnionDatasetFactory "arrow::dataset::UnionDatasetFactory":
+ @staticmethod
+ CResult[shared_ptr[CDatasetFactory]] Make(
+ vector[shared_ptr[CDatasetFactory]] factories)
+
+ cdef cppclass CFileSource "arrow::dataset::FileSource":
+ const c_string& path() const
+ const shared_ptr[CFileSystem]& filesystem() const
+ const shared_ptr[CBuffer]& buffer() const
+ # HACK: Cython can't handle all the overloads so don't declare them.
+ # This means invalid construction of CFileSource won't be caught in
+ # the C++ generation phase (though it will still be caught when
+ # the generated C++ is compiled).
+ CFileSource(...)
+
+ cdef cppclass CFileWriteOptions \
+ "arrow::dataset::FileWriteOptions":
+ const shared_ptr[CFileFormat]& format() const
+ c_string type_name() const
+
+ cdef cppclass CFileWriter \
+ "arrow::dataset::FileWriter":
+ const shared_ptr[CFileFormat]& format() const
+ const shared_ptr[CSchema]& schema() const
+ const shared_ptr[CFileWriteOptions]& options() const
+ const CFileLocator& destination() const
+
+ cdef cppclass CParquetFileWriter \
+ "arrow::dataset::ParquetFileWriter"(CFileWriter):
+ const shared_ptr[FileWriter]& parquet_writer() const
+
+ cdef cppclass CFileFormat "arrow::dataset::FileFormat":
+ shared_ptr[CFragmentScanOptions] default_fragment_scan_options
+ c_string type_name() const
+ CResult[shared_ptr[CSchema]] Inspect(const CFileSource&) const
+ CResult[shared_ptr[CFileFragment]] MakeFragment(
+ CFileSource source,
+ CExpression partition_expression,
+ shared_ptr[CSchema] physical_schema)
+ shared_ptr[CFileWriteOptions] DefaultWriteOptions()
+
+ cdef cppclass CFileFragment "arrow::dataset::FileFragment"(
+ CFragment):
+ const CFileSource& source() const
+ const shared_ptr[CFileFormat]& format() const
+
+ cdef cppclass CParquetFileWriteOptions \
+ "arrow::dataset::ParquetFileWriteOptions"(CFileWriteOptions):
+ shared_ptr[WriterProperties] writer_properties
+ shared_ptr[ArrowWriterProperties] arrow_writer_properties
+
+ cdef cppclass CParquetFileFragment "arrow::dataset::ParquetFileFragment"(
+ CFileFragment):
+ const vector[int]& row_groups() const
+ shared_ptr[CFileMetaData] metadata() const
+ CResult[vector[shared_ptr[CFragment]]] SplitByRowGroup(
+ CExpression predicate)
+ CResult[shared_ptr[CFragment]] SubsetWithFilter "Subset"(
+ CExpression predicate)
+ CResult[shared_ptr[CFragment]] SubsetWithIds "Subset"(
+ vector[int] row_group_ids)
+ CStatus EnsureCompleteMetadata()
+
+ cdef cppclass CFileSystemDatasetWriteOptions \
+ "arrow::dataset::FileSystemDatasetWriteOptions":
+ shared_ptr[CFileWriteOptions] file_write_options
+ shared_ptr[CFileSystem] filesystem
+ c_string base_dir
+ shared_ptr[CPartitioning] partitioning
+ int max_partitions
+ c_string basename_template
+ function[cb_writer_finish_internal] writer_pre_finish
+ function[cb_writer_finish_internal] writer_post_finish
+ ExistingDataBehavior existing_data_behavior
+
+ cdef cppclass CFileSystemDataset \
+ "arrow::dataset::FileSystemDataset"(CDataset):
+ @staticmethod
+ CResult[shared_ptr[CDataset]] Make(
+ shared_ptr[CSchema] schema,
+ CExpression source_partition,
+ shared_ptr[CFileFormat] format,
+ shared_ptr[CFileSystem] filesystem,
+ vector[shared_ptr[CFileFragment]] fragments)
+
+ @staticmethod
+ CStatus Write(
+ const CFileSystemDatasetWriteOptions& write_options,
+ shared_ptr[CScanner] scanner)
+
+ c_string type()
+ vector[c_string] files()
+ const shared_ptr[CFileFormat]& format() const
+ const shared_ptr[CFileSystem]& filesystem() const
+ const shared_ptr[CPartitioning]& partitioning() const
+
+ cdef cppclass CParquetFileFormatReaderOptions \
+ "arrow::dataset::ParquetFileFormat::ReaderOptions":
+ unordered_set[c_string] dict_columns
+ TimeUnit coerce_int96_timestamp_unit
+
+ cdef cppclass CParquetFileFormat "arrow::dataset::ParquetFileFormat"(
+ CFileFormat):
+ CParquetFileFormatReaderOptions reader_options
+ CResult[shared_ptr[CFileFragment]] MakeFragment(
+ CFileSource source,
+ CExpression partition_expression,
+ shared_ptr[CSchema] physical_schema,
+ vector[int] row_groups)
+
+ cdef cppclass CParquetFragmentScanOptions \
+ "arrow::dataset::ParquetFragmentScanOptions"(CFragmentScanOptions):
+ shared_ptr[CReaderProperties] reader_properties
+ shared_ptr[ArrowReaderProperties] arrow_reader_properties
+ c_bool enable_parallel_column_conversion
+
+ cdef cppclass CIpcFileWriteOptions \
+ "arrow::dataset::IpcFileWriteOptions"(CFileWriteOptions):
+ pass
+
+ cdef cppclass CIpcFileFormat "arrow::dataset::IpcFileFormat"(
+ CFileFormat):
+ pass
+
+ cdef cppclass COrcFileFormat "arrow::dataset::OrcFileFormat"(
+ CFileFormat):
+ pass
+
+ cdef cppclass CCsvFileWriteOptions \
+ "arrow::dataset::CsvFileWriteOptions"(CFileWriteOptions):
+ shared_ptr[CCSVWriteOptions] write_options
+ CMemoryPool* pool
+
+ cdef cppclass CCsvFileFormat "arrow::dataset::CsvFileFormat"(
+ CFileFormat):
+ CCSVParseOptions parse_options
+
+ cdef cppclass CCsvFragmentScanOptions \
+ "arrow::dataset::CsvFragmentScanOptions"(CFragmentScanOptions):
+ CCSVConvertOptions convert_options
+ CCSVReadOptions read_options
+
+ cdef cppclass CPartitioning "arrow::dataset::Partitioning":
+ c_string type_name() const
+ CResult[CExpression] Parse(const c_string & path) const
+ const shared_ptr[CSchema] & schema()
+
+ cdef cppclass CSegmentEncoding" arrow::dataset::SegmentEncoding":
+ pass
+
+ CSegmentEncoding CSegmentEncodingNone\
+ " arrow::dataset::SegmentEncoding::None"
+ CSegmentEncoding CSegmentEncodingUri\
+ " arrow::dataset::SegmentEncoding::Uri"
+
+ cdef cppclass CKeyValuePartitioningOptions \
+ "arrow::dataset::KeyValuePartitioningOptions":
+ CSegmentEncoding segment_encoding
+
+ cdef cppclass CHivePartitioningOptions \
+ "arrow::dataset::HivePartitioningOptions":
+ CSegmentEncoding segment_encoding
+ c_string null_fallback
+
+ cdef cppclass CPartitioningFactoryOptions \
+ "arrow::dataset::PartitioningFactoryOptions":
+ c_bool infer_dictionary
+ shared_ptr[CSchema] schema
+ CSegmentEncoding segment_encoding
+
+ cdef cppclass CHivePartitioningFactoryOptions \
+ "arrow::dataset::HivePartitioningFactoryOptions":
+ c_bool infer_dictionary
+ c_string null_fallback
+ shared_ptr[CSchema] schema
+ CSegmentEncoding segment_encoding
+
+ cdef cppclass CPartitioningFactory "arrow::dataset::PartitioningFactory":
+ c_string type_name() const
+
+ cdef cppclass CDirectoryPartitioning \
+ "arrow::dataset::DirectoryPartitioning"(CPartitioning):
+ CDirectoryPartitioning(shared_ptr[CSchema] schema,
+ vector[shared_ptr[CArray]] dictionaries)
+
+ @staticmethod
+ shared_ptr[CPartitioningFactory] MakeFactory(
+ vector[c_string] field_names, CPartitioningFactoryOptions)
+
+ vector[shared_ptr[CArray]] dictionaries() const
+
+ cdef cppclass CHivePartitioning \
+ "arrow::dataset::HivePartitioning"(CPartitioning):
+ CHivePartitioning(shared_ptr[CSchema] schema,
+ vector[shared_ptr[CArray]] dictionaries,
+ CHivePartitioningOptions options)
+
+ @staticmethod
+ shared_ptr[CPartitioningFactory] MakeFactory(
+ CHivePartitioningFactoryOptions)
+
+ vector[shared_ptr[CArray]] dictionaries() const
+
+ cdef cppclass CPartitioningOrFactory \
+ "arrow::dataset::PartitioningOrFactory":
+ CPartitioningOrFactory(shared_ptr[CPartitioning])
+ CPartitioningOrFactory(shared_ptr[CPartitioningFactory])
+ CPartitioningOrFactory & operator = (shared_ptr[CPartitioning])
+ CPartitioningOrFactory & operator = (
+ shared_ptr[CPartitioningFactory])
+ shared_ptr[CPartitioning] partitioning() const
+ shared_ptr[CPartitioningFactory] factory() const
+
+ cdef cppclass CFileSystemFactoryOptions \
+ "arrow::dataset::FileSystemFactoryOptions":
+ CPartitioningOrFactory partitioning
+ c_string partition_base_dir
+ c_bool exclude_invalid_files
+ vector[c_string] selector_ignore_prefixes
+
+ cdef cppclass CFileSystemDatasetFactory \
+ "arrow::dataset::FileSystemDatasetFactory"(
+ CDatasetFactory):
+ @staticmethod
+ CResult[shared_ptr[CDatasetFactory]] MakeFromPaths "Make"(
+ shared_ptr[CFileSystem] filesystem,
+ vector[c_string] paths,
+ shared_ptr[CFileFormat] format,
+ CFileSystemFactoryOptions options
+ )
+
+ @staticmethod
+ CResult[shared_ptr[CDatasetFactory]] MakeFromSelector "Make"(
+ shared_ptr[CFileSystem] filesystem,
+ CFileSelector,
+ shared_ptr[CFileFormat] format,
+ CFileSystemFactoryOptions options
+ )
+
+ cdef cppclass CParquetFactoryOptions \
+ "arrow::dataset::ParquetFactoryOptions":
+ CPartitioningOrFactory partitioning
+ c_string partition_base_dir
+ c_bool validate_column_chunk_paths
+
+ cdef cppclass CParquetDatasetFactory \
+ "arrow::dataset::ParquetDatasetFactory"(CDatasetFactory):
+ @staticmethod
+ CResult[shared_ptr[CDatasetFactory]] MakeFromMetaDataPath "Make"(
+ const c_string& metadata_path,
+ shared_ptr[CFileSystem] filesystem,
+ shared_ptr[CParquetFileFormat] format,
+ CParquetFactoryOptions options
+ )
+
+ @staticmethod
+ CResult[shared_ptr[CDatasetFactory]] MakeFromMetaDataSource "Make"(
+ const CFileSource& metadata_path,
+ const c_string& base_path,
+ shared_ptr[CFileSystem] filesystem,
+ shared_ptr[CParquetFileFormat] format,
+ CParquetFactoryOptions options
+ )