diff options
Diffstat (limited to 'src/arrow/python/pyarrow/includes/libarrow_dataset.pxd')
-rw-r--r-- | src/arrow/python/pyarrow/includes/libarrow_dataset.pxd | 478 |
1 files changed, 478 insertions, 0 deletions
diff --git a/src/arrow/python/pyarrow/includes/libarrow_dataset.pxd b/src/arrow/python/pyarrow/includes/libarrow_dataset.pxd new file mode 100644 index 000000000..abc79fea8 --- /dev/null +++ b/src/arrow/python/pyarrow/includes/libarrow_dataset.pxd @@ -0,0 +1,478 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# distutils: language = c++ + +from libcpp.unordered_map cimport unordered_map + +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport * +from pyarrow.includes.libarrow_fs cimport * +from pyarrow._parquet cimport * + + +cdef extern from "arrow/api.h" namespace "arrow" nogil: + + cdef cppclass CRecordBatchIterator "arrow::RecordBatchIterator"( + CIterator[shared_ptr[CRecordBatch]]): + pass + + +cdef extern from * namespace "arrow::compute": + # inlined from expression_internal.h to avoid + # proliferation of #include <unordered_map> + """ + #include <unordered_map> + + #include "arrow/type.h" + #include "arrow/datum.h" + + namespace arrow { + namespace compute { + struct KnownFieldValues { + std::unordered_map<FieldRef, Datum, FieldRef::Hash> map; + }; + } // namespace compute + } // namespace arrow + """ + cdef struct CKnownFieldValues "arrow::compute::KnownFieldValues": + unordered_map[CFieldRef, CDatum, CFieldRefHash] map + +cdef extern from "arrow/compute/exec/expression.h" \ + namespace "arrow::compute" nogil: + + cdef cppclass CExpression "arrow::compute::Expression": + c_bool Equals(const CExpression& other) const + c_string ToString() const + CResult[CExpression] Bind(const CSchema&) + + cdef CExpression CMakeScalarExpression \ + "arrow::compute::literal"(shared_ptr[CScalar] value) + + cdef CExpression CMakeFieldExpression \ + "arrow::compute::field_ref"(c_string name) + + cdef CExpression CMakeCallExpression \ + "arrow::compute::call"(c_string function, + vector[CExpression] arguments, + shared_ptr[CFunctionOptions] options) + + cdef CResult[shared_ptr[CBuffer]] CSerializeExpression \ + "arrow::compute::Serialize"(const CExpression&) + + cdef CResult[CExpression] CDeserializeExpression \ + "arrow::compute::Deserialize"(shared_ptr[CBuffer]) + + cdef CResult[CKnownFieldValues] \ + CExtractKnownFieldValues "arrow::compute::ExtractKnownFieldValues"( + const CExpression& partition_expression) + +ctypedef CStatus cb_writer_finish_internal(CFileWriter*) +ctypedef void cb_writer_finish(dict, CFileWriter*) + +cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: + + cdef enum ExistingDataBehavior" arrow::dataset::ExistingDataBehavior": + ExistingDataBehavior_DELETE_MATCHING" \ + arrow::dataset::ExistingDataBehavior::kDeleteMatchingPartitions" + ExistingDataBehavior_OVERWRITE_OR_IGNORE" \ + arrow::dataset::ExistingDataBehavior::kOverwriteOrIgnore" + ExistingDataBehavior_ERROR" \ + arrow::dataset::ExistingDataBehavior::kError" + + cdef cppclass CScanOptions "arrow::dataset::ScanOptions": + @staticmethod + shared_ptr[CScanOptions] Make(shared_ptr[CSchema] schema) + + shared_ptr[CSchema] dataset_schema + shared_ptr[CSchema] projected_schema + + cdef cppclass CFragmentScanOptions "arrow::dataset::FragmentScanOptions": + c_string type_name() const + + ctypedef CIterator[shared_ptr[CScanTask]] CScanTaskIterator \ + "arrow::dataset::ScanTaskIterator" + + cdef cppclass CScanTask" arrow::dataset::ScanTask": + CResult[CRecordBatchIterator] Execute() + + cdef cppclass CFragment "arrow::dataset::Fragment": + CResult[shared_ptr[CSchema]] ReadPhysicalSchema() + CResult[CScanTaskIterator] Scan(shared_ptr[CScanOptions] options) + c_bool splittable() const + c_string type_name() const + const CExpression& partition_expression() const + + ctypedef vector[shared_ptr[CFragment]] CFragmentVector \ + "arrow::dataset::FragmentVector" + + ctypedef CIterator[shared_ptr[CFragment]] CFragmentIterator \ + "arrow::dataset::FragmentIterator" + + cdef cppclass CInMemoryFragment "arrow::dataset::InMemoryFragment"( + CFragment): + CInMemoryFragment(vector[shared_ptr[CRecordBatch]] record_batches, + CExpression partition_expression) + + cdef cppclass CTaggedRecordBatch "arrow::dataset::TaggedRecordBatch": + shared_ptr[CRecordBatch] record_batch + shared_ptr[CFragment] fragment + + ctypedef CIterator[CTaggedRecordBatch] CTaggedRecordBatchIterator \ + "arrow::dataset::TaggedRecordBatchIterator" + + cdef cppclass CScanner "arrow::dataset::Scanner": + CScanner(shared_ptr[CDataset], shared_ptr[CScanOptions]) + CScanner(shared_ptr[CFragment], shared_ptr[CScanOptions]) + CResult[CScanTaskIterator] Scan() + CResult[CTaggedRecordBatchIterator] ScanBatches() + CResult[shared_ptr[CTable]] ToTable() + CResult[shared_ptr[CTable]] TakeRows(const CArray& indices) + CResult[shared_ptr[CTable]] Head(int64_t num_rows) + CResult[int64_t] CountRows() + CResult[CFragmentIterator] GetFragments() + CResult[shared_ptr[CRecordBatchReader]] ToRecordBatchReader() + const shared_ptr[CScanOptions]& options() + + cdef cppclass CScannerBuilder "arrow::dataset::ScannerBuilder": + CScannerBuilder(shared_ptr[CDataset], + shared_ptr[CScanOptions] scan_options) + CScannerBuilder(shared_ptr[CSchema], shared_ptr[CFragment], + shared_ptr[CScanOptions] scan_options) + + @staticmethod + shared_ptr[CScannerBuilder] FromRecordBatchReader( + shared_ptr[CRecordBatchReader] reader) + CStatus ProjectColumns "Project"(const vector[c_string]& columns) + CStatus Project(vector[CExpression]& exprs, vector[c_string]& columns) + CStatus Filter(CExpression filter) + CStatus UseThreads(c_bool use_threads) + CStatus UseAsync(c_bool use_async) + CStatus Pool(CMemoryPool* pool) + CStatus BatchSize(int64_t batch_size) + CStatus FragmentScanOptions( + shared_ptr[CFragmentScanOptions] fragment_scan_options) + CResult[shared_ptr[CScanner]] Finish() + shared_ptr[CSchema] schema() const + + ctypedef vector[shared_ptr[CDataset]] CDatasetVector \ + "arrow::dataset::DatasetVector" + + cdef cppclass CDataset "arrow::dataset::Dataset": + const shared_ptr[CSchema] & schema() + CResult[CFragmentIterator] GetFragments() + CResult[CFragmentIterator] GetFragments(CExpression predicate) + const CExpression & partition_expression() + c_string type_name() + + CResult[shared_ptr[CDataset]] ReplaceSchema(shared_ptr[CSchema]) + + CResult[shared_ptr[CScannerBuilder]] NewScan() + + cdef cppclass CInMemoryDataset "arrow::dataset::InMemoryDataset"( + CDataset): + CInMemoryDataset(shared_ptr[CRecordBatchReader]) + CInMemoryDataset(shared_ptr[CTable]) + + cdef cppclass CUnionDataset "arrow::dataset::UnionDataset"( + CDataset): + @staticmethod + CResult[shared_ptr[CUnionDataset]] Make(shared_ptr[CSchema] schema, + CDatasetVector children) + + const CDatasetVector& children() const + + cdef cppclass CInspectOptions "arrow::dataset::InspectOptions": + int fragments + + cdef cppclass CFinishOptions "arrow::dataset::FinishOptions": + shared_ptr[CSchema] schema + CInspectOptions inspect_options + c_bool validate_fragments + + cdef cppclass CDatasetFactory "arrow::dataset::DatasetFactory": + CResult[vector[shared_ptr[CSchema]]] InspectSchemas(CInspectOptions) + CResult[shared_ptr[CSchema]] Inspect(CInspectOptions) + CResult[shared_ptr[CDataset]] FinishWithSchema "Finish"( + const shared_ptr[CSchema]& schema) + CResult[shared_ptr[CDataset]] Finish() + const CExpression& root_partition() + CStatus SetRootPartition(CExpression partition) + + cdef cppclass CUnionDatasetFactory "arrow::dataset::UnionDatasetFactory": + @staticmethod + CResult[shared_ptr[CDatasetFactory]] Make( + vector[shared_ptr[CDatasetFactory]] factories) + + cdef cppclass CFileSource "arrow::dataset::FileSource": + const c_string& path() const + const shared_ptr[CFileSystem]& filesystem() const + const shared_ptr[CBuffer]& buffer() const + # HACK: Cython can't handle all the overloads so don't declare them. + # This means invalid construction of CFileSource won't be caught in + # the C++ generation phase (though it will still be caught when + # the generated C++ is compiled). + CFileSource(...) + + cdef cppclass CFileWriteOptions \ + "arrow::dataset::FileWriteOptions": + const shared_ptr[CFileFormat]& format() const + c_string type_name() const + + cdef cppclass CFileWriter \ + "arrow::dataset::FileWriter": + const shared_ptr[CFileFormat]& format() const + const shared_ptr[CSchema]& schema() const + const shared_ptr[CFileWriteOptions]& options() const + const CFileLocator& destination() const + + cdef cppclass CParquetFileWriter \ + "arrow::dataset::ParquetFileWriter"(CFileWriter): + const shared_ptr[FileWriter]& parquet_writer() const + + cdef cppclass CFileFormat "arrow::dataset::FileFormat": + shared_ptr[CFragmentScanOptions] default_fragment_scan_options + c_string type_name() const + CResult[shared_ptr[CSchema]] Inspect(const CFileSource&) const + CResult[shared_ptr[CFileFragment]] MakeFragment( + CFileSource source, + CExpression partition_expression, + shared_ptr[CSchema] physical_schema) + shared_ptr[CFileWriteOptions] DefaultWriteOptions() + + cdef cppclass CFileFragment "arrow::dataset::FileFragment"( + CFragment): + const CFileSource& source() const + const shared_ptr[CFileFormat]& format() const + + cdef cppclass CParquetFileWriteOptions \ + "arrow::dataset::ParquetFileWriteOptions"(CFileWriteOptions): + shared_ptr[WriterProperties] writer_properties + shared_ptr[ArrowWriterProperties] arrow_writer_properties + + cdef cppclass CParquetFileFragment "arrow::dataset::ParquetFileFragment"( + CFileFragment): + const vector[int]& row_groups() const + shared_ptr[CFileMetaData] metadata() const + CResult[vector[shared_ptr[CFragment]]] SplitByRowGroup( + CExpression predicate) + CResult[shared_ptr[CFragment]] SubsetWithFilter "Subset"( + CExpression predicate) + CResult[shared_ptr[CFragment]] SubsetWithIds "Subset"( + vector[int] row_group_ids) + CStatus EnsureCompleteMetadata() + + cdef cppclass CFileSystemDatasetWriteOptions \ + "arrow::dataset::FileSystemDatasetWriteOptions": + shared_ptr[CFileWriteOptions] file_write_options + shared_ptr[CFileSystem] filesystem + c_string base_dir + shared_ptr[CPartitioning] partitioning + int max_partitions + c_string basename_template + function[cb_writer_finish_internal] writer_pre_finish + function[cb_writer_finish_internal] writer_post_finish + ExistingDataBehavior existing_data_behavior + + cdef cppclass CFileSystemDataset \ + "arrow::dataset::FileSystemDataset"(CDataset): + @staticmethod + CResult[shared_ptr[CDataset]] Make( + shared_ptr[CSchema] schema, + CExpression source_partition, + shared_ptr[CFileFormat] format, + shared_ptr[CFileSystem] filesystem, + vector[shared_ptr[CFileFragment]] fragments) + + @staticmethod + CStatus Write( + const CFileSystemDatasetWriteOptions& write_options, + shared_ptr[CScanner] scanner) + + c_string type() + vector[c_string] files() + const shared_ptr[CFileFormat]& format() const + const shared_ptr[CFileSystem]& filesystem() const + const shared_ptr[CPartitioning]& partitioning() const + + cdef cppclass CParquetFileFormatReaderOptions \ + "arrow::dataset::ParquetFileFormat::ReaderOptions": + unordered_set[c_string] dict_columns + TimeUnit coerce_int96_timestamp_unit + + cdef cppclass CParquetFileFormat "arrow::dataset::ParquetFileFormat"( + CFileFormat): + CParquetFileFormatReaderOptions reader_options + CResult[shared_ptr[CFileFragment]] MakeFragment( + CFileSource source, + CExpression partition_expression, + shared_ptr[CSchema] physical_schema, + vector[int] row_groups) + + cdef cppclass CParquetFragmentScanOptions \ + "arrow::dataset::ParquetFragmentScanOptions"(CFragmentScanOptions): + shared_ptr[CReaderProperties] reader_properties + shared_ptr[ArrowReaderProperties] arrow_reader_properties + c_bool enable_parallel_column_conversion + + cdef cppclass CIpcFileWriteOptions \ + "arrow::dataset::IpcFileWriteOptions"(CFileWriteOptions): + pass + + cdef cppclass CIpcFileFormat "arrow::dataset::IpcFileFormat"( + CFileFormat): + pass + + cdef cppclass COrcFileFormat "arrow::dataset::OrcFileFormat"( + CFileFormat): + pass + + cdef cppclass CCsvFileWriteOptions \ + "arrow::dataset::CsvFileWriteOptions"(CFileWriteOptions): + shared_ptr[CCSVWriteOptions] write_options + CMemoryPool* pool + + cdef cppclass CCsvFileFormat "arrow::dataset::CsvFileFormat"( + CFileFormat): + CCSVParseOptions parse_options + + cdef cppclass CCsvFragmentScanOptions \ + "arrow::dataset::CsvFragmentScanOptions"(CFragmentScanOptions): + CCSVConvertOptions convert_options + CCSVReadOptions read_options + + cdef cppclass CPartitioning "arrow::dataset::Partitioning": + c_string type_name() const + CResult[CExpression] Parse(const c_string & path) const + const shared_ptr[CSchema] & schema() + + cdef cppclass CSegmentEncoding" arrow::dataset::SegmentEncoding": + pass + + CSegmentEncoding CSegmentEncodingNone\ + " arrow::dataset::SegmentEncoding::None" + CSegmentEncoding CSegmentEncodingUri\ + " arrow::dataset::SegmentEncoding::Uri" + + cdef cppclass CKeyValuePartitioningOptions \ + "arrow::dataset::KeyValuePartitioningOptions": + CSegmentEncoding segment_encoding + + cdef cppclass CHivePartitioningOptions \ + "arrow::dataset::HivePartitioningOptions": + CSegmentEncoding segment_encoding + c_string null_fallback + + cdef cppclass CPartitioningFactoryOptions \ + "arrow::dataset::PartitioningFactoryOptions": + c_bool infer_dictionary + shared_ptr[CSchema] schema + CSegmentEncoding segment_encoding + + cdef cppclass CHivePartitioningFactoryOptions \ + "arrow::dataset::HivePartitioningFactoryOptions": + c_bool infer_dictionary + c_string null_fallback + shared_ptr[CSchema] schema + CSegmentEncoding segment_encoding + + cdef cppclass CPartitioningFactory "arrow::dataset::PartitioningFactory": + c_string type_name() const + + cdef cppclass CDirectoryPartitioning \ + "arrow::dataset::DirectoryPartitioning"(CPartitioning): + CDirectoryPartitioning(shared_ptr[CSchema] schema, + vector[shared_ptr[CArray]] dictionaries) + + @staticmethod + shared_ptr[CPartitioningFactory] MakeFactory( + vector[c_string] field_names, CPartitioningFactoryOptions) + + vector[shared_ptr[CArray]] dictionaries() const + + cdef cppclass CHivePartitioning \ + "arrow::dataset::HivePartitioning"(CPartitioning): + CHivePartitioning(shared_ptr[CSchema] schema, + vector[shared_ptr[CArray]] dictionaries, + CHivePartitioningOptions options) + + @staticmethod + shared_ptr[CPartitioningFactory] MakeFactory( + CHivePartitioningFactoryOptions) + + vector[shared_ptr[CArray]] dictionaries() const + + cdef cppclass CPartitioningOrFactory \ + "arrow::dataset::PartitioningOrFactory": + CPartitioningOrFactory(shared_ptr[CPartitioning]) + CPartitioningOrFactory(shared_ptr[CPartitioningFactory]) + CPartitioningOrFactory & operator = (shared_ptr[CPartitioning]) + CPartitioningOrFactory & operator = ( + shared_ptr[CPartitioningFactory]) + shared_ptr[CPartitioning] partitioning() const + shared_ptr[CPartitioningFactory] factory() const + + cdef cppclass CFileSystemFactoryOptions \ + "arrow::dataset::FileSystemFactoryOptions": + CPartitioningOrFactory partitioning + c_string partition_base_dir + c_bool exclude_invalid_files + vector[c_string] selector_ignore_prefixes + + cdef cppclass CFileSystemDatasetFactory \ + "arrow::dataset::FileSystemDatasetFactory"( + CDatasetFactory): + @staticmethod + CResult[shared_ptr[CDatasetFactory]] MakeFromPaths "Make"( + shared_ptr[CFileSystem] filesystem, + vector[c_string] paths, + shared_ptr[CFileFormat] format, + CFileSystemFactoryOptions options + ) + + @staticmethod + CResult[shared_ptr[CDatasetFactory]] MakeFromSelector "Make"( + shared_ptr[CFileSystem] filesystem, + CFileSelector, + shared_ptr[CFileFormat] format, + CFileSystemFactoryOptions options + ) + + cdef cppclass CParquetFactoryOptions \ + "arrow::dataset::ParquetFactoryOptions": + CPartitioningOrFactory partitioning + c_string partition_base_dir + c_bool validate_column_chunk_paths + + cdef cppclass CParquetDatasetFactory \ + "arrow::dataset::ParquetDatasetFactory"(CDatasetFactory): + @staticmethod + CResult[shared_ptr[CDatasetFactory]] MakeFromMetaDataPath "Make"( + const c_string& metadata_path, + shared_ptr[CFileSystem] filesystem, + shared_ptr[CParquetFileFormat] format, + CParquetFactoryOptions options + ) + + @staticmethod + CResult[shared_ptr[CDatasetFactory]] MakeFromMetaDataSource "Make"( + const CFileSource& metadata_path, + const c_string& base_path, + shared_ptr[CFileSystem] filesystem, + shared_ptr[CParquetFileFormat] format, + CParquetFactoryOptions options + ) |