# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # distutils: language = c++ from libcpp.unordered_map cimport unordered_map from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport * from pyarrow.includes.libarrow_fs cimport * from pyarrow._parquet cimport * cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CRecordBatchIterator "arrow::RecordBatchIterator"( CIterator[shared_ptr[CRecordBatch]]): pass cdef extern from * namespace "arrow::compute": # inlined from expression_internal.h to avoid # proliferation of #include """ #include #include "arrow/type.h" #include "arrow/datum.h" namespace arrow { namespace compute { struct KnownFieldValues { std::unordered_map map; }; } // namespace compute } // namespace arrow """ cdef struct CKnownFieldValues "arrow::compute::KnownFieldValues": unordered_map[CFieldRef, CDatum, CFieldRefHash] map cdef extern from "arrow/compute/exec/expression.h" \ namespace "arrow::compute" nogil: cdef cppclass CExpression "arrow::compute::Expression": c_bool Equals(const CExpression& other) const c_string ToString() const CResult[CExpression] Bind(const CSchema&) cdef CExpression CMakeScalarExpression \ "arrow::compute::literal"(shared_ptr[CScalar] value) cdef CExpression CMakeFieldExpression \ "arrow::compute::field_ref"(c_string name) cdef CExpression CMakeCallExpression \ "arrow::compute::call"(c_string function, vector[CExpression] arguments, shared_ptr[CFunctionOptions] options) cdef CResult[shared_ptr[CBuffer]] CSerializeExpression \ "arrow::compute::Serialize"(const CExpression&) cdef CResult[CExpression] CDeserializeExpression \ "arrow::compute::Deserialize"(shared_ptr[CBuffer]) cdef CResult[CKnownFieldValues] \ CExtractKnownFieldValues "arrow::compute::ExtractKnownFieldValues"( const CExpression& partition_expression) ctypedef CStatus cb_writer_finish_internal(CFileWriter*) ctypedef void cb_writer_finish(dict, CFileWriter*) cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: cdef enum ExistingDataBehavior" arrow::dataset::ExistingDataBehavior": ExistingDataBehavior_DELETE_MATCHING" \ arrow::dataset::ExistingDataBehavior::kDeleteMatchingPartitions" ExistingDataBehavior_OVERWRITE_OR_IGNORE" \ arrow::dataset::ExistingDataBehavior::kOverwriteOrIgnore" ExistingDataBehavior_ERROR" \ arrow::dataset::ExistingDataBehavior::kError" cdef cppclass CScanOptions "arrow::dataset::ScanOptions": @staticmethod shared_ptr[CScanOptions] Make(shared_ptr[CSchema] schema) shared_ptr[CSchema] dataset_schema shared_ptr[CSchema] projected_schema cdef cppclass CFragmentScanOptions "arrow::dataset::FragmentScanOptions": c_string type_name() const ctypedef CIterator[shared_ptr[CScanTask]] CScanTaskIterator \ "arrow::dataset::ScanTaskIterator" cdef cppclass CScanTask" arrow::dataset::ScanTask": CResult[CRecordBatchIterator] Execute() cdef cppclass CFragment "arrow::dataset::Fragment": CResult[shared_ptr[CSchema]] ReadPhysicalSchema() CResult[CScanTaskIterator] Scan(shared_ptr[CScanOptions] options) c_bool splittable() const c_string type_name() const const CExpression& partition_expression() const ctypedef vector[shared_ptr[CFragment]] CFragmentVector \ "arrow::dataset::FragmentVector" ctypedef CIterator[shared_ptr[CFragment]] CFragmentIterator \ "arrow::dataset::FragmentIterator" cdef cppclass CInMemoryFragment "arrow::dataset::InMemoryFragment"( CFragment): CInMemoryFragment(vector[shared_ptr[CRecordBatch]] record_batches, CExpression partition_expression) cdef cppclass CTaggedRecordBatch "arrow::dataset::TaggedRecordBatch": shared_ptr[CRecordBatch] record_batch shared_ptr[CFragment] fragment ctypedef CIterator[CTaggedRecordBatch] CTaggedRecordBatchIterator \ "arrow::dataset::TaggedRecordBatchIterator" cdef cppclass CScanner "arrow::dataset::Scanner": CScanner(shared_ptr[CDataset], shared_ptr[CScanOptions]) CScanner(shared_ptr[CFragment], shared_ptr[CScanOptions]) CResult[CScanTaskIterator] Scan() CResult[CTaggedRecordBatchIterator] ScanBatches() CResult[shared_ptr[CTable]] ToTable() CResult[shared_ptr[CTable]] TakeRows(const CArray& indices) CResult[shared_ptr[CTable]] Head(int64_t num_rows) CResult[int64_t] CountRows() CResult[CFragmentIterator] GetFragments() CResult[shared_ptr[CRecordBatchReader]] ToRecordBatchReader() const shared_ptr[CScanOptions]& options() cdef cppclass CScannerBuilder "arrow::dataset::ScannerBuilder": CScannerBuilder(shared_ptr[CDataset], shared_ptr[CScanOptions] scan_options) CScannerBuilder(shared_ptr[CSchema], shared_ptr[CFragment], shared_ptr[CScanOptions] scan_options) @staticmethod shared_ptr[CScannerBuilder] FromRecordBatchReader( shared_ptr[CRecordBatchReader] reader) CStatus ProjectColumns "Project"(const vector[c_string]& columns) CStatus Project(vector[CExpression]& exprs, vector[c_string]& columns) CStatus Filter(CExpression filter) CStatus UseThreads(c_bool use_threads) CStatus UseAsync(c_bool use_async) CStatus Pool(CMemoryPool* pool) CStatus BatchSize(int64_t batch_size) CStatus FragmentScanOptions( shared_ptr[CFragmentScanOptions] fragment_scan_options) CResult[shared_ptr[CScanner]] Finish() shared_ptr[CSchema] schema() const ctypedef vector[shared_ptr[CDataset]] CDatasetVector \ "arrow::dataset::DatasetVector" cdef cppclass CDataset "arrow::dataset::Dataset": const shared_ptr[CSchema] & schema() CResult[CFragmentIterator] GetFragments() CResult[CFragmentIterator] GetFragments(CExpression predicate) const CExpression & partition_expression() c_string type_name() CResult[shared_ptr[CDataset]] ReplaceSchema(shared_ptr[CSchema]) CResult[shared_ptr[CScannerBuilder]] NewScan() cdef cppclass CInMemoryDataset "arrow::dataset::InMemoryDataset"( CDataset): CInMemoryDataset(shared_ptr[CRecordBatchReader]) CInMemoryDataset(shared_ptr[CTable]) cdef cppclass CUnionDataset "arrow::dataset::UnionDataset"( CDataset): @staticmethod CResult[shared_ptr[CUnionDataset]] Make(shared_ptr[CSchema] schema, CDatasetVector children) const CDatasetVector& children() const cdef cppclass CInspectOptions "arrow::dataset::InspectOptions": int fragments cdef cppclass CFinishOptions "arrow::dataset::FinishOptions": shared_ptr[CSchema] schema CInspectOptions inspect_options c_bool validate_fragments cdef cppclass CDatasetFactory "arrow::dataset::DatasetFactory": CResult[vector[shared_ptr[CSchema]]] InspectSchemas(CInspectOptions) CResult[shared_ptr[CSchema]] Inspect(CInspectOptions) CResult[shared_ptr[CDataset]] FinishWithSchema "Finish"( const shared_ptr[CSchema]& schema) CResult[shared_ptr[CDataset]] Finish() const CExpression& root_partition() CStatus SetRootPartition(CExpression partition) cdef cppclass CUnionDatasetFactory "arrow::dataset::UnionDatasetFactory": @staticmethod CResult[shared_ptr[CDatasetFactory]] Make( vector[shared_ptr[CDatasetFactory]] factories) cdef cppclass CFileSource "arrow::dataset::FileSource": const c_string& path() const const shared_ptr[CFileSystem]& filesystem() const const shared_ptr[CBuffer]& buffer() const # HACK: Cython can't handle all the overloads so don't declare them. # This means invalid construction of CFileSource won't be caught in # the C++ generation phase (though it will still be caught when # the generated C++ is compiled). CFileSource(...) cdef cppclass CFileWriteOptions \ "arrow::dataset::FileWriteOptions": const shared_ptr[CFileFormat]& format() const c_string type_name() const cdef cppclass CFileWriter \ "arrow::dataset::FileWriter": const shared_ptr[CFileFormat]& format() const const shared_ptr[CSchema]& schema() const const shared_ptr[CFileWriteOptions]& options() const const CFileLocator& destination() const cdef cppclass CParquetFileWriter \ "arrow::dataset::ParquetFileWriter"(CFileWriter): const shared_ptr[FileWriter]& parquet_writer() const cdef cppclass CFileFormat "arrow::dataset::FileFormat": shared_ptr[CFragmentScanOptions] default_fragment_scan_options c_string type_name() const CResult[shared_ptr[CSchema]] Inspect(const CFileSource&) const CResult[shared_ptr[CFileFragment]] MakeFragment( CFileSource source, CExpression partition_expression, shared_ptr[CSchema] physical_schema) shared_ptr[CFileWriteOptions] DefaultWriteOptions() cdef cppclass CFileFragment "arrow::dataset::FileFragment"( CFragment): const CFileSource& source() const const shared_ptr[CFileFormat]& format() const cdef cppclass CParquetFileWriteOptions \ "arrow::dataset::ParquetFileWriteOptions"(CFileWriteOptions): shared_ptr[WriterProperties] writer_properties shared_ptr[ArrowWriterProperties] arrow_writer_properties cdef cppclass CParquetFileFragment "arrow::dataset::ParquetFileFragment"( CFileFragment): const vector[int]& row_groups() const shared_ptr[CFileMetaData] metadata() const CResult[vector[shared_ptr[CFragment]]] SplitByRowGroup( CExpression predicate) CResult[shared_ptr[CFragment]] SubsetWithFilter "Subset"( CExpression predicate) CResult[shared_ptr[CFragment]] SubsetWithIds "Subset"( vector[int] row_group_ids) CStatus EnsureCompleteMetadata() cdef cppclass CFileSystemDatasetWriteOptions \ "arrow::dataset::FileSystemDatasetWriteOptions": shared_ptr[CFileWriteOptions] file_write_options shared_ptr[CFileSystem] filesystem c_string base_dir shared_ptr[CPartitioning] partitioning int max_partitions c_string basename_template function[cb_writer_finish_internal] writer_pre_finish function[cb_writer_finish_internal] writer_post_finish ExistingDataBehavior existing_data_behavior cdef cppclass CFileSystemDataset \ "arrow::dataset::FileSystemDataset"(CDataset): @staticmethod CResult[shared_ptr[CDataset]] Make( shared_ptr[CSchema] schema, CExpression source_partition, shared_ptr[CFileFormat] format, shared_ptr[CFileSystem] filesystem, vector[shared_ptr[CFileFragment]] fragments) @staticmethod CStatus Write( const CFileSystemDatasetWriteOptions& write_options, shared_ptr[CScanner] scanner) c_string type() vector[c_string] files() const shared_ptr[CFileFormat]& format() const const shared_ptr[CFileSystem]& filesystem() const const shared_ptr[CPartitioning]& partitioning() const cdef cppclass CParquetFileFormatReaderOptions \ "arrow::dataset::ParquetFileFormat::ReaderOptions": unordered_set[c_string] dict_columns TimeUnit coerce_int96_timestamp_unit cdef cppclass CParquetFileFormat "arrow::dataset::ParquetFileFormat"( CFileFormat): CParquetFileFormatReaderOptions reader_options CResult[shared_ptr[CFileFragment]] MakeFragment( CFileSource source, CExpression partition_expression, shared_ptr[CSchema] physical_schema, vector[int] row_groups) cdef cppclass CParquetFragmentScanOptions \ "arrow::dataset::ParquetFragmentScanOptions"(CFragmentScanOptions): shared_ptr[CReaderProperties] reader_properties shared_ptr[ArrowReaderProperties] arrow_reader_properties c_bool enable_parallel_column_conversion cdef cppclass CIpcFileWriteOptions \ "arrow::dataset::IpcFileWriteOptions"(CFileWriteOptions): pass cdef cppclass CIpcFileFormat "arrow::dataset::IpcFileFormat"( CFileFormat): pass cdef cppclass COrcFileFormat "arrow::dataset::OrcFileFormat"( CFileFormat): pass cdef cppclass CCsvFileWriteOptions \ "arrow::dataset::CsvFileWriteOptions"(CFileWriteOptions): shared_ptr[CCSVWriteOptions] write_options CMemoryPool* pool cdef cppclass CCsvFileFormat "arrow::dataset::CsvFileFormat"( CFileFormat): CCSVParseOptions parse_options cdef cppclass CCsvFragmentScanOptions \ "arrow::dataset::CsvFragmentScanOptions"(CFragmentScanOptions): CCSVConvertOptions convert_options CCSVReadOptions read_options cdef cppclass CPartitioning "arrow::dataset::Partitioning": c_string type_name() const CResult[CExpression] Parse(const c_string & path) const const shared_ptr[CSchema] & schema() cdef cppclass CSegmentEncoding" arrow::dataset::SegmentEncoding": pass CSegmentEncoding CSegmentEncodingNone\ " arrow::dataset::SegmentEncoding::None" CSegmentEncoding CSegmentEncodingUri\ " arrow::dataset::SegmentEncoding::Uri" cdef cppclass CKeyValuePartitioningOptions \ "arrow::dataset::KeyValuePartitioningOptions": CSegmentEncoding segment_encoding cdef cppclass CHivePartitioningOptions \ "arrow::dataset::HivePartitioningOptions": CSegmentEncoding segment_encoding c_string null_fallback cdef cppclass CPartitioningFactoryOptions \ "arrow::dataset::PartitioningFactoryOptions": c_bool infer_dictionary shared_ptr[CSchema] schema CSegmentEncoding segment_encoding cdef cppclass CHivePartitioningFactoryOptions \ "arrow::dataset::HivePartitioningFactoryOptions": c_bool infer_dictionary c_string null_fallback shared_ptr[CSchema] schema CSegmentEncoding segment_encoding cdef cppclass CPartitioningFactory "arrow::dataset::PartitioningFactory": c_string type_name() const cdef cppclass CDirectoryPartitioning \ "arrow::dataset::DirectoryPartitioning"(CPartitioning): CDirectoryPartitioning(shared_ptr[CSchema] schema, vector[shared_ptr[CArray]] dictionaries) @staticmethod shared_ptr[CPartitioningFactory] MakeFactory( vector[c_string] field_names, CPartitioningFactoryOptions) vector[shared_ptr[CArray]] dictionaries() const cdef cppclass CHivePartitioning \ "arrow::dataset::HivePartitioning"(CPartitioning): CHivePartitioning(shared_ptr[CSchema] schema, vector[shared_ptr[CArray]] dictionaries, CHivePartitioningOptions options) @staticmethod shared_ptr[CPartitioningFactory] MakeFactory( CHivePartitioningFactoryOptions) vector[shared_ptr[CArray]] dictionaries() const cdef cppclass CPartitioningOrFactory \ "arrow::dataset::PartitioningOrFactory": CPartitioningOrFactory(shared_ptr[CPartitioning]) CPartitioningOrFactory(shared_ptr[CPartitioningFactory]) CPartitioningOrFactory & operator = (shared_ptr[CPartitioning]) CPartitioningOrFactory & operator = ( shared_ptr[CPartitioningFactory]) shared_ptr[CPartitioning] partitioning() const shared_ptr[CPartitioningFactory] factory() const cdef cppclass CFileSystemFactoryOptions \ "arrow::dataset::FileSystemFactoryOptions": CPartitioningOrFactory partitioning c_string partition_base_dir c_bool exclude_invalid_files vector[c_string] selector_ignore_prefixes cdef cppclass CFileSystemDatasetFactory \ "arrow::dataset::FileSystemDatasetFactory"( CDatasetFactory): @staticmethod CResult[shared_ptr[CDatasetFactory]] MakeFromPaths "Make"( shared_ptr[CFileSystem] filesystem, vector[c_string] paths, shared_ptr[CFileFormat] format, CFileSystemFactoryOptions options ) @staticmethod CResult[shared_ptr[CDatasetFactory]] MakeFromSelector "Make"( shared_ptr[CFileSystem] filesystem, CFileSelector, shared_ptr[CFileFormat] format, CFileSystemFactoryOptions options ) cdef cppclass CParquetFactoryOptions \ "arrow::dataset::ParquetFactoryOptions": CPartitioningOrFactory partitioning c_string partition_base_dir c_bool validate_column_chunk_paths cdef cppclass CParquetDatasetFactory \ "arrow::dataset::ParquetDatasetFactory"(CDatasetFactory): @staticmethod CResult[shared_ptr[CDatasetFactory]] MakeFromMetaDataPath "Make"( const c_string& metadata_path, shared_ptr[CFileSystem] filesystem, shared_ptr[CParquetFileFormat] format, CParquetFactoryOptions options ) @staticmethod CResult[shared_ptr[CDatasetFactory]] MakeFromMetaDataSource "Make"( const CFileSource& metadata_path, const c_string& base_path, shared_ptr[CFileSystem] filesystem, shared_ptr[CParquetFileFormat] format, CParquetFactoryOptions options )