From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:54:28 +0200 Subject: Adding upstream version 18.2.2. Signed-off-by: Daniel Baumann --- src/arrow/r/man/Partitioning.Rd | 51 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 src/arrow/r/man/Partitioning.Rd (limited to 'src/arrow/r/man/Partitioning.Rd') diff --git a/src/arrow/r/man/Partitioning.Rd b/src/arrow/r/man/Partitioning.Rd new file mode 100644 index 000000000..cfe374155 --- /dev/null +++ b/src/arrow/r/man/Partitioning.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataset-partition.R +\name{Partitioning} +\alias{Partitioning} +\alias{DirectoryPartitioning} +\alias{HivePartitioning} +\alias{DirectoryPartitioningFactory} +\alias{HivePartitioningFactory} +\title{Define Partitioning for a Dataset} +\description{ +Pass a \code{Partitioning} object to a \link{FileSystemDatasetFactory}'s \verb{$create()} +method to indicate how the file's paths should be interpreted to define +partitioning. + +\code{DirectoryPartitioning} describes how to interpret raw path segments, in +order. For example, \code{schema(year = int16(), month = int8())} would define +partitions for file paths like "2019/01/file.parquet", +"2019/02/file.parquet", etc. In this scheme \code{NULL} values will be skipped. In +the previous example: when writing a dataset if the month was \code{NA} (or +\code{NULL}), the files would be placed in "2019/file.parquet". When reading, the +rows in "2019/file.parquet" would return an \code{NA} for the month column. An +error will be raised if an outer directory is \code{NULL} and an inner directory +is not. + +\code{HivePartitioning} is for Hive-style partitioning, which embeds field +names and values in path segments, such as +"/year=2019/month=2/data.parquet". Because fields are named in the path +segments, order does not matter. This partitioning scheme allows \code{NULL} +values. They will be replaced by a configurable \code{null_fallback} which +defaults to the string \code{"__HIVE_DEFAULT_PARTITION__"} when writing. When +reading, the \code{null_fallback} string will be replaced with \code{NA}s as +appropriate. + +\code{PartitioningFactory} subclasses instruct the \code{DatasetFactory} to detect +partition features from the file paths. +} +\section{Factory}{ + +Both \code{DirectoryPartitioning$create()} and \code{HivePartitioning$create()} +methods take a \link{Schema} as a single input argument. The helper +function \code{\link[=hive_partition]{hive_partition(...)}} is shorthand for +\code{HivePartitioning$create(schema(...))}. + +With \code{DirectoryPartitioningFactory$create()}, you can provide just the +names of the path segments (in our example, \code{c("year", "month")}), and +the \code{DatasetFactory} will infer the data types for those partition variables. +\code{HivePartitioningFactory$create()} takes no arguments: both variable names +and their types can be inferred from the file paths. \code{hive_partition()} with +no arguments returns a \code{HivePartitioningFactory}. +} + -- cgit v1.2.3