summaryrefslogtreecommitdiffstats
path: root/src/arrow/format/Schema.fbs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/arrow/format/Schema.fbs
parentInitial commit. (diff)
downloadceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/arrow/format/Schema.fbs')
-rw-r--r--src/arrow/format/Schema.fbs522
1 files changed, 522 insertions, 0 deletions
diff --git a/src/arrow/format/Schema.fbs b/src/arrow/format/Schema.fbs
new file mode 100644
index 000000000..7ee827b5d
--- /dev/null
+++ b/src/arrow/format/Schema.fbs
@@ -0,0 +1,522 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// Logical types, vector layouts, and schemas
+
+/// Format Version History.
+/// Version 1.0 - Forward and backwards compatibility guaranteed.
+/// Version 1.1 - Add Decimal256 (No format release).
+/// Version 1.2 (Pending)- Add Interval MONTH_DAY_NANO
+
+namespace org.apache.arrow.flatbuf;
+
+enum MetadataVersion:short {
+ /// 0.1.0 (October 2016).
+ V1,
+
+ /// 0.2.0 (February 2017). Non-backwards compatible with V1.
+ V2,
+
+ /// 0.3.0 -> 0.7.1 (May - December 2017). Non-backwards compatible with V2.
+ V3,
+
+ /// >= 0.8.0 (December 2017). Non-backwards compatible with V3.
+ V4,
+
+ /// >= 1.0.0 (July 2020. Backwards compatible with V4 (V5 readers can read V4
+ /// metadata and IPC messages). Implementations are recommended to provide a
+ /// V4 compatibility mode with V5 format changes disabled.
+ ///
+ /// Incompatible changes between V4 and V5:
+ /// - Union buffer layout has changed. In V5, Unions don't have a validity
+ /// bitmap buffer.
+ V5,
+}
+
+/// Represents Arrow Features that might not have full support
+/// within implementations. This is intended to be used in
+/// two scenarios:
+/// 1. A mechanism for readers of Arrow Streams
+/// and files to understand that the stream or file makes
+/// use of a feature that isn't supported or unknown to
+/// the implementation (and therefore can meet the Arrow
+/// forward compatibility guarantees).
+/// 2. A means of negotiating between a client and server
+/// what features a stream is allowed to use. The enums
+/// values here are intented to represent higher level
+/// features, additional details maybe negotiated
+/// with key-value pairs specific to the protocol.
+///
+/// Enums added to this list should be assigned power-of-two values
+/// to facilitate exchanging and comparing bitmaps for supported
+/// features.
+enum Feature : long {
+ /// Needed to make flatbuffers happy.
+ UNUSED = 0,
+ /// The stream makes use of multiple full dictionaries with the
+ /// same ID and assumes clients implement dictionary replacement
+ /// correctly.
+ DICTIONARY_REPLACEMENT = 1,
+ /// The stream makes use of compressed bodies as described
+ /// in Message.fbs.
+ COMPRESSED_BODY = 2
+}
+
+/// These are stored in the flatbuffer in the Type union below
+
+table Null {
+}
+
+/// A Struct_ in the flatbuffer metadata is the same as an Arrow Struct
+/// (according to the physical memory layout). We used Struct_ here as
+/// Struct is a reserved word in Flatbuffers
+table Struct_ {
+}
+
+table List {
+}
+
+/// Same as List, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+table LargeList {
+}
+
+table FixedSizeList {
+ /// Number of list items per value
+ listSize: int;
+}
+
+/// A Map is a logical nested type that is represented as
+///
+/// List<entries: Struct<key: K, value: V>>
+///
+/// In this layout, the keys and values are each respectively contiguous. We do
+/// not constrain the key and value types, so the application is responsible
+/// for ensuring that the keys are hashable and unique. Whether the keys are sorted
+/// may be set in the metadata for this field.
+///
+/// In a field with Map type, the field has a child Struct field, which then
+/// has two children: key type and the second the value type. The names of the
+/// child fields may be respectively "entries", "key", and "value", but this is
+/// not enforced.
+///
+/// Map
+/// ```text
+/// - child[0] entries: Struct
+/// - child[0] key: K
+/// - child[1] value: V
+/// ```
+/// Neither the "entries" field nor the "key" field may be nullable.
+///
+/// The metadata is structured so that Arrow systems without special handling
+/// for Map can make Map an alias for List. The "layout" attribute for the Map
+/// field must have the same contents as a List.
+table Map {
+ /// Set to true if the keys within each value are sorted
+ keysSorted: bool;
+}
+
+enum UnionMode:short { Sparse, Dense }
+
+/// A union is a complex type with children in Field
+/// By default ids in the type vector refer to the offsets in the children
+/// optionally typeIds provides an indirection between the child offset and the type id
+/// for each child `typeIds[offset]` is the id used in the type vector
+table Union {
+ mode: UnionMode;
+ typeIds: [ int ]; // optional, describes typeid of each child.
+}
+
+table Int {
+ bitWidth: int; // restricted to 8, 16, 32, and 64 in v1
+ is_signed: bool;
+}
+
+enum Precision:short {HALF, SINGLE, DOUBLE}
+
+table FloatingPoint {
+ precision: Precision;
+}
+
+/// Unicode with UTF-8 encoding
+table Utf8 {
+}
+
+/// Opaque binary data
+table Binary {
+}
+
+/// Same as Utf8, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+table LargeUtf8 {
+}
+
+/// Same as Binary, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+table LargeBinary {
+}
+
+table FixedSizeBinary {
+ /// Number of bytes per value
+ byteWidth: int;
+}
+
+table Bool {
+}
+
+/// Exact decimal value represented as an integer value in two's
+/// complement. Currently only 128-bit (16-byte) and 256-bit (32-byte) integers
+/// are used. The representation uses the endianness indicated
+/// in the Schema.
+table Decimal {
+ /// Total number of decimal digits
+ precision: int;
+
+ /// Number of digits after the decimal point "."
+ scale: int;
+
+ /// Number of bits per value. The only accepted widths are 128 and 256.
+ /// We use bitWidth for consistency with Int::bitWidth.
+ bitWidth: int = 128;
+}
+
+enum DateUnit: short {
+ DAY,
+ MILLISECOND
+}
+
+/// Date is either a 32-bit or 64-bit signed integer type representing an
+/// elapsed time since UNIX epoch (1970-01-01), stored in either of two units:
+///
+/// * Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no
+/// leap seconds), where the values are evenly divisible by 86400000
+/// * Days (32 bits) since the UNIX epoch
+table Date {
+ unit: DateUnit = MILLISECOND;
+}
+
+enum TimeUnit: short { SECOND, MILLISECOND, MICROSECOND, NANOSECOND }
+
+/// Time is either a 32-bit or 64-bit signed integer type representing an
+/// elapsed time since midnight, stored in either of four units: seconds,
+/// milliseconds, microseconds or nanoseconds.
+///
+/// The integer `bitWidth` depends on the `unit` and must be one of the following:
+/// * SECOND and MILLISECOND: 32 bits
+/// * MICROSECOND and NANOSECOND: 64 bits
+///
+/// The allowed values are between 0 (inclusive) and 86400 (=24*60*60) seconds
+/// (exclusive), adjusted for the time unit (for example, up to 86400000
+/// exclusive for the MILLISECOND unit).
+/// This definition doesn't allow for leap seconds. Time values from
+/// measurements with leap seconds will need to be corrected when ingesting
+/// into Arrow (for example by replacing the value 86400 with 86399).
+table Time {
+ unit: TimeUnit = MILLISECOND;
+ bitWidth: int = 32;
+}
+
+/// Timestamp is a 64-bit signed integer representing an elapsed time since a
+/// fixed epoch, stored in either of four units: seconds, milliseconds,
+/// microseconds or nanoseconds, and is optionally annotated with a timezone.
+///
+/// Timestamp values do not include any leap seconds (in other words, all
+/// days are considered 86400 seconds long).
+///
+/// Timestamps with a non-empty timezone
+/// ------------------------------------
+///
+/// If a Timestamp column has a non-empty timezone value, its epoch is
+/// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone
+/// (the Unix epoch), regardless of the Timestamp's own timezone.
+///
+/// Therefore, timestamp values with a non-empty timezone correspond to
+/// physical points in time together with some additional information about
+/// how the data was obtained and/or how to display it (the timezone).
+///
+/// For example, the timestamp value 0 with the timezone string "Europe/Paris"
+/// corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the
+/// application may prefer to display it as "January 1st 1970, 01h00" in
+/// the Europe/Paris timezone (which is the same physical point in time).
+///
+/// One consequence is that timestamp values with a non-empty timezone
+/// can be compared and ordered directly, since they all share the same
+/// well-known point of reference (the Unix epoch).
+///
+/// Timestamps with an unset / empty timezone
+/// -----------------------------------------
+///
+/// If a Timestamp column has no timezone value, its epoch is
+/// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone.
+///
+/// Therefore, timestamp values without a timezone cannot be meaningfully
+/// interpreted as physical points in time, but only as calendar / clock
+/// indications ("wall clock time") in an unspecified timezone.
+///
+/// For example, the timestamp value 0 with an empty timezone string
+/// corresponds to "January 1st 1970, 00h00" in an unknown timezone: there
+/// is not enough information to interpret it as a well-defined physical
+/// point in time.
+///
+/// One consequence is that timestamp values without a timezone cannot
+/// be reliably compared or ordered, since they may have different points of
+/// reference. In particular, it is *not* possible to interpret an unset
+/// or empty timezone as the same as "UTC".
+///
+/// Conversion between timezones
+/// ----------------------------
+///
+/// If a Timestamp column has a non-empty timezone, changing the timezone
+/// to a different non-empty value is a metadata-only operation:
+/// the timestamp values need not change as their point of reference remains
+/// the same (the Unix epoch).
+///
+/// However, if a Timestamp column has no timezone value, changing it to a
+/// non-empty value requires to think about the desired semantics.
+/// One possibility is to assume that the original timestamp values are
+/// relative to the epoch of the timezone being set; timestamp values should
+/// then adjusted to the Unix epoch (for example, changing the timezone from
+/// empty to "Europe/Paris" would require converting the timestamp values
+/// from "Europe/Paris" to "UTC", which seems counter-intuitive but is
+/// nevertheless correct).
+///
+/// Guidelines for encoding data from external libraries
+/// ----------------------------------------------------
+///
+/// Date & time libraries often have multiple different data types for temporal
+/// data. In order to ease interoperability between different implementations the
+/// Arrow project has some recommendations for encoding these types into a Timestamp
+/// column.
+///
+/// An "instant" represents a physical point in time that has no relevant timezone
+/// (for example, astronomical data). To encode an instant, use a Timestamp with
+/// the timezone string set to "UTC", and make sure the Timestamp values
+/// are relative to the UTC epoch (January 1st 1970, midnight).
+///
+/// A "zoned date-time" represents a physical point in time annotated with an
+/// informative timezone (for example, the timezone in which the data was
+/// recorded). To encode a zoned date-time, use a Timestamp with the timezone
+/// string set to the name of the timezone, and make sure the Timestamp values
+/// are relative to the UTC epoch (January 1st 1970, midnight).
+///
+/// (There is some ambiguity between an instant and a zoned date-time with the
+/// UTC timezone. Both of these are stored the same in Arrow. Typically,
+/// this distinction does not matter. If it does, then an application should
+/// use custom metadata or an extension type to distinguish between the two cases.)
+///
+/// An "offset date-time" represents a physical point in time combined with an
+/// explicit offset from UTC. To encode an offset date-time, use a Timestamp
+/// with the timezone string set to the numeric timezone offset string
+/// (e.g. "+03:00"), and make sure the Timestamp values are relative to
+/// the UTC epoch (January 1st 1970, midnight).
+///
+/// A "naive date-time" (also called "local date-time" in some libraries)
+/// represents a wall clock time combined with a calendar date, but with
+/// no indication of how to map this information to a physical point in time.
+/// Naive date-times must be handled with care because of this missing
+/// information, and also because daylight saving time (DST) may make
+/// some values ambiguous or non-existent. A naive date-time may be
+/// stored as a struct with Date and Time fields. However, it may also be
+/// encoded into a Timestamp column with an empty timezone. The timestamp
+/// values should be computed "as if" the timezone of the date-time values
+/// was UTC; for example, the naive date-time "January 1st 1970, 00h00" would
+/// be encoded as timestamp value 0.
+table Timestamp {
+ unit: TimeUnit;
+
+ /// The timezone is an optional string indicating the name of a timezone,
+ /// one of:
+ ///
+ /// * As used in the Olson timezone database (the "tz database" or
+ /// "tzdata"), such as "America/New_York".
+ /// * An absolute timezone offset of the form "+XX:XX" or "-XX:XX",
+ /// such as "+07:30".
+ ///
+ /// Whether a timezone string is present indicates different semantics about
+ /// the data (see above).
+ timezone: string;
+}
+
+enum IntervalUnit: short { YEAR_MONTH, DAY_TIME, MONTH_DAY_NANO}
+// A "calendar" interval which models types that don't necessarily
+// have a precise duration without the context of a base timestamp (e.g.
+// days can differ in length during day light savings time transitions).
+// All integers in the types below are stored in the endianness indicated
+// by the schema.
+//
+// YEAR_MONTH - Indicates the number of elapsed whole months, stored as
+// 4-byte signed integers.
+// DAY_TIME - Indicates the number of elapsed days and milliseconds (no leap seconds),
+// stored as 2 contiguous 32-bit signed integers (8-bytes in total). Support
+// of this IntervalUnit is not required for full arrow compatibility.
+// MONTH_DAY_NANO - A triple of the number of elapsed months, days, and nanoseconds.
+// The values are stored contiguously in 16-byte blocks. Months and days are
+// encoded as 32-bit signed integers and nanoseconds is encoded as a 64-bit
+// signed integer. Nanoseconds does not allow for leap seconds. Each field is
+// independent (e.g. there is no constraint that nanoseconds have the same
+// sign as days or that the quantity of nanoseconds represents less than a
+// day's worth of time).
+table Interval {
+ unit: IntervalUnit;
+}
+
+// An absolute length of time unrelated to any calendar artifacts.
+//
+// For the purposes of Arrow Implementations, adding this value to a Timestamp
+// ("t1") naively (i.e. simply summing the two number) is acceptable even
+// though in some cases the resulting Timestamp (t2) would not account for
+// leap-seconds during the elapsed time between "t1" and "t2". Similarly,
+// representing the difference between two Unix timestamp is acceptable, but
+// would yield a value that is possibly a few seconds off from the true elapsed
+// time.
+//
+// The resolution defaults to millisecond, but can be any of the other
+// supported TimeUnit values as with Timestamp and Time types. This type is
+// always represented as an 8-byte integer.
+table Duration {
+ unit: TimeUnit = MILLISECOND;
+}
+
+/// ----------------------------------------------------------------------
+/// Top-level Type value, enabling extensible type-specific metadata. We can
+/// add new logical types to Type without breaking backwards compatibility
+
+union Type {
+ Null,
+ Int,
+ FloatingPoint,
+ Binary,
+ Utf8,
+ Bool,
+ Decimal,
+ Date,
+ Time,
+ Timestamp,
+ Interval,
+ List,
+ Struct_,
+ Union,
+ FixedSizeBinary,
+ FixedSizeList,
+ Map,
+ Duration,
+ LargeBinary,
+ LargeUtf8,
+ LargeList,
+}
+
+/// ----------------------------------------------------------------------
+/// user defined key value pairs to add custom metadata to arrow
+/// key namespacing is the responsibility of the user
+
+table KeyValue {
+ key: string;
+ value: string;
+}
+
+/// ----------------------------------------------------------------------
+/// Dictionary encoding metadata
+/// Maintained for forwards compatibility, in the future
+/// Dictionaries might be explicit maps between integers and values
+/// allowing for non-contiguous index values
+enum DictionaryKind : short { DenseArray }
+table DictionaryEncoding {
+ /// The known dictionary id in the application where this data is used. In
+ /// the file or streaming formats, the dictionary ids are found in the
+ /// DictionaryBatch messages
+ id: long;
+
+ /// The dictionary indices are constrained to be non-negative integers. If
+ /// this field is null, the indices must be signed int32. To maximize
+ /// cross-language compatibility and performance, implementations are
+ /// recommended to prefer signed integer types over unsigned integer types
+ /// and to avoid uint64 indices unless they are required by an application.
+ indexType: Int;
+
+ /// By default, dictionaries are not ordered, or the order does not have
+ /// semantic meaning. In some statistical, applications, dictionary-encoding
+ /// is used to represent ordered categorical data, and we provide a way to
+ /// preserve that metadata here
+ isOrdered: bool;
+
+ dictionaryKind: DictionaryKind;
+}
+
+/// ----------------------------------------------------------------------
+/// A field represents a named column in a record / row batch or child of a
+/// nested type.
+
+table Field {
+ /// Name is not required, in i.e. a List
+ name: string;
+
+ /// Whether or not this field can contain nulls. Should be true in general.
+ nullable: bool;
+
+ /// This is the type of the decoded value if the field is dictionary encoded.
+ type: Type;
+
+ /// Present only if the field is dictionary encoded.
+ dictionary: DictionaryEncoding;
+
+ /// children apply only to nested data types like Struct, List and Union. For
+ /// primitive types children will have length 0.
+ children: [ Field ];
+
+ /// User-defined metadata
+ custom_metadata: [ KeyValue ];
+}
+
+/// ----------------------------------------------------------------------
+/// Endianness of the platform producing the data
+
+enum Endianness:short { Little, Big }
+
+/// ----------------------------------------------------------------------
+/// A Buffer represents a single contiguous memory segment
+struct Buffer {
+ /// The relative offset into the shared memory page where the bytes for this
+ /// buffer starts
+ offset: long;
+
+ /// The absolute length (in bytes) of the memory buffer. The memory is found
+ /// from offset (inclusive) to offset + length (non-inclusive). When building
+ /// messages using the encapsulated IPC message, padding bytes may be written
+ /// after a buffer, but such padding bytes do not need to be accounted for in
+ /// the size here.
+ length: long;
+}
+
+/// ----------------------------------------------------------------------
+/// A Schema describes the columns in a row batch
+
+table Schema {
+
+ /// endianness of the buffer
+ /// it is Little Endian by default
+ /// if endianness doesn't match the underlying system then the vectors need to be converted
+ endianness: Endianness=Little;
+
+ fields: [Field];
+ // User-defined metadata
+ custom_metadata: [ KeyValue ];
+
+ /// Features used in the stream/file.
+ features : [ Feature ];
+}
+
+root_type Schema;