diff options
Diffstat (limited to 'src/arrow/format/Schema.fbs')
-rw-r--r-- | src/arrow/format/Schema.fbs | 522 |
1 files changed, 522 insertions, 0 deletions
diff --git a/src/arrow/format/Schema.fbs b/src/arrow/format/Schema.fbs new file mode 100644 index 000000000..7ee827b5d --- /dev/null +++ b/src/arrow/format/Schema.fbs @@ -0,0 +1,522 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// Logical types, vector layouts, and schemas + +/// Format Version History. +/// Version 1.0 - Forward and backwards compatibility guaranteed. +/// Version 1.1 - Add Decimal256 (No format release). +/// Version 1.2 (Pending)- Add Interval MONTH_DAY_NANO + +namespace org.apache.arrow.flatbuf; + +enum MetadataVersion:short { + /// 0.1.0 (October 2016). + V1, + + /// 0.2.0 (February 2017). Non-backwards compatible with V1. + V2, + + /// 0.3.0 -> 0.7.1 (May - December 2017). Non-backwards compatible with V2. + V3, + + /// >= 0.8.0 (December 2017). Non-backwards compatible with V3. + V4, + + /// >= 1.0.0 (July 2020. Backwards compatible with V4 (V5 readers can read V4 + /// metadata and IPC messages). Implementations are recommended to provide a + /// V4 compatibility mode with V5 format changes disabled. + /// + /// Incompatible changes between V4 and V5: + /// - Union buffer layout has changed. In V5, Unions don't have a validity + /// bitmap buffer. + V5, +} + +/// Represents Arrow Features that might not have full support +/// within implementations. This is intended to be used in +/// two scenarios: +/// 1. A mechanism for readers of Arrow Streams +/// and files to understand that the stream or file makes +/// use of a feature that isn't supported or unknown to +/// the implementation (and therefore can meet the Arrow +/// forward compatibility guarantees). +/// 2. A means of negotiating between a client and server +/// what features a stream is allowed to use. The enums +/// values here are intented to represent higher level +/// features, additional details maybe negotiated +/// with key-value pairs specific to the protocol. +/// +/// Enums added to this list should be assigned power-of-two values +/// to facilitate exchanging and comparing bitmaps for supported +/// features. +enum Feature : long { + /// Needed to make flatbuffers happy. + UNUSED = 0, + /// The stream makes use of multiple full dictionaries with the + /// same ID and assumes clients implement dictionary replacement + /// correctly. + DICTIONARY_REPLACEMENT = 1, + /// The stream makes use of compressed bodies as described + /// in Message.fbs. + COMPRESSED_BODY = 2 +} + +/// These are stored in the flatbuffer in the Type union below + +table Null { +} + +/// A Struct_ in the flatbuffer metadata is the same as an Arrow Struct +/// (according to the physical memory layout). We used Struct_ here as +/// Struct is a reserved word in Flatbuffers +table Struct_ { +} + +table List { +} + +/// Same as List, but with 64-bit offsets, allowing to represent +/// extremely large data values. +table LargeList { +} + +table FixedSizeList { + /// Number of list items per value + listSize: int; +} + +/// A Map is a logical nested type that is represented as +/// +/// List<entries: Struct<key: K, value: V>> +/// +/// In this layout, the keys and values are each respectively contiguous. We do +/// not constrain the key and value types, so the application is responsible +/// for ensuring that the keys are hashable and unique. Whether the keys are sorted +/// may be set in the metadata for this field. +/// +/// In a field with Map type, the field has a child Struct field, which then +/// has two children: key type and the second the value type. The names of the +/// child fields may be respectively "entries", "key", and "value", but this is +/// not enforced. +/// +/// Map +/// ```text +/// - child[0] entries: Struct +/// - child[0] key: K +/// - child[1] value: V +/// ``` +/// Neither the "entries" field nor the "key" field may be nullable. +/// +/// The metadata is structured so that Arrow systems without special handling +/// for Map can make Map an alias for List. The "layout" attribute for the Map +/// field must have the same contents as a List. +table Map { + /// Set to true if the keys within each value are sorted + keysSorted: bool; +} + +enum UnionMode:short { Sparse, Dense } + +/// A union is a complex type with children in Field +/// By default ids in the type vector refer to the offsets in the children +/// optionally typeIds provides an indirection between the child offset and the type id +/// for each child `typeIds[offset]` is the id used in the type vector +table Union { + mode: UnionMode; + typeIds: [ int ]; // optional, describes typeid of each child. +} + +table Int { + bitWidth: int; // restricted to 8, 16, 32, and 64 in v1 + is_signed: bool; +} + +enum Precision:short {HALF, SINGLE, DOUBLE} + +table FloatingPoint { + precision: Precision; +} + +/// Unicode with UTF-8 encoding +table Utf8 { +} + +/// Opaque binary data +table Binary { +} + +/// Same as Utf8, but with 64-bit offsets, allowing to represent +/// extremely large data values. +table LargeUtf8 { +} + +/// Same as Binary, but with 64-bit offsets, allowing to represent +/// extremely large data values. +table LargeBinary { +} + +table FixedSizeBinary { + /// Number of bytes per value + byteWidth: int; +} + +table Bool { +} + +/// Exact decimal value represented as an integer value in two's +/// complement. Currently only 128-bit (16-byte) and 256-bit (32-byte) integers +/// are used. The representation uses the endianness indicated +/// in the Schema. +table Decimal { + /// Total number of decimal digits + precision: int; + + /// Number of digits after the decimal point "." + scale: int; + + /// Number of bits per value. The only accepted widths are 128 and 256. + /// We use bitWidth for consistency with Int::bitWidth. + bitWidth: int = 128; +} + +enum DateUnit: short { + DAY, + MILLISECOND +} + +/// Date is either a 32-bit or 64-bit signed integer type representing an +/// elapsed time since UNIX epoch (1970-01-01), stored in either of two units: +/// +/// * Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no +/// leap seconds), where the values are evenly divisible by 86400000 +/// * Days (32 bits) since the UNIX epoch +table Date { + unit: DateUnit = MILLISECOND; +} + +enum TimeUnit: short { SECOND, MILLISECOND, MICROSECOND, NANOSECOND } + +/// Time is either a 32-bit or 64-bit signed integer type representing an +/// elapsed time since midnight, stored in either of four units: seconds, +/// milliseconds, microseconds or nanoseconds. +/// +/// The integer `bitWidth` depends on the `unit` and must be one of the following: +/// * SECOND and MILLISECOND: 32 bits +/// * MICROSECOND and NANOSECOND: 64 bits +/// +/// The allowed values are between 0 (inclusive) and 86400 (=24*60*60) seconds +/// (exclusive), adjusted for the time unit (for example, up to 86400000 +/// exclusive for the MILLISECOND unit). +/// This definition doesn't allow for leap seconds. Time values from +/// measurements with leap seconds will need to be corrected when ingesting +/// into Arrow (for example by replacing the value 86400 with 86399). +table Time { + unit: TimeUnit = MILLISECOND; + bitWidth: int = 32; +} + +/// Timestamp is a 64-bit signed integer representing an elapsed time since a +/// fixed epoch, stored in either of four units: seconds, milliseconds, +/// microseconds or nanoseconds, and is optionally annotated with a timezone. +/// +/// Timestamp values do not include any leap seconds (in other words, all +/// days are considered 86400 seconds long). +/// +/// Timestamps with a non-empty timezone +/// ------------------------------------ +/// +/// If a Timestamp column has a non-empty timezone value, its epoch is +/// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone +/// (the Unix epoch), regardless of the Timestamp's own timezone. +/// +/// Therefore, timestamp values with a non-empty timezone correspond to +/// physical points in time together with some additional information about +/// how the data was obtained and/or how to display it (the timezone). +/// +/// For example, the timestamp value 0 with the timezone string "Europe/Paris" +/// corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the +/// application may prefer to display it as "January 1st 1970, 01h00" in +/// the Europe/Paris timezone (which is the same physical point in time). +/// +/// One consequence is that timestamp values with a non-empty timezone +/// can be compared and ordered directly, since they all share the same +/// well-known point of reference (the Unix epoch). +/// +/// Timestamps with an unset / empty timezone +/// ----------------------------------------- +/// +/// If a Timestamp column has no timezone value, its epoch is +/// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone. +/// +/// Therefore, timestamp values without a timezone cannot be meaningfully +/// interpreted as physical points in time, but only as calendar / clock +/// indications ("wall clock time") in an unspecified timezone. +/// +/// For example, the timestamp value 0 with an empty timezone string +/// corresponds to "January 1st 1970, 00h00" in an unknown timezone: there +/// is not enough information to interpret it as a well-defined physical +/// point in time. +/// +/// One consequence is that timestamp values without a timezone cannot +/// be reliably compared or ordered, since they may have different points of +/// reference. In particular, it is *not* possible to interpret an unset +/// or empty timezone as the same as "UTC". +/// +/// Conversion between timezones +/// ---------------------------- +/// +/// If a Timestamp column has a non-empty timezone, changing the timezone +/// to a different non-empty value is a metadata-only operation: +/// the timestamp values need not change as their point of reference remains +/// the same (the Unix epoch). +/// +/// However, if a Timestamp column has no timezone value, changing it to a +/// non-empty value requires to think about the desired semantics. +/// One possibility is to assume that the original timestamp values are +/// relative to the epoch of the timezone being set; timestamp values should +/// then adjusted to the Unix epoch (for example, changing the timezone from +/// empty to "Europe/Paris" would require converting the timestamp values +/// from "Europe/Paris" to "UTC", which seems counter-intuitive but is +/// nevertheless correct). +/// +/// Guidelines for encoding data from external libraries +/// ---------------------------------------------------- +/// +/// Date & time libraries often have multiple different data types for temporal +/// data. In order to ease interoperability between different implementations the +/// Arrow project has some recommendations for encoding these types into a Timestamp +/// column. +/// +/// An "instant" represents a physical point in time that has no relevant timezone +/// (for example, astronomical data). To encode an instant, use a Timestamp with +/// the timezone string set to "UTC", and make sure the Timestamp values +/// are relative to the UTC epoch (January 1st 1970, midnight). +/// +/// A "zoned date-time" represents a physical point in time annotated with an +/// informative timezone (for example, the timezone in which the data was +/// recorded). To encode a zoned date-time, use a Timestamp with the timezone +/// string set to the name of the timezone, and make sure the Timestamp values +/// are relative to the UTC epoch (January 1st 1970, midnight). +/// +/// (There is some ambiguity between an instant and a zoned date-time with the +/// UTC timezone. Both of these are stored the same in Arrow. Typically, +/// this distinction does not matter. If it does, then an application should +/// use custom metadata or an extension type to distinguish between the two cases.) +/// +/// An "offset date-time" represents a physical point in time combined with an +/// explicit offset from UTC. To encode an offset date-time, use a Timestamp +/// with the timezone string set to the numeric timezone offset string +/// (e.g. "+03:00"), and make sure the Timestamp values are relative to +/// the UTC epoch (January 1st 1970, midnight). +/// +/// A "naive date-time" (also called "local date-time" in some libraries) +/// represents a wall clock time combined with a calendar date, but with +/// no indication of how to map this information to a physical point in time. +/// Naive date-times must be handled with care because of this missing +/// information, and also because daylight saving time (DST) may make +/// some values ambiguous or non-existent. A naive date-time may be +/// stored as a struct with Date and Time fields. However, it may also be +/// encoded into a Timestamp column with an empty timezone. The timestamp +/// values should be computed "as if" the timezone of the date-time values +/// was UTC; for example, the naive date-time "January 1st 1970, 00h00" would +/// be encoded as timestamp value 0. +table Timestamp { + unit: TimeUnit; + + /// The timezone is an optional string indicating the name of a timezone, + /// one of: + /// + /// * As used in the Olson timezone database (the "tz database" or + /// "tzdata"), such as "America/New_York". + /// * An absolute timezone offset of the form "+XX:XX" or "-XX:XX", + /// such as "+07:30". + /// + /// Whether a timezone string is present indicates different semantics about + /// the data (see above). + timezone: string; +} + +enum IntervalUnit: short { YEAR_MONTH, DAY_TIME, MONTH_DAY_NANO} +// A "calendar" interval which models types that don't necessarily +// have a precise duration without the context of a base timestamp (e.g. +// days can differ in length during day light savings time transitions). +// All integers in the types below are stored in the endianness indicated +// by the schema. +// +// YEAR_MONTH - Indicates the number of elapsed whole months, stored as +// 4-byte signed integers. +// DAY_TIME - Indicates the number of elapsed days and milliseconds (no leap seconds), +// stored as 2 contiguous 32-bit signed integers (8-bytes in total). Support +// of this IntervalUnit is not required for full arrow compatibility. +// MONTH_DAY_NANO - A triple of the number of elapsed months, days, and nanoseconds. +// The values are stored contiguously in 16-byte blocks. Months and days are +// encoded as 32-bit signed integers and nanoseconds is encoded as a 64-bit +// signed integer. Nanoseconds does not allow for leap seconds. Each field is +// independent (e.g. there is no constraint that nanoseconds have the same +// sign as days or that the quantity of nanoseconds represents less than a +// day's worth of time). +table Interval { + unit: IntervalUnit; +} + +// An absolute length of time unrelated to any calendar artifacts. +// +// For the purposes of Arrow Implementations, adding this value to a Timestamp +// ("t1") naively (i.e. simply summing the two number) is acceptable even +// though in some cases the resulting Timestamp (t2) would not account for +// leap-seconds during the elapsed time between "t1" and "t2". Similarly, +// representing the difference between two Unix timestamp is acceptable, but +// would yield a value that is possibly a few seconds off from the true elapsed +// time. +// +// The resolution defaults to millisecond, but can be any of the other +// supported TimeUnit values as with Timestamp and Time types. This type is +// always represented as an 8-byte integer. +table Duration { + unit: TimeUnit = MILLISECOND; +} + +/// ---------------------------------------------------------------------- +/// Top-level Type value, enabling extensible type-specific metadata. We can +/// add new logical types to Type without breaking backwards compatibility + +union Type { + Null, + Int, + FloatingPoint, + Binary, + Utf8, + Bool, + Decimal, + Date, + Time, + Timestamp, + Interval, + List, + Struct_, + Union, + FixedSizeBinary, + FixedSizeList, + Map, + Duration, + LargeBinary, + LargeUtf8, + LargeList, +} + +/// ---------------------------------------------------------------------- +/// user defined key value pairs to add custom metadata to arrow +/// key namespacing is the responsibility of the user + +table KeyValue { + key: string; + value: string; +} + +/// ---------------------------------------------------------------------- +/// Dictionary encoding metadata +/// Maintained for forwards compatibility, in the future +/// Dictionaries might be explicit maps between integers and values +/// allowing for non-contiguous index values +enum DictionaryKind : short { DenseArray } +table DictionaryEncoding { + /// The known dictionary id in the application where this data is used. In + /// the file or streaming formats, the dictionary ids are found in the + /// DictionaryBatch messages + id: long; + + /// The dictionary indices are constrained to be non-negative integers. If + /// this field is null, the indices must be signed int32. To maximize + /// cross-language compatibility and performance, implementations are + /// recommended to prefer signed integer types over unsigned integer types + /// and to avoid uint64 indices unless they are required by an application. + indexType: Int; + + /// By default, dictionaries are not ordered, or the order does not have + /// semantic meaning. In some statistical, applications, dictionary-encoding + /// is used to represent ordered categorical data, and we provide a way to + /// preserve that metadata here + isOrdered: bool; + + dictionaryKind: DictionaryKind; +} + +/// ---------------------------------------------------------------------- +/// A field represents a named column in a record / row batch or child of a +/// nested type. + +table Field { + /// Name is not required, in i.e. a List + name: string; + + /// Whether or not this field can contain nulls. Should be true in general. + nullable: bool; + + /// This is the type of the decoded value if the field is dictionary encoded. + type: Type; + + /// Present only if the field is dictionary encoded. + dictionary: DictionaryEncoding; + + /// children apply only to nested data types like Struct, List and Union. For + /// primitive types children will have length 0. + children: [ Field ]; + + /// User-defined metadata + custom_metadata: [ KeyValue ]; +} + +/// ---------------------------------------------------------------------- +/// Endianness of the platform producing the data + +enum Endianness:short { Little, Big } + +/// ---------------------------------------------------------------------- +/// A Buffer represents a single contiguous memory segment +struct Buffer { + /// The relative offset into the shared memory page where the bytes for this + /// buffer starts + offset: long; + + /// The absolute length (in bytes) of the memory buffer. The memory is found + /// from offset (inclusive) to offset + length (non-inclusive). When building + /// messages using the encapsulated IPC message, padding bytes may be written + /// after a buffer, but such padding bytes do not need to be accounted for in + /// the size here. + length: long; +} + +/// ---------------------------------------------------------------------- +/// A Schema describes the columns in a row batch + +table Schema { + + /// endianness of the buffer + /// it is Little Endian by default + /// if endianness doesn't match the underlying system then the vectors need to be converted + endianness: Endianness=Little; + + fields: [Field]; + // User-defined metadata + custom_metadata: [ KeyValue ]; + + /// Features used in the stream/file. + features : [ Feature ]; +} + +root_type Schema; |