summaryrefslogtreecommitdiffstats
path: root/src/arrow/format/Schema.fbs
blob: 7ee827b5de8daa3804eb6f8d62309b5d47027937 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

/// Logical types, vector layouts, and schemas

/// Format Version History.
/// Version 1.0 - Forward and backwards compatibility guaranteed.
/// Version 1.1 - Add Decimal256 (No format release).
/// Version 1.2 (Pending)- Add Interval MONTH_DAY_NANO

namespace org.apache.arrow.flatbuf;

enum MetadataVersion:short {
  /// 0.1.0 (October 2016).
  V1,

  /// 0.2.0 (February 2017). Non-backwards compatible with V1.
  V2,

  /// 0.3.0 -> 0.7.1 (May - December 2017). Non-backwards compatible with V2.
  V3,

  /// >= 0.8.0 (December 2017). Non-backwards compatible with V3.
  V4,

  /// >= 1.0.0 (July 2020. Backwards compatible with V4 (V5 readers can read V4
  /// metadata and IPC messages). Implementations are recommended to provide a
  /// V4 compatibility mode with V5 format changes disabled.
  ///
  /// Incompatible changes between V4 and V5:
  /// - Union buffer layout has changed. In V5, Unions don't have a validity
  ///   bitmap buffer.
  V5,
}

/// Represents Arrow Features that might not have full support
/// within implementations. This is intended to be used in
/// two scenarios:
///  1.  A mechanism for readers of Arrow Streams
///      and files to understand that the stream or file makes
///      use of a feature that isn't supported or unknown to
///      the implementation (and therefore can meet the Arrow
///      forward compatibility guarantees).
///  2.  A means of negotiating between a client and server
///      what features a stream is allowed to use. The enums
///      values here are intented to represent higher level
///      features, additional details maybe negotiated
///      with key-value pairs specific to the protocol.
///
/// Enums added to this list should be assigned power-of-two values
/// to facilitate exchanging and comparing bitmaps for supported
/// features.
enum Feature : long {
  /// Needed to make flatbuffers happy.
  UNUSED = 0,
  /// The stream makes use of multiple full dictionaries with the
  /// same ID and assumes clients implement dictionary replacement
  /// correctly.
  DICTIONARY_REPLACEMENT = 1,
  /// The stream makes use of compressed bodies as described
  /// in Message.fbs.
  COMPRESSED_BODY = 2
}

/// These are stored in the flatbuffer in the Type union below

table Null {
}

/// A Struct_ in the flatbuffer metadata is the same as an Arrow Struct
/// (according to the physical memory layout). We used Struct_ here as
/// Struct is a reserved word in Flatbuffers
table Struct_ {
}

table List {
}

/// Same as List, but with 64-bit offsets, allowing to represent
/// extremely large data values.
table LargeList {
}

table FixedSizeList {
  /// Number of list items per value
  listSize: int;
}

/// A Map is a logical nested type that is represented as
///
/// List<entries: Struct<key: K, value: V>>
///
/// In this layout, the keys and values are each respectively contiguous. We do
/// not constrain the key and value types, so the application is responsible
/// for ensuring that the keys are hashable and unique. Whether the keys are sorted
/// may be set in the metadata for this field.
///
/// In a field with Map type, the field has a child Struct field, which then
/// has two children: key type and the second the value type. The names of the
/// child fields may be respectively "entries", "key", and "value", but this is
/// not enforced.
///
/// Map
/// ```text
///   - child[0] entries: Struct
///     - child[0] key: K
///     - child[1] value: V
/// ```
/// Neither the "entries" field nor the "key" field may be nullable.
///
/// The metadata is structured so that Arrow systems without special handling
/// for Map can make Map an alias for List. The "layout" attribute for the Map
/// field must have the same contents as a List.
table Map {
  /// Set to true if the keys within each value are sorted
  keysSorted: bool;
}

enum UnionMode:short { Sparse, Dense }

/// A union is a complex type with children in Field
/// By default ids in the type vector refer to the offsets in the children
/// optionally typeIds provides an indirection between the child offset and the type id
/// for each child `typeIds[offset]` is the id used in the type vector
table Union {
  mode: UnionMode;
  typeIds: [ int ]; // optional, describes typeid of each child.
}

table Int {
  bitWidth: int; // restricted to 8, 16, 32, and 64 in v1
  is_signed: bool;
}

enum Precision:short {HALF, SINGLE, DOUBLE}

table FloatingPoint {
  precision: Precision;
}

/// Unicode with UTF-8 encoding
table Utf8 {
}

/// Opaque binary data
table Binary {
}

/// Same as Utf8, but with 64-bit offsets, allowing to represent
/// extremely large data values.
table LargeUtf8 {
}

/// Same as Binary, but with 64-bit offsets, allowing to represent
/// extremely large data values.
table LargeBinary {
}

table FixedSizeBinary {
  /// Number of bytes per value
  byteWidth: int;
}

table Bool {
}

/// Exact decimal value represented as an integer value in two's
/// complement. Currently only 128-bit (16-byte) and 256-bit (32-byte) integers
/// are used. The representation uses the endianness indicated
/// in the Schema.
table Decimal {
  /// Total number of decimal digits
  precision: int;

  /// Number of digits after the decimal point "."
  scale: int;

  /// Number of bits per value. The only accepted widths are 128 and 256.
  /// We use bitWidth for consistency with Int::bitWidth.
  bitWidth: int = 128;
}

enum DateUnit: short {
  DAY,
  MILLISECOND
}

/// Date is either a 32-bit or 64-bit signed integer type representing an
/// elapsed time since UNIX epoch (1970-01-01), stored in either of two units:
///
/// * Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no
///   leap seconds), where the values are evenly divisible by 86400000
/// * Days (32 bits) since the UNIX epoch
table Date {
  unit: DateUnit = MILLISECOND;
}

enum TimeUnit: short { SECOND, MILLISECOND, MICROSECOND, NANOSECOND }

/// Time is either a 32-bit or 64-bit signed integer type representing an
/// elapsed time since midnight, stored in either of four units: seconds,
/// milliseconds, microseconds or nanoseconds.
///
/// The integer `bitWidth` depends on the `unit` and must be one of the following:
/// * SECOND and MILLISECOND: 32 bits
/// * MICROSECOND and NANOSECOND: 64 bits
///
/// The allowed values are between 0 (inclusive) and 86400 (=24*60*60) seconds
/// (exclusive), adjusted for the time unit (for example, up to 86400000
/// exclusive for the MILLISECOND unit).
/// This definition doesn't allow for leap seconds. Time values from
/// measurements with leap seconds will need to be corrected when ingesting
/// into Arrow (for example by replacing the value 86400 with 86399).
table Time {
  unit: TimeUnit = MILLISECOND;
  bitWidth: int = 32;
}

/// Timestamp is a 64-bit signed integer representing an elapsed time since a
/// fixed epoch, stored in either of four units: seconds, milliseconds,
/// microseconds or nanoseconds, and is optionally annotated with a timezone.
///
/// Timestamp values do not include any leap seconds (in other words, all
/// days are considered 86400 seconds long).
///
/// Timestamps with a non-empty timezone
/// ------------------------------------
///
/// If a Timestamp column has a non-empty timezone value, its epoch is
/// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone
/// (the Unix epoch), regardless of the Timestamp's own timezone.
///
/// Therefore, timestamp values with a non-empty timezone correspond to
/// physical points in time together with some additional information about
/// how the data was obtained and/or how to display it (the timezone).
///
///   For example, the timestamp value 0 with the timezone string "Europe/Paris"
///   corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the
///   application may prefer to display it as "January 1st 1970, 01h00" in
///   the Europe/Paris timezone (which is the same physical point in time).
///
/// One consequence is that timestamp values with a non-empty timezone
/// can be compared and ordered directly, since they all share the same
/// well-known point of reference (the Unix epoch).
///
/// Timestamps with an unset / empty timezone
/// -----------------------------------------
///
/// If a Timestamp column has no timezone value, its epoch is
/// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone.
///
/// Therefore, timestamp values without a timezone cannot be meaningfully
/// interpreted as physical points in time, but only as calendar / clock
/// indications ("wall clock time") in an unspecified timezone.
///
///   For example, the timestamp value 0 with an empty timezone string
///   corresponds to "January 1st 1970, 00h00" in an unknown timezone: there
///   is not enough information to interpret it as a well-defined physical
///   point in time.
///
/// One consequence is that timestamp values without a timezone cannot
/// be reliably compared or ordered, since they may have different points of
/// reference.  In particular, it is *not* possible to interpret an unset
/// or empty timezone as the same as "UTC".
///
/// Conversion between timezones
/// ----------------------------
///
/// If a Timestamp column has a non-empty timezone, changing the timezone
/// to a different non-empty value is a metadata-only operation:
/// the timestamp values need not change as their point of reference remains
/// the same (the Unix epoch).
///
/// However, if a Timestamp column has no timezone value, changing it to a
/// non-empty value requires to think about the desired semantics.
/// One possibility is to assume that the original timestamp values are
/// relative to the epoch of the timezone being set; timestamp values should
/// then adjusted to the Unix epoch (for example, changing the timezone from
/// empty to "Europe/Paris" would require converting the timestamp values
/// from "Europe/Paris" to "UTC", which seems counter-intuitive but is
/// nevertheless correct).
///
/// Guidelines for encoding data from external libraries
/// ----------------------------------------------------
///
/// Date & time libraries often have multiple different data types for temporal
/// data. In order to ease interoperability between different implementations the
/// Arrow project has some recommendations for encoding these types into a Timestamp
/// column.
///
/// An "instant" represents a physical point in time that has no relevant timezone
/// (for example, astronomical data). To encode an instant, use a Timestamp with
/// the timezone string set to "UTC", and make sure the Timestamp values
/// are relative to the UTC epoch (January 1st 1970, midnight).
///
/// A "zoned date-time" represents a physical point in time annotated with an
/// informative timezone (for example, the timezone in which the data was
/// recorded).  To encode a zoned date-time, use a Timestamp with the timezone
/// string set to the name of the timezone, and make sure the Timestamp values
/// are relative to the UTC epoch (January 1st 1970, midnight).
///
///  (There is some ambiguity between an instant and a zoned date-time with the
///   UTC timezone.  Both of these are stored the same in Arrow.  Typically,
///   this distinction does not matter.  If it does, then an application should
///   use custom metadata or an extension type to distinguish between the two cases.)
///
/// An "offset date-time" represents a physical point in time combined with an
/// explicit offset from UTC.  To encode an offset date-time, use a Timestamp
/// with the timezone string set to the numeric timezone offset string
/// (e.g. "+03:00"), and make sure the Timestamp values are relative to
/// the UTC epoch (January 1st 1970, midnight).
///
/// A "naive date-time" (also called "local date-time" in some libraries)
/// represents a wall clock time combined with a calendar date, but with
/// no indication of how to map this information to a physical point in time.
/// Naive date-times must be handled with care because of this missing
/// information, and also because daylight saving time (DST) may make
/// some values ambiguous or non-existent. A naive date-time may be
/// stored as a struct with Date and Time fields. However, it may also be
/// encoded into a Timestamp column with an empty timezone. The timestamp
/// values should be computed "as if" the timezone of the date-time values
/// was UTC; for example, the naive date-time "January 1st 1970, 00h00" would
/// be encoded as timestamp value 0.
table Timestamp {
  unit: TimeUnit;

  /// The timezone is an optional string indicating the name of a timezone,
  /// one of:
  ///
  /// * As used in the Olson timezone database (the "tz database" or
  ///   "tzdata"), such as "America/New_York".
  /// * An absolute timezone offset of the form "+XX:XX" or "-XX:XX",
  ///   such as "+07:30".
  ///
  /// Whether a timezone string is present indicates different semantics about
  /// the data (see above).
  timezone: string;
}

enum IntervalUnit: short { YEAR_MONTH, DAY_TIME, MONTH_DAY_NANO}
// A "calendar" interval which models types that don't necessarily
// have a precise duration without the context of a base timestamp (e.g.
// days can differ in length during day light savings time transitions).
// All integers in the types below are stored in the endianness indicated
// by the schema.
//
// YEAR_MONTH - Indicates the number of elapsed whole months, stored as
//   4-byte signed integers.
// DAY_TIME - Indicates the number of elapsed days and milliseconds (no leap seconds),
//   stored as 2 contiguous 32-bit signed integers (8-bytes in total). Support
//   of this IntervalUnit is not required for full arrow compatibility.
// MONTH_DAY_NANO - A triple of the number of elapsed months, days, and nanoseconds.
//  The values are stored contiguously in 16-byte blocks. Months and days are
//  encoded as 32-bit signed integers and nanoseconds is encoded as a 64-bit
//  signed integer. Nanoseconds does not allow for leap seconds. Each field is
//  independent (e.g. there is no constraint that nanoseconds have the same
//  sign as days or that the quantity of nanoseconds represents less than a
//  day's worth of time).
table Interval {
  unit: IntervalUnit;
}

// An absolute length of time unrelated to any calendar artifacts.
//
// For the purposes of Arrow Implementations, adding this value to a Timestamp
// ("t1") naively (i.e. simply summing the two number) is acceptable even
// though in some cases the resulting Timestamp (t2) would not account for
// leap-seconds during the elapsed time between "t1" and "t2".  Similarly,
// representing the difference between two Unix timestamp is acceptable, but
// would yield a value that is possibly a few seconds off from the true elapsed
// time.
//
//  The resolution defaults to millisecond, but can be any of the other
//  supported TimeUnit values as with Timestamp and Time types.  This type is
//  always represented as an 8-byte integer.
table Duration {
  unit: TimeUnit = MILLISECOND;
}

/// ----------------------------------------------------------------------
/// Top-level Type value, enabling extensible type-specific metadata. We can
/// add new logical types to Type without breaking backwards compatibility

union Type {
  Null,
  Int,
  FloatingPoint,
  Binary,
  Utf8,
  Bool,
  Decimal,
  Date,
  Time,
  Timestamp,
  Interval,
  List,
  Struct_,
  Union,
  FixedSizeBinary,
  FixedSizeList,
  Map,
  Duration,
  LargeBinary,
  LargeUtf8,
  LargeList,
}

/// ----------------------------------------------------------------------
/// user defined key value pairs to add custom metadata to arrow
/// key namespacing is the responsibility of the user

table KeyValue {
  key: string;
  value: string;
}

/// ----------------------------------------------------------------------
/// Dictionary encoding metadata
/// Maintained for forwards compatibility, in the future
/// Dictionaries might be explicit maps between integers and values
/// allowing for non-contiguous index values
enum DictionaryKind : short { DenseArray }
table DictionaryEncoding {
  /// The known dictionary id in the application where this data is used. In
  /// the file or streaming formats, the dictionary ids are found in the
  /// DictionaryBatch messages
  id: long;

  /// The dictionary indices are constrained to be non-negative integers. If
  /// this field is null, the indices must be signed int32. To maximize
  /// cross-language compatibility and performance, implementations are
  /// recommended to prefer signed integer types over unsigned integer types
  /// and to avoid uint64 indices unless they are required by an application.
  indexType: Int;

  /// By default, dictionaries are not ordered, or the order does not have
  /// semantic meaning. In some statistical, applications, dictionary-encoding
  /// is used to represent ordered categorical data, and we provide a way to
  /// preserve that metadata here
  isOrdered: bool;

  dictionaryKind: DictionaryKind;
}

/// ----------------------------------------------------------------------
/// A field represents a named column in a record / row batch or child of a
/// nested type.

table Field {
  /// Name is not required, in i.e. a List
  name: string;

  /// Whether or not this field can contain nulls. Should be true in general.
  nullable: bool;

  /// This is the type of the decoded value if the field is dictionary encoded.
  type: Type;

  /// Present only if the field is dictionary encoded.
  dictionary: DictionaryEncoding;

  /// children apply only to nested data types like Struct, List and Union. For
  /// primitive types children will have length 0.
  children: [ Field ];

  /// User-defined metadata
  custom_metadata: [ KeyValue ];
}

/// ----------------------------------------------------------------------
/// Endianness of the platform producing the data

enum Endianness:short { Little, Big }

/// ----------------------------------------------------------------------
/// A Buffer represents a single contiguous memory segment
struct Buffer {
  /// The relative offset into the shared memory page where the bytes for this
  /// buffer starts
  offset: long;

  /// The absolute length (in bytes) of the memory buffer. The memory is found
  /// from offset (inclusive) to offset + length (non-inclusive). When building
  /// messages using the encapsulated IPC message, padding bytes may be written
  /// after a buffer, but such padding bytes do not need to be accounted for in
  /// the size here.
  length: long;
}

/// ----------------------------------------------------------------------
/// A Schema describes the columns in a row batch

table Schema {

  /// endianness of the buffer
  /// it is Little Endian by default
  /// if endianness doesn't match the underlying system then the vectors need to be converted
  endianness: Endianness=Little;

  fields: [Field];
  // User-defined metadata
  custom_metadata: [ KeyValue ];

  /// Features used in the stream/file.
  features : [ Feature ];
}

root_type Schema;