7 files changed, 1356 insertions, 0 deletions
diff --git a/src/arrow/format/File.fbs b/src/arrow/format/File.fbs
new file mode 100644
index 000000000..906d494f2
--- /dev/null
+++ b/src/arrow/format/File.fbs
@@ -0,0 +1,52 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+include "Schema.fbs";
+
+namespace org.apache.arrow.flatbuf;
+
+/// ----------------------------------------------------------------------
+/// Arrow File metadata
+///
+
+table Footer {
+  version: org.apache.arrow.flatbuf.MetadataVersion;
+
+  schema: org.apache.arrow.flatbuf.Schema;
+
+  dictionaries: [ Block ];
+
+  recordBatches: [ Block ];
+
+  /// User-defined metadata
+  custom_metadata: [ KeyValue ];
+}
+
+struct Block {
+
+  /// Index to the start of the RecordBlock (note this is past the Message header)
+  offset: long;
+
+  /// Length of the metadata
+  metaDataLength: int;
+
+  /// Length of the data (this is aligned so there can be a gap between this and
+  /// the metadata).
+  bodyLength: long;
+}
+
+root_type Footer;
diff --git a/src/arrow/format/Flight.proto b/src/arrow/format/Flight.proto
new file mode 100644
index 000000000..b291d9dbd
--- /dev/null
+++ b/src/arrow/format/Flight.proto
@@ -0,0 +1,335 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = "proto3";
+
+option java_package = "org.apache.arrow.flight.impl";
+option go_package = "github.com/apache/arrow/go/flight;flight";
+option csharp_namespace = "Apache.Arrow.Flight.Protocol";
+
+package arrow.flight.protocol;
+
+/*
+ * A flight service is an endpoint for retrieving or storing Arrow data. A
+ * flight service can expose one or more predefined endpoints that can be
+ * accessed using the Arrow Flight Protocol. Additionally, a flight service
+ * can expose a set of actions that are available.
+ */
+service FlightService {
+
+  /*
+   * Handshake between client and server. Depending on the server, the
+   * handshake may be required to determine the token that should be used for
+   * future operations. Both request and response are streams to allow multiple
+   * round-trips depending on auth mechanism.
+   */
+  rpc Handshake(stream HandshakeRequest) returns (stream HandshakeResponse) {}
+
+  /*
+   * Get a list of available streams given a particular criteria. Most flight
+   * services will expose one or more streams that are readily available for
+   * retrieval. This api allows listing the streams available for
+   * consumption. A user can also provide a criteria. The criteria can limit
+   * the subset of streams that can be listed via this interface. Each flight
+   * service allows its own definition of how to consume criteria.
+   */
+  rpc ListFlights(Criteria) returns (stream FlightInfo) {}
+
+  /*
+   * For a given FlightDescriptor, get information about how the flight can be
+   * consumed. This is a useful interface if the consumer of the interface
+   * already can identify the specific flight to consume. This interface can
+   * also allow a consumer to generate a flight stream through a specified
+   * descriptor. For example, a flight descriptor might be something that
+   * includes a SQL statement or a Pickled Python operation that will be
+   * executed. In those cases, the descriptor will not be previously available
+   * within the list of available streams provided by ListFlights but will be
+   * available for consumption for the duration defined by the specific flight
+   * service.
+   */
+  rpc GetFlightInfo(FlightDescriptor) returns (FlightInfo) {}
+
+  /*
+   * For a given FlightDescriptor, get the Schema as described in Schema.fbs::Schema
+   * This is used when a consumer needs the Schema of flight stream. Similar to
+   * GetFlightInfo this interface may generate a new flight that was not previously
+   * available in ListFlights.
+   */
+   rpc GetSchema(FlightDescriptor) returns (SchemaResult) {}
+
+  /*
+   * Retrieve a single stream associated with a particular descriptor
+   * associated with the referenced ticket. A Flight can be composed of one or
+   * more streams where each stream can be retrieved using a separate opaque
+   * ticket that the flight service uses for managing a collection of streams.
+   */
+  rpc DoGet(Ticket) returns (stream FlightData) {}
+
+  /*
+   * Push a stream to the flight service associated with a particular
+   * flight stream. This allows a client of a flight service to upload a stream
+   * of data. Depending on the particular flight service, a client consumer
+   * could be allowed to upload a single stream per descriptor or an unlimited
+   * number. In the latter, the service might implement a 'seal' action that
+   * can be applied to a descriptor once all streams are uploaded.
+   */
+  rpc DoPut(stream FlightData) returns (stream PutResult) {}
+
+  /*
+   * Open a bidirectional data channel for a given descriptor. This
+   * allows clients to send and receive arbitrary Arrow data and
+   * application-specific metadata in a single logical stream. In
+   * contrast to DoGet/DoPut, this is more suited for clients
+   * offloading computation (rather than storage) to a Flight service.
+   */
+  rpc DoExchange(stream FlightData) returns (stream FlightData) {}
+
+  /*
+   * Flight services can support an arbitrary number of simple actions in
+   * addition to the possible ListFlights, GetFlightInfo, DoGet, DoPut
+   * operations that are potentially available. DoAction allows a flight client
+   * to do a specific action against a flight service. An action includes
+   * opaque request and response objects that are specific to the type action
+   * being undertaken.
+   */
+  rpc DoAction(Action) returns (stream Result) {}
+
+  /*
+   * A flight service exposes all of the available action types that it has
+   * along with descriptions. This allows different flight consumers to
+   * understand the capabilities of the flight service.
+   */
+  rpc ListActions(Empty) returns (stream ActionType) {}
+
+}
+
+/*
+ * The request that a client provides to a server on handshake.
+ */
+message HandshakeRequest {
+
+  /*
+   * A defined protocol version
+   */
+  uint64 protocol_version = 1;
+
+  /*
+   * Arbitrary auth/handshake info.
+   */
+  bytes payload = 2;
+}
+
+message HandshakeResponse {
+
+  /*
+   * A defined protocol version
+   */
+  uint64 protocol_version = 1;
+
+  /*
+   * Arbitrary auth/handshake info.
+   */
+  bytes payload = 2;
+}
+
+/*
+ * A message for doing simple auth.
+ */
+message BasicAuth {
+  string username = 2;
+  string password = 3;
+}
+
+message Empty {}
+
+/*
+ * Describes an available action, including both the name used for execution
+ * along with a short description of the purpose of the action.
+ */
+message ActionType {
+  string type = 1;
+  string description = 2;
+}
+
+/*
+ * A service specific expression that can be used to return a limited set
+ * of available Arrow Flight streams.
+ */
+message Criteria {
+  bytes expression = 1;
+}
+
+/*
+ * An opaque action specific for the service.
+ */
+message Action {
+  string type = 1;
+  bytes body = 2;
+}
+
+/*
+ * An opaque result returned after executing an action.
+ */
+message Result {
+  bytes body = 1;
+}
+
+/*
+ * Wrap the result of a getSchema call
+ */
+message SchemaResult {
+  // schema of the dataset as described in Schema.fbs::Schema.
+  bytes schema = 1;
+}
+
+/*
+ * The name or tag for a Flight. May be used as a way to retrieve or generate
+ * a flight or be used to expose a set of previously defined flights.
+ */
+message FlightDescriptor {
+
+  /*
+   * Describes what type of descriptor is defined.
+   */
+  enum DescriptorType {
+
+    // Protobuf pattern, not used.
+    UNKNOWN = 0;
+
+    /*
+     * A named path that identifies a dataset. A path is composed of a string
+     * or list of strings describing a particular dataset. This is conceptually
+     *  similar to a path inside a filesystem.
+     */
+    PATH = 1;
+
+    /*
+     * An opaque command to generate a dataset.
+     */
+    CMD = 2;
+  }
+
+  DescriptorType type = 1;
+
+  /*
+   * Opaque value used to express a command. Should only be defined when
+   * type = CMD.
+   */
+  bytes cmd = 2;
+
+  /*
+   * List of strings identifying a particular dataset. Should only be defined
+   * when type = PATH.
+   */
+  repeated string path = 3;
+}
+
+/*
+ * The access coordinates for retrieval of a dataset. With a FlightInfo, a
+ * consumer is able to determine how to retrieve a dataset.
+ */
+message FlightInfo {
+  // schema of the dataset as described in Schema.fbs::Schema.
+  bytes schema = 1;
+
+  /*
+   * The descriptor associated with this info.
+   */
+  FlightDescriptor flight_descriptor = 2;
+
+  /*
+   * A list of endpoints associated with the flight. To consume the whole
+   * flight, all endpoints must be consumed.
+   */
+  repeated FlightEndpoint endpoint = 3;
+
+  // Set these to -1 if unknown.
+  int64 total_records = 4;
+  int64 total_bytes = 5;
+}
+
+/*
+ * A particular stream or split associated with a flight.
+ */
+message FlightEndpoint {
+
+  /*
+   * Token used to retrieve this stream.
+   */
+  Ticket ticket = 1;
+
+  /*
+   * A list of URIs where this ticket can be redeemed. If the list is
+   * empty, the expectation is that the ticket can only be redeemed on the
+   * current service where the ticket was generated.
+   */
+  repeated Location location = 2;
+}
+
+/*
+ * A location where a Flight service will accept retrieval of a particular
+ * stream given a ticket.
+ */
+message Location {
+  string uri = 1;
+}
+
+/*
+ * An opaque identifier that the service can use to retrieve a particular
+ * portion of a stream.
+ */
+message Ticket {
+  bytes ticket = 1;
+}
+
+/*
+ * A batch of Arrow data as part of a stream of batches.
+ */
+message FlightData {
+
+  /*
+   * The descriptor of the data. This is only relevant when a client is
+   * starting a new DoPut stream.
+   */
+  FlightDescriptor flight_descriptor = 1;
+
+  /*
+   * Header for message data as described in Message.fbs::Message.
+   */
+  bytes data_header = 2;
+
+  /*
+   * Application-defined metadata.
+   */
+  bytes app_metadata = 3;
+
+  /*
+   * The actual batch of Arrow data. Preferably handled with minimal-copies
+   * coming last in the definition to help with sidecar patterns (it is
+   * expected that some implementations will fetch this field off the wire
+   * with specialized code to avoid extra memory copies).
+   */
+  bytes data_body = 1000;
+}
+
+/**
+ * The response message associated with the submission of a DoPut.
+ */
+message PutResult {
+  bytes app_metadata = 1;
+}
diff --git a/src/arrow/format/Message.fbs b/src/arrow/format/Message.fbs
new file mode 100644
index 000000000..f1c18d765
--- /dev/null
+++ b/src/arrow/format/Message.fbs
@@ -0,0 +1,140 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+include "Schema.fbs";
+include "SparseTensor.fbs";
+include "Tensor.fbs";
+
+namespace org.apache.arrow.flatbuf;
+
+/// ----------------------------------------------------------------------
+/// Data structures for describing a table row batch (a collection of
+/// equal-length Arrow arrays)
+
+/// Metadata about a field at some level of a nested type tree (but not
+/// its children).
+///
+/// For example, a List<Int16> with values `[[1, 2, 3], null, [4], [5, 6], null]`
+/// would have {length: 5, null_count: 2} for its List node, and {length: 6,
+/// null_count: 0} for its Int16 node, as separate FieldNode structs
+struct FieldNode {
+  /// The number of value slots in the Arrow array at this level of a nested
+  /// tree
+  length: long;
+
+  /// The number of observed nulls. Fields with null_count == 0 may choose not
+  /// to write their physical validity bitmap out as a materialized buffer,
+  /// instead setting the length of the bitmap buffer to 0.
+  null_count: long;
+}
+
+enum CompressionType:byte {
+  // LZ4 frame format, for portability, as provided by lz4frame.h or wrappers
+  // thereof. Not to be confused with "raw" (also called "block") format
+  // provided by lz4.h
+  LZ4_FRAME,
+
+  // Zstandard
+  ZSTD
+}
+
+/// Provided for forward compatibility in case we need to support different
+/// strategies for compressing the IPC message body (like whole-body
+/// compression rather than buffer-level) in the future
+enum BodyCompressionMethod:byte {
+  /// Each constituent buffer is first compressed with the indicated
+  /// compressor, and then written with the uncompressed length in the first 8
+  /// bytes as a 64-bit little-endian signed integer followed by the compressed
+  /// buffer bytes (and then padding as required by the protocol). The
+  /// uncompressed length may be set to -1 to indicate that the data that
+  /// follows is not compressed, which can be useful for cases where
+  /// compression does not yield appreciable savings.
+  BUFFER
+}
+
+/// Optional compression for the memory buffers constituting IPC message
+/// bodies. Intended for use with RecordBatch but could be used for other
+/// message types
+table BodyCompression {
+  /// Compressor library
+  codec: CompressionType = LZ4_FRAME;
+
+  /// Indicates the way the record batch body was compressed
+  method: BodyCompressionMethod = BUFFER;
+}
+
+/// A data header describing the shared memory layout of a "record" or "row"
+/// batch. Some systems call this a "row batch" internally and others a "record
+/// batch".
+table RecordBatch {
+  /// number of records / rows. The arrays in the batch should all have this
+  /// length
+  length: long;
+
+  /// Nodes correspond to the pre-ordered flattened logical schema
+  nodes: [FieldNode];
+
+  /// Buffers correspond to the pre-ordered flattened buffer tree
+  ///
+  /// The number of buffers appended to this list depends on the schema. For
+  /// example, most primitive arrays will have 2 buffers, 1 for the validity
+  /// bitmap and 1 for the values. For struct arrays, there will only be a
+  /// single buffer for the validity (nulls) bitmap
+  buffers: [Buffer];
+
+  /// Optional compression of the message body
+  compression: BodyCompression;
+}
+
+/// For sending dictionary encoding information. Any Field can be
+/// dictionary-encoded, but in this case none of its children may be
+/// dictionary-encoded.
+/// There is one vector / column per dictionary, but that vector / column
+/// may be spread across multiple dictionary batches by using the isDelta
+/// flag
+
+table DictionaryBatch {
+  id: long;
+  data: RecordBatch;
+
+  /// If isDelta is true the values in the dictionary are to be appended to a
+  /// dictionary with the indicated id. If isDelta is false this dictionary
+  /// should replace the existing dictionary.
+  isDelta: bool = false;
+}
+
+/// ----------------------------------------------------------------------
+/// The root Message type
+
+/// This union enables us to easily send different message types without
+/// redundant storage, and in the future we can easily add new message types.
+///
+/// Arrow implementations do not need to implement all of the message types,
+/// which may include experimental metadata types. For maximum compatibility,
+/// it is best to send data using RecordBatch
+union MessageHeader {
+  Schema, DictionaryBatch, RecordBatch, Tensor, SparseTensor
+}
+
+table Message {
+  version: org.apache.arrow.flatbuf.MetadataVersion;
+  header: MessageHeader;
+  bodyLength: long;
+  custom_metadata: [ KeyValue ];
+}
+
+root_type Message;
diff --git a/src/arrow/format/README.rst b/src/arrow/format/README.rst
new file mode 100644
index 000000000..0eaad49b7
--- /dev/null
+++ b/src/arrow/format/README.rst
@@ -0,0 +1,25 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements.  See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership.  The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License.  You may obtain a copy of the License at
+
+..   http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied.  See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+Arrow Protocol Files
+====================
+
+This folder contains binary protocol definitions for the Arrow columnar format
+and other parts of the project, like the Flight RPC framework.
+
+For documentation about the Arrow format, see the `docs/source/format`
+directory.
diff --git a/src/arrow/format/Schema.fbs b/src/arrow/format/Schema.fbs
new file mode 100644
index 000000000..7ee827b5d
--- /dev/null
+++ b/src/arrow/format/Schema.fbs
@@ -0,0 +1,522 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// Logical types, vector layouts, and schemas
+
+/// Format Version History.
+/// Version 1.0 - Forward and backwards compatibility guaranteed.
+/// Version 1.1 - Add Decimal256 (No format release).
+/// Version 1.2 (Pending)- Add Interval MONTH_DAY_NANO
+
+namespace org.apache.arrow.flatbuf;
+
+enum MetadataVersion:short {
+  /// 0.1.0 (October 2016).
+  V1,
+
+  /// 0.2.0 (February 2017). Non-backwards compatible with V1.
+  V2,
+
+  /// 0.3.0 -> 0.7.1 (May - December 2017). Non-backwards compatible with V2.
+  V3,
+
+  /// >= 0.8.0 (December 2017). Non-backwards compatible with V3.
+  V4,
+
+  /// >= 1.0.0 (July 2020. Backwards compatible with V4 (V5 readers can read V4
+  /// metadata and IPC messages). Implementations are recommended to provide a
+  /// V4 compatibility mode with V5 format changes disabled.
+  ///
+  /// Incompatible changes between V4 and V5:
+  /// - Union buffer layout has changed. In V5, Unions don't have a validity
+  ///   bitmap buffer.
+  V5,
+}
+
+/// Represents Arrow Features that might not have full support
+/// within implementations. This is intended to be used in
+/// two scenarios:
+///  1.  A mechanism for readers of Arrow Streams
+///      and files to understand that the stream or file makes
+///      use of a feature that isn't supported or unknown to
+///      the implementation (and therefore can meet the Arrow
+///      forward compatibility guarantees).
+///  2.  A means of negotiating between a client and server
+///      what features a stream is allowed to use. The enums
+///      values here are intented to represent higher level
+///      features, additional details maybe negotiated
+///      with key-value pairs specific to the protocol.
+///
+/// Enums added to this list should be assigned power-of-two values
+/// to facilitate exchanging and comparing bitmaps for supported
+/// features.
+enum Feature : long {
+  /// Needed to make flatbuffers happy.
+  UNUSED = 0,
+  /// The stream makes use of multiple full dictionaries with the
+  /// same ID and assumes clients implement dictionary replacement
+  /// correctly.
+  DICTIONARY_REPLACEMENT = 1,
+  /// The stream makes use of compressed bodies as described
+  /// in Message.fbs.
+  COMPRESSED_BODY = 2
+}
+
+/// These are stored in the flatbuffer in the Type union below
+
+table Null {
+}
+
+/// A Struct_ in the flatbuffer metadata is the same as an Arrow Struct
+/// (according to the physical memory layout). We used Struct_ here as
+/// Struct is a reserved word in Flatbuffers
+table Struct_ {
+}
+
+table List {
+}
+
+/// Same as List, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+table LargeList {
+}
+
+table FixedSizeList {
+  /// Number of list items per value
+  listSize: int;
+}
+
+/// A Map is a logical nested type that is represented as
+///
+/// List<entries: Struct<key: K, value: V>>
+///
+/// In this layout, the keys and values are each respectively contiguous. We do
+/// not constrain the key and value types, so the application is responsible
+/// for ensuring that the keys are hashable and unique. Whether the keys are sorted
+/// may be set in the metadata for this field.
+///
+/// In a field with Map type, the field has a child Struct field, which then
+/// has two children: key type and the second the value type. The names of the
+/// child fields may be respectively "entries", "key", and "value", but this is
+/// not enforced.
+///
+/// Map
+/// ```text
+///   - child[0] entries: Struct
+///     - child[0] key: K
+///     - child[1] value: V
+/// ```
+/// Neither the "entries" field nor the "key" field may be nullable.
+///
+/// The metadata is structured so that Arrow systems without special handling
+/// for Map can make Map an alias for List. The "layout" attribute for the Map
+/// field must have the same contents as a List.
+table Map {
+  /// Set to true if the keys within each value are sorted
+  keysSorted: bool;
+}
+
+enum UnionMode:short { Sparse, Dense }
+
+/// A union is a complex type with children in Field
+/// By default ids in the type vector refer to the offsets in the children
+/// optionally typeIds provides an indirection between the child offset and the type id
+/// for each child `typeIds[offset]` is the id used in the type vector
+table Union {
+  mode: UnionMode;
+  typeIds: [ int ]; // optional, describes typeid of each child.
+}
+
+table Int {
+  bitWidth: int; // restricted to 8, 16, 32, and 64 in v1
+  is_signed: bool;
+}
+
+enum Precision:short {HALF, SINGLE, DOUBLE}
+
+table FloatingPoint {
+  precision: Precision;
+}
+
+/// Unicode with UTF-8 encoding
+table Utf8 {
+}
+
+/// Opaque binary data
+table Binary {
+}
+
+/// Same as Utf8, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+table LargeUtf8 {
+}
+
+/// Same as Binary, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+table LargeBinary {
+}
+
+table FixedSizeBinary {
+  /// Number of bytes per value
+  byteWidth: int;
+}
+
+table Bool {
+}
+
+/// Exact decimal value represented as an integer value in two's
+/// complement. Currently only 128-bit (16-byte) and 256-bit (32-byte) integers
+/// are used. The representation uses the endianness indicated
+/// in the Schema.
+table Decimal {
+  /// Total number of decimal digits
+  precision: int;
+
+  /// Number of digits after the decimal point "."
+  scale: int;
+
+  /// Number of bits per value. The only accepted widths are 128 and 256.
+  /// We use bitWidth for consistency with Int::bitWidth.
+  bitWidth: int = 128;
+}
+
+enum DateUnit: short {
+  DAY,
+  MILLISECOND
+}
+
+/// Date is either a 32-bit or 64-bit signed integer type representing an
+/// elapsed time since UNIX epoch (1970-01-01), stored in either of two units:
+///
+/// * Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no
+///   leap seconds), where the values are evenly divisible by 86400000
+/// * Days (32 bits) since the UNIX epoch
+table Date {
+  unit: DateUnit = MILLISECOND;
+}
+
+enum TimeUnit: short { SECOND, MILLISECOND, MICROSECOND, NANOSECOND }
+
+/// Time is either a 32-bit or 64-bit signed integer type representing an
+/// elapsed time since midnight, stored in either of four units: seconds,
+/// milliseconds, microseconds or nanoseconds.
+///
+/// The integer `bitWidth` depends on the `unit` and must be one of the following:
+/// * SECOND and MILLISECOND: 32 bits
+/// * MICROSECOND and NANOSECOND: 64 bits
+///
+/// The allowed values are between 0 (inclusive) and 86400 (=24*60*60) seconds
+/// (exclusive), adjusted for the time unit (for example, up to 86400000
+/// exclusive for the MILLISECOND unit).
+/// This definition doesn't allow for leap seconds. Time values from
+/// measurements with leap seconds will need to be corrected when ingesting
+/// into Arrow (for example by replacing the value 86400 with 86399).
+table Time {
+  unit: TimeUnit = MILLISECOND;
+  bitWidth: int = 32;
+}
+
+/// Timestamp is a 64-bit signed integer representing an elapsed time since a
+/// fixed epoch, stored in either of four units: seconds, milliseconds,
+/// microseconds or nanoseconds, and is optionally annotated with a timezone.
+///
+/// Timestamp values do not include any leap seconds (in other words, all
+/// days are considered 86400 seconds long).
+///
+/// Timestamps with a non-empty timezone
+/// ------------------------------------
+///
+/// If a Timestamp column has a non-empty timezone value, its epoch is
+/// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone
+/// (the Unix epoch), regardless of the Timestamp's own timezone.
+///
+/// Therefore, timestamp values with a non-empty timezone correspond to
+/// physical points in time together with some additional information about
+/// how the data was obtained and/or how to display it (the timezone).
+///
+///   For example, the timestamp value 0 with the timezone string "Europe/Paris"
+///   corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the
+///   application may prefer to display it as "January 1st 1970, 01h00" in
+///   the Europe/Paris timezone (which is the same physical point in time).
+///
+/// One consequence is that timestamp values with a non-empty timezone
+/// can be compared and ordered directly, since they all share the same
+/// well-known point of reference (the Unix epoch).
+///
+/// Timestamps with an unset / empty timezone
+/// -----------------------------------------
+///
+/// If a Timestamp column has no timezone value, its epoch is
+/// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone.
+///
+/// Therefore, timestamp values without a timezone cannot be meaningfully
+/// interpreted as physical points in time, but only as calendar / clock
+/// indications ("wall clock time") in an unspecified timezone.
+///
+///   For example, the timestamp value 0 with an empty timezone string
+///   corresponds to "January 1st 1970, 00h00" in an unknown timezone: there
+///   is not enough information to interpret it as a well-defined physical
+///   point in time.
+///
+/// One consequence is that timestamp values without a timezone cannot
+/// be reliably compared or ordered, since they may have different points of
+/// reference.  In particular, it is *not* possible to interpret an unset
+/// or empty timezone as the same as "UTC".
+///
+/// Conversion between timezones
+/// ----------------------------
+///
+/// If a Timestamp column has a non-empty timezone, changing the timezone
+/// to a different non-empty value is a metadata-only operation:
+/// the timestamp values need not change as their point of reference remains
+/// the same (the Unix epoch).
+///
+/// However, if a Timestamp column has no timezone value, changing it to a
+/// non-empty value requires to think about the desired semantics.
+/// One possibility is to assume that the original timestamp values are
+/// relative to the epoch of the timezone being set; timestamp values should
+/// then adjusted to the Unix epoch (for example, changing the timezone from
+/// empty to "Europe/Paris" would require converting the timestamp values
+/// from "Europe/Paris" to "UTC", which seems counter-intuitive but is
+/// nevertheless correct).
+///
+/// Guidelines for encoding data from external libraries
+/// ----------------------------------------------------
+///
+/// Date & time libraries often have multiple different data types for temporal
+/// data. In order to ease interoperability between different implementations the
+/// Arrow project has some recommendations for encoding these types into a Timestamp
+/// column.
+///
+/// An "instant" represents a physical point in time that has no relevant timezone
+/// (for example, astronomical data). To encode an instant, use a Timestamp with
+/// the timezone string set to "UTC", and make sure the Timestamp values
+/// are relative to the UTC epoch (January 1st 1970, midnight).
+///
+/// A "zoned date-time" represents a physical point in time annotated with an
+/// informative timezone (for example, the timezone in which the data was
+/// recorded).  To encode a zoned date-time, use a Timestamp with the timezone
+/// string set to the name of the timezone, and make sure the Timestamp values
+/// are relative to the UTC epoch (January 1st 1970, midnight).
+///
+///  (There is some ambiguity between an instant and a zoned date-time with the
+///   UTC timezone.  Both of these are stored the same in Arrow.  Typically,
+///   this distinction does not matter.  If it does, then an application should
+///   use custom metadata or an extension type to distinguish between the two cases.)
+///
+/// An "offset date-time" represents a physical point in time combined with an
+/// explicit offset from UTC.  To encode an offset date-time, use a Timestamp
+/// with the timezone string set to the numeric timezone offset string
+/// (e.g. "+03:00"), and make sure the Timestamp values are relative to
+/// the UTC epoch (January 1st 1970, midnight).
+///
+/// A "naive date-time" (also called "local date-time" in some libraries)
+/// represents a wall clock time combined with a calendar date, but with
+/// no indication of how to map this information to a physical point in time.
+/// Naive date-times must be handled with care because of this missing
+/// information, and also because daylight saving time (DST) may make
+/// some values ambiguous or non-existent. A naive date-time may be
+/// stored as a struct with Date and Time fields. However, it may also be
+/// encoded into a Timestamp column with an empty timezone. The timestamp
+/// values should be computed "as if" the timezone of the date-time values
+/// was UTC; for example, the naive date-time "January 1st 1970, 00h00" would
+/// be encoded as timestamp value 0.
+table Timestamp {
+  unit: TimeUnit;
+
+  /// The timezone is an optional string indicating the name of a timezone,
+  /// one of:
+  ///
+  /// * As used in the Olson timezone database (the "tz database" or
+  ///   "tzdata"), such as "America/New_York".
+  /// * An absolute timezone offset of the form "+XX:XX" or "-XX:XX",
+  ///   such as "+07:30".
+  ///
+  /// Whether a timezone string is present indicates different semantics about
+  /// the data (see above).
+  timezone: string;
+}
+
+enum IntervalUnit: short { YEAR_MONTH, DAY_TIME, MONTH_DAY_NANO}
+// A "calendar" interval which models types that don't necessarily
+// have a precise duration without the context of a base timestamp (e.g.
+// days can differ in length during day light savings time transitions).
+// All integers in the types below are stored in the endianness indicated
+// by the schema.
+//
+// YEAR_MONTH - Indicates the number of elapsed whole months, stored as
+//   4-byte signed integers.
+// DAY_TIME - Indicates the number of elapsed days and milliseconds (no leap seconds),
+//   stored as 2 contiguous 32-bit signed integers (8-bytes in total). Support
+//   of this IntervalUnit is not required for full arrow compatibility.
+// MONTH_DAY_NANO - A triple of the number of elapsed months, days, and nanoseconds.
+//  The values are stored contiguously in 16-byte blocks. Months and days are
+//  encoded as 32-bit signed integers and nanoseconds is encoded as a 64-bit
+//  signed integer. Nanoseconds does not allow for leap seconds. Each field is
+//  independent (e.g. there is no constraint that nanoseconds have the same
+//  sign as days or that the quantity of nanoseconds represents less than a
+//  day's worth of time).
+table Interval {
+  unit: IntervalUnit;
+}
+
+// An absolute length of time unrelated to any calendar artifacts.
+//
+// For the purposes of Arrow Implementations, adding this value to a Timestamp
+// ("t1") naively (i.e. simply summing the two number) is acceptable even
+// though in some cases the resulting Timestamp (t2) would not account for
+// leap-seconds during the elapsed time between "t1" and "t2".  Similarly,
+// representing the difference between two Unix timestamp is acceptable, but
+// would yield a value that is possibly a few seconds off from the true elapsed
+// time.
+//
+//  The resolution defaults to millisecond, but can be any of the other
+//  supported TimeUnit values as with Timestamp and Time types.  This type is
+//  always represented as an 8-byte integer.
+table Duration {
+  unit: TimeUnit = MILLISECOND;
+}
+
+/// ----------------------------------------------------------------------
+/// Top-level Type value, enabling extensible type-specific metadata. We can
+/// add new logical types to Type without breaking backwards compatibility
+
+union Type {
+  Null,
+  Int,
+  FloatingPoint,
+  Binary,
+  Utf8,
+  Bool,
+  Decimal,
+  Date,
+  Time,
+  Timestamp,
+  Interval,
+  List,
+  Struct_,
+  Union,
+  FixedSizeBinary,
+  FixedSizeList,
+  Map,
+  Duration,
+  LargeBinary,
+  LargeUtf8,
+  LargeList,
+}
+
+/// ----------------------------------------------------------------------
+/// user defined key value pairs to add custom metadata to arrow
+/// key namespacing is the responsibility of the user
+
+table KeyValue {
+  key: string;
+  value: string;
+}
+
+/// ----------------------------------------------------------------------
+/// Dictionary encoding metadata
+/// Maintained for forwards compatibility, in the future
+/// Dictionaries might be explicit maps between integers and values
+/// allowing for non-contiguous index values
+enum DictionaryKind : short { DenseArray }
+table DictionaryEncoding {
+  /// The known dictionary id in the application where this data is used. In
+  /// the file or streaming formats, the dictionary ids are found in the
+  /// DictionaryBatch messages
+  id: long;
+
+  /// The dictionary indices are constrained to be non-negative integers. If
+  /// this field is null, the indices must be signed int32. To maximize
+  /// cross-language compatibility and performance, implementations are
+  /// recommended to prefer signed integer types over unsigned integer types
+  /// and to avoid uint64 indices unless they are required by an application.
+  indexType: Int;
+
+  /// By default, dictionaries are not ordered, or the order does not have
+  /// semantic meaning. In some statistical, applications, dictionary-encoding
+  /// is used to represent ordered categorical data, and we provide a way to
+  /// preserve that metadata here
+  isOrdered: bool;
+
+  dictionaryKind: DictionaryKind;
+}
+
+/// ----------------------------------------------------------------------
+/// A field represents a named column in a record / row batch or child of a
+/// nested type.
+
+table Field {
+  /// Name is not required, in i.e. a List
+  name: string;
+
+  /// Whether or not this field can contain nulls. Should be true in general.
+  nullable: bool;
+
+  /// This is the type of the decoded value if the field is dictionary encoded.
+  type: Type;
+
+  /// Present only if the field is dictionary encoded.
+  dictionary: DictionaryEncoding;
+
+  /// children apply only to nested data types like Struct, List and Union. For
+  /// primitive types children will have length 0.
+  children: [ Field ];
+
+  /// User-defined metadata
+  custom_metadata: [ KeyValue ];
+}
+
+/// ----------------------------------------------------------------------
+/// Endianness of the platform producing the data
+
+enum Endianness:short { Little, Big }
+
+/// ----------------------------------------------------------------------
+/// A Buffer represents a single contiguous memory segment
+struct Buffer {
+  /// The relative offset into the shared memory page where the bytes for this
+  /// buffer starts
+  offset: long;
+
+  /// The absolute length (in bytes) of the memory buffer. The memory is found
+  /// from offset (inclusive) to offset + length (non-inclusive). When building
+  /// messages using the encapsulated IPC message, padding bytes may be written
+  /// after a buffer, but such padding bytes do not need to be accounted for in
+  /// the size here.
+  length: long;
+}
+
+/// ----------------------------------------------------------------------
+/// A Schema describes the columns in a row batch
+
+table Schema {
+
+  /// endianness of the buffer
+  /// it is Little Endian by default
+  /// if endianness doesn't match the underlying system then the vectors need to be converted
+  endianness: Endianness=Little;
+
+  fields: [Field];
+  // User-defined metadata
+  custom_metadata: [ KeyValue ];
+
+  /// Features used in the stream/file.
+  features : [ Feature ];
+}
+
+root_type Schema;
diff --git a/src/arrow/format/SparseTensor.fbs b/src/arrow/format/SparseTensor.fbs
new file mode 100644
index 000000000..a6fd2f9e7
--- /dev/null
+++ b/src/arrow/format/SparseTensor.fbs
@@ -0,0 +1,228 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// EXPERIMENTAL: Metadata for n-dimensional sparse arrays, aka "sparse tensors".
+/// Arrow implementations in general are not required to implement this type
+
+include "Tensor.fbs";
+
+namespace org.apache.arrow.flatbuf;
+
+/// ----------------------------------------------------------------------
+/// EXPERIMENTAL: Data structures for sparse tensors
+
+/// Coordinate (COO) format of sparse tensor index.
+///
+/// COO's index list are represented as a NxM matrix,
+/// where N is the number of non-zero values,
+/// and M is the number of dimensions of a sparse tensor.
+///
+/// indicesBuffer stores the location and size of the data of this indices
+/// matrix.  The value type and the stride of the indices matrix is
+/// specified in indicesType and indicesStrides fields.
+///
+/// For example, let X be a 2x3x4x5 tensor, and it has the following
+/// 6 non-zero values:
+/// ```text
+///   X[0, 1, 2, 0] := 1
+///   X[1, 1, 2, 3] := 2
+///   X[0, 2, 1, 0] := 3
+///   X[0, 1, 3, 0] := 4
+///   X[0, 1, 2, 1] := 5
+///   X[1, 2, 0, 4] := 6
+/// ```
+/// In COO format, the index matrix of X is the following 4x6 matrix:
+/// ```text
+///   [[0, 0, 0, 0, 1, 1],
+///    [1, 1, 1, 2, 1, 2],
+///    [2, 2, 3, 1, 2, 0],
+///    [0, 1, 0, 0, 3, 4]]
+/// ```
+/// When isCanonical is true, the indices is sorted in lexicographical order
+/// (row-major order), and it does not have duplicated entries.  Otherwise,
+/// the indices may not be sorted, or may have duplicated entries.
+table SparseTensorIndexCOO {
+  /// The type of values in indicesBuffer
+  indicesType: Int (required);
+
+  /// Non-negative byte offsets to advance one value cell along each dimension
+  /// If omitted, default to row-major order (C-like).
+  indicesStrides: [long];
+
+  /// The location and size of the indices matrix's data
+  indicesBuffer: Buffer (required);
+
+  /// This flag is true if and only if the indices matrix is sorted in
+  /// row-major order, and does not have duplicated entries.
+  /// This sort order is the same as of Tensorflow's SparseTensor,
+  /// but it is inverse order of SciPy's canonical coo_matrix
+  /// (SciPy employs column-major order for its coo_matrix).
+  isCanonical: bool;
+}
+
+enum SparseMatrixCompressedAxis: short { Row, Column }
+
+/// Compressed Sparse format, that is matrix-specific.
+table SparseMatrixIndexCSX {
+  /// Which axis, row or column, is compressed
+  compressedAxis: SparseMatrixCompressedAxis;
+
+  /// The type of values in indptrBuffer
+  indptrType: Int (required);
+
+  /// indptrBuffer stores the location and size of indptr array that
+  /// represents the range of the rows.
+  /// The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data.
+  /// The length of this array is 1 + (the number of rows), and the type
+  /// of index value is long.
+  ///
+  /// For example, let X be the following 6x4 matrix:
+  /// ```text
+  ///   X := [[0, 1, 2, 0],
+  ///         [0, 0, 3, 0],
+  ///         [0, 4, 0, 5],
+  ///         [0, 0, 0, 0],
+  ///         [6, 0, 7, 8],
+  ///         [0, 9, 0, 0]].
+  /// ```
+  /// The array of non-zero values in X is:
+  /// ```text
+  ///   values(X) = [1, 2, 3, 4, 5, 6, 7, 8, 9].
+  /// ```
+  /// And the indptr of X is:
+  /// ```text
+  ///   indptr(X) = [0, 2, 3, 5, 5, 8, 10].
+  /// ```
+  indptrBuffer: Buffer (required);
+
+  /// The type of values in indicesBuffer
+  indicesType: Int (required);
+
+  /// indicesBuffer stores the location and size of the array that
+  /// contains the column indices of the corresponding non-zero values.
+  /// The type of index value is long.
+  ///
+  /// For example, the indices of the above X is:
+  /// ```text
+  ///   indices(X) = [1, 2, 2, 1, 3, 0, 2, 3, 1].
+  /// ```
+  /// Note that the indices are sorted in lexicographical order for each row.
+  indicesBuffer: Buffer (required);
+}
+
+/// Compressed Sparse Fiber (CSF) sparse tensor index.
+table SparseTensorIndexCSF {
+  /// CSF is a generalization of compressed sparse row (CSR) index.
+  /// See [smith2017knl](http://shaden.io/pub-files/smith2017knl.pdf)
+  ///
+  /// CSF index recursively compresses each dimension of a tensor into a set
+  /// of prefix trees. Each path from a root to leaf forms one tensor
+  /// non-zero index. CSF is implemented with two arrays of buffers and one
+  /// arrays of integers.
+  ///
+  /// For example, let X be a 2x3x4x5 tensor and let it have the following
+  /// 8 non-zero values:
+  /// ```text
+  ///   X[0, 0, 0, 1] := 1
+  ///   X[0, 0, 0, 2] := 2
+  ///   X[0, 1, 0, 0] := 3
+  ///   X[0, 1, 0, 2] := 4
+  ///   X[0, 1, 1, 0] := 5
+  ///   X[1, 1, 1, 0] := 6
+  ///   X[1, 1, 1, 1] := 7
+  ///   X[1, 1, 1, 2] := 8
+  /// ```
+  /// As a prefix tree this would be represented as:
+  /// ```text
+  ///         0          1
+  ///        / \         |
+  ///       0   1        1
+  ///      /   / \       |
+  ///     0   0   1      1
+  ///    /|  /|   |    /| |
+  ///   1 2 0 2   0   0 1 2
+  /// ```
+  /// The type of values in indptrBuffers
+  indptrType: Int (required);
+
+  /// indptrBuffers stores the sparsity structure.
+  /// Each two consecutive dimensions in a tensor correspond to a buffer in
+  /// indptrBuffers. A pair of consecutive values at `indptrBuffers[dim][i]`
+  /// and `indptrBuffers[dim][i + 1]` signify a range of nodes in
+  /// `indicesBuffers[dim + 1]` who are children of `indicesBuffers[dim][i]` node.
+  ///
+  /// For example, the indptrBuffers for the above X is:
+  /// ```text
+  ///   indptrBuffer(X) = [
+  ///                       [0, 2, 3],
+  ///                       [0, 1, 3, 4],
+  ///                       [0, 2, 4, 5, 8]
+  ///                     ].
+  /// ```
+  indptrBuffers: [Buffer] (required);
+
+  /// The type of values in indicesBuffers
+  indicesType: Int (required);
+
+  /// indicesBuffers stores values of nodes.
+  /// Each tensor dimension corresponds to a buffer in indicesBuffers.
+  /// For example, the indicesBuffers for the above X is:
+  /// ```text
+  ///   indicesBuffer(X) = [
+  ///                        [0, 1],
+  ///                        [0, 1, 1],
+  ///                        [0, 0, 1, 1],
+  ///                        [1, 2, 0, 2, 0, 0, 1, 2]
+  ///                      ].
+  /// ```
+  indicesBuffers: [Buffer] (required);
+
+  /// axisOrder stores the sequence in which dimensions were traversed to
+  /// produce the prefix tree.
+  /// For example, the axisOrder for the above X is:
+  /// ```text
+  ///   axisOrder(X) = [0, 1, 2, 3].
+  /// ```
+  axisOrder: [int] (required);
+}
+
+union SparseTensorIndex {
+  SparseTensorIndexCOO,
+  SparseMatrixIndexCSX,
+  SparseTensorIndexCSF
+}
+
+table SparseTensor {
+  /// The type of data contained in a value cell.
+  /// Currently only fixed-width value types are supported,
+  /// no strings or nested types.
+  type: Type (required);
+
+  /// The dimensions of the tensor, optionally named.
+  shape: [TensorDim] (required);
+
+  /// The number of non-zero values in a sparse tensor.
+  non_zero_length: long;
+
+  /// Sparse tensor index
+  sparseIndex: SparseTensorIndex (required);
+
+  /// The location and size of the tensor's data
+  data: Buffer (required);
+}
+
+root_type SparseTensor;
diff --git a/src/arrow/format/Tensor.fbs b/src/arrow/format/Tensor.fbs
new file mode 100644
index 000000000..409297ccf
--- /dev/null
+++ b/src/arrow/format/Tensor.fbs
@@ -0,0 +1,54 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// EXPERIMENTAL: Metadata for n-dimensional arrays, aka "tensors" or
+/// "ndarrays". Arrow implementations in general are not required to implement
+/// this type
+
+include "Schema.fbs";
+
+namespace org.apache.arrow.flatbuf;
+
+/// ----------------------------------------------------------------------
+/// Data structures for dense tensors
+
+/// Shape data for a single axis in a tensor
+table TensorDim {
+  /// Length of dimension
+  size: long;
+
+  /// Name of the dimension, optional
+  name: string;
+}
+
+table Tensor {
+  /// The type of data contained in a value cell. Currently only fixed-width
+  /// value types are supported, no strings or nested types
+  type: Type (required);
+
+  /// The dimensions of the tensor, optionally named
+  shape: [TensorDim] (required);
+
+  /// Non-negative byte offsets to advance one value cell along each dimension
+  /// If omitted, default to row-major order (C-like).
+  strides: [long];
+
+  /// The location and size of the tensor's data
+  data: Buffer (required);
+}
+
+root_type Tensor;