diff options
Diffstat (limited to 'src/arrow/format/SparseTensor.fbs')
-rw-r--r-- | src/arrow/format/SparseTensor.fbs | 228 |
1 files changed, 228 insertions, 0 deletions
diff --git a/src/arrow/format/SparseTensor.fbs b/src/arrow/format/SparseTensor.fbs new file mode 100644 index 000000000..a6fd2f9e7 --- /dev/null +++ b/src/arrow/format/SparseTensor.fbs @@ -0,0 +1,228 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// EXPERIMENTAL: Metadata for n-dimensional sparse arrays, aka "sparse tensors". +/// Arrow implementations in general are not required to implement this type + +include "Tensor.fbs"; + +namespace org.apache.arrow.flatbuf; + +/// ---------------------------------------------------------------------- +/// EXPERIMENTAL: Data structures for sparse tensors + +/// Coordinate (COO) format of sparse tensor index. +/// +/// COO's index list are represented as a NxM matrix, +/// where N is the number of non-zero values, +/// and M is the number of dimensions of a sparse tensor. +/// +/// indicesBuffer stores the location and size of the data of this indices +/// matrix. The value type and the stride of the indices matrix is +/// specified in indicesType and indicesStrides fields. +/// +/// For example, let X be a 2x3x4x5 tensor, and it has the following +/// 6 non-zero values: +/// ```text +/// X[0, 1, 2, 0] := 1 +/// X[1, 1, 2, 3] := 2 +/// X[0, 2, 1, 0] := 3 +/// X[0, 1, 3, 0] := 4 +/// X[0, 1, 2, 1] := 5 +/// X[1, 2, 0, 4] := 6 +/// ``` +/// In COO format, the index matrix of X is the following 4x6 matrix: +/// ```text +/// [[0, 0, 0, 0, 1, 1], +/// [1, 1, 1, 2, 1, 2], +/// [2, 2, 3, 1, 2, 0], +/// [0, 1, 0, 0, 3, 4]] +/// ``` +/// When isCanonical is true, the indices is sorted in lexicographical order +/// (row-major order), and it does not have duplicated entries. Otherwise, +/// the indices may not be sorted, or may have duplicated entries. +table SparseTensorIndexCOO { + /// The type of values in indicesBuffer + indicesType: Int (required); + + /// Non-negative byte offsets to advance one value cell along each dimension + /// If omitted, default to row-major order (C-like). + indicesStrides: [long]; + + /// The location and size of the indices matrix's data + indicesBuffer: Buffer (required); + + /// This flag is true if and only if the indices matrix is sorted in + /// row-major order, and does not have duplicated entries. + /// This sort order is the same as of Tensorflow's SparseTensor, + /// but it is inverse order of SciPy's canonical coo_matrix + /// (SciPy employs column-major order for its coo_matrix). + isCanonical: bool; +} + +enum SparseMatrixCompressedAxis: short { Row, Column } + +/// Compressed Sparse format, that is matrix-specific. +table SparseMatrixIndexCSX { + /// Which axis, row or column, is compressed + compressedAxis: SparseMatrixCompressedAxis; + + /// The type of values in indptrBuffer + indptrType: Int (required); + + /// indptrBuffer stores the location and size of indptr array that + /// represents the range of the rows. + /// The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. + /// The length of this array is 1 + (the number of rows), and the type + /// of index value is long. + /// + /// For example, let X be the following 6x4 matrix: + /// ```text + /// X := [[0, 1, 2, 0], + /// [0, 0, 3, 0], + /// [0, 4, 0, 5], + /// [0, 0, 0, 0], + /// [6, 0, 7, 8], + /// [0, 9, 0, 0]]. + /// ``` + /// The array of non-zero values in X is: + /// ```text + /// values(X) = [1, 2, 3, 4, 5, 6, 7, 8, 9]. + /// ``` + /// And the indptr of X is: + /// ```text + /// indptr(X) = [0, 2, 3, 5, 5, 8, 10]. + /// ``` + indptrBuffer: Buffer (required); + + /// The type of values in indicesBuffer + indicesType: Int (required); + + /// indicesBuffer stores the location and size of the array that + /// contains the column indices of the corresponding non-zero values. + /// The type of index value is long. + /// + /// For example, the indices of the above X is: + /// ```text + /// indices(X) = [1, 2, 2, 1, 3, 0, 2, 3, 1]. + /// ``` + /// Note that the indices are sorted in lexicographical order for each row. + indicesBuffer: Buffer (required); +} + +/// Compressed Sparse Fiber (CSF) sparse tensor index. +table SparseTensorIndexCSF { + /// CSF is a generalization of compressed sparse row (CSR) index. + /// See [smith2017knl](http://shaden.io/pub-files/smith2017knl.pdf) + /// + /// CSF index recursively compresses each dimension of a tensor into a set + /// of prefix trees. Each path from a root to leaf forms one tensor + /// non-zero index. CSF is implemented with two arrays of buffers and one + /// arrays of integers. + /// + /// For example, let X be a 2x3x4x5 tensor and let it have the following + /// 8 non-zero values: + /// ```text + /// X[0, 0, 0, 1] := 1 + /// X[0, 0, 0, 2] := 2 + /// X[0, 1, 0, 0] := 3 + /// X[0, 1, 0, 2] := 4 + /// X[0, 1, 1, 0] := 5 + /// X[1, 1, 1, 0] := 6 + /// X[1, 1, 1, 1] := 7 + /// X[1, 1, 1, 2] := 8 + /// ``` + /// As a prefix tree this would be represented as: + /// ```text + /// 0 1 + /// / \ | + /// 0 1 1 + /// / / \ | + /// 0 0 1 1 + /// /| /| | /| | + /// 1 2 0 2 0 0 1 2 + /// ``` + /// The type of values in indptrBuffers + indptrType: Int (required); + + /// indptrBuffers stores the sparsity structure. + /// Each two consecutive dimensions in a tensor correspond to a buffer in + /// indptrBuffers. A pair of consecutive values at `indptrBuffers[dim][i]` + /// and `indptrBuffers[dim][i + 1]` signify a range of nodes in + /// `indicesBuffers[dim + 1]` who are children of `indicesBuffers[dim][i]` node. + /// + /// For example, the indptrBuffers for the above X is: + /// ```text + /// indptrBuffer(X) = [ + /// [0, 2, 3], + /// [0, 1, 3, 4], + /// [0, 2, 4, 5, 8] + /// ]. + /// ``` + indptrBuffers: [Buffer] (required); + + /// The type of values in indicesBuffers + indicesType: Int (required); + + /// indicesBuffers stores values of nodes. + /// Each tensor dimension corresponds to a buffer in indicesBuffers. + /// For example, the indicesBuffers for the above X is: + /// ```text + /// indicesBuffer(X) = [ + /// [0, 1], + /// [0, 1, 1], + /// [0, 0, 1, 1], + /// [1, 2, 0, 2, 0, 0, 1, 2] + /// ]. + /// ``` + indicesBuffers: [Buffer] (required); + + /// axisOrder stores the sequence in which dimensions were traversed to + /// produce the prefix tree. + /// For example, the axisOrder for the above X is: + /// ```text + /// axisOrder(X) = [0, 1, 2, 3]. + /// ``` + axisOrder: [int] (required); +} + +union SparseTensorIndex { + SparseTensorIndexCOO, + SparseMatrixIndexCSX, + SparseTensorIndexCSF +} + +table SparseTensor { + /// The type of data contained in a value cell. + /// Currently only fixed-width value types are supported, + /// no strings or nested types. + type: Type (required); + + /// The dimensions of the tensor, optionally named. + shape: [TensorDim] (required); + + /// The number of non-zero values in a sparse tensor. + non_zero_length: long; + + /// Sparse tensor index + sparseIndex: SparseTensorIndex (required); + + /// The location and size of the tensor's data + data: Buffer (required); +} + +root_type SparseTensor; |