summaryrefslogtreecommitdiffstats
path: root/src/arrow/format/SparseTensor.fbs
diff options
context:
space:
mode:
Diffstat (limited to 'src/arrow/format/SparseTensor.fbs')
-rw-r--r--src/arrow/format/SparseTensor.fbs228
1 files changed, 228 insertions, 0 deletions
diff --git a/src/arrow/format/SparseTensor.fbs b/src/arrow/format/SparseTensor.fbs
new file mode 100644
index 000000000..a6fd2f9e7
--- /dev/null
+++ b/src/arrow/format/SparseTensor.fbs
@@ -0,0 +1,228 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// EXPERIMENTAL: Metadata for n-dimensional sparse arrays, aka "sparse tensors".
+/// Arrow implementations in general are not required to implement this type
+
+include "Tensor.fbs";
+
+namespace org.apache.arrow.flatbuf;
+
+/// ----------------------------------------------------------------------
+/// EXPERIMENTAL: Data structures for sparse tensors
+
+/// Coordinate (COO) format of sparse tensor index.
+///
+/// COO's index list are represented as a NxM matrix,
+/// where N is the number of non-zero values,
+/// and M is the number of dimensions of a sparse tensor.
+///
+/// indicesBuffer stores the location and size of the data of this indices
+/// matrix. The value type and the stride of the indices matrix is
+/// specified in indicesType and indicesStrides fields.
+///
+/// For example, let X be a 2x3x4x5 tensor, and it has the following
+/// 6 non-zero values:
+/// ```text
+/// X[0, 1, 2, 0] := 1
+/// X[1, 1, 2, 3] := 2
+/// X[0, 2, 1, 0] := 3
+/// X[0, 1, 3, 0] := 4
+/// X[0, 1, 2, 1] := 5
+/// X[1, 2, 0, 4] := 6
+/// ```
+/// In COO format, the index matrix of X is the following 4x6 matrix:
+/// ```text
+/// [[0, 0, 0, 0, 1, 1],
+/// [1, 1, 1, 2, 1, 2],
+/// [2, 2, 3, 1, 2, 0],
+/// [0, 1, 0, 0, 3, 4]]
+/// ```
+/// When isCanonical is true, the indices is sorted in lexicographical order
+/// (row-major order), and it does not have duplicated entries. Otherwise,
+/// the indices may not be sorted, or may have duplicated entries.
+table SparseTensorIndexCOO {
+ /// The type of values in indicesBuffer
+ indicesType: Int (required);
+
+ /// Non-negative byte offsets to advance one value cell along each dimension
+ /// If omitted, default to row-major order (C-like).
+ indicesStrides: [long];
+
+ /// The location and size of the indices matrix's data
+ indicesBuffer: Buffer (required);
+
+ /// This flag is true if and only if the indices matrix is sorted in
+ /// row-major order, and does not have duplicated entries.
+ /// This sort order is the same as of Tensorflow's SparseTensor,
+ /// but it is inverse order of SciPy's canonical coo_matrix
+ /// (SciPy employs column-major order for its coo_matrix).
+ isCanonical: bool;
+}
+
+enum SparseMatrixCompressedAxis: short { Row, Column }
+
+/// Compressed Sparse format, that is matrix-specific.
+table SparseMatrixIndexCSX {
+ /// Which axis, row or column, is compressed
+ compressedAxis: SparseMatrixCompressedAxis;
+
+ /// The type of values in indptrBuffer
+ indptrType: Int (required);
+
+ /// indptrBuffer stores the location and size of indptr array that
+ /// represents the range of the rows.
+ /// The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data.
+ /// The length of this array is 1 + (the number of rows), and the type
+ /// of index value is long.
+ ///
+ /// For example, let X be the following 6x4 matrix:
+ /// ```text
+ /// X := [[0, 1, 2, 0],
+ /// [0, 0, 3, 0],
+ /// [0, 4, 0, 5],
+ /// [0, 0, 0, 0],
+ /// [6, 0, 7, 8],
+ /// [0, 9, 0, 0]].
+ /// ```
+ /// The array of non-zero values in X is:
+ /// ```text
+ /// values(X) = [1, 2, 3, 4, 5, 6, 7, 8, 9].
+ /// ```
+ /// And the indptr of X is:
+ /// ```text
+ /// indptr(X) = [0, 2, 3, 5, 5, 8, 10].
+ /// ```
+ indptrBuffer: Buffer (required);
+
+ /// The type of values in indicesBuffer
+ indicesType: Int (required);
+
+ /// indicesBuffer stores the location and size of the array that
+ /// contains the column indices of the corresponding non-zero values.
+ /// The type of index value is long.
+ ///
+ /// For example, the indices of the above X is:
+ /// ```text
+ /// indices(X) = [1, 2, 2, 1, 3, 0, 2, 3, 1].
+ /// ```
+ /// Note that the indices are sorted in lexicographical order for each row.
+ indicesBuffer: Buffer (required);
+}
+
+/// Compressed Sparse Fiber (CSF) sparse tensor index.
+table SparseTensorIndexCSF {
+ /// CSF is a generalization of compressed sparse row (CSR) index.
+ /// See [smith2017knl](http://shaden.io/pub-files/smith2017knl.pdf)
+ ///
+ /// CSF index recursively compresses each dimension of a tensor into a set
+ /// of prefix trees. Each path from a root to leaf forms one tensor
+ /// non-zero index. CSF is implemented with two arrays of buffers and one
+ /// arrays of integers.
+ ///
+ /// For example, let X be a 2x3x4x5 tensor and let it have the following
+ /// 8 non-zero values:
+ /// ```text
+ /// X[0, 0, 0, 1] := 1
+ /// X[0, 0, 0, 2] := 2
+ /// X[0, 1, 0, 0] := 3
+ /// X[0, 1, 0, 2] := 4
+ /// X[0, 1, 1, 0] := 5
+ /// X[1, 1, 1, 0] := 6
+ /// X[1, 1, 1, 1] := 7
+ /// X[1, 1, 1, 2] := 8
+ /// ```
+ /// As a prefix tree this would be represented as:
+ /// ```text
+ /// 0 1
+ /// / \ |
+ /// 0 1 1
+ /// / / \ |
+ /// 0 0 1 1
+ /// /| /| | /| |
+ /// 1 2 0 2 0 0 1 2
+ /// ```
+ /// The type of values in indptrBuffers
+ indptrType: Int (required);
+
+ /// indptrBuffers stores the sparsity structure.
+ /// Each two consecutive dimensions in a tensor correspond to a buffer in
+ /// indptrBuffers. A pair of consecutive values at `indptrBuffers[dim][i]`
+ /// and `indptrBuffers[dim][i + 1]` signify a range of nodes in
+ /// `indicesBuffers[dim + 1]` who are children of `indicesBuffers[dim][i]` node.
+ ///
+ /// For example, the indptrBuffers for the above X is:
+ /// ```text
+ /// indptrBuffer(X) = [
+ /// [0, 2, 3],
+ /// [0, 1, 3, 4],
+ /// [0, 2, 4, 5, 8]
+ /// ].
+ /// ```
+ indptrBuffers: [Buffer] (required);
+
+ /// The type of values in indicesBuffers
+ indicesType: Int (required);
+
+ /// indicesBuffers stores values of nodes.
+ /// Each tensor dimension corresponds to a buffer in indicesBuffers.
+ /// For example, the indicesBuffers for the above X is:
+ /// ```text
+ /// indicesBuffer(X) = [
+ /// [0, 1],
+ /// [0, 1, 1],
+ /// [0, 0, 1, 1],
+ /// [1, 2, 0, 2, 0, 0, 1, 2]
+ /// ].
+ /// ```
+ indicesBuffers: [Buffer] (required);
+
+ /// axisOrder stores the sequence in which dimensions were traversed to
+ /// produce the prefix tree.
+ /// For example, the axisOrder for the above X is:
+ /// ```text
+ /// axisOrder(X) = [0, 1, 2, 3].
+ /// ```
+ axisOrder: [int] (required);
+}
+
+union SparseTensorIndex {
+ SparseTensorIndexCOO,
+ SparseMatrixIndexCSX,
+ SparseTensorIndexCSF
+}
+
+table SparseTensor {
+ /// The type of data contained in a value cell.
+ /// Currently only fixed-width value types are supported,
+ /// no strings or nested types.
+ type: Type (required);
+
+ /// The dimensions of the tensor, optionally named.
+ shape: [TensorDim] (required);
+
+ /// The number of non-zero values in a sparse tensor.
+ non_zero_length: long;
+
+ /// Sparse tensor index
+ sparseIndex: SparseTensorIndex (required);
+
+ /// The location and size of the tensor's data
+ data: Buffer (required);
+}
+
+root_type SparseTensor;