From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:54:28 +0200 Subject: Adding upstream version 18.2.2. Signed-off-by: Daniel Baumann --- src/arrow/java/vector/pom.xml | 274 ++ src/arrow/java/vector/src/main/codegen/config.fmpp | 24 + .../vector/src/main/codegen/data/ArrowTypes.tdd | 124 + .../src/main/codegen/data/ValueVectorTypes.tdd | 206 ++ .../vector/src/main/codegen/includes/license.ftl | 16 + .../src/main/codegen/includes/vv_imports.ftl | 61 + .../codegen/templates/AbstractFieldReader.java | 132 + .../codegen/templates/AbstractFieldWriter.java | 230 ++ .../templates/AbstractPromotableFieldWriter.java | 238 ++ .../src/main/codegen/templates/ArrowType.java | 375 +++ .../src/main/codegen/templates/BaseReader.java | 85 + .../src/main/codegen/templates/BaseWriter.java | 131 + .../templates/CaseSensitiveStructWriters.java | 53 + .../src/main/codegen/templates/ComplexCopier.java | 191 ++ .../src/main/codegen/templates/ComplexReaders.java | 147 + .../src/main/codegen/templates/ComplexWriters.java | 211 ++ .../main/codegen/templates/DenseUnionReader.java | 229 ++ .../main/codegen/templates/DenseUnionVector.java | 943 ++++++ .../main/codegen/templates/DenseUnionWriter.java | 302 ++ .../main/codegen/templates/HolderReaderImpl.java | 173 ++ .../src/main/codegen/templates/NullReader.java | 147 + .../src/main/codegen/templates/StructWriters.java | 326 +++ .../templates/UnionFixedSizeListWriter.java | 319 ++ .../main/codegen/templates/UnionListWriter.java | 326 +++ .../src/main/codegen/templates/UnionMapWriter.java | 222 ++ .../src/main/codegen/templates/UnionReader.java | 223 ++ .../src/main/codegen/templates/UnionVector.java | 854 ++++++ .../src/main/codegen/templates/UnionWriter.java | 364 +++ .../src/main/codegen/templates/ValueHolders.java | 81 + .../org/apache/arrow/vector/AddOrGetResult.java | 46 + .../org/apache/arrow/vector/AllocationHelper.java | 95 + .../apache/arrow/vector/BaseFixedWidthVector.java | 930 ++++++ .../org/apache/arrow/vector/BaseIntVector.java | 43 + .../arrow/vector/BaseLargeVariableWidthVector.java | 1370 +++++++++ .../org/apache/arrow/vector/BaseValueVector.java | 231 ++ .../arrow/vector/BaseVariableWidthVector.java | 1410 +++++++++ .../java/org/apache/arrow/vector/BigIntVector.java | 358 +++ .../java/org/apache/arrow/vector/BitVector.java | 599 ++++ .../org/apache/arrow/vector/BitVectorHelper.java | 449 +++ .../java/org/apache/arrow/vector/BufferBacked.java | 31 + .../java/org/apache/arrow/vector/BufferLayout.java | 153 + .../org/apache/arrow/vector/DateDayVector.java | 347 +++ .../org/apache/arrow/vector/DateMilliVector.java | 350 +++ .../org/apache/arrow/vector/Decimal256Vector.java | 584 ++++ .../org/apache/arrow/vector/DecimalVector.java | 584 ++++ .../apache/arrow/vector/DensityAwareVector.java | 57 + .../org/apache/arrow/vector/DurationVector.java | 406 +++ .../arrow/vector/ElementAddressableVector.java | 42 + .../apache/arrow/vector/ExtensionTypeVector.java | 274 ++ .../java/org/apache/arrow/vector/FieldVector.java | 93 + .../apache/arrow/vector/FixedSizeBinaryVector.java | 386 +++ .../org/apache/arrow/vector/FixedWidthVector.java | 36 + .../java/org/apache/arrow/vector/Float4Vector.java | 361 +++ .../java/org/apache/arrow/vector/Float8Vector.java | 362 +++ .../apache/arrow/vector/FloatingPointVector.java | 46 + .../apache/arrow/vector/GenerateSampleData.java | 337 +++ .../java/org/apache/arrow/vector/IntVector.java | 362 +++ .../org/apache/arrow/vector/IntervalDayVector.java | 433 +++ .../arrow/vector/IntervalMonthDayNanoVector.java | 442 +++ .../apache/arrow/vector/IntervalYearVector.java | 382 +++ .../apache/arrow/vector/LargeVarBinaryVector.java | 305 ++ .../apache/arrow/vector/LargeVarCharVector.java | 331 +++ .../apache/arrow/vector/NullCheckingForGet.java | 84 + .../java/org/apache/arrow/vector/NullVector.java | 338 +++ .../org/apache/arrow/vector/PeriodDuration.java | 64 + .../apache/arrow/vector/SchemaChangeCallBack.java | 57 + .../org/apache/arrow/vector/SmallIntVector.java | 389 +++ .../org/apache/arrow/vector/TimeMicroVector.java | 347 +++ .../org/apache/arrow/vector/TimeMilliVector.java | 351 +++ .../org/apache/arrow/vector/TimeNanoVector.java | 347 +++ .../org/apache/arrow/vector/TimeSecVector.java | 348 +++ .../arrow/vector/TimeStampMicroTZVector.java | 239 ++ .../apache/arrow/vector/TimeStampMicroVector.java | 236 ++ .../arrow/vector/TimeStampMilliTZVector.java | 238 ++ .../apache/arrow/vector/TimeStampMilliVector.java | 236 ++ .../apache/arrow/vector/TimeStampNanoTZVector.java | 241 ++ .../apache/arrow/vector/TimeStampNanoVector.java | 236 ++ .../apache/arrow/vector/TimeStampSecTZVector.java | 238 ++ .../apache/arrow/vector/TimeStampSecVector.java | 237 ++ .../org/apache/arrow/vector/TimeStampVector.java | 197 ++ .../org/apache/arrow/vector/TinyIntVector.java | 390 +++ .../java/org/apache/arrow/vector/TypeLayout.java | 448 +++ .../java/org/apache/arrow/vector/UInt1Vector.java | 368 +++ .../java/org/apache/arrow/vector/UInt2Vector.java | 346 +++ .../java/org/apache/arrow/vector/UInt4Vector.java | 340 +++ .../java/org/apache/arrow/vector/UInt8Vector.java | 336 +++ .../java/org/apache/arrow/vector/ValueVector.java | 285 ++ .../org/apache/arrow/vector/VarBinaryVector.java | 306 ++ .../org/apache/arrow/vector/VarCharVector.java | 331 +++ .../apache/arrow/vector/VariableWidthVector.java | 53 + .../arrow/vector/VectorDefinitionSetter.java | 26 + .../java/org/apache/arrow/vector/VectorLoader.java | 137 + .../org/apache/arrow/vector/VectorSchemaRoot.java | 429 +++ .../org/apache/arrow/vector/VectorUnloader.java | 107 + .../java/org/apache/arrow/vector/ZeroVector.java | 138 + .../arrow/vector/compare/ApproxEqualsVisitor.java | 147 + .../org/apache/arrow/vector/compare/Range.java | 85 + .../arrow/vector/compare/RangeEqualsVisitor.java | 563 ++++ .../arrow/vector/compare/TypeEqualsVisitor.java | 154 + .../arrow/vector/compare/VectorEqualsVisitor.java | 60 + .../arrow/vector/compare/VectorValueEqualizer.java | 44 + .../apache/arrow/vector/compare/VectorVisitor.java | 61 + .../compare/util/ValueEpsilonEqualizers.java | 149 + .../vector/complex/AbstractContainerVector.java | 140 + .../arrow/vector/complex/AbstractStructVector.java | 425 +++ .../arrow/vector/complex/BaseListVector.java | 36 + .../vector/complex/BaseRepeatedValueVector.java | 367 +++ .../arrow/vector/complex/EmptyValuePopulator.java | 51 + .../arrow/vector/complex/FixedSizeListVector.java | 675 +++++ .../arrow/vector/complex/LargeListVector.java | 1036 +++++++ .../apache/arrow/vector/complex/ListVector.java | 879 ++++++ .../org/apache/arrow/vector/complex/MapVector.java | 122 + .../vector/complex/NonNullableStructVector.java | 440 +++ .../apache/arrow/vector/complex/Positionable.java | 29 + .../arrow/vector/complex/PromotableVector.java | 32 + .../complex/RepeatedFixedWidthVectorLike.java | 32 + .../arrow/vector/complex/RepeatedValueVector.java | 49 + .../complex/RepeatedVariableWidthVectorLike.java | 40 + .../org/apache/arrow/vector/complex/StateTool.java | 44 + .../apache/arrow/vector/complex/StructVector.java | 608 ++++ .../arrow/vector/complex/VectorWithOrdinal.java | 34 + .../vector/complex/impl/AbstractBaseReader.java | 118 + .../vector/complex/impl/AbstractBaseWriter.java | 55 + .../vector/complex/impl/ComplexWriterImpl.java | 227 ++ .../complex/impl/NullableStructReaderImpl.java | 59 + .../complex/impl/NullableStructWriterFactory.java | 48 + .../vector/complex/impl/PromotableWriter.java | 398 +++ .../vector/complex/impl/SingleListReaderImpl.java | 91 + .../complex/impl/SingleStructReaderImpl.java | 113 + .../complex/impl/StructOrListWriterImpl.java | 137 + .../complex/impl/UnionFixedSizeListReader.java | 105 + .../vector/complex/impl/UnionLargeListReader.java | 109 + .../arrow/vector/complex/impl/UnionListReader.java | 107 + .../arrow/vector/complex/impl/UnionMapReader.java | 77 + .../arrow/vector/complex/reader/FieldReader.java | 35 + .../arrow/vector/complex/writer/FieldWriter.java | 33 + .../compression/AbstractCompressionCodec.java | 116 + .../arrow/vector/compression/CompressionCodec.java | 62 + .../arrow/vector/compression/CompressionUtil.java | 103 + .../vector/compression/NoCompressionCodec.java | 67 + .../apache/arrow/vector/dictionary/Dictionary.java | 75 + .../arrow/vector/dictionary/DictionaryEncoder.java | 196 ++ .../vector/dictionary/DictionaryHashTable.java | 295 ++ .../vector/dictionary/DictionaryProvider.java | 62 + .../vector/dictionary/ListSubfieldEncoder.java | 137 + .../vector/dictionary/StructSubfieldEncoder.java | 196 ++ .../apache/arrow/vector/holders/ComplexHolder.java | 28 + .../arrow/vector/holders/DenseUnionHolder.java | 38 + .../arrow/vector/holders/RepeatedListHolder.java | 26 + .../arrow/vector/holders/RepeatedStructHolder.java | 26 + .../apache/arrow/vector/holders/UnionHolder.java | 37 + .../apache/arrow/vector/holders/ValueHolder.java | 31 + .../apache/arrow/vector/ipc/ArrowFileReader.java | 230 ++ .../apache/arrow/vector/ipc/ArrowFileWriter.java | 119 + .../org/apache/arrow/vector/ipc/ArrowMagic.java | 44 + .../org/apache/arrow/vector/ipc/ArrowReader.java | 255 ++ .../apache/arrow/vector/ipc/ArrowStreamReader.java | 229 ++ .../apache/arrow/vector/ipc/ArrowStreamWriter.java | 86 + .../org/apache/arrow/vector/ipc/ArrowWriter.java | 210 ++ .../vector/ipc/InvalidArrowFileException.java | 30 + .../apache/arrow/vector/ipc/JsonFileReader.java | 806 ++++++ .../apache/arrow/vector/ipc/JsonFileWriter.java | 417 +++ .../org/apache/arrow/vector/ipc/ReadChannel.java | 102 + .../arrow/vector/ipc/SeekableReadChannel.java | 43 + .../org/apache/arrow/vector/ipc/WriteChannel.java | 162 ++ .../arrow/vector/ipc/message/ArrowBlock.java | 95 + .../vector/ipc/message/ArrowBodyCompression.java | 55 + .../arrow/vector/ipc/message/ArrowBuffer.java | 90 + .../vector/ipc/message/ArrowDictionaryBatch.java | 94 + .../arrow/vector/ipc/message/ArrowFieldNode.java | 64 + .../arrow/vector/ipc/message/ArrowFooter.java | 226 ++ .../arrow/vector/ipc/message/ArrowMessage.java | 42 + .../arrow/vector/ipc/message/ArrowRecordBatch.java | 259 ++ .../arrow/vector/ipc/message/FBSerializable.java | 30 + .../arrow/vector/ipc/message/FBSerializables.java | 67 + .../apache/arrow/vector/ipc/message/IpcOption.java | 44 + .../vector/ipc/message/MessageChannelReader.java | 91 + .../vector/ipc/message/MessageMetadataResult.java | 115 + .../arrow/vector/ipc/message/MessageResult.java | 61 + .../vector/ipc/message/MessageSerializer.java | 736 +++++ .../org/apache/arrow/vector/types/DateUnit.java | 50 + .../arrow/vector/types/FloatingPointPrecision.java | 55 + .../apache/arrow/vector/types/IntervalUnit.java | 52 + .../apache/arrow/vector/types/MetadataVersion.java | 65 + .../org/apache/arrow/vector/types/TimeUnit.java | 50 + .../java/org/apache/arrow/vector/types/Types.java | 1016 +++++++ .../org/apache/arrow/vector/types/UnionMode.java | 57 + .../vector/types/pojo/DictionaryEncoding.java | 88 + .../vector/types/pojo/ExtensionTypeRegistry.java | 42 + .../org/apache/arrow/vector/types/pojo/Field.java | 306 ++ .../apache/arrow/vector/types/pojo/FieldType.java | 123 + .../org/apache/arrow/vector/types/pojo/Schema.java | 247 ++ .../util/ByteArrayReadableSeekableByteChannel.java | 86 + .../org/apache/arrow/vector/util/CallBack.java | 25 + .../arrow/vector/util/DataSizeRoundingUtil.java | 99 + .../org/apache/arrow/vector/util/DateUtility.java | 134 + .../apache/arrow/vector/util/DecimalUtility.java | 188 ++ .../arrow/vector/util/DictionaryUtility.java | 145 + .../util/ElementAddressableVectorIterator.java | 86 + .../arrow/vector/util/JsonStringArrayList.java | 55 + .../arrow/vector/util/JsonStringHashMap.java | 48 + .../apache/arrow/vector/util/MapWithOrdinal.java | 67 + .../arrow/vector/util/MapWithOrdinalImpl.java | 248 ++ .../arrow/vector/util/MultiMapWithOrdinal.java | 230 ++ .../vector/util/OversizedAllocationException.java | 52 + .../vector/util/PromotableMultiMapWithOrdinal.java | 133 + .../vector/util/SchemaChangeRuntimeException.java | 48 + .../apache/arrow/vector/util/SchemaUtility.java | 63 + .../java/org/apache/arrow/vector/util/Text.java | 688 +++++ .../org/apache/arrow/vector/util/TransferPair.java | 33 + .../org/apache/arrow/vector/util/Validator.java | 190 ++ .../arrow/vector/util/ValueVectorUtility.java | 187 ++ .../apache/arrow/vector/util/VectorAppender.java | 542 ++++ .../arrow/vector/util/VectorBatchAppender.java | 39 + .../vector/util/VectorSchemaRootAppender.java | 83 + .../vector/validate/MetadataV4UnionChecker.java | 82 + .../apache/arrow/vector/validate/ValidateUtil.java | 61 + .../validate/ValidateVectorBufferVisitor.java | 246 ++ .../vector/validate/ValidateVectorDataVisitor.java | 180 ++ .../vector/validate/ValidateVectorTypeVisitor.java | 378 +++ .../vector/validate/ValidateVectorVisitor.java | 273 ++ .../java/org/apache/arrow/util/TestSchemaUtil.java | 51 + .../apache/arrow/vector/DirtyRootAllocator.java | 52 + .../org/apache/arrow/vector/ITTestLargeVector.java | 280 ++ .../org/apache/arrow/vector/TestBitVector.java | 543 ++++ .../apache/arrow/vector/TestBitVectorHelper.java | 235 ++ .../arrow/vector/TestBufferOwnershipTransfer.java | 131 + .../java/org/apache/arrow/vector/TestCopyFrom.java | 1104 +++++++ .../apache/arrow/vector/TestDecimal256Vector.java | 357 +++ .../org/apache/arrow/vector/TestDecimalVector.java | 365 +++ .../apache/arrow/vector/TestDenseUnionVector.java | 639 ++++ .../apache/arrow/vector/TestDictionaryVector.java | 1032 +++++++ .../apache/arrow/vector/TestDurationVector.java | 137 + .../arrow/vector/TestFixedSizeBinaryVector.java | 279 ++ .../arrow/vector/TestFixedSizeListVector.java | 507 ++++ .../vector/TestIntervalMonthDayNanoVector.java | 99 + .../arrow/vector/TestIntervalYearVector.java | 58 + .../apache/arrow/vector/TestLargeListVector.java | 982 +++++++ .../arrow/vector/TestLargeVarBinaryVector.java | 104 + .../arrow/vector/TestLargeVarCharVector.java | 816 ++++++ .../org/apache/arrow/vector/TestListVector.java | 981 +++++++ .../org/apache/arrow/vector/TestMapVector.java | 1113 +++++++ .../arrow/vector/TestNullCheckingForGet.java | 92 + .../vector/TestOutOfMemoryForValueVector.java | 73 + .../TestOversizedAllocationForValueVector.java | 132 + .../apache/arrow/vector/TestPeriodDuration.java | 46 + .../apache/arrow/vector/TestSplitAndTransfer.java | 410 +++ .../org/apache/arrow/vector/TestStructVector.java | 183 ++ .../org/apache/arrow/vector/TestTypeLayout.java | 98 + .../org/apache/arrow/vector/TestUnionVector.java | 520 ++++ .../java/org/apache/arrow/vector/TestUtils.java | 45 + .../org/apache/arrow/vector/TestValueVector.java | 3061 ++++++++++++++++++++ .../apache/arrow/vector/TestVarCharListVector.java | 77 + .../org/apache/arrow/vector/TestVectorAlloc.java | 169 ++ .../org/apache/arrow/vector/TestVectorReAlloc.java | 474 +++ .../org/apache/arrow/vector/TestVectorReset.java | 168 ++ .../apache/arrow/vector/TestVectorSchemaRoot.java | 318 ++ .../apache/arrow/vector/TestVectorUnloadLoad.java | 332 +++ .../vector/compare/TestRangeEqualsVisitor.java | 740 +++++ .../vector/compare/TestTypeEqualsVisitor.java | 185 ++ .../vector/complex/impl/TestComplexCopier.java | 763 +++++ .../vector/complex/impl/TestPromotableWriter.java | 167 ++ .../vector/complex/writer/TestComplexWriter.java | 1335 +++++++++ .../org/apache/arrow/vector/ipc/BaseFileTest.java | 849 ++++++ .../vector/ipc/ITTestIPCWithLargeArrowBuffers.java | 187 ++ .../arrow/vector/ipc/MessageSerializerTest.java | 247 ++ .../org/apache/arrow/vector/ipc/TestArrowFile.java | 134 + .../apache/arrow/vector/ipc/TestArrowFooter.java | 68 + .../arrow/vector/ipc/TestArrowReaderWriter.java | 882 ++++++ .../apache/arrow/vector/ipc/TestArrowStream.java | 147 + .../arrow/vector/ipc/TestArrowStreamPipe.java | 161 + .../org/apache/arrow/vector/ipc/TestJSONFile.java | 458 +++ .../org/apache/arrow/vector/ipc/TestRoundTrip.java | 628 ++++ .../vector/ipc/TestUIntDictionaryRoundTrip.java | 246 ++ .../ipc/message/TestMessageMetadataResult.java | 36 + .../org/apache/arrow/vector/pojo/TestConvert.java | 169 ++ .../arrow/vector/testing/RandomDataGenerator.java | 44 + .../vector/testing/TestValueVectorPopulator.java | 604 ++++ .../vector/testing/ValueVectorDataPopulator.java | 708 +++++ .../arrow/vector/types/pojo/TestExtensionType.java | 420 +++ .../apache/arrow/vector/types/pojo/TestField.java | 63 + .../apache/arrow/vector/types/pojo/TestSchema.java | 254 ++ .../arrow/vector/util/DecimalUtilityTest.java | 127 + .../vector/util/TestDataSizeRoundingUtil.java | 76 + .../util/TestElementAddressableVectorIterator.java | 134 + .../arrow/vector/util/TestMultiMapWithOrdinal.java | 60 + .../apache/arrow/vector/util/TestValidator.java | 56 + .../arrow/vector/util/TestVectorAppender.java | 794 +++++ .../arrow/vector/util/TestVectorBatchAppender.java | 72 + .../vector/util/TestVectorSchemaRootAppender.java | 161 + .../arrow/vector/validate/TestValidateVector.java | 260 ++ .../vector/validate/TestValidateVectorFull.java | 234 ++ .../validate/TestValidateVectorSchemaRoot.java | 101 + .../validate/TestValidateVectorTypeVisitor.java | 301 ++ .../java/vector/src/test/resources/logback.xml | 28 + 295 files changed, 76818 insertions(+) create mode 100644 src/arrow/java/vector/pom.xml create mode 100644 src/arrow/java/vector/src/main/codegen/config.fmpp create mode 100644 src/arrow/java/vector/src/main/codegen/data/ArrowTypes.tdd create mode 100644 src/arrow/java/vector/src/main/codegen/data/ValueVectorTypes.tdd create mode 100644 src/arrow/java/vector/src/main/codegen/includes/license.ftl create mode 100644 src/arrow/java/vector/src/main/codegen/includes/vv_imports.ftl create mode 100644 src/arrow/java/vector/src/main/codegen/templates/AbstractFieldReader.java create mode 100644 src/arrow/java/vector/src/main/codegen/templates/AbstractFieldWriter.java create mode 100644 src/arrow/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java create mode 100644 src/arrow/java/vector/src/main/codegen/templates/ArrowType.java create mode 100644 src/arrow/java/vector/src/main/codegen/templates/BaseReader.java create mode 100644 src/arrow/java/vector/src/main/codegen/templates/BaseWriter.java create mode 100644 src/arrow/java/vector/src/main/codegen/templates/CaseSensitiveStructWriters.java create mode 100644 src/arrow/java/vector/src/main/codegen/templates/ComplexCopier.java create mode 100644 src/arrow/java/vector/src/main/codegen/templates/ComplexReaders.java create mode 100644 src/arrow/java/vector/src/main/codegen/templates/ComplexWriters.java create mode 100644 src/arrow/java/vector/src/main/codegen/templates/DenseUnionReader.java create mode 100644 src/arrow/java/vector/src/main/codegen/templates/DenseUnionVector.java create mode 100644 src/arrow/java/vector/src/main/codegen/templates/DenseUnionWriter.java create mode 100644 src/arrow/java/vector/src/main/codegen/templates/HolderReaderImpl.java create mode 100644 src/arrow/java/vector/src/main/codegen/templates/NullReader.java create mode 100644 src/arrow/java/vector/src/main/codegen/templates/StructWriters.java create mode 100644 src/arrow/java/vector/src/main/codegen/templates/UnionFixedSizeListWriter.java create mode 100644 src/arrow/java/vector/src/main/codegen/templates/UnionListWriter.java create mode 100644 src/arrow/java/vector/src/main/codegen/templates/UnionMapWriter.java create mode 100644 src/arrow/java/vector/src/main/codegen/templates/UnionReader.java create mode 100644 src/arrow/java/vector/src/main/codegen/templates/UnionVector.java create mode 100644 src/arrow/java/vector/src/main/codegen/templates/UnionWriter.java create mode 100644 src/arrow/java/vector/src/main/codegen/templates/ValueHolders.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/AddOrGetResult.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BaseIntVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BigIntVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/DateDayVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/DateMilliVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/Decimal256Vector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/DecimalVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/DurationVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ElementAddressableVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ExtensionTypeVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/FixedWidthVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/Float4Vector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/Float8Vector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/FloatingPointVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/GenerateSampleData.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/IntVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/IntervalDayVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/IntervalMonthDayNanoVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/IntervalYearVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/LargeVarBinaryVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/NullCheckingForGet.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/PeriodDuration.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/SchemaChangeCallBack.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/SmallIntVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeMicroVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeMilliVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeNanoVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeSecVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMicroTZVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMicroVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMilliTZVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMilliVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampNanoTZVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampNanoVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampSecTZVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampSecVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TinyIntVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/UInt1Vector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/UInt2Vector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/UInt4Vector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/UInt8Vector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VarBinaryVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VectorDefinitionSetter.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/ApproxEqualsVisitor.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/Range.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorEqualsVisitor.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorValueEqualizer.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/util/ValueEpsilonEqualizers.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractStructVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseListVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/EmptyValuePopulator.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/Positionable.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/PromotableVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedFixedWidthVectorLike.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedVariableWidthVectorLike.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/VectorWithOrdinal.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseReader.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseWriter.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableStructReaderImpl.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableStructWriterFactory.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleListReaderImpl.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleStructReaderImpl.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/StructOrListWriterImpl.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionFixedSizeListReader.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionLargeListReader.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionMapReader.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/reader/FieldReader.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/writer/FieldWriter.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compression/AbstractCompressionCodec.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compression/CompressionCodec.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compression/CompressionUtil.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compression/NoCompressionCodec.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/dictionary/Dictionary.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryProvider.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/dictionary/ListSubfieldEncoder.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/dictionary/StructSubfieldEncoder.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/holders/ComplexHolder.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/holders/DenseUnionHolder.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedListHolder.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedStructHolder.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/holders/UnionHolder.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/holders/ValueHolder.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowFileReader.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowFileWriter.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowMagic.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowReader.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowStreamReader.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowStreamWriter.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowWriter.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/InvalidArrowFileException.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ReadChannel.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/SeekableReadChannel.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/WriteChannel.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowBlock.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowBodyCompression.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowBuffer.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowDictionaryBatch.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowFieldNode.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowFooter.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowMessage.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatch.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/FBSerializable.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/FBSerializables.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/IpcOption.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/MessageChannelReader.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/MessageMetadataResult.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/MessageResult.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/MessageSerializer.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/DateUnit.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/FloatingPointPrecision.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/IntervalUnit.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/MetadataVersion.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/TimeUnit.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/UnionMode.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/DictionaryEncoding.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/ExtensionTypeRegistry.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/FieldType.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Schema.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/ByteArrayReadableSeekableByteChannel.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/CallBack.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/DataSizeRoundingUtil.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/ElementAddressableVectorIterator.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringArrayList.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringHashMap.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinal.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinalImpl.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/MultiMapWithOrdinal.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/OversizedAllocationException.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/PromotableMultiMapWithOrdinal.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaChangeRuntimeException.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaUtility.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/TransferPair.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/ValueVectorUtility.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/VectorBatchAppender.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/VectorSchemaRootAppender.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/validate/MetadataV4UnionChecker.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateUtil.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorDataVisitor.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java create mode 100644 src/arrow/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorVisitor.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/util/TestSchemaUtil.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/DirtyRootAllocator.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ITTestLargeVector.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestBitVector.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestBitVectorHelper.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestCopyFrom.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestDecimal256Vector.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestDecimalVector.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestDurationVector.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeBinaryVector.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeListVector.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestIntervalMonthDayNanoVector.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestIntervalYearVector.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestLargeListVector.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarBinaryVector.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestMapVector.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestNullCheckingForGet.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestOutOfMemoryForValueVector.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestOversizedAllocationForValueVector.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestPeriodDuration.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestStructVector.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestTypeLayout.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestUtils.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestVarCharListVector.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestVectorAlloc.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestVectorSchemaRoot.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/compare/TestTypeEqualsVisitor.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestComplexCopier.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/BaseFileTest.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/ITTestIPCWithLargeArrowBuffers.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/MessageSerializerTest.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowFile.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowFooter.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowReaderWriter.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowStream.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowStreamPipe.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestJSONFile.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestRoundTrip.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestUIntDictionaryRoundTrip.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/message/TestMessageMetadataResult.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/testing/RandomDataGenerator.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/testing/TestValueVectorPopulator.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestField.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestSchema.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/DecimalUtilityTest.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestDataSizeRoundingUtil.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestElementAddressableVectorIterator.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestMultiMapWithOrdinal.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestValidator.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorBatchAppender.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorSchemaRootAppender.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVector.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVectorFull.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVectorSchemaRoot.java create mode 100644 src/arrow/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVectorTypeVisitor.java create mode 100644 src/arrow/java/vector/src/test/resources/logback.xml (limited to 'src/arrow/java/vector') diff --git a/src/arrow/java/vector/pom.xml b/src/arrow/java/vector/pom.xml new file mode 100644 index 000000000..e37e931ef --- /dev/null +++ b/src/arrow/java/vector/pom.xml @@ -0,0 +1,274 @@ + + + + 4.0.0 + + org.apache.arrow + arrow-java-root + 6.0.1 + + arrow-vector + Arrow Vectors + An off-heap reference implementation for Arrow columnar data format. + + + + + org.apache.arrow + arrow-format + ${project.version} + + + org.apache.arrow + arrow-memory-core + ${project.version} + + + com.fasterxml.jackson.core + jackson-core + + + com.fasterxml.jackson.core + jackson-annotations + + + com.fasterxml.jackson.core + jackson-databind + + + commons-codec + commons-codec + 1.10 + + + org.apache.arrow + arrow-memory-netty + ${project.version} + test + + + org.apache.arrow + arrow-memory-unsafe + ${project.version} + test + + + io.netty + netty-common + + + com.google.flatbuffers + flatbuffers-java + ${dep.fbs.version} + + + org.slf4j + slf4j-api + + + + + + apache + apache + https://repo.maven.apache.org/maven2/ + + true + + + false + + + + + + + + + + ${basedir}/src/main/codegen + codegen + + + + + + org.apache.maven.plugins + maven-surefire-plugin + + + default-test + test + + + org.apache.arrow:arrow-memory-unsafe + + + + + run-unsafe + test + + test + + + + org.apache.arrow:arrow-memory-netty + + netty + + + + + + maven-resources-plugin + + + copy-fmpp-resources + initialize + + copy-resources + + + ${project.build.directory}/codegen + + + src/main/codegen + false + + + + + + + + org.apache.drill.tools + drill-fmpp-maven-plugin + 1.5.0 + + + generate-fmpp + generate-sources + + generate + + + src/main/codegen/config.fmpp + ${project.build.directory}/generated-sources + ${project.build.directory}/codegen/templates + + + + + + org.apache.maven.plugins + maven-shade-plugin + 3.1.1 + + + package + + shade + + + + + org.apache.arrow:arrow-format + com.google.flatbuffers:* + + + true + shade-format-flatbuffers + true + true + + + com.google.flatbuffers + arrow.vector.com.google.flatbuffers + + + + + + + + + + + + org.eclipse.m2e + lifecycle-mapping + 1.0.0 + + + + + + org.apache.drill.tools + drill-fmpp-maven-plugin + [1.0,) + + generate + + + + + false + true + + + + + + + + + + + + + + + + + + integration-tests + + + + org.apache.maven.plugins + maven-failsafe-plugin + + 3600 + + false + + + + + + integration-test + verify + + + + + + + + + + diff --git a/src/arrow/java/vector/src/main/codegen/config.fmpp b/src/arrow/java/vector/src/main/codegen/config.fmpp new file mode 100644 index 000000000..ef5a5072a --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/config.fmpp @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +data: { + # TODO: Rename to ~valueVectorModesAndTypes for clarity. + vv: tdd(../data/ValueVectorTypes.tdd), + arrowTypes: tdd(../data/ArrowTypes.tdd) + +} +freemarkerLinks: { + includes: includes/ +} diff --git a/src/arrow/java/vector/src/main/codegen/data/ArrowTypes.tdd b/src/arrow/java/vector/src/main/codegen/data/ArrowTypes.tdd new file mode 100644 index 000000000..3cf9a9687 --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/data/ArrowTypes.tdd @@ -0,0 +1,124 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{ + types: [ + { + name: "Null", + fields: [], + complex: false + }, + { + name: "Struct_", + fields: [], + complex: true + }, + { + name: "List", + fields: [], + complex: true + }, + { + name: "LargeList", + fields: [], + complex: true + }, + { + name: "FixedSizeList", + fields: [{name: "listSize", type: int}], + complex: true + }, + { + name: "Union", + fields: [{name: "mode", type: short, valueType: UnionMode}, {name: "typeIds", type: "int[]"}], + complex: true + }, + { + name: "Map", + fields: [{name: "keysSorted", type: boolean}], + complex: true + }, + { + name: "Int", + fields: [{name: "bitWidth", type: int}, {name: "isSigned", type: boolean}], + complex: false + }, + { + name: "FloatingPoint", + fields: [{name: precision, type: short, valueType: FloatingPointPrecision}], + complex: false + }, + { + name: "Utf8", + fields: [], + complex: false + }, + { + name: "LargeUtf8", + fields: [], + complex: false + }, + { + name: "Binary", + fields: [], + complex: false + }, + { + name: "LargeBinary", + fields: [], + complex: false + }, + { + name: "FixedSizeBinary", + fields: [{name: "byteWidth", type: int}], + complex: false + } + { + name: "Bool", + fields: [], + complex: false + }, + { + name: "Decimal", + fields: [{name: "precision", type: int}, {name: "scale", type: int}, {name: "bitWidth", type: int}], + complex: false + }, + { + name: "Date", + fields: [{name: "unit", type: short, valueType: DateUnit}] + complex: false + }, + { + name: "Time", + fields: [{name: "unit", type: short, valueType: TimeUnit}, {name: "bitWidth", type: int}], + complex: false + }, + { + name: "Timestamp", + fields: [{name: "unit", type: short, valueType: TimeUnit}, {name: "timezone", type: String}] + complex: false + }, + { + name: "Interval", + fields: [{name: "unit", type: short, valueType: IntervalUnit}], + complex: false + }, + { + name: "Duration", + fields: [{name: "unit", type: short, valueType: TimeUnit}], + complex: false + } + ] +} diff --git a/src/arrow/java/vector/src/main/codegen/data/ValueVectorTypes.tdd b/src/arrow/java/vector/src/main/codegen/data/ValueVectorTypes.tdd new file mode 100644 index 000000000..2a9218042 --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/data/ValueVectorTypes.tdd @@ -0,0 +1,206 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{ + modes: [ + {name: "Optional", prefix: "Nullable"}, + {name: "Required", prefix: ""} + ], + types: [ + { + major: "Fixed", + width: 1, + javaType: "byte", + boxedType: "Byte", + fields: [{name: "value", type: "byte"}], + minor: [ + { class: "TinyInt", valueHolder: "IntHolder" }, + { class: "UInt1", valueHolder: "UInt1Holder" } + ] + }, + { + major: "Fixed", + width: 2, + javaType: "char", + boxedType: "Character", + fields: [{name: "value", type: "char"}], + minor: [ + { class: "UInt2", valueHolder: "UInt2Holder"} + ] + }, { + major: "Fixed", + width: 2, + javaType: "short", + boxedType: "Short", + fields: [{name: "value", type: "short"}], + minor: [ + { class: "SmallInt", valueHolder: "Int2Holder"}, + ] + }, + { + major: "Fixed", + width: 4, + javaType: "int", + boxedType: "Integer", + fields: [{name: "value", type: "int"}], + minor: [ + { class: "Int", valueHolder: "IntHolder"}, + { class: "UInt4", valueHolder: "UInt4Holder" }, + { class: "Float4", javaType: "float" , boxedType: "Float", fields: [{name: "value", type: "float"}]}, + { class: "DateDay" }, + { class: "IntervalYear", javaType: "int", friendlyType: "Period" }, + { class: "TimeSec" }, + { class: "TimeMilli", javaType: "int", friendlyType: "LocalDateTime" } + ] + }, + { + major: "Fixed", + width: 8, + javaType: "long", + boxedType: "Long", + fields: [{name: "value", type: "long"}], + minor: [ + { class: "BigInt"}, + { class: "UInt8" }, + { class: "Float8", javaType: "double", boxedType: "Double", fields: [{name: "value", type: "double"}] }, + { class: "DateMilli", javaType: "long", friendlyType: "LocalDateTime" }, + { class: "Duration", javaType: "long", friendlyType: "Duration", + arrowType: "org.apache.arrow.vector.types.pojo.ArrowType.Duration", + typeParams: [ {name: "unit", type: "org.apache.arrow.vector.types.TimeUnit"} ], + arrowTypeConstructorParams: ["unit"]} + { class: "TimeStampSec", javaType: "long", boxedType: "Long", friendlyType: "LocalDateTime" }, + { class: "TimeStampMilli", javaType: "long", boxedType: "Long", friendlyType: "LocalDateTime" }, + { class: "TimeStampMicro", javaType: "long", boxedType: "Long", friendlyType: "LocalDateTime" }, + { class: "TimeStampNano", javaType: "long", boxedType: "Long", friendlyType: "LocalDateTime" }, + { class: "TimeStampSecTZ", javaType: "long", boxedType: "Long", + typeParams: [ {name: "timezone", type: "String"} ], + arrowType: "org.apache.arrow.vector.types.pojo.ArrowType.Timestamp", + arrowTypeConstructorParams: ["org.apache.arrow.vector.types.TimeUnit.SECOND", "timezone"] }, + { class: "TimeStampMilliTZ", javaType: "long", boxedType: "Long", + typeParams: [ {name: "timezone", type: "String"} ], + arrowType: "org.apache.arrow.vector.types.pojo.ArrowType.Timestamp", + arrowTypeConstructorParams: ["org.apache.arrow.vector.types.TimeUnit.MILLISECOND", "timezone"] }, + { class: "TimeStampMicroTZ", javaType: "long", boxedType: "Long", + typeParams: [ {name: "timezone", type: "String"} ], + arrowType: "org.apache.arrow.vector.types.pojo.ArrowType.Timestamp", + arrowTypeConstructorParams: ["org.apache.arrow.vector.types.TimeUnit.MICROSECOND", "timezone"] }, + { class: "TimeStampNanoTZ", javaType: "long", boxedType: "Long", + typeParams: [ {name: "timezone", type: "String"} ], + arrowType: "org.apache.arrow.vector.types.pojo.ArrowType.Timestamp", + arrowTypeConstructorParams: ["org.apache.arrow.vector.types.TimeUnit.NANOSECOND", "timezone"] }, + { class: "TimeMicro" }, + { class: "TimeNano" } + ] + }, + { + major: "Fixed", + width: 8, + javaType: "ArrowBuf", + boxedType: "ArrowBuf", + minor: [ + { class: "IntervalDay", millisecondsOffset: 4, friendlyType: "Duration", fields: [ {name: "days", type:"int"}, {name: "milliseconds", type:"int"}] } + ] + }, + { + major: "Fixed", + width: 16, + javaType: "ArrowBuf", + boxedType: "ArrowBuf", + minor: [ + { class: "IntervalMonthDayNano", daysOffset: 4, nanosecondsOffset: 8, friendlyType: "PeriodDuration", fields: [ {name: "months", type:"int"}, {name: "days", type:"int"}, {name: "nanoseconds", type:"long"}] } + ] + }, + + { + major: "Fixed", + width: 32, + javaType: "ArrowBuf", + boxedType: "ArrowBuf", + + minor: [ + { + class: "Decimal256", + maxPrecisionDigits: 76, nDecimalDigits: 4, friendlyType: "BigDecimal", + typeParams: [ {name: "scale", type: "int"}, { name: "precision", type: "int"}], + arrowType: "org.apache.arrow.vector.types.pojo.ArrowType.Decimal", + fields: [{name: "start", type: "long"}, {name: "buffer", type: "ArrowBuf"}] + } + ] + }, + { + major: "Fixed", + width: 16, + javaType: "ArrowBuf", + boxedType: "ArrowBuf", + + minor: [ + { + class: "Decimal", + maxPrecisionDigits: 38, nDecimalDigits: 4, friendlyType: "BigDecimal", + typeParams: [ {name: "scale", type: "int"}, { name: "precision", type: "int"}], + arrowType: "org.apache.arrow.vector.types.pojo.ArrowType.Decimal", + fields: [{name: "start", type: "long"}, {name: "buffer", type: "ArrowBuf"}] + } + ] + }, + + { + major: "Fixed", + width: -1, + javaType: "byte[]", + boxedType: "ArrowBuf", + minor: [ + { + class: "FixedSizeBinary", + typeParams: [ {name: "byteWidth", type: "int"} ], + arrowType: "org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeBinary", + friendlyType: "byte[]", + fields: [{name: "buffer", type: "ArrowBuf"}], + } + ] + }, + { + major: "VarLen", + width: 4, + javaType: "int", + boxedType: "ArrowBuf", + fields: [{name: "start", type: "int"}, {name: "end", type: "int"}, {name: "buffer", type: "ArrowBuf"}], + minor: [ + { class: "VarBinary" , friendlyType: "byte[]" }, + { class: "VarChar" , friendlyType: "Text" } + ] + }, + { + major: "VarLen", + width: 8, + javaType: "long", + boxedType: "ArrowBuf", + fields: [{name: "start", type: "long"}, {name: "end", type: "long"}, {name: "buffer", type: "ArrowBuf"}], + minor: [ + { class: "LargeVarChar" , friendlyType: "Text" } + { class: "LargeVarBinary" , friendlyType: "byte[]" } + ] + }, + { + major: "Bit", + width: 1, + javaType: "int", + boxedType: "Integer", + minor: [ + { class: "Bit" , friendlyType: "Boolean", fields: [{name: "value", type: "int"}] } + ] + } + ] +} diff --git a/src/arrow/java/vector/src/main/codegen/includes/license.ftl b/src/arrow/java/vector/src/main/codegen/includes/license.ftl new file mode 100644 index 000000000..c6a5afeef --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/includes/license.ftl @@ -0,0 +1,16 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ \ No newline at end of file diff --git a/src/arrow/java/vector/src/main/codegen/includes/vv_imports.ftl b/src/arrow/java/vector/src/main/codegen/includes/vv_imports.ftl new file mode 100644 index 000000000..c9a8820b2 --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/includes/vv_imports.ftl @@ -0,0 +1,61 @@ +<#-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +import static org.apache.arrow.util.Preconditions.checkArgument; +import static org.apache.arrow.util.Preconditions.checkState; + +import com.google.flatbuffers.FlatBufferBuilder; + +import org.apache.arrow.memory.*; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.Types.*; +import org.apache.arrow.vector.types.pojo.*; +import org.apache.arrow.vector.types.pojo.ArrowType.*; +import org.apache.arrow.vector.types.*; +import org.apache.arrow.vector.*; +import org.apache.arrow.vector.holders.*; +import org.apache.arrow.vector.util.*; +import org.apache.arrow.vector.complex.*; +import org.apache.arrow.vector.complex.reader.*; +import org.apache.arrow.vector.complex.impl.*; +import org.apache.arrow.vector.complex.writer.*; +import org.apache.arrow.vector.complex.writer.BaseWriter.StructWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; +import org.apache.arrow.vector.util.JsonStringArrayList; + +import java.util.Arrays; +import java.util.Random; +import java.util.List; + +import java.io.Closeable; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.ByteBuffer; + +import java.sql.Date; +import java.sql.Time; +import java.sql.Timestamp; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.time.Duration; +import java.time.LocalDateTime; +import java.time.Period; +import java.time.ZonedDateTime; + + diff --git a/src/arrow/java/vector/src/main/codegen/templates/AbstractFieldReader.java b/src/arrow/java/vector/src/main/codegen/templates/AbstractFieldReader.java new file mode 100644 index 000000000..e3c872946 --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/templates/AbstractFieldReader.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/AbstractFieldReader.java" /> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> + +/** + * Source code generated using FreeMarker template ${.template_name} + */ +@SuppressWarnings("unused") +abstract class AbstractFieldReader extends AbstractBaseReader implements FieldReader{ + + AbstractFieldReader(){ + super(); + } + + /** + * Returns true if the current value of the reader is not null + * @return whether the current value is set + */ + public boolean isSet() { + return true; + } + + @Override + public Field getField() { + fail("getField"); + return null; + } + + <#list ["Object", "BigDecimal", "Short", "Integer", "Long", "Boolean", + "LocalDateTime", "Duration", "Period", "Double", "Float", + "Character", "Text", "String", "Byte", "byte[]", "PeriodDuration"] as friendlyType> + <#assign safeType=friendlyType /> + <#if safeType=="byte[]"><#assign safeType="ByteArray" /> + public ${friendlyType} read${safeType}(int arrayIndex) { + fail("read${safeType}(int arrayIndex)"); + return null; + } + + public ${friendlyType} read${safeType}() { + fail("read${safeType}()"); + return null; + } + + + public void copyAsValue(StructWriter writer) { + fail("CopyAsValue StructWriter"); + } + + public void copyAsField(String name, StructWriter writer) { + fail("CopyAsField StructWriter"); + } + + public void copyAsField(String name, ListWriter writer) { + fail("CopyAsFieldList"); + } + + public void copyAsField(String name, MapWriter writer) { + fail("CopyAsFieldMap"); + } + + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> + <#assign boxedType = (minor.boxedType!type.boxedType) /> + public void read(${name}Holder holder) { + fail("${name}"); + } + + public void read(Nullable${name}Holder holder) { + fail("${name}"); + } + + public void read(int arrayIndex, ${name}Holder holder) { + fail("Repeated${name}"); + } + + public void read(int arrayIndex, Nullable${name}Holder holder) { + fail("Repeated${name}"); + } + + public void copyAsValue(${name}Writer writer) { + fail("CopyAsValue${name}"); + } + + public void copyAsField(String name, ${name}Writer writer) { + fail("CopyAsField${name}"); + } + + + public FieldReader reader(String name) { + fail("reader(String name)"); + return null; + } + + public FieldReader reader() { + fail("reader()"); + return null; + } + + public int size() { + fail("size()"); + return -1; + } + + private void fail(String name) { + throw new IllegalArgumentException(String.format("You tried to read a [%s] type when you are using a field reader of type [%s].", name, this.getClass().getSimpleName())); + } +} + + + diff --git a/src/arrow/java/vector/src/main/codegen/templates/AbstractFieldWriter.java b/src/arrow/java/vector/src/main/codegen/templates/AbstractFieldWriter.java new file mode 100644 index 000000000..1f80f2526 --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/templates/AbstractFieldWriter.java @@ -0,0 +1,230 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/AbstractFieldWriter.java" /> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> + +/* + * This class is generated using freemarker and the ${.template_name} template. + */ +@SuppressWarnings("unused") +abstract class AbstractFieldWriter extends AbstractBaseWriter implements FieldWriter { + + protected boolean addVectorAsNullable = true; + + /** + * Set flag to control the FieldType.nullable property when a writer creates a new vector. + * If true then vectors created will be nullable, this is the default behavior. If false then + * vectors created will be non-nullable. + * + * @param nullable Whether or not to create nullable vectors (default behavior is true) + */ + public void setAddVectorAsNullable(boolean nullable) { + addVectorAsNullable = nullable; + } + + @Override + public void start() { + throw new IllegalStateException(String.format("You tried to start when you are using a ValueWriter of type %s.", this.getClass().getSimpleName())); + } + + @Override + public void end() { + throw new IllegalStateException(String.format("You tried to end when you are using a ValueWriter of type %s.", this.getClass().getSimpleName())); + } + + @Override + public void startList() { + throw new IllegalStateException(String.format("You tried to start a list when you are using a ValueWriter of type %s.", this.getClass().getSimpleName())); + } + + @Override + public void endList() { + throw new IllegalStateException(String.format("You tried to end a list when you are using a ValueWriter of type %s.", this.getClass().getSimpleName())); + } + + @Override + public void startMap() { + throw new IllegalStateException(String.format("You tried to start a map when you are using a ValueWriter of type %s.", this.getClass().getSimpleName())); + } + + @Override + public void endMap() { + throw new IllegalStateException(String.format("You tried to end a map when you are using a ValueWriter of type %s.", this.getClass().getSimpleName())); + } + + @Override + public void startEntry() { + throw new IllegalStateException(String.format("You tried to start a map entry when you are using a ValueWriter of type %s.", this.getClass().getSimpleName())); + } + + @Override + public MapWriter key() { + throw new IllegalStateException(String.format("You tried to start a map key when you are using a ValueWriter of type %s.", this.getClass().getSimpleName())); + } + + @Override + public MapWriter value() { + throw new IllegalStateException(String.format("You tried to start a map value when you are using a ValueWriter of type %s.", this.getClass().getSimpleName())); + } + + @Override + public void endEntry() { + throw new IllegalStateException(String.format("You tried to end a map entry when you are using a ValueWriter of type %s.", this.getClass().getSimpleName())); + } + + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign friendlyType = (minor.friendlyType!minor.boxedType!type.boxedType) /> + @Override + public void write(${name}Holder holder) { + fail("${name}"); + } + + public void write${minor.class}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, ) { + fail("${name}"); + } + + <#if minor.class?starts_with("Decimal")> + public void write${minor.class}(${friendlyType} value) { + fail("${name}"); + } + + public void write${minor.class}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, , ArrowType arrowType) { + fail("${name}"); + } + + public void writeBigEndianBytesTo${minor.class}(byte[] value) { + fail("${name}"); + } + + public void writeBigEndianBytesTo${minor.class}(byte[] value, ArrowType arrowType) { + fail("${name}"); + } + + + + + public void writeNull() { + fail("${name}"); + } + + /** + * This implementation returns {@code false}. + *

+ * Must be overridden by struct writers. + *

+ */ + @Override + public boolean isEmptyStruct() { + return false; + } + + @Override + public StructWriter struct() { + fail("Struct"); + return null; + } + + @Override + public ListWriter list() { + fail("List"); + return null; + } + + @Override + public MapWriter map() { + fail("Map"); + return null; + } + + @Override + public StructWriter struct(String name) { + fail("Struct"); + return null; + } + + @Override + public ListWriter list(String name) { + fail("List"); + return null; + } + + @Override + public MapWriter map(String name) { + fail("Map"); + return null; + } + + @Override + public MapWriter map(boolean keysSorted) { + fail("Map"); + return null; + } + + @Override + public MapWriter map(String name, boolean keysSorted) { + fail("Map"); + return null; + } + <#list vv.types as type><#list type.minor as minor> + <#assign lowerName = minor.class?uncap_first /> + <#if lowerName == "int" ><#assign lowerName = "integer" /> + <#assign upperName = minor.class?upper_case /> + <#assign capName = minor.class?cap_first /> + <#if minor.typeParams?? > + + @Override + public ${capName}Writer ${lowerName}(String name<#list minor.typeParams as typeParam>, ${typeParam.type} ${typeParam.name}) { + fail("${capName}(" + <#list minor.typeParams as typeParam>"${typeParam.name}: " + ${typeParam.name} + ", " + ")"); + return null; + } + + + @Override + public ${capName}Writer ${lowerName}(String name) { + fail("${capName}"); + return null; + } + + @Override + public ${capName}Writer ${lowerName}() { + fail("${capName}"); + return null; + } + + + + public void copyReader(FieldReader reader) { + fail("Copy FieldReader"); + } + + public void copyReaderToField(String name, FieldReader reader) { + fail("Copy FieldReader to STring"); + } + + private void fail(String name) { + throw new IllegalArgumentException(String.format("You tried to write a %s type when you are using a ValueWriter of type %s.", name, this.getClass().getSimpleName())); + } +} diff --git a/src/arrow/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java b/src/arrow/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java new file mode 100644 index 000000000..264e85021 --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java @@ -0,0 +1,238 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/AbstractPromotableFieldWriter.java" /> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> + +/* + * A FieldWriter which delegates calls to another FieldWriter. The delegate FieldWriter can be promoted to a new type + * when necessary. Classes that extend this class are responsible for handling promotion. + * + * This class is generated using freemarker and the ${.template_name} template. + * + */ +@SuppressWarnings("unused") +abstract class AbstractPromotableFieldWriter extends AbstractFieldWriter { + /** + * Retrieve the FieldWriter, promoting if it is not a FieldWriter of the specified type + * @param type the type of the values we want to write + * @return the corresponding field writer + */ + protected FieldWriter getWriter(MinorType type) { + return getWriter(type, null); + } + + abstract protected FieldWriter getWriter(MinorType type, ArrowType arrowType); + + /** + * @return the current FieldWriter + */ + abstract protected FieldWriter getWriter(); + + @Override + public void start() { + getWriter(MinorType.STRUCT).start(); + } + + @Override + public void end() { + getWriter(MinorType.STRUCT).end(); + setPosition(idx() + 1); + } + + @Override + public void startList() { + getWriter(MinorType.LIST).startList(); + } + + @Override + public void endList() { + getWriter(MinorType.LIST).endList(); + setPosition(idx() + 1); + } + + @Override + public void startMap() { + getWriter(MinorType.MAP).startMap(); + } + + @Override + public void endMap() { + getWriter(MinorType.MAP).endMap(); + setPosition(idx() + 1); + } + + @Override + public void startEntry() { + getWriter(MinorType.MAP).startEntry(); + } + + @Override + public MapWriter key() { + return getWriter(MinorType.MAP).key(); + } + + @Override + public MapWriter value() { + return getWriter(MinorType.MAP).value(); + } + + @Override + public void endEntry() { + getWriter(MinorType.MAP).endEntry(); + } + + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#if minor.class != "Decimal" && minor.class != "Decimal256"> + @Override + public void write(${name}Holder holder) { + getWriter(MinorType.${name?upper_case}).write(holder); + } + + public void write${minor.class}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, ) { + getWriter(MinorType.${name?upper_case}).write${minor.class}(<#list fields as field>${field.name}<#if field_has_next>, ); + } + + <#elseif minor.class == "Decimal"> + @Override + public void write(DecimalHolder holder) { + getWriter(MinorType.DECIMAL).write(holder); + } + + public void writeDecimal(int start, ArrowBuf buffer, ArrowType arrowType) { + getWriter(MinorType.DECIMAL).writeDecimal(start, buffer, arrowType); + } + + public void writeDecimal(int start, ArrowBuf buffer) { + getWriter(MinorType.DECIMAL).writeDecimal(start, buffer); + } + + public void writeBigEndianBytesToDecimal(byte[] value, ArrowType arrowType) { + getWriter(MinorType.DECIMAL).writeBigEndianBytesToDecimal(value, arrowType); + } + + public void writeBigEndianBytesToDecimal(byte[] value) { + getWriter(MinorType.DECIMAL).writeBigEndianBytesToDecimal(value); + } + <#elseif minor.class == "Decimal256"> + @Override + public void write(Decimal256Holder holder) { + getWriter(MinorType.DECIMAL256).write(holder); + } + + public void writeDecimal256(long start, ArrowBuf buffer, ArrowType arrowType) { + getWriter(MinorType.DECIMAL256).writeDecimal256(start, buffer, arrowType); + } + + public void writeDecimal256(long start, ArrowBuf buffer) { + getWriter(MinorType.DECIMAL256).writeDecimal256(start, buffer); + } + public void writeBigEndianBytesToDecimal256(byte[] value, ArrowType arrowType) { + getWriter(MinorType.DECIMAL256).writeBigEndianBytesToDecimal256(value, arrowType); + } + + public void writeBigEndianBytesToDecimal256(byte[] value) { + getWriter(MinorType.DECIMAL256).writeBigEndianBytesToDecimal256(value); + } + + + + + + public void writeNull() { + } + + @Override + public StructWriter struct() { + return getWriter(MinorType.LIST).struct(); + } + + @Override + public ListWriter list() { + return getWriter(MinorType.LIST).list(); + } + + @Override + public MapWriter map() { + return getWriter(MinorType.LIST).map(); + } + + @Override + public MapWriter map(boolean keysSorted) { + return getWriter(MinorType.MAP, new ArrowType.Map(keysSorted)); + } + + @Override + public StructWriter struct(String name) { + return getWriter(MinorType.STRUCT).struct(name); + } + + @Override + public ListWriter list(String name) { + return getWriter(MinorType.STRUCT).list(name); + } + + @Override + public MapWriter map(String name) { + return getWriter(MinorType.STRUCT).map(name); + } + + @Override + public MapWriter map(String name, boolean keysSorted) { + return getWriter(MinorType.STRUCT).map(name, keysSorted); + } + <#list vv.types as type><#list type.minor as minor> + <#assign lowerName = minor.class?uncap_first /> + <#if lowerName == "int" ><#assign lowerName = "integer" /> + <#assign upperName = minor.class?upper_case /> + <#assign capName = minor.class?cap_first /> + + <#if minor.typeParams?? > + @Override + public ${capName}Writer ${lowerName}(String name<#list minor.typeParams as typeParam>, ${typeParam.type} ${typeParam.name}) { + return getWriter(MinorType.STRUCT).${lowerName}(name<#list minor.typeParams as typeParam>, ${typeParam.name}); + } + + + @Override + public ${capName}Writer ${lowerName}(String name) { + return getWriter(MinorType.STRUCT).${lowerName}(name); + } + + @Override + public ${capName}Writer ${lowerName}() { + return getWriter(MinorType.LIST).${lowerName}(); + } + + + + public void copyReader(FieldReader reader) { + getWriter().copyReader(reader); + } + + public void copyReaderToField(String name, FieldReader reader) { + getWriter().copyReaderToField(name, reader); + } +} diff --git a/src/arrow/java/vector/src/main/codegen/templates/ArrowType.java b/src/arrow/java/vector/src/main/codegen/templates/ArrowType.java new file mode 100644 index 000000000..b08d4ad0a --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/templates/ArrowType.java @@ -0,0 +1,375 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/types/pojo/ArrowType.java" /> +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.types.pojo; + +import com.google.flatbuffers.FlatBufferBuilder; + +import java.util.Objects; + +import org.apache.arrow.flatbuf.Type; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.types.*; +import org.apache.arrow.vector.FieldVector; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonSubTypes; +import com.fasterxml.jackson.annotation.JsonTypeInfo; + +/** + * Arrow types + * Source code generated using FreeMarker template ${.template_name} + **/ +@JsonTypeInfo( + use = JsonTypeInfo.Id.NAME, + include = JsonTypeInfo.As.PROPERTY, + property = "name") +@JsonSubTypes({ +<#list arrowTypes.types as type> + @JsonSubTypes.Type(value = ArrowType.${type.name?remove_ending("_")}.class, name = "${type.name?remove_ending("_")?lower_case}"), + +}) +public abstract class ArrowType { + + public static abstract class PrimitiveType extends ArrowType { + + private PrimitiveType() { + } + + @Override + public boolean isComplex() { + return false; + } + } + + public static abstract class ComplexType extends ArrowType { + + private ComplexType() { + } + + @Override + public boolean isComplex() { + return true; + } + } + + public static enum ArrowTypeID { + <#list arrowTypes.types as type> + <#assign name = type.name> + ${name?remove_ending("_")}(Type.${name}), + + NONE(Type.NONE); + + private final byte flatbufType; + + public byte getFlatbufID() { + return this.flatbufType; + } + + private ArrowTypeID(byte flatbufType) { + this.flatbufType = flatbufType; + } + } + + @JsonIgnore + public abstract ArrowTypeID getTypeID(); + @JsonIgnore + public abstract boolean isComplex(); + public abstract int getType(FlatBufferBuilder builder); + public abstract T accept(ArrowTypeVisitor visitor); + + /** + * to visit the ArrowTypes + * + * type.accept(new ArrowTypeVisitor<Type>() { + * ... + * }); + * + */ + public static interface ArrowTypeVisitor { + <#list arrowTypes.types as type> + T visit(${type.name?remove_ending("_")} type); + + default T visit(ExtensionType type) { + return type.storageType().accept(this); + } + } + + /** + * to visit the Complex ArrowTypes and bundle Primitive ones in one case + */ + public static abstract class ComplexTypeVisitor implements ArrowTypeVisitor { + + public T visit(PrimitiveType type) { + throw new UnsupportedOperationException("Unexpected Primitive type: " + type); + } + + <#list arrowTypes.types as type> + <#if !type.complex> + public final T visit(${type.name?remove_ending("_")} type) { + return visit((PrimitiveType) type); + } + + + } + + /** + * to visit the Primitive ArrowTypes and bundle Complex ones under one case + */ + public static abstract class PrimitiveTypeVisitor implements ArrowTypeVisitor { + + public T visit(ComplexType type) { + throw new UnsupportedOperationException("Unexpected Complex type: " + type); + } + + <#list arrowTypes.types as type> + <#if type.complex> + public final T visit(${type.name?remove_ending("_")} type) { + return visit((ComplexType) type); + } + + + } + + <#list arrowTypes.types as type> + <#assign name = type.name?remove_ending("_")> + <#assign fields = type.fields> + public static class ${name} extends <#if type.complex>ComplexType<#else>PrimitiveType { + public static final ArrowTypeID TYPE_TYPE = ArrowTypeID.${name}; + <#if type.fields?size == 0> + public static final ${name} INSTANCE = new ${name}(); + <#else> + + <#list fields as field> + <#assign fieldType = field.valueType!field.type> + ${fieldType} ${field.name}; + + + + <#if type.name == "Decimal"> + // Needed to support golden file integration tests. + @JsonCreator + public static Decimal createDecimal( + @JsonProperty("precision") int precision, + @JsonProperty("scale") int scale, + @JsonProperty("bitWidth") Integer bitWidth) { + + return new Decimal(precision, scale, bitWidth == null ? 128 : bitWidth); + } + + /** + * Construct Decimal with 128 bits. + * + * This is kept mainly for the sake of backward compatibility. + * Please use {@link org.apache.arrow.vector.types.pojo.ArrowType.Decimal#Decimal(int, int, int)} instead. + * + * @deprecated This API will be removed in a future release. + */ + @Deprecated + public Decimal(int precision, int scale) { + this(precision, scale, 128); + } + + <#else> + @JsonCreator + + public ${type.name}( + <#list type.fields as field> + <#assign fieldType = field.valueType!field.type> + @JsonProperty("${field.name}") ${fieldType} ${field.name}<#if field_has_next>, + + ) { + <#list type.fields as field> + this.${field.name} = ${field.name}; + + } + + <#list fields as field> + <#assign fieldType = field.valueType!field.type> + public ${fieldType} get${field.name?cap_first}() { + return ${field.name}; + } + + + + @Override + public ArrowTypeID getTypeID() { + return TYPE_TYPE; + } + + @Override + public int getType(FlatBufferBuilder builder) { + <#list type.fields as field> + <#if field.type == "String"> + int ${field.name} = this.${field.name} == null ? -1 : builder.createString(this.${field.name}); + + <#if field.type == "int[]"> + int ${field.name} = this.${field.name} == null ? -1 : org.apache.arrow.flatbuf.${type.name}.create${field.name?cap_first}Vector(builder, this.${field.name}); + + + org.apache.arrow.flatbuf.${type.name}.start${type.name}(builder); + <#list type.fields as field> + <#if field.type == "String" || field.type == "int[]"> + if (this.${field.name} != null) { + org.apache.arrow.flatbuf.${type.name}.add${field.name?cap_first}(builder, ${field.name}); + } + <#else> + org.apache.arrow.flatbuf.${type.name}.add${field.name?cap_first}(builder, this.${field.name}<#if field.valueType??>.getFlatbufID()); + + + return org.apache.arrow.flatbuf.${type.name}.end${type.name}(builder); + } + + public String toString() { + return "${name}" + <#if fields?size != 0> + + "(" + <#list fields as field> + + <#if field.type == "int[]">java.util.Arrays.toString(${field.name})<#else>${field.name}<#if field_has_next> + ", " + + + ")" + + ; + } + + @Override + public int hashCode() { + return java.util.Arrays.deepHashCode(new Object[] {<#list type.fields as field>${field.name}<#if field_has_next>, }); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof ${name})) { + return false; + } + <#if type.fields?size == 0> + return true; + <#else> + ${type.name} that = (${type.name}) obj; + return <#list type.fields as field>Objects.deepEquals(this.${field.name}, that.${field.name}) <#if field_has_next>&&<#else>; + + + } + + @Override + public T accept(ArrowTypeVisitor visitor) { + return visitor.visit(this); + } + } + + + /** + * A user-defined data type that wraps an underlying storage type. + */ + public abstract static class ExtensionType extends ComplexType { + /** The on-wire type for this user-defined type. */ + public abstract ArrowType storageType(); + /** The name of this user-defined type. Used to identify the type during serialization. */ + public abstract String extensionName(); + /** Check equality of this type to another user-defined type. */ + public abstract boolean extensionEquals(ExtensionType other); + /** Save any metadata for this type. */ + public abstract String serialize(); + /** Given saved metadata and the underlying storage type, construct a new instance of the user type. */ + public abstract ArrowType deserialize(ArrowType storageType, String serializedData); + /** Construct a vector for the user type. */ + public abstract FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator); + + /** The field metadata key storing the name of the extension type. */ + public static final String EXTENSION_METADATA_KEY_NAME = "ARROW:extension:name"; + /** The field metadata key storing metadata for the extension type. */ + public static final String EXTENSION_METADATA_KEY_METADATA = "ARROW:extension:metadata"; + + @Override + public ArrowTypeID getTypeID() { + return storageType().getTypeID(); + } + + @Override + public int getType(FlatBufferBuilder builder) { + return storageType().getType(builder); + } + + public String toString() { + return "ExtensionType(" + extensionName() + ", " + storageType().toString() + ")"; + } + + @Override + public int hashCode() { + return java.util.Arrays.deepHashCode(new Object[] {storageType(), extensionName()}); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof ExtensionType)) { + return false; + } + return this.extensionEquals((ExtensionType) obj); + } + + @Override + public T accept(ArrowTypeVisitor visitor) { + return visitor.visit(this); + } + } + + private static final int defaultDecimalBitWidth = 128; + + public static org.apache.arrow.vector.types.pojo.ArrowType getTypeForField(org.apache.arrow.flatbuf.Field field) { + switch(field.typeType()) { + <#list arrowTypes.types as type> + <#assign name = type.name?remove_ending("_")> + <#assign nameLower = type.name?lower_case> + <#assign fields = type.fields> + case Type.${type.name}: { + org.apache.arrow.flatbuf.${type.name} ${nameLower}Type = (org.apache.arrow.flatbuf.${type.name}) field.type(new org.apache.arrow.flatbuf.${type.name}()); + <#list type.fields as field> + <#if field.type == "int[]"> + ${field.type} ${field.name} = new int[${nameLower}Type.${field.name}Length()]; + for (int i = 0; i< ${field.name}.length; ++i) { + ${field.name}[i] = ${nameLower}Type.${field.name}(i); + } + <#else> + ${field.type} ${field.name} = ${nameLower}Type.${field.name}(); + + + <#if type.name == "Decimal"> + if (bitWidth != defaultDecimalBitWidth && bitWidth != 256) { + throw new IllegalArgumentException("Library only supports 128-bit and 256-bit decimal values"); + } + + return new ${name}(<#list type.fields as field><#if field.valueType??>${field.valueType}.fromFlatbufID(${field.name})<#else>${field.name}<#if field_has_next>, ); + } + + default: + throw new UnsupportedOperationException("Unsupported type: " + field.typeType()); + } + } + + public static Int getInt(org.apache.arrow.flatbuf.Field field) { + org.apache.arrow.flatbuf.Int intType = (org.apache.arrow.flatbuf.Int) field.type(new org.apache.arrow.flatbuf.Int()); + return new Int(intType.bitWidth(), intType.isSigned()); + } +} + + diff --git a/src/arrow/java/vector/src/main/codegen/templates/BaseReader.java b/src/arrow/java/vector/src/main/codegen/templates/BaseReader.java new file mode 100644 index 000000000..85d582a53 --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/templates/BaseReader.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/reader/BaseReader.java" /> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.reader; + +<#include "/@includes/vv_imports.ftl" /> + +/** + * Source code generated using FreeMarker template ${.template_name} + */ +@SuppressWarnings("unused") +public interface BaseReader extends Positionable{ + Field getField(); + MinorType getMinorType(); + void reset(); + void read(UnionHolder holder); + void read(int index, UnionHolder holder); + void copyAsValue(UnionWriter writer); + void read(DenseUnionHolder holder); + void read(int index, DenseUnionHolder holder); + void copyAsValue(DenseUnionWriter writer); + boolean isSet(); + + public interface StructReader extends BaseReader, Iterable{ + FieldReader reader(String name); + } + + public interface RepeatedStructReader extends StructReader{ + boolean next(); + int size(); + void copyAsValue(StructWriter writer); + } + + public interface ListReader extends BaseReader{ + FieldReader reader(); + } + + public interface RepeatedListReader extends ListReader{ + boolean next(); + int size(); + void copyAsValue(ListWriter writer); + } + + public interface MapReader extends BaseReader{ + FieldReader reader(); + } + + public interface RepeatedMapReader extends MapReader{ + boolean next(); + int size(); + void copyAsValue(MapWriter writer); + } + + public interface ScalarReader extends + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> ${name}Reader, + BaseReader {} + + interface ComplexReader{ + StructReader rootAsStruct(); + ListReader rootAsList(); + boolean rootIsStruct(); + boolean ok(); + } +} + diff --git a/src/arrow/java/vector/src/main/codegen/templates/BaseWriter.java b/src/arrow/java/vector/src/main/codegen/templates/BaseWriter.java new file mode 100644 index 000000000..4d63fb73e --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/templates/BaseWriter.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/writer/BaseWriter.java" /> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.writer; + +<#include "/@includes/vv_imports.ftl" /> + +/* + * File generated from ${.template_name} using FreeMarker. + */ +@SuppressWarnings("unused") +public interface BaseWriter extends AutoCloseable, Positionable { + int getValueCapacity(); + void writeNull(); + + public interface StructWriter extends BaseWriter { + + Field getField(); + + /** + * Whether this writer is a struct writer and is empty (has no children). + * + *

+ * Intended only for use in determining whether to add dummy vector to + * avoid empty (zero-column) schema, as in JsonReader. + *

+ * @return whether the struct is empty + */ + boolean isEmptyStruct(); + + <#list vv.types as type><#list type.minor as minor> + <#assign lowerName = minor.class?uncap_first /> + <#if lowerName == "int" ><#assign lowerName = "integer" /> + <#assign upperName = minor.class?upper_case /> + <#assign capName = minor.class?cap_first /> + <#if minor.typeParams?? > + ${capName}Writer ${lowerName}(String name<#list minor.typeParams as typeParam>, ${typeParam.type} ${typeParam.name}); + + ${capName}Writer ${lowerName}(String name); + + + void copyReaderToField(String name, FieldReader reader); + StructWriter struct(String name); + ListWriter list(String name); + MapWriter map(String name); + MapWriter map(String name, boolean keysSorted); + void start(); + void end(); + } + + public interface ListWriter extends BaseWriter { + void startList(); + void endList(); + StructWriter struct(); + ListWriter list(); + MapWriter map(); + MapWriter map(boolean keysSorted); + void copyReader(FieldReader reader); + + <#list vv.types as type><#list type.minor as minor> + <#assign lowerName = minor.class?uncap_first /> + <#if lowerName == "int" ><#assign lowerName = "integer" /> + <#assign upperName = minor.class?upper_case /> + <#assign capName = minor.class?cap_first /> + ${capName}Writer ${lowerName}(); + + } + + public interface MapWriter extends ListWriter { + void startMap(); + void endMap(); + + void startEntry(); + void endEntry(); + + MapWriter key(); + MapWriter value(); + } + + public interface ScalarWriter extends + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> ${name}Writer, BaseWriter {} + + public interface ComplexWriter { + void allocate(); + void clear(); + void copyReader(FieldReader reader); + StructWriter rootAsStruct(); + ListWriter rootAsList(); + + void setPosition(int index); + void setValueCount(int count); + void reset(); + } + + public interface StructOrListWriter { + void start(); + void end(); + StructOrListWriter struct(String name); + StructOrListWriter listoftstruct(String name); + StructOrListWriter list(String name); + boolean isStructWriter(); + boolean isListWriter(); + VarCharWriter varChar(String name); + IntWriter integer(String name); + BigIntWriter bigInt(String name); + Float4Writer float4(String name); + Float8Writer float8(String name); + BitWriter bit(String name); + VarBinaryWriter binary(String name); + } +} diff --git a/src/arrow/java/vector/src/main/codegen/templates/CaseSensitiveStructWriters.java b/src/arrow/java/vector/src/main/codegen/templates/CaseSensitiveStructWriters.java new file mode 100644 index 000000000..cc0dd7b33 --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/templates/CaseSensitiveStructWriters.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +<@pp.dropOutputFile /> +<#list ["Nullable", "Single"] as mode> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/${mode}CaseSensitiveStructWriter.java" /> +<#assign index = "idx()"> +<#if mode == "Single"> +<#assign containerClass = "NonNullableStructVector" /> +<#else> +<#assign containerClass = "StructVector" /> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> +/* + * This class is generated using FreeMarker and the ${.template_name} template. + */ +@SuppressWarnings("unused") +public class ${mode}CaseSensitiveStructWriter extends ${mode}StructWriter { + public ${mode}CaseSensitiveStructWriter(${containerClass} container) { + super(container); + } + + @Override + protected String handleCase(final String input){ + return input; + } + + @Override + protected NullableStructWriterFactory getNullableStructWriterFactory() { + return NullableStructWriterFactory.getNullableCaseSensitiveStructWriterFactoryInstance(); + } + +} + diff --git a/src/arrow/java/vector/src/main/codegen/templates/ComplexCopier.java b/src/arrow/java/vector/src/main/codegen/templates/ComplexCopier.java new file mode 100644 index 000000000..39a84041e --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/templates/ComplexCopier.java @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.impl.UnionMapReader; +import org.apache.arrow.vector.complex.impl.UnionMapWriter; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.FieldWriter; +import org.apache.arrow.vector.types.Types; + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/ComplexCopier.java" /> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> + +/* + * This class is generated using freemarker and the ${.template_name} template. + */ +@SuppressWarnings("unused") +public class ComplexCopier { + + /** + * Do a deep copy of the value in input into output + * @param input field to read from + * @param output field to write to + */ + public static void copy(FieldReader input, FieldWriter output) { + writeValue(input, output); + } + + private static void writeValue(FieldReader reader, FieldWriter writer) { + final MinorType mt = reader.getMinorType(); + + switch (mt) { + + case LIST: + case LARGELIST: + case FIXED_SIZE_LIST: + if (reader.isSet()) { + writer.startList(); + while (reader.next()) { + FieldReader childReader = reader.reader(); + FieldWriter childWriter = getListWriterForReader(childReader, writer); + if (childReader.isSet()) { + writeValue(childReader, childWriter); + } else { + childWriter.writeNull(); + } + } + writer.endList(); + } else { + writer.writeNull(); + } + break; + case MAP: + if (reader.isSet()) { + UnionMapWriter mapWriter = (UnionMapWriter) writer; + UnionMapReader mapReader = (UnionMapReader) reader; + + mapWriter.startMap(); + while (mapReader.next()) { + FieldReader structReader = reader.reader(); + UnionMapWriter structWriter = (UnionMapWriter) writer.struct(); + if (structReader.isSet()) { + mapWriter.startEntry(); + writeValue(mapReader.key(), getStructWriterForReader(mapReader.key(), structWriter.key(), MapVector.KEY_NAME)); + writeValue(mapReader.value(), getStructWriterForReader(mapReader.value(), structWriter.value(), MapVector.VALUE_NAME)); + mapWriter.endEntry(); + } else { + structWriter.writeNull(); + } + } + mapWriter.endMap(); + } else { + writer.writeNull(); + } + break; + case STRUCT: + if (reader.isSet()) { + writer.start(); + for(String name : reader){ + FieldReader childReader = reader.reader(name); + if (childReader.getMinorType() != Types.MinorType.NULL) { + FieldWriter childWriter = getStructWriterForReader(childReader, writer, name); + if (childReader.isSet()) { + writeValue(childReader, childWriter); + } else { + childWriter.writeNull(); + } + } + } + writer.end(); + } else { + writer.writeNull(); + } + break; + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + + <#if !minor.typeParams?? || minor.class?starts_with("Decimal") > + + case ${name?upper_case}: + if (reader.isSet()) { + Nullable${name}Holder ${uncappedName}Holder = new Nullable${name}Holder(); + reader.read(${uncappedName}Holder); + if (${uncappedName}Holder.isSet == 1) { + writer.write${name}(<#list fields as field>${uncappedName}Holder.${field.name}<#if field_has_next>, <#if minor.class?starts_with("Decimal")>, new ArrowType.Decimal(${uncappedName}Holder.precision, ${uncappedName}Holder.scale, ${name}Holder.WIDTH * 8)); + } + } else { + writer.writeNull(); + } + break; + + + + } + } + + private static FieldWriter getStructWriterForReader(FieldReader reader, StructWriter writer, String name) { + switch (reader.getMinorType()) { + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#if !minor.typeParams??> + case ${name?upper_case}: + return (FieldWriter) writer.<#if name == "Int">integer<#else>${uncappedName}(name); + + <#if minor.class?starts_with("Decimal")> + case ${name?upper_case}: + if (reader.getField().getType() instanceof ArrowType.Decimal) { + ArrowType.Decimal type = (ArrowType.Decimal) reader.getField().getType(); + return (FieldWriter) writer.${uncappedName}(name, type.getScale(), type.getPrecision()); + } else { + return (FieldWriter) writer.${uncappedName}(name); + } + + + + case STRUCT: + return (FieldWriter) writer.struct(name); + case FIXED_SIZE_LIST: + case LIST: + case MAP: + return (FieldWriter) writer.list(name); + default: + throw new UnsupportedOperationException(reader.getMinorType().toString()); + } + } + + private static FieldWriter getListWriterForReader(FieldReader reader, ListWriter writer) { + switch (reader.getMinorType()) { + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#if !minor.typeParams?? || minor.class?starts_with("Decimal") > + case ${name?upper_case}: + return (FieldWriter) writer.<#if name == "Int">integer<#else>${uncappedName}(); + + + case STRUCT: + return (FieldWriter) writer.struct(); + case FIXED_SIZE_LIST: + case LIST: + case MAP: + case NULL: + return (FieldWriter) writer.list(); + default: + throw new UnsupportedOperationException(reader.getMinorType().toString()); + } + } +} diff --git a/src/arrow/java/vector/src/main/codegen/templates/ComplexReaders.java b/src/arrow/java/vector/src/main/codegen/templates/ComplexReaders.java new file mode 100644 index 000000000..48fb6603a --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/templates/ComplexReaders.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.lang.Override; +import java.util.List; + +import org.apache.arrow.record.TransferPair; +import org.apache.arrow.vector.complex.IndexHolder; +import org.apache.arrow.vector.complex.writer.IntervalWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.StructWriter; + +<@pp.dropOutputFile /> +<#list vv.types as type> +<#list type.minor as minor> +<#list [""] as mode> +<#assign lowerName = minor.class?uncap_first /> +<#if lowerName == "int" ><#assign lowerName = "integer" /> +<#assign name = minor.class?cap_first /> +<#assign javaType = (minor.javaType!type.javaType) /> +<#assign friendlyType = (minor.friendlyType!minor.boxedType!type.boxedType) /> +<#assign safeType=friendlyType /> +<#if safeType=="byte[]"><#assign safeType="ByteArray" /> + +<#assign hasFriendly = minor.friendlyType!"no" == "no" /> + +<#list ["Nullable"] as nullMode> +<#if mode == "" > +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/${name}ReaderImpl.java" /> +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> + +/** + * Source code generated using FreeMarker template ${.template_name} + */ +@SuppressWarnings("unused") +public class ${name}ReaderImpl extends AbstractFieldReader { + + private final ${name}Vector vector; + + public ${name}ReaderImpl(${name}Vector vector){ + super(); + this.vector = vector; + } + + public MinorType getMinorType(){ + return vector.getMinorType(); + } + + public Field getField(){ + return vector.getField(); + } + + public boolean isSet(){ + return !vector.isNull(idx()); + } + + public void copyAsValue(${minor.class?cap_first}Writer writer){ + ${minor.class?cap_first}WriterImpl impl = (${minor.class?cap_first}WriterImpl) writer; + impl.vector.copyFromSafe(idx(), impl.idx(), vector); + } + + public void copyAsField(String name, StructWriter writer){ + ${minor.class?cap_first}WriterImpl impl = (${minor.class?cap_first}WriterImpl) writer.${lowerName}(name); + impl.vector.copyFromSafe(idx(), impl.idx(), vector); + } + + <#if nullMode != "Nullable"> + public void read(${minor.class?cap_first}Holder h){ + vector.get(idx(), h); + } + + + public void read(Nullable${minor.class?cap_first}Holder h){ + vector.get(idx(), h); + } + + public ${friendlyType} read${safeType}(){ + return vector.getObject(idx()); + } + + <#if minor.class == "TimeStampSec" || + minor.class == "TimeStampMilli" || + minor.class == "TimeStampMicro" || + minor.class == "TimeStampNano"> + @Override + public ${minor.boxedType} read${minor.boxedType}(){ + return vector.get(idx()); + } + + + public void copyValue(FieldWriter w){ + + } + + public Object readObject(){ + return (Object)vector.getObject(idx()); + } +} + + +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/reader/${name}Reader.java" /> +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.reader; + +<#include "/@includes/vv_imports.ftl" /> +/** + * Source code generated using FreeMarker template ${.template_name} + */ +@SuppressWarnings("unused") +public interface ${name}Reader extends BaseReader{ + + public void read(${minor.class?cap_first}Holder h); + public void read(Nullable${minor.class?cap_first}Holder h); + public Object readObject(); + // read friendly type + public ${friendlyType} read${safeType}(); + public boolean isSet(); + public void copyAsValue(${minor.class}Writer writer); + public void copyAsField(String name, ${minor.class}Writer writer); + +} + + + + + + + + diff --git a/src/arrow/java/vector/src/main/codegen/templates/ComplexWriters.java b/src/arrow/java/vector/src/main/codegen/templates/ComplexWriters.java new file mode 100644 index 000000000..0381e5559 --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/templates/ComplexWriters.java @@ -0,0 +1,211 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +<@pp.dropOutputFile /> +<#list vv.types as type> +<#list type.minor as minor> +<#list ["Nullable"] as mode> +<#assign name = minor.class?cap_first /> +<#assign eName = name /> +<#assign javaType = (minor.javaType!type.javaType) /> +<#assign fields = minor.fields!type.fields /> +<#assign friendlyType = (minor.friendlyType!minor.boxedType!type.boxedType) /> + +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/${eName}WriterImpl.java" /> +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> + +/* + * This class is generated using FreeMarker on the ${.template_name} template. + */ +@SuppressWarnings("unused") +public class ${eName}WriterImpl extends AbstractFieldWriter { + + final ${name}Vector vector; + + public ${eName}WriterImpl(${name}Vector vector) { + this.vector = vector; + } + + @Override + public Field getField() { + return vector.getField(); + } + + @Override + public int getValueCapacity() { + return vector.getValueCapacity(); + } + + @Override + public void allocate() { + vector.allocateNew(); + } + + @Override + public void close() { + vector.close(); + } + + @Override + public void clear() { + vector.clear(); + } + + @Override + protected int idx() { + return super.idx(); + } + + <#if mode == "Repeated"> + + public void write(${minor.class?cap_first}Holder h) { + mutator.addSafe(idx(), h); + vector.setValueCount(idx()+1); + } + + public void write(${minor.class?cap_first}Holder h) { + mutator.addSafe(idx(), h); + vector.setValueCount(idx()+1); + } + + public void write${minor.class}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, ) { + mutator.addSafe(idx(), <#list fields as field>${field.name}<#if field_has_next>, ); + vector.setValueCount(idx()+1); + } + + public void setPosition(int idx) { + super.setPosition(idx); + mutator.startNewValue(idx); + } + + + <#else> + + <#if !minor.class?starts_with("Decimal")> + public void write(${minor.class}Holder h) { + vector.setSafe(idx(), h); + vector.setValueCount(idx()+1); + } + + public void write(Nullable${minor.class}Holder h) { + vector.setSafe(idx(), h); + vector.setValueCount(idx()+1); + } + + public void write${minor.class}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, ) { + vector.setSafe(idx(), 1<#list fields as field><#if field.include!true >, ${field.name}); + vector.setValueCount(idx()+1); + } + + + <#if minor.class == "VarChar"> + public void write${minor.class}(${friendlyType} value) { + vector.setSafe(idx(), value); + vector.setValueCount(idx()+1); + } + + + <#if minor.class?starts_with("Decimal")> + + public void write(${minor.class}Holder h){ + DecimalUtility.checkPrecisionAndScale(h.precision, h.scale, vector.getPrecision(), vector.getScale()); + vector.setSafe(idx(), h); + vector.setValueCount(idx() + 1); + } + + public void write(Nullable${minor.class}Holder h){ + if (h.isSet == 1) { + DecimalUtility.checkPrecisionAndScale(h.precision, h.scale, vector.getPrecision(), vector.getScale()); + } + vector.setSafe(idx(), h); + vector.setValueCount(idx() + 1); + } + + public void write${minor.class}(long start, ArrowBuf buffer){ + vector.setSafe(idx(), 1, start, buffer); + vector.setValueCount(idx() + 1); + } + + public void write${minor.class}(long start, ArrowBuf buffer, ArrowType arrowType){ + DecimalUtility.checkPrecisionAndScale(((ArrowType.Decimal) arrowType).getPrecision(), + ((ArrowType.Decimal) arrowType).getScale(), vector.getPrecision(), vector.getScale()); + vector.setSafe(idx(), 1, start, buffer); + vector.setValueCount(idx() + 1); + } + + public void write${minor.class}(BigDecimal value){ + // vector.setSafe already does precision and scale checking + vector.setSafe(idx(), value); + vector.setValueCount(idx() + 1); + } + + public void writeBigEndianBytesTo${minor.class}(byte[] value, ArrowType arrowType){ + DecimalUtility.checkPrecisionAndScale(((ArrowType.Decimal) arrowType).getPrecision(), + ((ArrowType.Decimal) arrowType).getScale(), vector.getPrecision(), vector.getScale()); + vector.setBigEndianSafe(idx(), value); + vector.setValueCount(idx() + 1); + } + + public void writeBigEndianBytesTo${minor.class}(byte[] value){ + vector.setBigEndianSafe(idx(), value); + vector.setValueCount(idx() + 1); + } + + + + public void writeNull() { + vector.setNull(idx()); + vector.setValueCount(idx()+1); + } + +} + +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/writer/${eName}Writer.java" /> +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.writer; + +<#include "/@includes/vv_imports.ftl" /> +/* + * This class is generated using FreeMarker on the ${.template_name} template. + */ +@SuppressWarnings("unused") +public interface ${eName}Writer extends BaseWriter { + public void write(${minor.class}Holder h); + + <#if minor.class?starts_with("Decimal")>@Deprecated + public void write${minor.class}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, ); +<#if minor.class?starts_with("Decimal")> + + public void write${minor.class}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, , ArrowType arrowType); + + public void write${minor.class}(${friendlyType} value); + + public void writeBigEndianBytesTo${minor.class}(byte[] value, ArrowType arrowType); + + @Deprecated + public void writeBigEndianBytesTo${minor.class}(byte[] value); + +} + + + + diff --git a/src/arrow/java/vector/src/main/codegen/templates/DenseUnionReader.java b/src/arrow/java/vector/src/main/codegen/templates/DenseUnionReader.java new file mode 100644 index 000000000..a085e03ea --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/templates/DenseUnionReader.java @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import org.apache.arrow.vector.complex.impl.UnionListReader; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/DenseUnionReader.java" /> + + +<#include "/@includes/license.ftl" /> + + package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> +/** + * Source code generated using FreeMarker template ${.template_name} + */ +@SuppressWarnings("unused") +public class DenseUnionReader extends AbstractFieldReader { + + private BaseReader[] readers = new BaseReader[Byte.MAX_VALUE + 1]; + public DenseUnionVector data; + + public DenseUnionReader(DenseUnionVector data) { + this.data = data; + } + + public MinorType getMinorType() { + byte typeId = data.getTypeId(idx()); + return data.getVectorByType(typeId).getMinorType(); + } + + public byte getTypeId() { + return data.getTypeId(idx()); + } + + @Override + public Field getField() { + return data.getField(); + } + + public boolean isSet(){ + return !data.isNull(idx()); + } + + public void read(DenseUnionHolder holder) { + holder.reader = this; + holder.isSet = this.isSet() ? 1 : 0; + holder.typeId = getTypeId(); + } + + public void read(int index, UnionHolder holder) { + byte typeId = data.getTypeId(index); + getList(typeId).read(index, holder); + } + + private FieldReader getReaderForIndex(int index) { + byte typeId = data.getTypeId(index); + MinorType minorType = data.getVectorByType(typeId).getMinorType(); + FieldReader reader = (FieldReader) readers[typeId]; + if (reader != null) { + return reader; + } + switch (minorType) { + case NULL: + reader = NullReader.INSTANCE; + break; + case STRUCT: + reader = (FieldReader) getStruct(typeId); + break; + case LIST: + reader = (FieldReader) getList(typeId); + break; + <#list vv.types as type> + <#list type.minor as minor> + <#assign name = minor.class?cap_first /> + <#assign uncappedName = name?uncap_first/> + <#if !minor.typeParams?? || minor.class?starts_with("Decimal")> + case ${name?upper_case}: + reader = (FieldReader) get${name}(typeId); + break; + + + + default: + throw new UnsupportedOperationException("Unsupported type: " + MinorType.values()[typeId]); + } + return reader; + } + + private SingleStructReaderImpl structReader; + + private StructReader getStruct(byte typeId) { + StructReader structReader = (StructReader) readers[typeId]; + if (structReader == null) { + structReader = (SingleStructReaderImpl) data.getVectorByType(typeId).getReader(); + structReader.setPosition(idx()); + readers[typeId] = structReader; + } + return structReader; + } + + private UnionListReader listReader; + + private FieldReader getList(byte typeId) { + UnionListReader listReader = (UnionListReader) readers[typeId]; + if (listReader == null) { + listReader = new UnionListReader((ListVector) data.getVectorByType(typeId)); + listReader.setPosition(idx()); + readers[typeId] = listReader; + } + return listReader; + } + + private UnionMapReader mapReader; + + private FieldReader getMap(byte typeId) { + UnionMapReader mapReader = (UnionMapReader) readers[typeId]; + if (mapReader == null) { + mapReader = new UnionMapReader((MapVector) data.getVectorByType(typeId)); + mapReader.setPosition(idx()); + readers[typeId] = mapReader; + } + return mapReader; + } + + @Override + public java.util.Iterator iterator() { + throw new UnsupportedOperationException(); + } + + @Override + public void copyAsValue(UnionWriter writer) { + writer.data.copyFrom(idx(), writer.idx(), data); + } + + <#list ["Object", "BigDecimal", "Short", "Integer", "Long", "Boolean", + "LocalDateTime", "Duration", "Period", "Double", "Float", + "Character", "Text", "Byte", "byte[]", "PeriodDuration"] as friendlyType> + <#assign safeType=friendlyType /> + <#if safeType=="byte[]"><#assign safeType="ByteArray" /> + + @Override + public ${friendlyType} read${safeType}() { + return getReaderForIndex(idx()).read${safeType}(); + } + + + + public int size() { + return getReaderForIndex(idx()).size(); + } + + <#list vv.types as type> + <#list type.minor as minor> + <#assign name = minor.class?cap_first /> + <#assign uncappedName = name?uncap_first/> + <#assign boxedType = (minor.boxedType!type.boxedType) /> + <#assign javaType = (minor.javaType!type.javaType) /> + <#assign friendlyType = (minor.friendlyType!minor.boxedType!type.boxedType) /> + <#assign safeType=friendlyType /> + <#if safeType=="byte[]"><#assign safeType="ByteArray" /> + <#if !minor.typeParams?? || minor.class?starts_with("Decimal")> + + private ${name}ReaderImpl get${name}(byte typeId) { + ${name}ReaderImpl reader = (${name}ReaderImpl) readers[typeId]; + if (reader == null) { + reader = new ${name}ReaderImpl((${name}Vector) data.getVectorByType(typeId)); + reader.setPosition(idx()); + readers[typeId] = reader; + } + return reader; + } + + public void read(Nullable${name}Holder holder){ + getReaderForIndex(idx()).read(holder); + } + + public void copyAsValue(${name}Writer writer){ + getReaderForIndex(idx()).copyAsValue(writer); + } + + + + + @Override + public void copyAsValue(ListWriter writer) { + ComplexCopier.copy(this, (FieldWriter) writer); + } + + @Override + public void setPosition(int index) { + super.setPosition(index); + byte typeId = data.getTypeId(index); + if (readers[typeId] != null) { + int offset = data.getOffset(index); + readers[typeId].setPosition(offset); + } + } + + public FieldReader reader(byte typeId, String name){ + return getStruct(typeId).reader(name); + } + + public FieldReader reader(byte typeId) { + return getList(typeId).reader(); + } + + public boolean next() { + return getReaderForIndex(idx()).next(); + } +} diff --git a/src/arrow/java/vector/src/main/codegen/templates/DenseUnionVector.java b/src/arrow/java/vector/src/main/codegen/templates/DenseUnionVector.java new file mode 100644 index 000000000..63f4f5876 --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/templates/DenseUnionVector.java @@ -0,0 +1,943 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.ReferenceManager; +import org.apache.arrow.memory.util.CommonUtil; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.BaseValueVector; +import org.apache.arrow.vector.BitVectorHelper; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.complex.AbstractStructVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.UnionMode; +import org.apache.arrow.vector.compare.RangeEqualsVisitor; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.DataSizeRoundingUtil; +import org.apache.arrow.vector.util.TransferPair; + +import java.util.Arrays; +import java.util.stream.Collectors; + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/DenseUnionVector.java" /> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex; + +<#include "/@includes/vv_imports.ftl" /> +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import org.apache.arrow.memory.util.CommonUtil; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.memory.util.hash.SimpleHasher; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.complex.impl.ComplexCopier; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.BaseValueVector; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.util.Preconditions; + +import static org.apache.arrow.vector.types.UnionMode.Dense; + + + +/* + * This class is generated using freemarker and the ${.template_name} template. + */ +@SuppressWarnings("unused") + + +/** + * A vector which can hold values of different types. It does so by using a StructVector which contains a vector for each + * primitive type that is stored. StructVector is used in order to take advantage of its serialization/deserialization methods, + * as well as the addOrGet method. + * + * For performance reasons, DenseUnionVector stores a cached reference to each subtype vector, to avoid having to do the struct lookup + * each time the vector is accessed. + * Source code generated using FreeMarker template ${.template_name} + */ +public class DenseUnionVector extends AbstractContainerVector implements FieldVector { + int valueCount; + + NonNullableStructVector internalStruct; + private ArrowBuf typeBuffer; + private ArrowBuf offsetBuffer; + + /** + * The key is type Id, and the value is vector. + */ + private ValueVector[] childVectors = new ValueVector[Byte.MAX_VALUE + 1]; + + /** + * The index is the type id, and the value is the type field. + */ + private Field[] typeFields = new Field[Byte.MAX_VALUE + 1]; + /** + * The index is the index into the typeFields array, and the value is the logical field id. + */ + private byte[] typeMapFields = new byte[Byte.MAX_VALUE + 1]; + + /** + * The next type id to allocate. + */ + private byte nextTypeId = 0; + + private FieldReader reader; + + private long typeBufferAllocationSizeInBytes; + private long offsetBufferAllocationSizeInBytes; + + private final FieldType fieldType; + + public static final byte TYPE_WIDTH = 1; + public static final byte OFFSET_WIDTH = 4; + + private static final FieldType INTERNAL_STRUCT_TYPE = new FieldType(/*nullable*/ false, + ArrowType.Struct.INSTANCE, /*dictionary*/ null, /*metadata*/ null); + + public static DenseUnionVector empty(String name, BufferAllocator allocator) { + FieldType fieldType = FieldType.nullable(new ArrowType.Union( + UnionMode.Dense, null)); + return new DenseUnionVector(name, allocator, fieldType, null); + } + + public DenseUnionVector(String name, BufferAllocator allocator, FieldType fieldType, CallBack callBack) { + super(name, allocator, callBack); + this.fieldType = fieldType; + this.internalStruct = new NonNullableStructVector( + "internal", + allocator, + INTERNAL_STRUCT_TYPE, + callBack, + AbstractStructVector.ConflictPolicy.CONFLICT_REPLACE, + false); + this.typeBuffer = allocator.getEmpty(); + this.typeBufferAllocationSizeInBytes = BaseValueVector.INITIAL_VALUE_ALLOCATION * TYPE_WIDTH; + this.offsetBuffer = allocator.getEmpty(); + this.offsetBufferAllocationSizeInBytes = BaseValueVector.INITIAL_VALUE_ALLOCATION * OFFSET_WIDTH; + } + + public BufferAllocator getAllocator() { + return allocator; + } + + @Override + public MinorType getMinorType() { + return MinorType.DENSEUNION; + } + + @Override + public void initializeChildrenFromFields(List children) { + for (Field field : children) { + byte typeId = registerNewTypeId(field); + FieldVector vector = (FieldVector) internalStruct.add(field.getName(), field.getFieldType()); + vector.initializeChildrenFromFields(field.getChildren()); + childVectors[typeId] = vector; + } + } + + @Override + public List getChildrenFromFields() { + return internalStruct.getChildrenFromFields(); + } + + @Override + public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { + if (ownBuffers.size() != 2) { + throw new IllegalArgumentException("Illegal buffer count for dense union with type " + getField().getFieldType() + + ", expected " + 2 + ", got: " + ownBuffers.size()); + } + + ArrowBuf buffer = ownBuffers.get(0); + typeBuffer.getReferenceManager().release(); + typeBuffer = buffer.getReferenceManager().retain(buffer, allocator); + typeBufferAllocationSizeInBytes = typeBuffer.capacity(); + + buffer = ownBuffers.get(1); + offsetBuffer.getReferenceManager().release(); + offsetBuffer = buffer.getReferenceManager().retain(buffer, allocator); + offsetBufferAllocationSizeInBytes = offsetBuffer.capacity(); + + this.valueCount = fieldNode.getLength(); + } + + @Override + public List getFieldBuffers() { + List result = new ArrayList<>(2); + setReaderAndWriterIndex(); + result.add(typeBuffer); + result.add(offsetBuffer); + + return result; + } + + private void setReaderAndWriterIndex() { + typeBuffer.readerIndex(0); + typeBuffer.writerIndex(valueCount * TYPE_WIDTH); + + offsetBuffer.readerIndex(0); + offsetBuffer.writerIndex((long) valueCount * OFFSET_WIDTH); + } + + /** + * Get the inner vectors. + * + * @deprecated This API will be removed as the current implementations no longer support inner vectors. + * + * @return the inner vectors for this field as defined by the TypeLayout + */ + @Override + @Deprecated + public List getFieldInnerVectors() { + throw new UnsupportedOperationException("There are no inner vectors. Use geFieldBuffers"); + } + + private String fieldName(byte typeId, MinorType type) { + return type.name().toLowerCase() + typeId; + } + + private FieldType fieldType(MinorType type) { + return FieldType.nullable(type.getType()); + } + + public synchronized byte registerNewTypeId(Field field) { + if (nextTypeId == typeFields.length) { + throw new IllegalStateException("Dense union vector support at most " + + typeFields.length + " relative types. Please use union of union instead"); + } + byte typeId = nextTypeId; + if (fieldType != null) { + int[] typeIds = ((ArrowType.Union) fieldType.getType()).getTypeIds(); + if (typeIds != null) { + int thisTypeId = typeIds[nextTypeId]; + if (thisTypeId > Byte.MAX_VALUE) { + throw new IllegalStateException("Dense union vector types must be bytes. " + thisTypeId + " is too large"); + } + typeId = (byte) thisTypeId; + } + } + typeFields[typeId] = field; + typeMapFields[nextTypeId] = typeId; + this.nextTypeId += 1; + return typeId; + } + + private T addOrGet(byte typeId, MinorType minorType, Class c) { + return internalStruct.addOrGet(fieldName(typeId, minorType), fieldType(minorType), c); + } + + private T addOrGet(byte typeId, MinorType minorType, ArrowType arrowType, Class c) { + return internalStruct.addOrGet(fieldName(typeId, minorType), FieldType.nullable(arrowType), c); + } + + @Override + public long getOffsetBufferAddress() { + return offsetBuffer.memoryAddress(); + } + + @Override + public long getDataBufferAddress() { + throw new UnsupportedOperationException(); + } + + @Override + public long getValidityBufferAddress() { + throw new UnsupportedOperationException(); + } + + @Override + public ArrowBuf getValidityBuffer() { throw new UnsupportedOperationException(); } + + @Override + public ArrowBuf getOffsetBuffer() { return offsetBuffer; } + + public ArrowBuf getTypeBuffer() { return typeBuffer; } + + @Override + public ArrowBuf getDataBuffer() { throw new UnsupportedOperationException(); } + + public StructVector getStruct(byte typeId) { + StructVector structVector = typeId < 0 ? null : (StructVector) childVectors[typeId]; + if (structVector == null) { + int vectorCount = internalStruct.size(); + structVector = addOrGet(typeId, MinorType.STRUCT, StructVector.class); + if (internalStruct.size() > vectorCount) { + structVector.allocateNew(); + childVectors[typeId] = structVector; + if (callBack != null) { + callBack.doWork(); + } + } + } + return structVector; + } + + <#list vv.types as type> + <#list type.minor as minor> + <#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#assign lowerCaseName = name?lower_case/> + <#if !minor.typeParams?? || minor.class?starts_with("Decimal")> + + public ${name}Vector get${name}Vector(byte typeId<#if minor.class?starts_with("Decimal")>, ArrowType arrowType) { + ValueVector vector = typeId < 0 ? null : childVectors[typeId]; + if (vector == null) { + int vectorCount = internalStruct.size(); + vector = addOrGet(typeId, MinorType.${name?upper_case}<#if minor.class?starts_with("Decimal")>, arrowType, ${name}Vector.class); + childVectors[typeId] = vector; + if (internalStruct.size() > vectorCount) { + vector.allocateNew(); + if (callBack != null) { + callBack.doWork(); + } + } + } + return (${name}Vector) vector; + } + + + + + public ListVector getList(byte typeId) { + ListVector listVector = typeId < 0 ? null : (ListVector) childVectors[typeId]; + if (listVector == null) { + int vectorCount = internalStruct.size(); + listVector = addOrGet(typeId, MinorType.LIST, ListVector.class); + if (internalStruct.size() > vectorCount) { + listVector.allocateNew(); + childVectors[typeId] = listVector; + if (callBack != null) { + callBack.doWork(); + } + } + } + return listVector; + } + + public MapVector getMap(byte typeId) { + MapVector mapVector = typeId < 0 ? null : (MapVector) childVectors[typeId]; + if (mapVector == null) { + int vectorCount = internalStruct.size(); + mapVector = addOrGet(typeId, MinorType.MAP, MapVector.class); + if (internalStruct.size() > vectorCount) { + mapVector.allocateNew(); + childVectors[typeId] = mapVector; + if (callBack != null) { + callBack.doWork(); + } + } + } + return mapVector; + } + + public byte getTypeId(int index) { + return typeBuffer.getByte(index * TYPE_WIDTH); + } + + public ValueVector getVectorByType(byte typeId) { + return typeId < 0 ? null : childVectors[typeId]; + } + + @Override + public void allocateNew() throws OutOfMemoryException { + /* new allocation -- clear the current buffers */ + clear(); + internalStruct.allocateNew(); + try { + allocateTypeBuffer(); + allocateOffsetBuffer(); + } catch (Exception e) { + clear(); + throw e; + } + } + + @Override + public boolean allocateNewSafe() { + /* new allocation -- clear the current buffers */ + clear(); + boolean safe = internalStruct.allocateNewSafe(); + if (!safe) { return false; } + try { + allocateTypeBuffer(); + allocateOffsetBuffer(); + } catch (Exception e) { + clear(); + return false; + } + + return true; + } + + private void allocateTypeBuffer() { + typeBuffer = allocator.buffer(typeBufferAllocationSizeInBytes); + typeBuffer.readerIndex(0); + setNegative(0, typeBuffer.capacity()); + } + + private void allocateOffsetBuffer() { + offsetBuffer = allocator.buffer(offsetBufferAllocationSizeInBytes); + offsetBuffer.readerIndex(0); + offsetBuffer.setZero(0, offsetBuffer.capacity()); + } + + + @Override + public void reAlloc() { + internalStruct.reAlloc(); + reallocTypeBuffer(); + reallocOffsetBuffer(); + } + + public int getOffset(int index) { + return offsetBuffer.getInt((long) index * OFFSET_WIDTH); + } + + private void reallocTypeBuffer() { + final long currentBufferCapacity = typeBuffer.capacity(); + long newAllocationSize = currentBufferCapacity * 2; + if (newAllocationSize == 0) { + if (typeBufferAllocationSizeInBytes > 0) { + newAllocationSize = typeBufferAllocationSizeInBytes; + } else { + newAllocationSize = BaseValueVector.INITIAL_VALUE_ALLOCATION * TYPE_WIDTH * 2; + } + } + + newAllocationSize = CommonUtil.nextPowerOfTwo(newAllocationSize); + assert newAllocationSize >= 1; + + if (newAllocationSize > BaseValueVector.MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Unable to expand the buffer"); + } + + final ArrowBuf newBuf = allocator.buffer((int)newAllocationSize); + newBuf.setBytes(0, typeBuffer, 0, currentBufferCapacity); + typeBuffer.getReferenceManager().release(1); + typeBuffer = newBuf; + typeBufferAllocationSizeInBytes = (int)newAllocationSize; + setNegative(currentBufferCapacity, newBuf.capacity() - currentBufferCapacity); + } + + private void reallocOffsetBuffer() { + final long currentBufferCapacity = offsetBuffer.capacity(); + long newAllocationSize = currentBufferCapacity * 2; + if (newAllocationSize == 0) { + if (offsetBufferAllocationSizeInBytes > 0) { + newAllocationSize = offsetBufferAllocationSizeInBytes; + } else { + newAllocationSize = BaseValueVector.INITIAL_VALUE_ALLOCATION * OFFSET_WIDTH * 2; + } + } + + newAllocationSize = CommonUtil.nextPowerOfTwo(newAllocationSize); + assert newAllocationSize >= 1; + + if (newAllocationSize > BaseValueVector.MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Unable to expand the buffer"); + } + + final ArrowBuf newBuf = allocator.buffer((int) newAllocationSize); + newBuf.setBytes(0, offsetBuffer, 0, currentBufferCapacity); + newBuf.setZero(currentBufferCapacity, newBuf.capacity() - currentBufferCapacity); + offsetBuffer.getReferenceManager().release(1); + offsetBuffer = newBuf; + offsetBufferAllocationSizeInBytes = (int) newAllocationSize; + } + + @Override + public void setInitialCapacity(int numRecords) { } + + @Override + public int getValueCapacity() { + long capacity = getTypeBufferValueCapacity(); + long offsetCapacity = getOffsetBufferValueCapacity(); + if (offsetCapacity < capacity) { + capacity = offsetCapacity; + } + long structCapacity = internalStruct.getValueCapacity(); + if (structCapacity < capacity) { + structCapacity = capacity; + } + return (int) capacity; + } + + @Override + public void close() { + clear(); + } + + @Override + public void clear() { + valueCount = 0; + typeBuffer.getReferenceManager().release(); + typeBuffer = allocator.getEmpty(); + offsetBuffer.getReferenceManager().release(); + offsetBuffer = allocator.getEmpty(); + internalStruct.clear(); + } + + @Override + public void reset() { + valueCount = 0; + setNegative(0, typeBuffer.capacity()); + offsetBuffer.setZero(0, offsetBuffer.capacity()); + internalStruct.reset(); + } + + @Override + public Field getField() { + int childCount = (int) Arrays.stream(typeFields).filter(field -> field != null).count(); + List childFields = new ArrayList<>(childCount); + int[] typeIds = new int[childCount]; + for (int i = 0; i < typeFields.length; i++) { + if (typeFields[i] != null) { + int curIdx = childFields.size(); + typeIds[curIdx] = i; + childFields.add(typeFields[i]); + } + } + + FieldType fieldType; + if (this.fieldType == null) { + fieldType = FieldType.nullable(new ArrowType.Union(Dense, typeIds)); + } else { + final UnionMode mode = UnionMode.Dense; + fieldType = new FieldType(this.fieldType.isNullable(), new ArrowType.Union(mode, typeIds), + this.fieldType.getDictionary(), this.fieldType.getMetadata()); + } + + return new Field(name, fieldType, childFields); + } + + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + return getTransferPair(name, allocator); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return getTransferPair(ref, allocator, null); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { + return new org.apache.arrow.vector.complex.DenseUnionVector.TransferImpl(ref, allocator, callBack); + } + + @Override + public TransferPair makeTransferPair(ValueVector target) { + return new TransferImpl((DenseUnionVector) target); + } + + @Override + public void copyFrom(int inIndex, int outIndex, ValueVector from) { + Preconditions.checkArgument(this.getMinorType() == from.getMinorType()); + DenseUnionVector fromCast = (DenseUnionVector) from; + int inOffset = fromCast.offsetBuffer.getInt((long) inIndex * OFFSET_WIDTH); + fromCast.getReader().setPosition(inOffset); + int outOffset = offsetBuffer.getInt((long) outIndex * OFFSET_WIDTH); + getWriter().setPosition(outOffset); + ComplexCopier.copy(fromCast.reader, writer); + } + + @Override + public void copyFromSafe(int inIndex, int outIndex, ValueVector from) { + copyFrom(inIndex, outIndex, from); + } + + public FieldVector addVector(byte typeId, FieldVector v) { + final String name = v.getName().isEmpty() ? fieldName(typeId, v.getMinorType()) : v.getName(); + Preconditions.checkState(internalStruct.getChild(name) == null, String.format("%s vector already exists", name)); + final FieldVector newVector = internalStruct.addOrGet(name, v.getField().getFieldType(), v.getClass()); + v.makeTransferPair(newVector).transfer(); + internalStruct.putChild(name, newVector); + childVectors[typeId] = newVector; + if (callBack != null) { + callBack.doWork(); + } + return newVector; + } + + private class TransferImpl implements TransferPair { + private final TransferPair[] internalTransferPairs = new TransferPair[nextTypeId]; + private final DenseUnionVector to; + + public TransferImpl(String name, BufferAllocator allocator, CallBack callBack) { + to = new DenseUnionVector(name, allocator, null, callBack); + internalStruct.makeTransferPair(to.internalStruct); + createTransferPairs(); + } + + public TransferImpl(DenseUnionVector to) { + this.to = to; + internalStruct.makeTransferPair(to.internalStruct); + createTransferPairs(); + } + + private void createTransferPairs() { + for (int i = 0; i < nextTypeId; i++) { + ValueVector srcVec = internalStruct.getVectorById(i); + ValueVector dstVec = to.internalStruct.getVectorById(i); + to.typeFields[i] = typeFields[i]; + to.typeMapFields[i] = typeMapFields[i]; + to.childVectors[i] = dstVec; + internalTransferPairs[i] = srcVec.makeTransferPair(dstVec); + } + } + + @Override + public void transfer() { + to.clear(); + + ReferenceManager refManager = typeBuffer.getReferenceManager(); + to.typeBuffer = refManager.transferOwnership(typeBuffer, to.allocator).getTransferredBuffer(); + + refManager = offsetBuffer.getReferenceManager(); + to.offsetBuffer = refManager.transferOwnership(offsetBuffer, to.allocator).getTransferredBuffer(); + + for (int i = 0; i < nextTypeId; i++) { + if (internalTransferPairs[i] != null) { + internalTransferPairs[i].transfer(); + to.childVectors[i] = internalTransferPairs[i].getTo(); + } + } + to.valueCount = valueCount; + clear(); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + to.clear(); + + // transfer type buffer + int startPoint = startIndex * TYPE_WIDTH; + int sliceLength = length * TYPE_WIDTH; + ArrowBuf slicedBuffer = typeBuffer.slice(startPoint, sliceLength); + ReferenceManager refManager = slicedBuffer.getReferenceManager(); + to.typeBuffer = refManager.transferOwnership(slicedBuffer, to.allocator).getTransferredBuffer(); + + // transfer offset byffer + while (to.offsetBuffer.capacity() < (long) length * OFFSET_WIDTH) { + to.reallocOffsetBuffer(); + } + + int [] typeCounts = new int[nextTypeId]; + int [] typeStarts = new int[nextTypeId]; + for (int i = 0; i < typeCounts.length; i++) { + typeCounts[i] = 0; + typeStarts[i] = -1; + } + + for (int i = startIndex; i < startIndex + length; i++) { + byte typeId = typeBuffer.getByte(i); + to.offsetBuffer.setInt((long) (i - startIndex) * OFFSET_WIDTH, typeCounts[typeId]); + typeCounts[typeId] += 1; + if (typeStarts[typeId] == -1) { + typeStarts[typeId] = offsetBuffer.getInt((long) i * OFFSET_WIDTH); + } + } + + // transfer vector values + for (int i = 0; i < nextTypeId; i++) { + if (typeCounts[i] > 0 && typeStarts[i] != -1) { + internalTransferPairs[i].splitAndTransfer(typeStarts[i], typeCounts[i]); + to.childVectors[i] = internalTransferPairs[i].getTo(); + } + } + + to.setValueCount(length); + } + + @Override + public ValueVector getTo() { + return to; + } + + @Override + public void copyValueSafe(int from, int to) { + this.to.copyFrom(from, to, DenseUnionVector.this); + } + } + + @Override + public FieldReader getReader() { + if (reader == null) { + reader = new DenseUnionReader(this); + } + return reader; + } + + public FieldWriter getWriter() { + if (writer == null) { + writer = new DenseUnionWriter(this); + } + return writer; + } + + @Override + public int getBufferSize() { + return this.getBufferSizeFor(this.valueCount); + } + + @Override + public int getBufferSizeFor(final int count) { + if (count == 0) { + return 0; + } + return (int) (count * TYPE_WIDTH + (long) count * OFFSET_WIDTH + + DataSizeRoundingUtil.divideBy8Ceil(count) + internalStruct.getBufferSizeFor(count)); + } + + @Override + public ArrowBuf[] getBuffers(boolean clear) { + List list = new java.util.ArrayList<>(); + setReaderAndWriterIndex(); + if (getBufferSize() != 0) { + list.add(typeBuffer); + list.add(offsetBuffer); + list.addAll(java.util.Arrays.asList(internalStruct.getBuffers(clear))); + } + if (clear) { + valueCount = 0; + typeBuffer.getReferenceManager().retain(); + typeBuffer.close(); + typeBuffer = allocator.getEmpty(); + offsetBuffer.getReferenceManager().retain(); + offsetBuffer.close(); + offsetBuffer = allocator.getEmpty(); + } + return list.toArray(new ArrowBuf[list.size()]); + } + + @Override + public Iterator iterator() { + return internalStruct.iterator(); + } + + private ValueVector getVector(int index) { + byte typeId = typeBuffer.getByte(index * TYPE_WIDTH); + return getVectorByType(typeId); + } + + public Object getObject(int index) { + ValueVector vector = getVector(index); + if (vector != null) { + int offset = offsetBuffer.getInt((long) index * OFFSET_WIDTH); + return vector.isNull(offset) ? null : vector.getObject(offset); + } + return null; + } + + public void get(int index, DenseUnionHolder holder) { + FieldReader reader = new DenseUnionReader(DenseUnionVector.this); + reader.setPosition(index); + holder.reader = reader; + } + + public int getValueCount() { + return valueCount; + } + + /** + * IMPORTANT: Union types always return non null as there is no validity buffer. + * + * To check validity correctly you must check the underlying vector. + */ + public boolean isNull(int index) { + return false; + } + + @Override + public int getNullCount() { + return 0; + } + + public int isSet(int index) { + return isNull(index) ? 0 : 1; + } + + DenseUnionWriter writer; + + public void setValueCount(int valueCount) { + this.valueCount = valueCount; + while (valueCount > getTypeBufferValueCapacity()) { + reallocTypeBuffer(); + reallocOffsetBuffer(); + } + setChildVectorValueCounts(); + } + + private void setChildVectorValueCounts() { + int [] counts = new int[Byte.MAX_VALUE + 1]; + for (int i = 0; i < this.valueCount; i++) { + byte typeId = getTypeId(i); + if (typeId != -1) { + counts[typeId] += 1; + } + } + for (int i = 0; i < nextTypeId; i++) { + childVectors[typeMapFields[i]].setValueCount(counts[typeMapFields[i]]); + } + } + + public void setSafe(int index, DenseUnionHolder holder) { + FieldReader reader = holder.reader; + if (writer == null) { + writer = new DenseUnionWriter(DenseUnionVector.this); + } + int offset = offsetBuffer.getInt((long) index * OFFSET_WIDTH); + MinorType type = reader.getMinorType(); + writer.setPosition(offset); + byte typeId = holder.typeId; + switch (type) { + <#list vv.types as type> + <#list type.minor as minor> + <#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#if !minor.typeParams?? || minor.class?starts_with("Decimal")> + case ${name?upper_case}: + Nullable${name}Holder ${uncappedName}Holder = new Nullable${name}Holder(); + reader.read(${uncappedName}Holder); + setSafe(index, ${uncappedName}Holder); + break; + + + + case STRUCT: + case LIST: { + setTypeId(index, typeId); + ComplexCopier.copy(reader, writer); + break; + } + default: + throw new UnsupportedOperationException(); + } + } + <#list vv.types as type> + <#list type.minor as minor> + <#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#if !minor.typeParams?? || minor.class?starts_with("Decimal")> + public void setSafe(int index, Nullable${name}Holder holder) { + while (index >= getOffsetBufferValueCapacity()) { + reallocOffsetBuffer(); + } + byte typeId = getTypeId(index); + ${name}Vector vector = get${name}Vector(typeId<#if minor.class?starts_with("Decimal")>, new ArrowType.Decimal(holder.precision, holder.scale, holder.WIDTH * 8)); + int offset = vector.getValueCount(); + vector.setValueCount(offset + 1); + vector.setSafe(offset, holder); + offsetBuffer.setInt((long) index * OFFSET_WIDTH, offset); + } + + + + + public void setTypeId(int index, byte typeId) { + while (index >= getTypeBufferValueCapacity()) { + reallocTypeBuffer(); + } + typeBuffer.setByte(index * TYPE_WIDTH , typeId); + } + + private int getTypeBufferValueCapacity() { + return (int) typeBuffer.capacity() / TYPE_WIDTH; + } + + private long getOffsetBufferValueCapacity() { + return offsetBuffer.capacity() / OFFSET_WIDTH; + } + + @Override + public int hashCode(int index, ArrowBufHasher hasher) { + if (isNull(index)) { + return 0; + } + int offset = offsetBuffer.getInt((long) index * OFFSET_WIDTH); + return getVector(index).hashCode(offset, hasher); + } + + @Override + public int hashCode(int index) { + return hashCode(index, SimpleHasher.INSTANCE); + } + + @Override + public OUT accept(VectorVisitor visitor, IN value) { + return visitor.visit(this, value); + } + + @Override + public String getName() { + return name; + } + + private void setNegative(long start, long end) { + for (long i = start;i < end; i++) { + typeBuffer.setByte(i, -1); + } + } + + @Override + public T addOrGet(String name, FieldType fieldType, Class clazz) { + return internalStruct.addOrGet(name, fieldType, clazz); + } + + @Override + public T getChild(String name, Class clazz) { + return internalStruct.getChild(name, clazz); + } + + @Override + public VectorWithOrdinal getChildVectorWithOrdinal(String name) { + return internalStruct.getChildVectorWithOrdinal(name); + } + + @Override + public int size() { + return internalStruct.size(); + } + + @Override + public void setInitialCapacity(int valueCount, double density) { + for (final ValueVector vector : internalStruct) { + if (vector instanceof DensityAwareVector) { + ((DensityAwareVector) vector).setInitialCapacity(valueCount, density); + } else { + vector.setInitialCapacity(valueCount); + } + } + } +} diff --git a/src/arrow/java/vector/src/main/codegen/templates/DenseUnionWriter.java b/src/arrow/java/vector/src/main/codegen/templates/DenseUnionWriter.java new file mode 100644 index 000000000..e69a62a9e --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/templates/DenseUnionWriter.java @@ -0,0 +1,302 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.impl.NullableStructWriterFactory; +import org.apache.arrow.vector.types.Types; + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/DenseUnionWriter.java" /> + + +<#include "/@includes/license.ftl" /> + + package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> + import org.apache.arrow.vector.complex.writer.BaseWriter; + import org.apache.arrow.vector.types.Types.MinorType; + +/* + * This class is generated using freemarker and the ${.template_name} template. + */ +@SuppressWarnings("unused") +public class DenseUnionWriter extends AbstractFieldWriter implements FieldWriter { + + DenseUnionVector data; + + private BaseWriter[] writers = new BaseWriter[Byte.MAX_VALUE + 1]; + private final NullableStructWriterFactory nullableStructWriterFactory; + + public DenseUnionWriter(DenseUnionVector vector) { + this(vector, NullableStructWriterFactory.getNullableStructWriterFactoryInstance()); + } + + public DenseUnionWriter(DenseUnionVector vector, NullableStructWriterFactory nullableStructWriterFactory) { + data = vector; + this.nullableStructWriterFactory = nullableStructWriterFactory; + } + + @Override + public void setPosition(int index) { + super.setPosition(index); + for (BaseWriter writer : writers) { + writer.setPosition(index); + } + } + + @Override + public void start() { + byte typeId = data.getTypeId(idx()); + getStructWriter((byte) idx()).start(); + } + + @Override + public void end() { + byte typeId = data.getTypeId(idx()); + getStructWriter(typeId).end(); + } + + @Override + public void startList() { + byte typeId = data.getTypeId(idx()); + getListWriter(typeId).startList(); + } + + @Override + public void endList() { + byte typeId = data.getTypeId(idx()); + getListWriter(typeId).endList(); + } + + private StructWriter getStructWriter(byte typeId) { + StructWriter structWriter = (StructWriter) writers[typeId]; + if (structWriter == null) { + structWriter = nullableStructWriterFactory.build((StructVector) data.getVectorByType(typeId)); + writers[typeId] = structWriter; + } + return structWriter; + } + + public StructWriter asStruct(byte typeId) { + data.setTypeId(idx(), typeId); + return getStructWriter(typeId); + } + + private ListWriter getListWriter(byte typeId) { + ListWriter listWriter = (ListWriter) writers[typeId]; + if (listWriter == null) { + listWriter = new UnionListWriter((ListVector) data.getVectorByType(typeId), nullableStructWriterFactory); + writers[typeId] = listWriter; + } + return listWriter; + } + + public ListWriter asList(byte typeId) { + data.setTypeId(idx(), typeId); + return getListWriter(typeId); + } + + private MapWriter getMapWriter(byte typeId) { + MapWriter mapWriter = (MapWriter) writers[typeId]; + if (mapWriter == null) { + mapWriter = new UnionMapWriter((MapVector) data.getVectorByType(typeId)); + writers[typeId] = mapWriter; + } + return mapWriter; + } + + public MapWriter asMap(byte typeId) { + data.setTypeId(idx(), typeId); + return getMapWriter(typeId); + } + + BaseWriter getWriter(byte typeId) { + MinorType minorType = data.getVectorByType(typeId).getMinorType(); + switch (minorType) { + case STRUCT: + return getStructWriter(typeId); + case LIST: + return getListWriter(typeId); + case MAP: + return getMapWriter(typeId); + <#list vv.types as type> + <#list type.minor as minor> + <#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#if !minor.typeParams?? || minor.class?starts_with("Decimal")> + case ${name?upper_case}: + return get${name}Writer(typeId); + + + + default: + throw new UnsupportedOperationException("Unknown type: " + minorType); + } + } + <#list vv.types as type> + <#list type.minor as minor> + <#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#if !minor.typeParams?? || minor.class?starts_with("Decimal")> + + private ${name}Writer get${name}Writer(byte typeId) { + ${name}Writer writer = (${name}Writer) writers[typeId]; + if (writer == null) { + writer = new ${name}WriterImpl((${name}Vector) data.getVectorByType(typeId)); + writers[typeId] = writer; + } + return writer; + } + + public ${name}Writer as${name}(byte typeId) { + data.setTypeId(idx(), typeId); + return get${name}Writer(typeId); + } + + @Override + public void write(${name}Holder holder) { + throw new UnsupportedOperationException(); + } + + public void write${minor.class}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, , byte typeId<#if minor.class?starts_with("Decimal")>, ArrowType arrowType) { + data.setTypeId(idx(), typeId); + get${name}Writer(typeId).setPosition(data.getOffset(idx())); + get${name}Writer(typeId).write${name}(<#list fields as field>${field.name}<#if field_has_next>, <#if minor.class?starts_with("Decimal")>, arrowType); + } + + + + + public void writeNull() { + } + + @Override + public StructWriter struct() { + byte typeId = data.getTypeId(idx()); + data.setTypeId(idx(), typeId); + getListWriter(typeId).setPosition(data.getOffset(idx())); + return getListWriter(typeId).struct(); + } + + @Override + public ListWriter list() { + byte typeId = data.getTypeId(idx()); + data.setTypeId(idx(), typeId); + getListWriter(typeId).setPosition(data.getOffset(idx())); + return getListWriter(typeId).list(); + } + + @Override + public ListWriter list(String name) { + byte typeId = data.getTypeId(idx()); + data.setTypeId(idx(), typeId); + getStructWriter(typeId).setPosition(data.getOffset(idx())); + return getStructWriter(typeId).list(name); + } + + @Override + public MapWriter map() { + byte typeId = data.getTypeId(idx()); + data.setTypeId(idx(), typeId); + getListWriter(typeId).setPosition(data.getOffset(idx())); + return getMapWriter(typeId).map(); + } + + @Override + public MapWriter map(String name) { + byte typeId = data.getTypeId(idx()); + data.setTypeId(idx(), typeId); + getStructWriter(typeId).setPosition(data.getOffset(idx())); + return getStructWriter(typeId).map(name); + } + + @Override + public MapWriter map(String name, boolean keysSorted) { + byte typeId = data.getTypeId(idx()); + data.setTypeId(idx(), typeId); + getStructWriter(typeId).setPosition(data.getOffset(idx())); + return getStructWriter(typeId).map(name, keysSorted); + } + + @Override + public StructWriter struct(String name) { + byte typeId = data.getTypeId(idx()); + data.setTypeId(idx(), typeId); + getStructWriter(typeId).setPosition(data.getOffset(idx())); + return getStructWriter(typeId).struct(name); + } + + <#list vv.types as type><#list type.minor as minor> + <#assign lowerName = minor.class?uncap_first /> + <#if lowerName == "int" ><#assign lowerName = "integer" /> + <#assign upperName = minor.class?upper_case /> + <#assign capName = minor.class?cap_first /> + <#if !minor.typeParams?? || minor.class?starts_with("Decimal") > + @Override + public ${capName}Writer ${lowerName}(String name) { + byte typeId = data.getTypeId(idx()); + data.setTypeId(idx(), typeId); + getStructWriter(typeId).setPosition(data.getOffset(idx())); + return getStructWriter(typeId).${lowerName}(name); + } + + @Override + public ${capName}Writer ${lowerName}() { + byte typeId = data.getTypeId(idx()); + data.setTypeId(idx(), typeId); + getListWriter(typeId).setPosition(data.getOffset(idx())); + return getListWriter(typeId).${lowerName}(); + } + + <#if minor.class?starts_with("Decimal")> + public ${capName}Writer ${lowerName}(String name<#list minor.typeParams as typeParam>, ${typeParam.type} ${typeParam.name}) { + byte typeId = data.getTypeId(idx()); + data.setTypeId(idx(), typeId); + getStructWriter(typeId).setPosition(data.getOffset(idx())); + return getStructWriter(typeId).${lowerName}(name<#list minor.typeParams as typeParam>, ${typeParam.name}); + } + + + + @Override + public void allocate() { + data.allocateNew(); + } + + @Override + public void clear() { + data.clear(); + } + + @Override + public void close() throws Exception { + data.close(); + } + + @Override + public Field getField() { + return data.getField(); + } + + @Override + public int getValueCapacity() { + return data.getValueCapacity(); + } +} diff --git a/src/arrow/java/vector/src/main/codegen/templates/HolderReaderImpl.java b/src/arrow/java/vector/src/main/codegen/templates/HolderReaderImpl.java new file mode 100644 index 000000000..8394aaad4 --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/templates/HolderReaderImpl.java @@ -0,0 +1,173 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +<@pp.dropOutputFile /> +<#list vv.types as type> +<#list type.minor as minor> +<#list ["", "Nullable"] as holderMode> +<#assign nullMode = holderMode /> + +<#assign lowerName = minor.class?uncap_first /> +<#if lowerName == "int" ><#assign lowerName = "integer" /> +<#assign name = minor.class?cap_first /> +<#assign javaType = (minor.javaType!type.javaType) /> +<#assign friendlyType = (minor.friendlyType!minor.boxedType!type.boxedType) /> +<#assign safeType=friendlyType /> +<#if safeType=="byte[]"><#assign safeType="ByteArray" /> +<#assign fields = (minor.fields!type.fields) + minor.typeParams![]/> + +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/${holderMode}${name}HolderReaderImpl.java" /> +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> + +// Source code generated using FreeMarker template ${.template_name} + +@SuppressWarnings("unused") +public class ${holderMode}${name}HolderReaderImpl extends AbstractFieldReader { + + private ${nullMode}${name}Holder holder; + public ${holderMode}${name}HolderReaderImpl(${holderMode}${name}Holder holder) { + this.holder = holder; + } + + @Override + public int size() { + throw new UnsupportedOperationException("You can't call size on a Holder value reader."); + } + + @Override + public boolean next() { + throw new UnsupportedOperationException("You can't call next on a single value reader."); + + } + + @Override + public void setPosition(int index) { + throw new UnsupportedOperationException("You can't call next on a single value reader."); + } + + @Override + public MinorType getMinorType() { + return MinorType.${name?upper_case}; + } + + @Override + public boolean isSet() { + <#if holderMode == "Nullable"> + return this.holder.isSet == 1; + <#else> + return true; + + } + + @Override + public void read(${name}Holder h) { + <#list fields as field> + h.${field.name} = holder.${field.name}; + + } + + @Override + public void read(Nullable${name}Holder h) { + <#list fields as field> + h.${field.name} = holder.${field.name}; + + h.isSet = isSet() ? 1 : 0; + } + + // read friendly type + @Override + public ${friendlyType} read${safeType}() { + <#if nullMode == "Nullable"> + if (!isSet()) { + return null; + } + + + <#if type.major == "VarLen"> + <#if type.width == 4> + int length = holder.end - holder.start; + <#elseif type.width == 8> + int length = (int) (holder.end - holder.start); + + byte[] value = new byte [length]; + holder.buffer.getBytes(holder.start, value, 0, length); + <#if minor.class == "VarBinary" || minor.class == "LargeVarBinary"> + return value; + <#elseif minor.class == "VarChar" || minor.class == "LargeVarChar"> + Text text = new Text(); + text.set(value); + return text; + + <#elseif minor.class == "IntervalDay"> + return Duration.ofDays(holder.days).plusMillis(holder.milliseconds); + <#elseif minor.class == "IntervalYear"> + return Period.ofMonths(holder.value); + <#elseif minor.class == "IntervalMonthDayNano"> + return new PeriodDuration(Period.ofMonths(holder.months).plusDays(holder.days), + Duration.ofNanos(holder.nanoseconds)); + <#elseif minor.class == "Duration"> + return DurationVector.toDuration(holder.value, holder.unit); + <#elseif minor.class == "Bit" > + return new Boolean(holder.value != 0); + <#elseif minor.class == "Decimal"> + byte[] bytes = new byte[${type.width}]; + holder.buffer.getBytes(holder.start, bytes, 0, ${type.width}); + ${friendlyType} value = new BigDecimal(new BigInteger(bytes), holder.scale); + return value; + <#elseif minor.class == "Decimal256"> + byte[] bytes = new byte[${type.width}]; + holder.buffer.getBytes(holder.start, bytes, 0, ${type.width}); + ${friendlyType} value = new BigDecimal(new BigInteger(bytes), holder.scale); + return value; + <#elseif minor.class == "FixedSizeBinary"> + byte[] value = new byte [holder.byteWidth]; + holder.buffer.getBytes(0, value, 0, holder.byteWidth); + return value; + <#elseif minor.class == "TimeStampSec"> + final long millis = java.util.concurrent.TimeUnit.SECONDS.toMillis(holder.value); + return DateUtility.getLocalDateTimeFromEpochMilli(millis); + <#elseif minor.class == "TimeStampMilli" || minor.class == "DateMilli" || minor.class == "TimeMilli"> + return DateUtility.getLocalDateTimeFromEpochMilli(holder.value); + <#elseif minor.class == "TimeStampMicro"> + return DateUtility.getLocalDateTimeFromEpochMicro(holder.value); + <#elseif minor.class == "TimeStampNano"> + return DateUtility.getLocalDateTimeFromEpochNano(holder.value); + <#else> + ${friendlyType} value = new ${friendlyType}(this.holder.value); + return value; + + } + + @Override + public Object readObject() { + return read${safeType}(); + } + + <#if nullMode != "Nullable"> + public void copyAsValue(${minor.class?cap_first}Writer writer){ + writer.write(holder); + } + +} + + + + diff --git a/src/arrow/java/vector/src/main/codegen/templates/NullReader.java b/src/arrow/java/vector/src/main/codegen/templates/NullReader.java new file mode 100644 index 000000000..0c65f9a56 --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/templates/NullReader.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.arrow.vector.types.pojo.ArrowType.Null; +import org.apache.arrow.vector.types.pojo.Field; + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/NullReader.java" /> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> + +/** + * Source code generated using FreeMarker template ${.template_name} + */ +@SuppressWarnings("unused") +public class NullReader extends AbstractBaseReader implements FieldReader{ + + public static final NullReader INSTANCE = new NullReader(); + public static final NullReader EMPTY_LIST_INSTANCE = new NullReader(MinorType.NULL); + public static final NullReader EMPTY_STRUCT_INSTANCE = new NullReader(MinorType.STRUCT); + private MinorType type; + + private NullReader(){ + super(); + type = MinorType.NULL; + } + + private NullReader(MinorType type){ + super(); + this.type = type; + } + + @Override + public MinorType getMinorType() { + return type; + } + + @Override + public Field getField() { + return new Field("", FieldType.nullable(new Null()), null); + } + + public void copyAsValue(StructWriter writer) {} + + public void copyAsValue(ListWriter writer) {} + + public void copyAsValue(UnionWriter writer) {} + + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> + public void read(${name}Holder holder){ + throw new UnsupportedOperationException("NullReader cannot write into non-nullable holder"); + } + + public void read(Nullable${name}Holder holder){ + holder.isSet = 0; + } + + public void read(int arrayIndex, ${name}Holder holder){ + throw new ArrayIndexOutOfBoundsException(); + } + + public void copyAsValue(${minor.class}Writer writer){} + public void copyAsField(String name, ${minor.class}Writer writer){} + + public void read(int arrayIndex, Nullable${name}Holder holder){ + throw new ArrayIndexOutOfBoundsException(); + } + + + public int size(){ + return 0; + } + + public boolean isSet(){ + return false; + } + + public boolean next(){ + return false; + } + + public RepeatedStructReader struct(){ + return this; + } + + public RepeatedListReader list(){ + return this; + } + + public StructReader struct(String name){ + return this; + } + + public ListReader list(String name){ + return this; + } + + public FieldReader reader(String name){ + return this; + } + + public FieldReader reader(){ + return this; + } + + private void fail(String name){ + throw new IllegalArgumentException(String.format("You tried to read a %s type when you are using a ValueReader of type %s.", name, this.getClass().getSimpleName())); + } + + <#list ["Object", "BigDecimal", "Short", "Integer", "Long", "Boolean", + "LocalDateTime", "Duration", "Period", "Double", "Float", + "Character", "Text", "String", "Byte", "byte[]", "PeriodDuration"] as friendlyType> + <#assign safeType=friendlyType /> + <#if safeType=="byte[]"><#assign safeType="ByteArray" /> + + public ${friendlyType} read${safeType}(int arrayIndex){ + return null; + } + + public ${friendlyType} read${safeType}(){ + return null; + } + + +} + + + diff --git a/src/arrow/java/vector/src/main/codegen/templates/StructWriters.java b/src/arrow/java/vector/src/main/codegen/templates/StructWriters.java new file mode 100644 index 000000000..69693c630 --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/templates/StructWriters.java @@ -0,0 +1,326 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +<@pp.dropOutputFile /> +<#list ["Nullable", "Single"] as mode> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/${mode}StructWriter.java" /> +<#assign index = "idx()"> +<#if mode == "Single"> +<#assign containerClass = "NonNullableStructVector" /> +<#else> +<#assign containerClass = "StructVector" /> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> +import java.util.Map; +import java.util.HashMap; + +import org.apache.arrow.vector.holders.RepeatedStructHolder; +import org.apache.arrow.vector.AllocationHelper; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.FieldWriter; + +/* + * This class is generated using FreeMarker and the ${.template_name} template. + */ +@SuppressWarnings("unused") +public class ${mode}StructWriter extends AbstractFieldWriter { + + protected final ${containerClass} container; + private int initialCapacity; + private final Map fields = new HashMap<>(); + public ${mode}StructWriter(${containerClass} container) { + <#if mode == "Single"> + if (container instanceof StructVector) { + throw new IllegalArgumentException("Invalid container: " + container); + } + + this.container = container; + this.initialCapacity = 0; + for (Field child : container.getField().getChildren()) { + MinorType minorType = Types.getMinorTypeForArrowType(child.getType()); + switch (minorType) { + case STRUCT: + struct(child.getName()); + break; + case LIST: + list(child.getName()); + break; + case MAP: { + ArrowType.Map arrowType = (ArrowType.Map) child.getType(); + map(child.getName(), arrowType.getKeysSorted()); + break; + } + case UNION: + FieldType fieldType = new FieldType(addVectorAsNullable, MinorType.UNION.getType(), null, null); + UnionWriter writer = new UnionWriter(container.addOrGet(child.getName(), fieldType, UnionVector.class), getNullableStructWriterFactory()); + fields.put(handleCase(child.getName()), writer); + break; +<#list vv.types as type><#list type.minor as minor> +<#assign lowerName = minor.class?uncap_first /> +<#if lowerName == "int" ><#assign lowerName = "integer" /> +<#assign upperName = minor.class?upper_case /> + case ${upperName}: { + <#if minor.typeParams?? > + ${minor.arrowType} arrowType = (${minor.arrowType})child.getType(); + ${lowerName}(child.getName()<#list minor.typeParams as typeParam>, arrowType.get${typeParam.name?cap_first}()); + <#else> + ${lowerName}(child.getName()); + + break; + } + + default: + throw new UnsupportedOperationException("Unknown type: " + minorType); + } + } + } + + protected String handleCase(final String input) { + return input.toLowerCase(); + } + + protected NullableStructWriterFactory getNullableStructWriterFactory() { + return NullableStructWriterFactory.getNullableStructWriterFactoryInstance(); + } + + @Override + public int getValueCapacity() { + return container.getValueCapacity(); + } + + public void setInitialCapacity(int initialCapacity) { + this.initialCapacity = initialCapacity; + container.setInitialCapacity(initialCapacity); + } + + @Override + public boolean isEmptyStruct() { + return 0 == container.size(); + } + + @Override + public Field getField() { + return container.getField(); + } + + @Override + public StructWriter struct(String name) { + String finalName = handleCase(name); + FieldWriter writer = fields.get(finalName); + if(writer == null){ + int vectorCount=container.size(); + FieldType fieldType = new FieldType(addVectorAsNullable, MinorType.STRUCT.getType(), null, null); + StructVector vector = container.addOrGet(name, fieldType, StructVector.class); + writer = new PromotableWriter(vector, container, getNullableStructWriterFactory()); + if(vectorCount != container.size()) { + writer.allocate(); + } + writer.setPosition(idx()); + fields.put(finalName, writer); + } else { + if (writer instanceof PromotableWriter) { + // ensure writers are initialized + ((PromotableWriter)writer).getWriter(MinorType.STRUCT); + } + } + return writer; + } + + @Override + public void close() throws Exception { + clear(); + container.close(); + } + + @Override + public void allocate() { + container.allocateNew(); + for(final FieldWriter w : fields.values()) { + w.allocate(); + } + } + + @Override + public void clear() { + container.clear(); + for(final FieldWriter w : fields.values()) { + w.clear(); + } + } + + @Override + public ListWriter list(String name) { + String finalName = handleCase(name); + FieldWriter writer = fields.get(finalName); + int vectorCount = container.size(); + if(writer == null) { + FieldType fieldType = new FieldType(addVectorAsNullable, MinorType.LIST.getType(), null, null); + writer = new PromotableWriter(container.addOrGet(name, fieldType, ListVector.class), container, getNullableStructWriterFactory()); + if (container.size() > vectorCount) { + writer.allocate(); + } + writer.setPosition(idx()); + fields.put(finalName, writer); + } else { + if (writer instanceof PromotableWriter) { + // ensure writers are initialized + ((PromotableWriter)writer).getWriter(MinorType.LIST); + } + } + return writer; + } + + @Override + public MapWriter map(String name) { + return map(name, false); + } + + @Override + public MapWriter map(String name, boolean keysSorted) { + FieldWriter writer = fields.get(handleCase(name)); + if(writer == null) { + ValueVector vector; + ValueVector currentVector = container.getChild(name); + MapVector v = container.addOrGet(name, + new FieldType(addVectorAsNullable, + new ArrowType.Map(keysSorted) + ,null, null), + MapVector.class); + writer = new PromotableWriter(v, container, getNullableStructWriterFactory()); + vector = v; + if (currentVector == null || currentVector != vector) { + if(this.initialCapacity > 0) { + vector.setInitialCapacity(this.initialCapacity); + } + vector.allocateNewSafe(); + } + writer.setPosition(idx()); + fields.put(handleCase(name), writer); + } else { + if (writer instanceof PromotableWriter) { + // ensure writers are initialized + ((PromotableWriter)writer).getWriter(MinorType.MAP, new ArrowType.Map(keysSorted)); + } + } + return writer; + } + + public void setValueCount(int count) { + container.setValueCount(count); + } + + @Override + public void setPosition(int index) { + super.setPosition(index); + for(final FieldWriter w: fields.values()) { + w.setPosition(index); + } + } + + <#if mode="Nullable"> + @Override + public void writeNull() { + container.setNull(idx()); + setValueCount(idx()+1); + super.setPosition(idx()+1); + } + + + @Override + public void start() { + <#if mode == "Single"> + <#else> + container.setIndexDefined(idx()); + + } + + @Override + public void end() { + setPosition(idx()+1); + } + + <#list vv.types as type><#list type.minor as minor> + <#assign lowerName = minor.class?uncap_first /> + <#if lowerName == "int" ><#assign lowerName = "integer" /> + <#assign upperName = minor.class?upper_case /> + <#assign capName = minor.class?cap_first /> + <#assign vectName = capName /> + + <#if minor.typeParams?? > + @Override + public ${minor.class}Writer ${lowerName}(String name) { + // returns existing writer + final FieldWriter writer = fields.get(handleCase(name)); + Preconditions.checkNotNull(writer); + return writer; + } + + @Override + public ${minor.class}Writer ${lowerName}(String name<#list minor.typeParams as typeParam>, ${typeParam.type} ${typeParam.name}) { + <#else> + @Override + public ${minor.class}Writer ${lowerName}(String name) { + + FieldWriter writer = fields.get(handleCase(name)); + if(writer == null) { + ValueVector vector; + ValueVector currentVector = container.getChild(name); + ${vectName}Vector v = container.addOrGet(name, + new FieldType(addVectorAsNullable, + <#if minor.typeParams??> + <#if minor.arrowTypeConstructorParams??> + <#assign constructorParams = minor.arrowTypeConstructorParams /> + <#else> + <#assign constructorParams = [] /> + <#list minor.typeParams?reverse as typeParam> + <#assign constructorParams = constructorParams + [ typeParam.name ] /> + + + new ${minor.arrowType}(${constructorParams?join(", ")}<#if minor.class?starts_with("Decimal")>, ${vectName}Vector.TYPE_WIDTH * 8) + <#else> + MinorType.${upperName}.getType() + + ,null, null), + ${vectName}Vector.class); + writer = new PromotableWriter(v, container, getNullableStructWriterFactory()); + vector = v; + if (currentVector == null || currentVector != vector) { + if(this.initialCapacity > 0) { + vector.setInitialCapacity(this.initialCapacity); + } + vector.allocateNewSafe(); + } + writer.setPosition(idx()); + fields.put(handleCase(name), writer); + } else { + if (writer instanceof PromotableWriter) { + // ensure writers are initialized + ((PromotableWriter)writer).getWriter(MinorType.${upperName}<#if minor.class?starts_with("Decimal")>, new ${minor.arrowType}(precision, scale, ${vectName}Vector.TYPE_WIDTH * 8)); + } + } + return writer; + } + + + +} + diff --git a/src/arrow/java/vector/src/main/codegen/templates/UnionFixedSizeListWriter.java b/src/arrow/java/vector/src/main/codegen/templates/UnionFixedSizeListWriter.java new file mode 100644 index 000000000..55c661bfc --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/templates/UnionFixedSizeListWriter.java @@ -0,0 +1,319 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.vector.complex.writer.Decimal256Writer; +import org.apache.arrow.vector.complex.writer.DecimalWriter; +import org.apache.arrow.vector.holders.Decimal256Holder; +import org.apache.arrow.vector.holders.DecimalHolder; + + +import java.lang.UnsupportedOperationException; +import java.math.BigDecimal; + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/UnionFixedSizeListWriter.java" /> + + +<#include "/@includes/license.ftl" /> + + package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> + +/* + * This class is generated using freemarker and the ${.template_name} template. + */ + +@SuppressWarnings("unused") +public class UnionFixedSizeListWriter extends AbstractFieldWriter { + + protected FixedSizeListVector vector; + protected PromotableWriter writer; + private boolean inStruct = false; + private String structName; + private final int listSize; + + public UnionFixedSizeListWriter(FixedSizeListVector vector) { + this(vector, NullableStructWriterFactory.getNullableStructWriterFactoryInstance()); + } + + public UnionFixedSizeListWriter(FixedSizeListVector vector, NullableStructWriterFactory nullableStructWriterFactory) { + this.vector = vector; + this.writer = new PromotableWriter(vector.getDataVector(), vector, nullableStructWriterFactory); + this.listSize = vector.getListSize(); + } + + public UnionFixedSizeListWriter(FixedSizeListVector vector, AbstractFieldWriter parent) { + this(vector); + } + + @Override + public void allocate() { + vector.allocateNew(); + } + + @Override + public void clear() { + vector.clear(); + } + + @Override + public Field getField() { + return vector.getField(); + } + + public void setValueCount(int count) { + vector.setValueCount(count); + } + + @Override + public int getValueCapacity() { + return vector.getValueCapacity(); + } + + @Override + public void close() throws Exception { + vector.close(); + writer.close(); + } + + @Override + public void setPosition(int index) { + super.setPosition(index); + } + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#if uncappedName == "int" ><#assign uncappedName = "integer" /> + <#if !minor.typeParams?? > + + @Override + public ${name}Writer ${uncappedName}() { + return this; + } + + @Override + public ${name}Writer ${uncappedName}(String name) { + structName = name; + return writer.${uncappedName}(name); + } + + + + @Override + public DecimalWriter decimal() { + return this; + } + + @Override + public DecimalWriter decimal(String name, int scale, int precision) { + return writer.decimal(name, scale, precision); + } + + @Override + public DecimalWriter decimal(String name) { + return writer.decimal(name); + } + + + @Override + public Decimal256Writer decimal256() { + return this; + } + + @Override + public Decimal256Writer decimal256(String name, int scale, int precision) { + return writer.decimal256(name, scale, precision); + } + + @Override + public Decimal256Writer decimal256(String name) { + return writer.decimal256(name); + } + + @Override + public StructWriter struct() { + inStruct = true; + return this; + } + + @Override + public ListWriter list() { + return writer; + } + + @Override + public ListWriter list(String name) { + ListWriter listWriter = writer.list(name); + return listWriter; + } + + @Override + public StructWriter struct(String name) { + StructWriter structWriter = writer.struct(name); + return structWriter; + } + + @Override + public MapWriter map() { + return writer; + } + + @Override + public MapWriter map(String name) { + MapWriter mapWriter = writer.map(name); + return mapWriter; + } + + @Override + public MapWriter map(boolean keysSorted) { + writer.map(keysSorted); + return writer; + } + + @Override + public MapWriter map(String name, boolean keysSorted) { + MapWriter mapWriter = writer.map(name, keysSorted); + return mapWriter; + } + + @Override + public void startList() { + int start = vector.startNewValue(idx()); + writer.setPosition(start); + } + + @Override + public void endList() { + setPosition(idx() + 1); + } + + @Override + public void start() { + writer.start(); + } + + @Override + public void end() { + writer.end(); + inStruct = false; + } + + @Override + public void write(DecimalHolder holder) { + if (writer.idx() >= (idx() + 1) * listSize) { + throw new IllegalStateException(String.format("values at index %s is greater than listSize %s", idx(), listSize)); + } + writer.write(holder); + writer.setPosition(writer.idx() + 1); + } + + @Override + public void write(Decimal256Holder holder) { + if (writer.idx() >= (idx() + 1) * listSize) { + throw new IllegalStateException(String.format("values at index %s is greater than listSize %s", idx(), listSize)); + } + writer.write(holder); + writer.setPosition(writer.idx() + 1); + } + + + @Override + public void writeNull() { + if (writer.idx() >= (idx() + 1) * listSize) { + throw new IllegalStateException(String.format("values at index %s is greater than listSize %s", idx(), listSize)); + } + writer.writeNull(); + } + + public void writeDecimal(long start, ArrowBuf buffer, ArrowType arrowType) { + if (writer.idx() >= (idx() + 1) * listSize) { + throw new IllegalStateException(String.format("values at index %s is greater than listSize %s", idx(), listSize)); + } + writer.writeDecimal(start, buffer, arrowType); + writer.setPosition(writer.idx() + 1); + } + + public void writeDecimal(BigDecimal value) { + if (writer.idx() >= (idx() + 1) * listSize) { + throw new IllegalStateException(String.format("values at index %s is greater than listSize %s", idx(), listSize)); + } + writer.writeDecimal(value); + writer.setPosition(writer.idx() + 1); + } + + public void writeBigEndianBytesToDecimal(byte[] value, ArrowType arrowType) { + if (writer.idx() >= (idx() + 1) * listSize) { + throw new IllegalStateException(String.format("values at index %s is greater than listSize %s", idx(), listSize)); + } + writer.writeBigEndianBytesToDecimal(value, arrowType); + writer.setPosition(writer.idx() + 1); + } + + public void writeDecimal256(long start, ArrowBuf buffer, ArrowType arrowType) { + if (writer.idx() >= (idx() + 1) * listSize) { + throw new IllegalStateException(String.format("values at index %s is greater than listSize %s", idx(), listSize)); + } + writer.writeDecimal256(start, buffer, arrowType); + writer.setPosition(writer.idx() + 1); + } + + public void writeDecimal256(BigDecimal value) { + if (writer.idx() >= (idx() + 1) * listSize) { + throw new IllegalStateException(String.format("values at index %s is greater than listSize %s", idx(), listSize)); + } + writer.writeDecimal256(value); + writer.setPosition(writer.idx() + 1); + } + + public void writeBigEndianBytesToDecimal256(byte[] value, ArrowType arrowType) { + if (writer.idx() >= (idx() + 1) * listSize) { + throw new IllegalStateException(String.format("values at index %s is greater than listSize %s", idx(), listSize)); + } + writer.writeBigEndianBytesToDecimal256(value, arrowType); + writer.setPosition(writer.idx() + 1); + } + + + <#list vv.types as type> + <#list type.minor as minor> + <#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#if !minor.typeParams?? > + @Override + public void write${name}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, ) { + if (writer.idx() >= (idx() + 1) * listSize) { + throw new IllegalStateException(String.format("values at index %s is greater than listSize %s", idx(), listSize)); + } + writer.write${name}(<#list fields as field>${field.name}<#if field_has_next>, ); + writer.setPosition(writer.idx() + 1); + } + + public void write(${name}Holder holder) { + if (writer.idx() >= (idx() + 1) * listSize) { + throw new IllegalStateException(String.format("values at index %s is greater than listSize %s", idx(), listSize)); + } + writer.write${name}(<#list fields as field>holder.${field.name}<#if field_has_next>, ); + writer.setPosition(writer.idx() + 1); + } + + + + +} diff --git a/src/arrow/java/vector/src/main/codegen/templates/UnionListWriter.java b/src/arrow/java/vector/src/main/codegen/templates/UnionListWriter.java new file mode 100644 index 000000000..926276b5e --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/templates/UnionListWriter.java @@ -0,0 +1,326 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.vector.complex.writer.Decimal256Writer; +import org.apache.arrow.vector.complex.writer.DecimalWriter; +import org.apache.arrow.vector.holders.Decimal256Holder; +import org.apache.arrow.vector.holders.DecimalHolder; + + +import java.lang.UnsupportedOperationException; +import java.math.BigDecimal; + +<@pp.dropOutputFile /> +<#list ["List", "LargeList"] as listName> + +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/Union${listName}Writer.java" /> + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; + +import static org.apache.arrow.memory.util.LargeMemoryUtil.checkedCastToInt; +<#include "/@includes/vv_imports.ftl" /> + +/* + * This class is generated using freemarker and the ${.template_name} template. + */ + +@SuppressWarnings("unused") +public class Union${listName}Writer extends AbstractFieldWriter { + + protected ${listName}Vector vector; + protected PromotableWriter writer; + private boolean inStruct = false; + private boolean listStarted = false; + private String structName; + <#if listName == "LargeList"> + private static final long OFFSET_WIDTH = 8; + <#else> + private static final int OFFSET_WIDTH = 4; + + + public Union${listName}Writer(${listName}Vector vector) { + this(vector, NullableStructWriterFactory.getNullableStructWriterFactoryInstance()); + } + + public Union${listName}Writer(${listName}Vector vector, NullableStructWriterFactory nullableStructWriterFactory) { + this.vector = vector; + this.writer = new PromotableWriter(vector.getDataVector(), vector, nullableStructWriterFactory); + } + + public Union${listName}Writer(${listName}Vector vector, AbstractFieldWriter parent) { + this(vector); + } + + @Override + public void allocate() { + vector.allocateNew(); + } + + @Override + public void clear() { + vector.clear(); + } + + @Override + public Field getField() { + return vector.getField(); + } + + public void setValueCount(int count) { + vector.setValueCount(count); + } + + @Override + public int getValueCapacity() { + return vector.getValueCapacity(); + } + + @Override + public void close() throws Exception { + vector.close(); + writer.close(); + } + + @Override + public void setPosition(int index) { + super.setPosition(index); + } + + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#if uncappedName == "int" ><#assign uncappedName = "integer" /> + <#if !minor.typeParams?? > + + @Override + public ${name}Writer ${uncappedName}() { + return this; + } + + @Override + public ${name}Writer ${uncappedName}(String name) { + structName = name; + return writer.${uncappedName}(name); + } + + + + @Override + public DecimalWriter decimal() { + return this; + } + + @Override + public DecimalWriter decimal(String name, int scale, int precision) { + return writer.decimal(name, scale, precision); + } + + @Override + public DecimalWriter decimal(String name) { + return writer.decimal(name); + } + + @Override + public Decimal256Writer decimal256() { + return this; + } + + @Override + public Decimal256Writer decimal256(String name, int scale, int precision) { + return writer.decimal256(name, scale, precision); + } + + @Override + public Decimal256Writer decimal256(String name) { + return writer.decimal256(name); + } + + + @Override + public StructWriter struct() { + inStruct = true; + return this; + } + + @Override + public ListWriter list() { + return writer; + } + + @Override + public ListWriter list(String name) { + ListWriter listWriter = writer.list(name); + return listWriter; + } + + @Override + public StructWriter struct(String name) { + StructWriter structWriter = writer.struct(name); + return structWriter; + } + + @Override + public MapWriter map() { + return writer; + } + + @Override + public MapWriter map(String name) { + MapWriter mapWriter = writer.map(name); + return mapWriter; + } + + @Override + public MapWriter map(boolean keysSorted) { + writer.map(keysSorted); + return writer; + } + + @Override + public MapWriter map(String name, boolean keysSorted) { + MapWriter mapWriter = writer.map(name, keysSorted); + return mapWriter; + } + + <#if listName == "LargeList"> + @Override + public void startList() { + vector.startNewValue(idx()); + writer.setPosition(checkedCastToInt(vector.getOffsetBuffer().getLong((idx() + 1L) * OFFSET_WIDTH))); + listStarted = true; + } + + @Override + public void endList() { + vector.getOffsetBuffer().setLong((idx() + 1L) * OFFSET_WIDTH, writer.idx()); + setPosition(idx() + 1); + listStarted = false; + } + <#else> + @Override + public void startList() { + vector.startNewValue(idx()); + writer.setPosition(vector.getOffsetBuffer().getInt((idx() + 1L) * OFFSET_WIDTH)); + listStarted = true; + } + + @Override + public void endList() { + vector.getOffsetBuffer().setInt((idx() + 1L) * OFFSET_WIDTH, writer.idx()); + setPosition(idx() + 1); + listStarted = false; + } + + + @Override + public void start() { + writer.start(); + } + + @Override + public void end() { + writer.end(); + inStruct = false; + } + + @Override + public void write(DecimalHolder holder) { + writer.write(holder); + writer.setPosition(writer.idx()+1); + } + + @Override + public void write(Decimal256Holder holder) { + writer.write(holder); + writer.setPosition(writer.idx()+1); + } + + @Override + public void writeNull() { + if (!listStarted){ + vector.setNull(idx()); + } else { + writer.writeNull(); + } + } + + public void writeDecimal(long start, ArrowBuf buffer, ArrowType arrowType) { + writer.writeDecimal(start, buffer, arrowType); + writer.setPosition(writer.idx()+1); + } + + public void writeDecimal(long start, ArrowBuf buffer) { + writer.writeDecimal(start, buffer); + writer.setPosition(writer.idx()+1); + } + + public void writeDecimal(BigDecimal value) { + writer.writeDecimal(value); + writer.setPosition(writer.idx()+1); + } + + public void writeBigEndianBytesToDecimal(byte[] value, ArrowType arrowType){ + writer.writeBigEndianBytesToDecimal(value, arrowType); + writer.setPosition(writer.idx() + 1); + } + + public void writeDecimal256(long start, ArrowBuf buffer, ArrowType arrowType) { + writer.writeDecimal256(start, buffer, arrowType); + writer.setPosition(writer.idx()+1); + } + + public void writeDecimal256(long start, ArrowBuf buffer) { + writer.writeDecimal256(start, buffer); + writer.setPosition(writer.idx()+1); + } + + public void writeDecimal256(BigDecimal value) { + writer.writeDecimal256(value); + writer.setPosition(writer.idx()+1); + } + + public void writeBigEndianBytesToDecimal256(byte[] value, ArrowType arrowType){ + writer.writeBigEndianBytesToDecimal256(value, arrowType); + writer.setPosition(writer.idx() + 1); + } + + + <#list vv.types as type> + <#list type.minor as minor> + <#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#if !minor.typeParams?? > + @Override + public void write${name}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, ) { + writer.write${name}(<#list fields as field>${field.name}<#if field_has_next>, ); + writer.setPosition(writer.idx()+1); + } + + public void write(${name}Holder holder) { + writer.write${name}(<#list fields as field>holder.${field.name}<#if field_has_next>, ); + writer.setPosition(writer.idx()+1); + } + + + + +} + diff --git a/src/arrow/java/vector/src/main/codegen/templates/UnionMapWriter.java b/src/arrow/java/vector/src/main/codegen/templates/UnionMapWriter.java new file mode 100644 index 000000000..606f88037 --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/templates/UnionMapWriter.java @@ -0,0 +1,222 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.vector.complex.writer.Decimal256Writer; +import org.apache.arrow.vector.complex.writer.DecimalWriter; +import org.apache.arrow.vector.holders.Decimal256Holder; +import org.apache.arrow.vector.holders.DecimalHolder; + +import java.lang.UnsupportedOperationException; +import java.math.BigDecimal; + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/UnionMapWriter.java" /> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> + +/* + * This class is generated using freemarker and the ${.template_name} template. + */ + +/** + *

Writer for MapVectors. This extends UnionListWriter to simplify writing map entries to a list + * of struct elements, with "key" and "value" fields. The procedure for writing a map begin with + * {@link #startMap()} followed by {@link #startEntry()}. An entry is written by using the + * {@link #key()} writer to write the key, then the {@link #value()} writer to write a value. After + * writing the value, call {@link #endEntry()} to complete the entry. Each map can have 1 or more + * entries. When done writing entries, call {@link #endMap()} to complete the map. + * + *

NOTE: the MapVector can have NULL values by not writing to position. If a map is started with + * {@link #startMap()}, then it must have a key written. The value of a map entry can be NULL by + * not using the {@link #value()} writer. + * + *

Example to write the following map to position 5 of a vector + *

{@code
+ *   // {
+ *   //   1 -> 3,
+ *   //   2 -> 4,
+ *   //   3 -> NULL
+ *   // }
+ *
+ *   UnionMapWriter writer = ...
+ *
+ *   writer.setPosition(5);
+ *   writer.startMap();
+ *   writer.startEntry();
+ *   writer.key().integer().writeInt(1);
+ *   writer.value().integer().writeInt(3);
+ *   writer.endEntry();
+ *   writer.startEntry();
+ *   writer.key().integer().writeInt(2);
+ *   writer.value().integer().writeInt(4);
+ *   writer.endEntry();
+ *   writer.startEntry();
+ *   writer.key().integer().writeInt(3);
+ *   writer.endEntry();
+ *   writer.endMap();
+ * 
+ *

+ */ +@SuppressWarnings("unused") +public class UnionMapWriter extends UnionListWriter { + + /** + * Current mode for writing map entries, set by calling {@link #key()} or {@link #value()} + * and reset with a call to {@link #endEntry()}. With KEY mode, a struct writer with field + * named "key" is returned. With VALUE mode, a struct writer with field named "value" is + * returned. In OFF mode, the writer will behave like a standard UnionListWriter + */ + private enum MapWriteMode { + OFF, + KEY, + VALUE, + } + + private MapWriteMode mode = MapWriteMode.OFF; + private StructWriter entryWriter; + + public UnionMapWriter(MapVector vector) { + super(vector); + entryWriter = struct(); + } + + /** Start writing a map that consists of 1 or more entries. */ + public void startMap() { + startList(); + } + + /** Complete the map. */ + public void endMap() { + endList(); + } + + /** + * Start a map entry that should be followed by calls to {@link #key()} and {@link #value()} + * writers. Call {@link #endEntry()} to complete the entry. + */ + public void startEntry() { + writer.setAddVectorAsNullable(false); + entryWriter.start(); + } + + /** Complete the map entry. */ + public void endEntry() { + entryWriter.end(); + mode = MapWriteMode.OFF; + writer.setAddVectorAsNullable(true); + } + + /** Return the key writer that is used to write to the "key" field. */ + public UnionMapWriter key() { + writer.setAddVectorAsNullable(false); + mode = MapWriteMode.KEY; + return this; + } + + /** Return the value writer that is used to write to the "value" field. */ + public UnionMapWriter value() { + writer.setAddVectorAsNullable(true); + mode = MapWriteMode.VALUE; + return this; + } + + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#if uncappedName == "int" ><#assign uncappedName = "integer" /> + <#if !minor.typeParams?? > + @Override + public ${name}Writer ${uncappedName}() { + switch (mode) { + case KEY: + return entryWriter.${uncappedName}(MapVector.KEY_NAME); + case VALUE: + return entryWriter.${uncappedName}(MapVector.VALUE_NAME); + default: + return this; + } + } + + + + @Override + public DecimalWriter decimal() { + switch (mode) { + case KEY: + return entryWriter.decimal(MapVector.KEY_NAME); + case VALUE: + return entryWriter.decimal(MapVector.VALUE_NAME); + default: + return this; + } + } + + @Override + public Decimal256Writer decimal256() { + switch (mode) { + case KEY: + return entryWriter.decimal256(MapVector.KEY_NAME); + case VALUE: + return entryWriter.decimal256(MapVector.VALUE_NAME); + default: + return this; + } + } + + + @Override + public StructWriter struct() { + switch (mode) { + case KEY: + return entryWriter.struct(MapVector.KEY_NAME); + case VALUE: + return entryWriter.struct(MapVector.VALUE_NAME); + default: + return super.struct(); + } + } + + @Override + public ListWriter list() { + switch (mode) { + case KEY: + return entryWriter.list(MapVector.KEY_NAME); + case VALUE: + return entryWriter.list(MapVector.VALUE_NAME); + default: + return super.list(); + } + } + + @Override + public MapWriter map(boolean keysSorted) { + switch (mode) { + case KEY: + return entryWriter.map(MapVector.KEY_NAME, keysSorted); + case VALUE: + return entryWriter.map(MapVector.VALUE_NAME, keysSorted); + default: + return super.map(); + } + } +} diff --git a/src/arrow/java/vector/src/main/codegen/templates/UnionReader.java b/src/arrow/java/vector/src/main/codegen/templates/UnionReader.java new file mode 100644 index 000000000..444ca9ca7 --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/templates/UnionReader.java @@ -0,0 +1,223 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/UnionReader.java" /> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> +/** + * Source code generated using FreeMarker template ${.template_name} + */ +@SuppressWarnings("unused") +public class UnionReader extends AbstractFieldReader { + + private BaseReader[] readers = new BaseReader[45]; + public UnionVector data; + + public UnionReader(UnionVector data) { + this.data = data; + } + + public MinorType getMinorType() { + return TYPES[data.getTypeValue(idx())]; + } + + private static MinorType[] TYPES = new MinorType[45]; + + static { + for (MinorType minorType : MinorType.values()) { + TYPES[minorType.ordinal()] = minorType; + } + } + + @Override + public Field getField() { + return data.getField(); + } + + public boolean isSet(){ + return !data.isNull(idx()); + } + + public void read(UnionHolder holder) { + holder.reader = this; + holder.isSet = this.isSet() ? 1 : 0; + } + + public void read(int index, UnionHolder holder) { + getList().read(index, holder); + } + + private FieldReader getReaderForIndex(int index) { + int typeValue = data.getTypeValue(index); + FieldReader reader = (FieldReader) readers[typeValue]; + if (reader != null) { + return reader; + } + switch (MinorType.values()[typeValue]) { + case NULL: + return NullReader.INSTANCE; + case STRUCT: + return (FieldReader) getStruct(); + case LIST: + return (FieldReader) getList(); + case MAP: + return (FieldReader) getMap(); + <#list vv.types as type> + <#list type.minor as minor> + <#assign name = minor.class?cap_first /> + <#assign uncappedName = name?uncap_first/> + <#if !minor.typeParams?? || minor.class?starts_with("Decimal")> + case ${name?upper_case}: + return (FieldReader) get${name}(); + + + + default: + throw new UnsupportedOperationException("Unsupported type: " + MinorType.values()[typeValue]); + } + } + + private SingleStructReaderImpl structReader; + + private StructReader getStruct() { + if (structReader == null) { + structReader = (SingleStructReaderImpl) data.getStruct().getReader(); + structReader.setPosition(idx()); + readers[MinorType.STRUCT.ordinal()] = structReader; + } + return structReader; + } + + private UnionListReader listReader; + + private FieldReader getList() { + if (listReader == null) { + listReader = new UnionListReader(data.getList()); + listReader.setPosition(idx()); + readers[MinorType.LIST.ordinal()] = listReader; + } + return listReader; + } + + private UnionMapReader mapReader; + + private FieldReader getMap() { + if (mapReader == null) { + mapReader = new UnionMapReader(data.getMap()); + mapReader.setPosition(idx()); + readers[MinorType.MAP.ordinal()] = mapReader; + } + return mapReader; + } + + @Override + public java.util.Iterator iterator() { + return getStruct().iterator(); + } + + @Override + public void copyAsValue(UnionWriter writer) { + writer.data.copyFrom(idx(), writer.idx(), data); + } + + <#list ["Object", "BigDecimal", "Short", "Integer", "Long", "Boolean", + "LocalDateTime", "Duration", "Period", "Double", "Float", + "Character", "Text", "Byte", "byte[]", "PeriodDuration"] as friendlyType> + <#assign safeType=friendlyType /> + <#if safeType=="byte[]"><#assign safeType="ByteArray" /> + + @Override + public ${friendlyType} read${safeType}() { + return getReaderForIndex(idx()).read${safeType}(); + } + + + + public int size() { + return getReaderForIndex(idx()).size(); + } + + <#list vv.types as type> + <#list type.minor as minor> + <#assign name = minor.class?cap_first /> + <#assign uncappedName = name?uncap_first/> + <#assign boxedType = (minor.boxedType!type.boxedType) /> + <#assign javaType = (minor.javaType!type.javaType) /> + <#assign friendlyType = (minor.friendlyType!minor.boxedType!type.boxedType) /> + <#assign safeType=friendlyType /> + <#if safeType=="byte[]"><#assign safeType="ByteArray" /> + <#if !minor.typeParams?? || minor.class?starts_with("Decimal") > + + private ${name}ReaderImpl ${uncappedName}Reader; + + private ${name}ReaderImpl get${name}() { + if (${uncappedName}Reader == null) { + ${uncappedName}Reader = new ${name}ReaderImpl(data.get${name}Vector()); + ${uncappedName}Reader.setPosition(idx()); + readers[MinorType.${name?upper_case}.ordinal()] = ${uncappedName}Reader; + } + return ${uncappedName}Reader; + } + + public void read(Nullable${name}Holder holder){ + getReaderForIndex(idx()).read(holder); + } + + public void copyAsValue(${name}Writer writer){ + getReaderForIndex(idx()).copyAsValue(writer); + } + + + + + @Override + public void copyAsValue(ListWriter writer) { + ComplexCopier.copy(this, (FieldWriter) writer); + } + + @Override + public void setPosition(int index) { + super.setPosition(index); + for (BaseReader reader : readers) { + if (reader != null) { + reader.setPosition(index); + } + } + } + + public FieldReader reader(String name){ + return getStruct().reader(name); + } + + public FieldReader reader() { + return getList().reader(); + } + + public boolean next() { + return getReaderForIndex(idx()).next(); + } +} diff --git a/src/arrow/java/vector/src/main/codegen/templates/UnionVector.java b/src/arrow/java/vector/src/main/codegen/templates/UnionVector.java new file mode 100644 index 000000000..1468116c7 --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/templates/UnionVector.java @@ -0,0 +1,854 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.ReferenceManager; +import org.apache.arrow.memory.util.CommonUtil; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.BaseValueVector; +import org.apache.arrow.vector.BitVectorHelper; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.complex.AbstractStructVector; +import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.UnionMode; +import org.apache.arrow.vector.compare.RangeEqualsVisitor; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.DataSizeRoundingUtil; + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/UnionVector.java" /> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex; + +<#include "/@includes/vv_imports.ftl" /> +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.util.CommonUtil; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.complex.impl.ComplexCopier; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.ValueVectorUtility; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.memory.util.ArrowBufPointer; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.vector.BaseValueVector; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.util.Preconditions; + +import static org.apache.arrow.vector.types.UnionMode.Sparse; +import static org.apache.arrow.memory.util.LargeMemoryUtil.checkedCastToInt; +import static org.apache.arrow.memory.util.LargeMemoryUtil.capAtMaxInt; + + + +/* + * This class is generated using freemarker and the ${.template_name} template. + */ +@SuppressWarnings("unused") + + +/** + * A vector which can hold values of different types. It does so by using a StructVector which contains a vector for each + * primitive type that is stored. StructVector is used in order to take advantage of its serialization/deserialization methods, + * as well as the addOrGet method. + * + * For performance reasons, UnionVector stores a cached reference to each subtype vector, to avoid having to do the struct lookup + * each time the vector is accessed. + * Source code generated using FreeMarker template ${.template_name} + */ +public class UnionVector extends AbstractContainerVector implements FieldVector { + int valueCount; + + NonNullableStructVector internalStruct; + protected ArrowBuf typeBuffer; + + private StructVector structVector; + private ListVector listVector; + private MapVector mapVector; + + private FieldReader reader; + + private int singleType = 0; + private ValueVector singleVector; + + private int typeBufferAllocationSizeInBytes; + + private final FieldType fieldType; + private final Field[] typeIds = new Field[Byte.MAX_VALUE + 1]; + + public static final byte TYPE_WIDTH = 1; + private static final FieldType INTERNAL_STRUCT_TYPE = new FieldType(false /*nullable*/, + ArrowType.Struct.INSTANCE, null /*dictionary*/, null /*metadata*/); + + public static UnionVector empty(String name, BufferAllocator allocator) { + FieldType fieldType = FieldType.nullable(new ArrowType.Union( + UnionMode.Sparse, null)); + return new UnionVector(name, allocator, fieldType, null); + } + + public UnionVector(String name, BufferAllocator allocator, FieldType fieldType, CallBack callBack) { + super(name, allocator, callBack); + this.fieldType = fieldType; + this.internalStruct = new NonNullableStructVector( + "internal", + allocator, + INTERNAL_STRUCT_TYPE, + callBack, + AbstractStructVector.ConflictPolicy.CONFLICT_REPLACE, + false); + this.typeBuffer = allocator.getEmpty(); + this.typeBufferAllocationSizeInBytes = BaseValueVector.INITIAL_VALUE_ALLOCATION * TYPE_WIDTH; + } + + public BufferAllocator getAllocator() { + return allocator; + } + + @Override + public MinorType getMinorType() { + return MinorType.UNION; + } + + @Override + public void initializeChildrenFromFields(List children) { + int count = 0; + for (Field child: children) { + int typeId = Types.getMinorTypeForArrowType(child.getType()).ordinal(); + if (fieldType != null) { + int[] typeIds = ((ArrowType.Union)fieldType.getType()).getTypeIds(); + if (typeIds != null) { + typeId = typeIds[count++]; + } + } + typeIds[typeId] = child; + } + internalStruct.initializeChildrenFromFields(children); + } + + @Override + public List getChildrenFromFields() { + return internalStruct.getChildrenFromFields(); + } + + @Override + public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { + if (ownBuffers.size() != 1) { + throw new IllegalArgumentException("Illegal buffer count, expected 1, got: " + ownBuffers.size()); + } + ArrowBuf buffer = ownBuffers.get(0); + typeBuffer.getReferenceManager().release(); + typeBuffer = buffer.getReferenceManager().retain(buffer, allocator); + typeBufferAllocationSizeInBytes = checkedCastToInt(typeBuffer.capacity()); + this.valueCount = fieldNode.getLength(); + } + + @Override + public List getFieldBuffers() { + List result = new ArrayList<>(1); + setReaderAndWriterIndex(); + result.add(typeBuffer); + + return result; + } + + private void setReaderAndWriterIndex() { + typeBuffer.readerIndex(0); + typeBuffer.writerIndex(valueCount * TYPE_WIDTH); + } + + /** + * Get the inner vectors. + * + * @deprecated This API will be removed as the current implementations no longer support inner vectors. + * + * @return the inner vectors for this field as defined by the TypeLayout + */ + @Deprecated + @Override + public List getFieldInnerVectors() { + throw new UnsupportedOperationException("There are no inner vectors. Use geFieldBuffers"); + } + + private String fieldName(MinorType type) { + return type.name().toLowerCase(); + } + + private FieldType fieldType(MinorType type) { + return FieldType.nullable(type.getType()); + } + + private T addOrGet(Types.MinorType minorType, Class c) { + return addOrGet(null, minorType, c); + } + + private T addOrGet(String name, Types.MinorType minorType, ArrowType arrowType, Class c) { + return internalStruct.addOrGet(name == null ? fieldName(minorType) : name, FieldType.nullable(arrowType), c); + } + + private T addOrGet(String name, Types.MinorType minorType, Class c) { + return internalStruct.addOrGet(name == null ? fieldName(minorType) : name, fieldType(minorType), c); + } + + + @Override + public long getValidityBufferAddress() { + throw new UnsupportedOperationException(); + } + + public long getTypeBufferAddress() { + return typeBuffer.memoryAddress(); + } + + @Override + public long getDataBufferAddress() { + throw new UnsupportedOperationException(); + } + + @Override + public long getOffsetBufferAddress() { + throw new UnsupportedOperationException(); + } + + public ArrowBuf getTypeBuffer() { + return typeBuffer; + } + + @Override + public ArrowBuf getValidityBuffer() { throw new UnsupportedOperationException(); } + + @Override + public ArrowBuf getDataBuffer() { throw new UnsupportedOperationException(); } + + @Override + public ArrowBuf getOffsetBuffer() { throw new UnsupportedOperationException(); } + + public StructVector getStruct() { + if (structVector == null) { + int vectorCount = internalStruct.size(); + structVector = addOrGet(MinorType.STRUCT, StructVector.class); + if (internalStruct.size() > vectorCount) { + structVector.allocateNew(); + if (callBack != null) { + callBack.doWork(); + } + } + } + return structVector; + } + <#list vv.types as type> + <#list type.minor as minor> + <#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#assign lowerCaseName = name?lower_case/> + <#if !minor.typeParams?? || minor.class?starts_with("Decimal") > + + private ${name}Vector ${uncappedName}Vector; + + public ${name}Vector get${name}Vector(<#if minor.class?starts_with("Decimal")> ArrowType arrowType) { + return get${name}Vector(null<#if minor.class?starts_with("Decimal")>, arrowType); + } + + public ${name}Vector get${name}Vector(String name<#if minor.class?starts_with("Decimal")>, ArrowType arrowType) { + if (${uncappedName}Vector == null) { + int vectorCount = internalStruct.size(); + ${uncappedName}Vector = addOrGet(name, MinorType.${name?upper_case},<#if minor.class?starts_with("Decimal")> arrowType, ${name}Vector.class); + if (internalStruct.size() > vectorCount) { + ${uncappedName}Vector.allocateNew(); + if (callBack != null) { + callBack.doWork(); + } + } + } + return ${uncappedName}Vector; + } + <#if minor.class?starts_with("Decimal")> + public ${name}Vector get${name}Vector() { + if (${uncappedName}Vector == null) { + throw new IllegalArgumentException("No ${uncappedName} present. Provide ArrowType argument to create a new vector"); + } + return ${uncappedName}Vector; + } + + + + + + public ListVector getList() { + if (listVector == null) { + int vectorCount = internalStruct.size(); + listVector = addOrGet(MinorType.LIST, ListVector.class); + if (internalStruct.size() > vectorCount) { + listVector.allocateNew(); + if (callBack != null) { + callBack.doWork(); + } + } + } + return listVector; + } + + public MapVector getMap() { + if (mapVector == null) { + throw new IllegalArgumentException("No map present. Provide ArrowType argument to create a new vector"); + } + return mapVector; + } + + public MapVector getMap(ArrowType arrowType) { + return getMap(null, arrowType); + } + + public MapVector getMap(String name, ArrowType arrowType) { + if (mapVector == null) { + int vectorCount = internalStruct.size(); + mapVector = addOrGet(name, MinorType.MAP, arrowType, MapVector.class); + if (internalStruct.size() > vectorCount) { + mapVector.allocateNew(); + if (callBack != null) { + callBack.doWork(); + } + } + } + return mapVector; + } + + public int getTypeValue(int index) { + return typeBuffer.getByte(index * TYPE_WIDTH); + } + + @Override + public void allocateNew() throws OutOfMemoryException { + /* new allocation -- clear the current buffers */ + clear(); + internalStruct.allocateNew(); + try { + allocateTypeBuffer(); + } catch (Exception e) { + clear(); + throw e; + } + } + + @Override + public boolean allocateNewSafe() { + /* new allocation -- clear the current buffers */ + clear(); + boolean safe = internalStruct.allocateNewSafe(); + if (!safe) { return false; } + try { + allocateTypeBuffer(); + } catch (Exception e) { + clear(); + return false; + } + + return true; + } + + private void allocateTypeBuffer() { + typeBuffer = allocator.buffer(typeBufferAllocationSizeInBytes); + typeBuffer.readerIndex(0); + typeBuffer.setZero(0, typeBuffer.capacity()); + } + + @Override + public void reAlloc() { + internalStruct.reAlloc(); + reallocTypeBuffer(); + } + + private void reallocTypeBuffer() { + final long currentBufferCapacity = typeBuffer.capacity(); + long newAllocationSize = currentBufferCapacity * 2; + if (newAllocationSize == 0) { + if (typeBufferAllocationSizeInBytes > 0) { + newAllocationSize = typeBufferAllocationSizeInBytes; + } else { + newAllocationSize = BaseValueVector.INITIAL_VALUE_ALLOCATION * TYPE_WIDTH * 2; + } + } + newAllocationSize = CommonUtil.nextPowerOfTwo(newAllocationSize); + assert newAllocationSize >= 1; + + if (newAllocationSize > BaseValueVector.MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Unable to expand the buffer"); + } + + final ArrowBuf newBuf = allocator.buffer(checkedCastToInt(newAllocationSize)); + newBuf.setBytes(0, typeBuffer, 0, currentBufferCapacity); + newBuf.setZero(currentBufferCapacity, newBuf.capacity() - currentBufferCapacity); + typeBuffer.getReferenceManager().release(1); + typeBuffer = newBuf; + typeBufferAllocationSizeInBytes = (int)newAllocationSize; + } + + @Override + public void setInitialCapacity(int numRecords) { } + + @Override + public int getValueCapacity() { + return Math.min(getTypeBufferValueCapacity(), internalStruct.getValueCapacity()); + } + + @Override + public void close() { + clear(); + } + + @Override + public void clear() { + valueCount = 0; + typeBuffer.getReferenceManager().release(); + typeBuffer = allocator.getEmpty(); + internalStruct.clear(); + } + + @Override + public void reset() { + valueCount = 0; + typeBuffer.setZero(0, typeBuffer.capacity()); + internalStruct.reset(); + } + + @Override + public Field getField() { + List childFields = new ArrayList<>(); + List children = internalStruct.getChildren(); + int[] typeIds = new int[children.size()]; + for (ValueVector v : children) { + typeIds[childFields.size()] = v.getMinorType().ordinal(); + childFields.add(v.getField()); + } + + FieldType fieldType; + if (this.fieldType == null) { + fieldType = FieldType.nullable(new ArrowType.Union(Sparse, typeIds)); + } else { + final UnionMode mode = ((ArrowType.Union)this.fieldType.getType()).getMode(); + fieldType = new FieldType(this.fieldType.isNullable(), new ArrowType.Union(mode, typeIds), + this.fieldType.getDictionary(), this.fieldType.getMetadata()); + } + + return new Field(name, fieldType, childFields); + } + + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + return getTransferPair(name, allocator); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return getTransferPair(ref, allocator, null); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { + return new org.apache.arrow.vector.complex.UnionVector.TransferImpl(ref, allocator, callBack); + } + + @Override + public TransferPair makeTransferPair(ValueVector target) { + return new TransferImpl((UnionVector) target); + } + + @Override + public void copyFrom(int inIndex, int outIndex, ValueVector from) { + Preconditions.checkArgument(this.getMinorType() == from.getMinorType()); + UnionVector fromCast = (UnionVector) from; + fromCast.getReader().setPosition(inIndex); + getWriter().setPosition(outIndex); + ComplexCopier.copy(fromCast.reader, writer); + } + + @Override + public void copyFromSafe(int inIndex, int outIndex, ValueVector from) { + copyFrom(inIndex, outIndex, from); + } + + public FieldVector addVector(FieldVector v) { + final String name = v.getName().isEmpty() ? fieldName(v.getMinorType()) : v.getName(); + Preconditions.checkState(internalStruct.getChild(name) == null, String.format("%s vector already exists", name)); + final FieldVector newVector = internalStruct.addOrGet(name, v.getField().getFieldType(), v.getClass()); + v.makeTransferPair(newVector).transfer(); + internalStruct.putChild(name, newVector); + if (callBack != null) { + callBack.doWork(); + } + return newVector; + } + + /** + * Directly put a vector to internalStruct without creating a new one with same type. + */ + public void directAddVector(FieldVector v) { + String name = fieldName(v.getMinorType()); + Preconditions.checkState(internalStruct.getChild(name) == null, String.format("%s vector already exists", name)); + internalStruct.putChild(name, v); + if (callBack != null) { + callBack.doWork(); + } + } + + private class TransferImpl implements TransferPair { + private final TransferPair internalStructVectorTransferPair; + private final UnionVector to; + + public TransferImpl(String name, BufferAllocator allocator, CallBack callBack) { + to = new UnionVector(name, allocator, /* field type */ null, callBack); + internalStructVectorTransferPair = internalStruct.makeTransferPair(to.internalStruct); + } + + public TransferImpl(UnionVector to) { + this.to = to; + internalStructVectorTransferPair = internalStruct.makeTransferPair(to.internalStruct); + } + + @Override + public void transfer() { + to.clear(); + ReferenceManager refManager = typeBuffer.getReferenceManager(); + to.typeBuffer = refManager.transferOwnership(typeBuffer, to.allocator).getTransferredBuffer(); + internalStructVectorTransferPair.transfer(); + to.valueCount = valueCount; + clear(); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + Preconditions.checkArgument(startIndex >= 0 && length >= 0 && startIndex + length <= valueCount, + "Invalid parameters startIndex: %s, length: %s for valueCount: %s", startIndex, length, valueCount); + to.clear(); + + internalStructVectorTransferPair.splitAndTransfer(startIndex, length); + final int startPoint = startIndex * TYPE_WIDTH; + final int sliceLength = length * TYPE_WIDTH; + final ArrowBuf slicedBuffer = typeBuffer.slice(startPoint, sliceLength); + final ReferenceManager refManager = slicedBuffer.getReferenceManager(); + to.typeBuffer = refManager.transferOwnership(slicedBuffer, to.allocator).getTransferredBuffer(); + to.setValueCount(length); + } + + @Override + public ValueVector getTo() { + return to; + } + + @Override + public void copyValueSafe(int from, int to) { + this.to.copyFrom(from, to, UnionVector.this); + } + } + + @Override + public FieldReader getReader() { + if (reader == null) { + reader = new UnionReader(this); + } + return reader; + } + + public FieldWriter getWriter() { + if (writer == null) { + writer = new UnionWriter(this); + } + return writer; + } + + @Override + public int getBufferSize() { + if (valueCount == 0) { return 0; } + + return (valueCount * TYPE_WIDTH) + internalStruct.getBufferSize(); + } + + @Override + public int getBufferSizeFor(final int valueCount) { + if (valueCount == 0) { + return 0; + } + + long bufferSize = 0; + for (final ValueVector v : (Iterable) this) { + bufferSize += v.getBufferSizeFor(valueCount); + } + + return (int) bufferSize + (valueCount * TYPE_WIDTH); + } + + @Override + public ArrowBuf[] getBuffers(boolean clear) { + List list = new java.util.ArrayList<>(); + setReaderAndWriterIndex(); + if (getBufferSize() != 0) { + list.add(typeBuffer); + list.addAll(java.util.Arrays.asList(internalStruct.getBuffers(clear))); + } + if (clear) { + valueCount = 0; + typeBuffer.getReferenceManager().retain(); + typeBuffer.getReferenceManager().release(); + typeBuffer = allocator.getEmpty(); + } + return list.toArray(new ArrowBuf[list.size()]); + } + + @Override + public Iterator iterator() { + return internalStruct.iterator(); + } + + public ValueVector getVector(int index) { + return getVector(index, null); + } + + public ValueVector getVector(int index, ArrowType arrowType) { + int type = typeBuffer.getByte(index * TYPE_WIDTH); + return getVectorByType(type, arrowType); + } + + public ValueVector getVectorByType(int typeId) { + return getVectorByType(typeId, null); + } + + public ValueVector getVectorByType(int typeId, ArrowType arrowType) { + Field type = typeIds[typeId]; + Types.MinorType minorType; + String name = null; + if (type == null) { + minorType = Types.MinorType.values()[typeId]; + } else { + minorType = Types.getMinorTypeForArrowType(type.getType()); + name = type.getName(); + } + switch (minorType) { + case NULL: + return null; + <#list vv.types as type> + <#list type.minor as minor> + <#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#if !minor.typeParams?? || minor.class?starts_with("Decimal") > + case ${name?upper_case}: + return get${name}Vector(name<#if minor.class?starts_with("Decimal")>, arrowType); + + + + case STRUCT: + return getStruct(); + case LIST: + return getList(); + case MAP: + return getMap(name, arrowType); + default: + throw new UnsupportedOperationException("Cannot support type: " + MinorType.values()[typeId]); + } + } + + public Object getObject(int index) { + ValueVector vector = getVector(index); + if (vector != null) { + return vector.isNull(index) ? null : vector.getObject(index); + } + return null; + } + + public byte[] get(int index) { + return null; + } + + public void get(int index, ComplexHolder holder) { + } + + public void get(int index, UnionHolder holder) { + FieldReader reader = new UnionReader(UnionVector.this); + reader.setPosition(index); + holder.reader = reader; + } + + public int getValueCount() { + return valueCount; + } + + /** + * IMPORTANT: Union types always return non null as there is no validity buffer. + * + * To check validity correctly you must check the underlying vector. + */ + public boolean isNull(int index) { + return false; + } + + @Override + public int getNullCount() { + return 0; + } + + public int isSet(int index) { + return isNull(index) ? 0 : 1; + } + + UnionWriter writer; + + public void setValueCount(int valueCount) { + this.valueCount = valueCount; + while (valueCount > getTypeBufferValueCapacity()) { + reallocTypeBuffer(); + } + internalStruct.setValueCount(valueCount); + } + + public void setSafe(int index, UnionHolder holder) { + setSafe(index, holder, null); + } + + public void setSafe(int index, UnionHolder holder, ArrowType arrowType) { + FieldReader reader = holder.reader; + if (writer == null) { + writer = new UnionWriter(UnionVector.this); + } + writer.setPosition(index); + MinorType type = reader.getMinorType(); + switch (type) { + <#list vv.types as type> + <#list type.minor as minor> + <#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#if !minor.typeParams?? || minor.class?starts_with("Decimal") > + case ${name?upper_case}: + Nullable${name}Holder ${uncappedName}Holder = new Nullable${name}Holder(); + reader.read(${uncappedName}Holder); + setSafe(index, ${uncappedName}Holder<#if minor.class?starts_with("Decimal")>, arrowType); + break; + + + + case STRUCT: { + ComplexCopier.copy(reader, writer); + break; + } + case LIST: { + ComplexCopier.copy(reader, writer); + break; + } + default: + throw new UnsupportedOperationException(); + } + } + <#list vv.types as type> + <#list type.minor as minor> + <#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#if !minor.typeParams?? || minor.class?starts_with("Decimal") > + public void setSafe(int index, Nullable${name}Holder holder<#if minor.class?starts_with("Decimal")>, ArrowType arrowType) { + setType(index, MinorType.${name?upper_case}); + get${name}Vector(null<#if minor.class?starts_with("Decimal")>, arrowType).setSafe(index, holder); + } + + + + + + public void setType(int index, MinorType type) { + while (index >= getTypeBufferValueCapacity()) { + reallocTypeBuffer(); + } + typeBuffer.setByte(index * TYPE_WIDTH , (byte) type.ordinal()); + } + + private int getTypeBufferValueCapacity() { + return capAtMaxInt(typeBuffer.capacity() / TYPE_WIDTH); + } + + @Override + public int hashCode(int index) { + return hashCode(index, null); + } + + @Override + public int hashCode(int index, ArrowBufHasher hasher) { + ValueVector vec = getVector(index); + if (vec == null) { + return ArrowBufPointer.NULL_HASH_CODE; + } + return vec.hashCode(index, hasher); + } + + @Override + public OUT accept(VectorVisitor visitor, IN value) { + return visitor.visit(this, value); + } + + @Override + public String getName() { + return name; + } + + @Override + public String toString() { + return ValueVectorUtility.getToString(this, 0, getValueCount()); + } + + @Override + public T addOrGet(String name, FieldType fieldType, Class clazz) { + return internalStruct.addOrGet(name, fieldType, clazz); + } + + @Override + public T getChild(String name, Class clazz) { + return internalStruct.getChild(name, clazz); + } + + @Override + public VectorWithOrdinal getChildVectorWithOrdinal(String name) { + return internalStruct.getChildVectorWithOrdinal(name); + } + + @Override + public int size() { + return internalStruct.size(); + } + + @Override + public void setInitialCapacity(int valueCount, double density) { + for (final ValueVector vector : internalStruct) { + if (vector instanceof DensityAwareVector) { + ((DensityAwareVector) vector).setInitialCapacity(valueCount, density); + } else { + vector.setInitialCapacity(valueCount); + } + } + } +} diff --git a/src/arrow/java/vector/src/main/codegen/templates/UnionWriter.java b/src/arrow/java/vector/src/main/codegen/templates/UnionWriter.java new file mode 100644 index 000000000..fc4fd7dd7 --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/templates/UnionWriter.java @@ -0,0 +1,364 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.complex.impl.NullableStructWriterFactory; +import org.apache.arrow.vector.types.Types; + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/UnionWriter.java" /> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> +import org.apache.arrow.vector.complex.writer.BaseWriter; +import org.apache.arrow.vector.types.Types.MinorType; + +/* + * This class is generated using freemarker and the ${.template_name} template. + */ +@SuppressWarnings("unused") +public class UnionWriter extends AbstractFieldWriter implements FieldWriter { + + UnionVector data; + private StructWriter structWriter; + private UnionListWriter listWriter; + private UnionMapWriter mapWriter; + private List writers = new java.util.ArrayList<>(); + private final NullableStructWriterFactory nullableStructWriterFactory; + + public UnionWriter(UnionVector vector) { + this(vector, NullableStructWriterFactory.getNullableStructWriterFactoryInstance()); + } + + public UnionWriter(UnionVector vector, NullableStructWriterFactory nullableStructWriterFactory) { + data = vector; + this.nullableStructWriterFactory = nullableStructWriterFactory; + } + + @Override + public void setPosition(int index) { + super.setPosition(index); + for (BaseWriter writer : writers) { + writer.setPosition(index); + } + } + + + @Override + public void start() { + data.setType(idx(), MinorType.STRUCT); + getStructWriter().start(); + } + + @Override + public void end() { + getStructWriter().end(); + } + + @Override + public void startList() { + getListWriter().startList(); + data.setType(idx(), MinorType.LIST); + } + + @Override + public void endList() { + getListWriter().endList(); + } + + @Override + public void startMap() { + getMapWriter().startMap(); + data.setType(idx(), MinorType.MAP); + } + + @Override + public void endMap() { + getMapWriter().endMap(); + } + + @Override + public void startEntry() { + getMapWriter().startEntry(); + } + + @Override + public MapWriter key() { + return getMapWriter().key(); + } + + @Override + public MapWriter value() { + return getMapWriter().value(); + } + + @Override + public void endEntry() { + getMapWriter().endEntry(); + } + + private StructWriter getStructWriter() { + if (structWriter == null) { + structWriter = nullableStructWriterFactory.build(data.getStruct()); + structWriter.setPosition(idx()); + writers.add(structWriter); + } + return structWriter; + } + + public StructWriter asStruct() { + data.setType(idx(), MinorType.STRUCT); + return getStructWriter(); + } + + private ListWriter getListWriter() { + if (listWriter == null) { + listWriter = new UnionListWriter(data.getList(), nullableStructWriterFactory); + listWriter.setPosition(idx()); + writers.add(listWriter); + } + return listWriter; + } + + public ListWriter asList() { + data.setType(idx(), MinorType.LIST); + return getListWriter(); + } + + private MapWriter getMapWriter() { + if (mapWriter == null) { + mapWriter = new UnionMapWriter(data.getMap(new ArrowType.Map(false))); + mapWriter.setPosition(idx()); + writers.add(mapWriter); + } + return mapWriter; + } + + private MapWriter getMapWriter(ArrowType arrowType) { + if (mapWriter == null) { + mapWriter = new UnionMapWriter(data.getMap(arrowType)); + mapWriter.setPosition(idx()); + writers.add(mapWriter); + } + return mapWriter; + } + + public MapWriter asMap(ArrowType arrowType) { + data.setType(idx(), MinorType.MAP); + return getMapWriter(arrowType); + } + + BaseWriter getWriter(MinorType minorType) { + return getWriter(minorType, null); + } + + BaseWriter getWriter(MinorType minorType, ArrowType arrowType) { + switch (minorType) { + case STRUCT: + return getStructWriter(); + case LIST: + return getListWriter(); + case MAP: + return getMapWriter(arrowType); + <#list vv.types as type> + <#list type.minor as minor> + <#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#if !minor.typeParams?? || minor.class?starts_with("Decimal")> + case ${name?upper_case}: + return get${name}Writer(<#if minor.class?starts_with("Decimal") >arrowType); + + + + default: + throw new UnsupportedOperationException("Unknown type: " + minorType); + } + } + <#list vv.types as type> + <#list type.minor as minor> + <#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#assign friendlyType = (minor.friendlyType!minor.boxedType!type.boxedType) /> + <#if !minor.typeParams?? || minor.class?starts_with("Decimal") > + + private ${name}Writer ${name?uncap_first}Writer; + + private ${name}Writer get${name}Writer(<#if minor.class?starts_with("Decimal")>ArrowType arrowType) { + if (${uncappedName}Writer == null) { + ${uncappedName}Writer = new ${name}WriterImpl(data.get${name}Vector(<#if minor.class?starts_with("Decimal")>arrowType)); + ${uncappedName}Writer.setPosition(idx()); + writers.add(${uncappedName}Writer); + } + return ${uncappedName}Writer; + } + + public ${name}Writer as${name}(<#if minor.class?starts_with("Decimal")>ArrowType arrowType) { + data.setType(idx(), MinorType.${name?upper_case}); + return get${name}Writer(<#if minor.class?starts_with("Decimal")>arrowType); + } + + @Override + public void write(${name}Holder holder) { + data.setType(idx(), MinorType.${name?upper_case}); + <#if minor.class?starts_with("Decimal")>ArrowType arrowType = new ArrowType.Decimal(holder.precision, holder.scale, ${name}Holder.WIDTH * 8); + get${name}Writer(<#if minor.class?starts_with("Decimal")>arrowType).setPosition(idx()); + get${name}Writer(<#if minor.class?starts_with("Decimal")>arrowType).write${name}(<#list fields as field>holder.${field.name}<#if field_has_next>, <#if minor.class?starts_with("Decimal")>, arrowType); + } + + public void write${minor.class}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, <#if minor.class?starts_with("Decimal")>, ArrowType arrowType) { + data.setType(idx(), MinorType.${name?upper_case}); + get${name}Writer(<#if minor.class?starts_with("Decimal")>arrowType).setPosition(idx()); + get${name}Writer(<#if minor.class?starts_with("Decimal")>arrowType).write${name}(<#list fields as field>${field.name}<#if field_has_next>, <#if minor.class?starts_with("Decimal")>, arrowType); + } + <#if minor.class?starts_with("Decimal")> + public void write${name}(${friendlyType} value) { + data.setType(idx(), MinorType.${name?upper_case}); + ArrowType arrowType = new ArrowType.Decimal(value.precision(), value.scale(), ${name}Vector.TYPE_WIDTH * 8); + get${name}Writer(arrowType).setPosition(idx()); + get${name}Writer(arrowType).write${name}(value); + } + + public void writeBigEndianBytesTo${name}(byte[] value, ArrowType arrowType) { + data.setType(idx(), MinorType.${name?upper_case}); + get${name}Writer(arrowType).setPosition(idx()); + get${name}Writer(arrowType).writeBigEndianBytesTo${name}(value, arrowType); + } + + + + + + public void writeNull() { + } + + @Override + public StructWriter struct() { + data.setType(idx(), MinorType.LIST); + getListWriter().setPosition(idx()); + return getListWriter().struct(); + } + + @Override + public ListWriter list() { + data.setType(idx(), MinorType.LIST); + getListWriter().setPosition(idx()); + return getListWriter().list(); + } + + @Override + public ListWriter list(String name) { + data.setType(idx(), MinorType.STRUCT); + getStructWriter().setPosition(idx()); + return getStructWriter().list(name); + } + + @Override + public StructWriter struct(String name) { + data.setType(idx(), MinorType.STRUCT); + getStructWriter().setPosition(idx()); + return getStructWriter().struct(name); + } + + @Override + public MapWriter map() { + data.setType(idx(), MinorType.MAP); + getListWriter().setPosition(idx()); + return getListWriter().map(); + } + + @Override + public MapWriter map(boolean keysSorted) { + data.setType(idx(), MinorType.MAP); + getListWriter().setPosition(idx()); + return getListWriter().map(keysSorted); + } + + @Override + public MapWriter map(String name) { + data.setType(idx(), MinorType.MAP); + getStructWriter().setPosition(idx()); + return getStructWriter().map(name); + } + + @Override + public MapWriter map(String name, boolean keysSorted) { + data.setType(idx(), MinorType.MAP); + getStructWriter().setPosition(idx()); + return getStructWriter().map(name, keysSorted); + } + + <#list vv.types as type><#list type.minor as minor> + <#assign lowerName = minor.class?uncap_first /> + <#if lowerName == "int" ><#assign lowerName = "integer" /> + <#assign upperName = minor.class?upper_case /> + <#assign capName = minor.class?cap_first /> + <#if !minor.typeParams?? || minor.class?starts_with("Decimal") > + @Override + public ${capName}Writer ${lowerName}(String name) { + data.setType(idx(), MinorType.STRUCT); + getStructWriter().setPosition(idx()); + return getStructWriter().${lowerName}(name); + } + + @Override + public ${capName}Writer ${lowerName}() { + data.setType(idx(), MinorType.LIST); + getListWriter().setPosition(idx()); + return getListWriter().${lowerName}(); + } + + <#if minor.class?starts_with("Decimal")> + @Override + public ${capName}Writer ${lowerName}(String name<#list minor.typeParams as typeParam>, ${typeParam.type} ${typeParam.name}) { + data.setType(idx(), MinorType.STRUCT); + getStructWriter().setPosition(idx()); + return getStructWriter().${lowerName}(name<#list minor.typeParams as typeParam>, ${typeParam.name}); + } + + + + @Override + public void allocate() { + data.allocateNew(); + } + + @Override + public void clear() { + data.clear(); + } + + @Override + public void close() throws Exception { + data.close(); + } + + @Override + public Field getField() { + return data.getField(); + } + + @Override + public int getValueCapacity() { + return data.getValueCapacity(); + } +} diff --git a/src/arrow/java/vector/src/main/codegen/templates/ValueHolders.java b/src/arrow/java/vector/src/main/codegen/templates/ValueHolders.java new file mode 100644 index 000000000..973efd870 --- /dev/null +++ b/src/arrow/java/vector/src/main/codegen/templates/ValueHolders.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +<@pp.dropOutputFile /> +<#list vv.modes as mode> +<#list vv.types as type> +<#list type.minor as minor> + +<#assign className="${mode.prefix}${minor.class}Holder" /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/holders/${className}.java" /> + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.holders; + +<#include "/@includes/vv_imports.ftl" /> + +/** + * Source code generated using FreeMarker template ${.template_name} + */ +public final class ${className} implements ValueHolder{ + + <#if mode.name == "Repeated"> + + /** The first index (inclusive) into the Vector. **/ + public int start; + + /** The last index (exclusive) into the Vector. **/ + public int end; + + /** The Vector holding the actual values. **/ + public ${minor.class}Vector vector; + + <#else> + public static final int WIDTH = ${type.width}; + + <#if mode.name == "Optional">public int isSet; + <#else>public final int isSet = 1; + <#assign fields = (minor.fields!type.fields) + (minor.typeParams![]) /> + <#list fields as field> + public ${field.type} ${field.name}; + + + /** + * Reason for not supporting the operation is that ValueHolders are potential scalar + * replacements and hence we don't want any methods to be invoked on them. + */ + public int hashCode(){ + throw new UnsupportedOperationException(); + } + + /** + * Reason for not supporting the operation is that ValueHolders are potential scalar + * replacements and hence we don't want any methods to be invoked on them. + */ + public String toString(){ + throw new UnsupportedOperationException(); + } + + + + + +} + + + + \ No newline at end of file diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/AddOrGetResult.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/AddOrGetResult.java new file mode 100644 index 000000000..b41dbb245 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/AddOrGetResult.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.util.Preconditions; + +/** + * Tuple class containing a vector and whether is was created. + * + * @param The type of vector the result is for. + */ +public class AddOrGetResult { + private final V vector; + private final boolean created; + + /** Constructs a new object. */ + public AddOrGetResult(V vector, boolean created) { + this.vector = Preconditions.checkNotNull(vector); + this.created = created; + } + + /** Returns the vector. */ + public V getVector() { + return vector; + } + + /** Returns whether the vector is created. */ + public boolean isCreated() { + return created; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java new file mode 100644 index 000000000..6824756d8 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.vector.complex.RepeatedFixedWidthVectorLike; +import org.apache.arrow.vector.complex.RepeatedVariableWidthVectorLike; + +/** Helper utility methods for allocating storage for Vectors. */ +public class AllocationHelper { + private AllocationHelper() {} + + /** + * Allocates the vector. + * + * @param v The vector to allocate. + * @param valueCount Number of values to allocate. + * @param bytesPerValue bytes per value. + * @throws org.apache.arrow.memory.OutOfMemoryException if it can't allocate the memory. + */ + public static void allocate(ValueVector v, int valueCount, int bytesPerValue) { + allocate(v, valueCount, bytesPerValue, 5); + } + + /** + * Allocates memory for a vector assuming given number of values and their width. + * + * @param v The vector the allocate. + * @param valueCount The number of elements to allocate. + * @param bytesPerValue The bytes per value to use for allocating underlying storage + * @param childValCount If v is a repeated vector, this is number of child elements to allocate. + * @throws org.apache.arrow.memory.OutOfMemoryException if it can't allocate the memory. + */ + public static void allocatePrecomputedChildCount( + ValueVector v, + int valueCount, + int bytesPerValue, + int childValCount) { + if (v instanceof FixedWidthVector) { + ((FixedWidthVector) v).allocateNew(valueCount); + } else if (v instanceof VariableWidthVector) { + ((VariableWidthVector) v).allocateNew(valueCount * bytesPerValue, valueCount); + } else if (v instanceof RepeatedFixedWidthVectorLike) { + ((RepeatedFixedWidthVectorLike) v).allocateNew(valueCount, childValCount); + } else if (v instanceof RepeatedVariableWidthVectorLike) { + ((RepeatedVariableWidthVectorLike) v).allocateNew(childValCount * bytesPerValue, valueCount, childValCount); + } else { + v.allocateNew(); + } + } + + /** + * Allocates memory for a vector assuming given number of values and their width. + * + * @param v The vector the allocate. + * @param valueCount The number of elements to allocate. + * @param bytesPerValue The bytes per value to use for allocating underlying storage + * @param repeatedPerTop If v is a repeated vector, this is assumed number of elements per child. + * @throws org.apache.arrow.memory.OutOfMemoryException if it can't allocate the memory + */ + public static void allocate(ValueVector v, int valueCount, int bytesPerValue, int repeatedPerTop) { + allocatePrecomputedChildCount(v, valueCount, bytesPerValue, repeatedPerTop * valueCount); + } + + /** + * Allocates the exact amount if v is fixed width, otherwise falls back to dynamic allocation. + * + * @param v value vector we are trying to allocate + * @param valueCount size we are trying to allocate + * @throws org.apache.arrow.memory.OutOfMemoryException if it can't allocate the memory + */ + public static void allocateNew(ValueVector v, int valueCount) { + if (v instanceof FixedWidthVector) { + ((FixedWidthVector) v).allocateNew(valueCount); + } else if (v instanceof VariableWidthVector) { + ((VariableWidthVector) v).allocateNew(valueCount); + } else { + v.allocateNew(); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java new file mode 100644 index 000000000..ded58b22b --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java @@ -0,0 +1,930 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.memory.util.LargeMemoryUtil.capAtMaxInt; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.util.ArrowBufPointer; +import org.apache.arrow.memory.util.ByteFunctionHelpers; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.util.TransferPair; + +import io.netty.util.internal.PlatformDependent; + +/** + * BaseFixedWidthVector provides an abstract interface for + * implementing vectors of fixed width values. The vectors are nullable + * implying that zero or more elements in the vector could be NULL. + */ +public abstract class BaseFixedWidthVector extends BaseValueVector + implements FixedWidthVector, FieldVector, VectorDefinitionSetter { + private final int typeWidth; + + protected int lastValueCapacity; + protected int actualValueCapacity; + + protected final Field field; + private int allocationMonitor; + protected ArrowBuf validityBuffer; + protected ArrowBuf valueBuffer; + protected int valueCount; + + /** + * Constructs a new instance. + * + * @param field field materialized by this vector + * @param allocator The allocator to use for allocating memory for the vector. + * @param typeWidth The width in bytes of the type. + */ + public BaseFixedWidthVector(Field field, final BufferAllocator allocator, final int typeWidth) { + super(allocator); + this.typeWidth = typeWidth; + this.field = field; + valueCount = 0; + allocationMonitor = 0; + validityBuffer = allocator.getEmpty(); + valueBuffer = allocator.getEmpty(); + lastValueCapacity = INITIAL_VALUE_ALLOCATION; + refreshValueCapacity(); + } + + + public int getTypeWidth() { + return typeWidth; + } + + @Override + public String getName() { + return field.getName(); + } + + /* TODO: + * see if getNullCount() can be made faster -- O(1) + */ + + /* TODO: + * Once the entire hierarchy has been refactored, move common functions + * like getNullCount(), splitAndTransferValidityBuffer to top level + * base class BaseValueVector. + * + * Along with this, some class members (validityBuffer) can also be + * abstracted out to top level base class. + * + * Right now BaseValueVector is the top level base class for other + * vector types in ValueVector hierarchy (non-nullable) and those + * vectors have not yet been refactored/removed so moving things to + * the top class as of now is not a good idea. + */ + + /** + * Get the memory address of buffer that manages the validity + * (NULL or NON-NULL nature) of elements in the vector. + * @return starting address of the buffer + */ + @Override + public long getValidityBufferAddress() { + return (validityBuffer.memoryAddress()); + } + + /** + * Get the memory address of buffer that stores the data for elements + * in the vector. + * @return starting address of the buffer + */ + @Override + public long getDataBufferAddress() { + return (valueBuffer.memoryAddress()); + } + + /** + * Get the memory address of buffer that stores the offsets for elements + * in the vector. This operation is not supported for fixed-width vectors. + * @return starting address of the buffer + * @throws UnsupportedOperationException for fixed width vectors + */ + @Override + public long getOffsetBufferAddress() { + throw new UnsupportedOperationException("not supported for fixed-width vectors"); + } + + /** + * Get buffer that manages the validity (NULL or NON-NULL nature) of + * elements in the vector. Consider it as a buffer for internal bit vector + * data structure. + * @return buffer + */ + @Override + public ArrowBuf getValidityBuffer() { + return validityBuffer; + } + + /** + * Get the buffer that stores the data for elements in the vector. + * @return buffer + */ + @Override + public ArrowBuf getDataBuffer() { + return valueBuffer; + } + + /** + * buffer that stores the offsets for elements + * in the vector. This operation is not supported for fixed-width vectors. + * @return buffer + * @throws UnsupportedOperationException for fixed width vectors + */ + @Override + public ArrowBuf getOffsetBuffer() { + throw new UnsupportedOperationException("not supported for fixed-width vectors"); + } + + /** + * Sets the desired value capacity for the vector. This function doesn't + * allocate any memory for the vector. + * @param valueCount desired number of elements in the vector + */ + @Override + public void setInitialCapacity(int valueCount) { + computeAndCheckBufferSize(valueCount); + lastValueCapacity = valueCount; + } + + /** + * Get the current value capacity for the vector. + * + * @return number of elements that vector can hold. + */ + @Override + public int getValueCapacity() { + return actualValueCapacity; + } + + /** + * Call this if you change the capacity of valueBuffer or validityBuffer. + */ + protected void refreshValueCapacity() { + actualValueCapacity = Math.min(getValueBufferValueCapacity(), getValidityBufferValueCapacity()); + } + + protected int getValueBufferValueCapacity() { + return capAtMaxInt(valueBuffer.capacity() / typeWidth); + } + + protected int getValidityBufferValueCapacity() { + return capAtMaxInt(validityBuffer.capacity() * 8); + } + + /** + * zero out the vector and the data in associated buffers. + */ + @Override + public void zeroVector() { + initValidityBuffer(); + initValueBuffer(); + } + + /* zero out the validity buffer */ + private void initValidityBuffer() { + validityBuffer.setZero(0, validityBuffer.capacity()); + } + + /* zero out the data buffer */ + private void initValueBuffer() { + valueBuffer.setZero(0, valueBuffer.capacity()); + } + + /** + * Reset the vector to initial state. Same as {@link #zeroVector()}. + * Note that this method doesn't release any memory. + */ + @Override + public void reset() { + valueCount = 0; + zeroVector(); + } + + /** + * Close the vector and release the associated buffers. + */ + @Override + public void close() { + clear(); + } + + /** + * Same as {@link #close()}. + */ + @Override + public void clear() { + valueCount = 0; + validityBuffer = releaseBuffer(validityBuffer); + valueBuffer = releaseBuffer(valueBuffer); + refreshValueCapacity(); + } + + /* used to step down the memory allocation */ + protected void incrementAllocationMonitor() { + if (allocationMonitor < 0) { + allocationMonitor = 0; + } + allocationMonitor++; + } + + /* used to step up the memory allocation */ + protected void decrementAllocationMonitor() { + if (allocationMonitor > 0) { + allocationMonitor = 0; + } + allocationMonitor--; + } + + /** + * Same as {@link #allocateNewSafe()}. + */ + @Override + public void allocateNew() { + allocateNew(lastValueCapacity); + } + + /** + * Allocate memory for the vector. We internally use a default value count + * of 4096 to allocate memory for at least these many elements in the + * vector. See {@link #allocateNew(int)} for allocating memory for specific + * number of elements in the vector. + * + * @return false if memory allocation fails, true otherwise. + */ + @Override + public boolean allocateNewSafe() { + try { + allocateNew(lastValueCapacity); + return true; + } catch (Exception e) { + return false; + } + } + + /** + * Allocate memory for the vector to support storing at least the provided number of + * elements in the vector. This method must be called prior to using the ValueVector. + * + * @param valueCount the desired number of elements in the vector + * @throws org.apache.arrow.memory.OutOfMemoryException on error + */ + public void allocateNew(int valueCount) { + computeAndCheckBufferSize(valueCount); + + /* we are doing a new allocation -- release the current buffers */ + clear(); + + try { + allocateBytes(valueCount); + } catch (Exception e) { + clear(); + throw e; + } + } + + /* + * Compute the buffer size required for 'valueCount', and check if it's within bounds. + */ + private long computeAndCheckBufferSize(int valueCount) { + final long size = computeCombinedBufferSize(valueCount, typeWidth); + if (size > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Memory required for vector capacity " + + valueCount + + " is (" + size + "), which is more than max allowed (" + MAX_ALLOCATION_SIZE + ")"); + } + return size; + } + + /** + * Actual memory allocation is done by this function. All the calculations + * and knowledge about what size to allocate is upto the callers of this + * method. + * Callers appropriately handle errors if memory allocation fails here. + * Callers should also take care of determining that desired size is + * within the bounds of max allocation allowed and any other error + * conditions. + */ + private void allocateBytes(int valueCount) { + DataAndValidityBuffers buffers = allocFixedDataAndValidityBufs(valueCount, typeWidth); + valueBuffer = buffers.getDataBuf(); + validityBuffer = buffers.getValidityBuf(); + zeroVector(); + + refreshValueCapacity(); + lastValueCapacity = getValueCapacity(); + } + + /** + * During splitAndTransfer, if we splitting from a random position within a byte, + * we can't just slice the source buffer so we have to explicitly allocate the + * validityBuffer of the target vector. This is unlike the databuffer which we can + * always slice for the target vector. + */ + private void allocateValidityBuffer(final int validityBufferSize) { + validityBuffer = allocator.buffer(validityBufferSize); + validityBuffer.readerIndex(0); + refreshValueCapacity(); + } + + /** + * Get the potential buffer size for a particular number of records. + * @param count desired number of elements in the vector + * @return estimated size of underlying buffers if the vector holds + * a given number of elements + */ + @Override + public int getBufferSizeFor(final int count) { + if (count == 0) { + return 0; + } + return (count * typeWidth) + getValidityBufferSizeFromCount(count); + } + + /** + * Get the size (number of bytes) of underlying buffers used by this + * vector. + * @return size of underlying buffers. + */ + @Override + public int getBufferSize() { + if (valueCount == 0) { + return 0; + } + return (valueCount * typeWidth) + getValidityBufferSizeFromCount(valueCount); + } + + /** + * Get information about how this field is materialized. + * @return the field corresponding to this vector + */ + @Override + public Field getField() { + return field; + } + + /** + * Return the underlying buffers associated with this vector. Note that this doesn't + * impact the reference counts for this buffer so it only should be used for in-context + * access. Also note that this buffer changes regularly thus + * external classes shouldn't hold a reference to it (unless they change it). + * + * @param clear Whether to clear vector before returning; the buffers will still be refcounted + * but the returned array will be the only reference to them + * @return The underlying {@link ArrowBuf buffers} that is used by this + * vector instance. + */ + @Override + public ArrowBuf[] getBuffers(boolean clear) { + final ArrowBuf[] buffers; + setReaderAndWriterIndex(); + if (getBufferSize() == 0) { + buffers = new ArrowBuf[0]; + } else { + buffers = new ArrowBuf[2]; + buffers[0] = validityBuffer; + buffers[1] = valueBuffer; + } + if (clear) { + for (final ArrowBuf buffer : buffers) { + buffer.getReferenceManager().retain(1); + } + clear(); + } + return buffers; + } + + /** + * Resize the vector to increase the capacity. The internal behavior is to + * double the current value capacity. + */ + @Override + public void reAlloc() { + int targetValueCount = getValueCapacity() * 2; + if (targetValueCount == 0) { + if (lastValueCapacity > 0) { + targetValueCount = lastValueCapacity; + } else { + targetValueCount = INITIAL_VALUE_ALLOCATION * 2; + } + } + computeAndCheckBufferSize(targetValueCount); + + DataAndValidityBuffers buffers = allocFixedDataAndValidityBufs(targetValueCount, typeWidth); + final ArrowBuf newValueBuffer = buffers.getDataBuf(); + newValueBuffer.setBytes(0, valueBuffer, 0, valueBuffer.capacity()); + newValueBuffer.setZero(valueBuffer.capacity(), newValueBuffer.capacity() - valueBuffer.capacity()); + valueBuffer.getReferenceManager().release(); + valueBuffer = newValueBuffer; + + final ArrowBuf newValidityBuffer = buffers.getValidityBuf(); + newValidityBuffer.setBytes(0, validityBuffer, 0, validityBuffer.capacity()); + newValidityBuffer.setZero(validityBuffer.capacity(), newValidityBuffer.capacity() - validityBuffer.capacity()); + validityBuffer.getReferenceManager().release(); + validityBuffer = newValidityBuffer; + + refreshValueCapacity(); + lastValueCapacity = getValueCapacity(); + } + + /** + * Get the inner vectors. + * + * @deprecated This API will be removed as the current implementations no longer support inner vectors. + * + * @return the inner vectors for this field as defined by the TypeLayout + */ + @Deprecated + @Override + public List getFieldInnerVectors() { + throw new UnsupportedOperationException("There are no inner vectors. Use getFieldBuffers"); + } + + /** + * Initialize the children in schema for this Field. This operation is a + * NO-OP for scalar types since they don't have any children. + * @param children the schema + * @throws IllegalArgumentException if children is a non-empty list for scalar types. + */ + @Override + public void initializeChildrenFromFields(List children) { + if (!children.isEmpty()) { + throw new IllegalArgumentException("primitive type vector can not have children"); + } + } + + /** + * Get the inner child vectors. + * @return list of child vectors for complex types, empty list for scalar vector types + */ + @Override + public List getChildrenFromFields() { + return Collections.emptyList(); + } + + /** + * Load the buffers of this vector with provided source buffers. + * The caller manages the source buffers and populates them before invoking + * this method. + * @param fieldNode the fieldNode indicating the value count + * @param ownBuffers the buffers for this Field (own buffers only, children not included) + */ + @Override + public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { + if (ownBuffers.size() != 2) { + throw new IllegalArgumentException("Illegal buffer count, expected " + 2 + ", got: " + ownBuffers.size()); + } + + ArrowBuf bitBuffer = ownBuffers.get(0); + ArrowBuf dataBuffer = ownBuffers.get(1); + + validityBuffer.getReferenceManager().release(); + validityBuffer = BitVectorHelper.loadValidityBuffer(fieldNode, bitBuffer, allocator); + valueBuffer.getReferenceManager().release(); + valueBuffer = dataBuffer.getReferenceManager().retain(dataBuffer, allocator); + refreshValueCapacity(); + + valueCount = fieldNode.getLength(); + } + + /** + * Get the buffers belonging to this vector. + * + * @return the inner buffers. + */ + public List getFieldBuffers() { + List result = new ArrayList<>(2); + setReaderAndWriterIndex(); + result.add(validityBuffer); + result.add(valueBuffer); + + return result; + } + + /** + * Set the reader and writer indexes for the inner buffers. + */ + private void setReaderAndWriterIndex() { + validityBuffer.readerIndex(0); + valueBuffer.readerIndex(0); + if (valueCount == 0) { + validityBuffer.writerIndex(0); + valueBuffer.writerIndex(0); + } else { + validityBuffer.writerIndex(getValidityBufferSizeFromCount(valueCount)); + if (typeWidth == 0) { + /* specialized handling for BitVector */ + valueBuffer.writerIndex(getValidityBufferSizeFromCount(valueCount)); + } else { + valueBuffer.writerIndex((long) valueCount * typeWidth); + } + } + } + + /** + * Construct a transfer pair of this vector and another vector of same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @param callBack not used + * @return TransferPair + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { + return getTransferPair(ref, allocator); + } + + /** + * Construct a transfer pair of this vector and another vector of same type. + * @param allocator allocator for the target vector + * @return TransferPair + */ + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + return getTransferPair(getName(), allocator); + } + + /** + * Construct a transfer pair of this vector and another vector of same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return TransferPair + */ + public abstract TransferPair getTransferPair(String ref, BufferAllocator allocator); + + /** + * Transfer this vector'data to another vector. The memory associated + * with this vector is transferred to the allocator of target vector + * for accounting and management purposes. + * @param target destination vector for transfer + */ + public void transferTo(BaseFixedWidthVector target) { + compareTypes(target, "transferTo"); + target.clear(); + target.validityBuffer = transferBuffer(validityBuffer, target.allocator); + target.valueBuffer = transferBuffer(valueBuffer, target.allocator); + target.valueCount = valueCount; + target.refreshValueCapacity(); + clear(); + } + + /** + * Slice this vector at desired index and length and transfer the + * corresponding data to the target vector. + * @param startIndex start position of the split in source vector. + * @param length length of the split. + * @param target destination vector + */ + public void splitAndTransferTo(int startIndex, int length, + BaseFixedWidthVector target) { + Preconditions.checkArgument(startIndex >= 0 && length >= 0 && startIndex + length <= valueCount, + "Invalid parameters startIndex: %s, length: %s for valueCount: %s", startIndex, length, valueCount); + compareTypes(target, "splitAndTransferTo"); + target.clear(); + splitAndTransferValidityBuffer(startIndex, length, target); + splitAndTransferValueBuffer(startIndex, length, target); + target.setValueCount(length); + } + + /** + * Data buffer can always be split and transferred using slicing. + */ + private void splitAndTransferValueBuffer(int startIndex, int length, + BaseFixedWidthVector target) { + final int startPoint = startIndex * typeWidth; + final int sliceLength = length * typeWidth; + final ArrowBuf slicedBuffer = valueBuffer.slice(startPoint, sliceLength); + target.valueBuffer = transferBuffer(slicedBuffer, target.allocator); + target.refreshValueCapacity(); + } + + /** + * Validity buffer has multiple cases of split and transfer depending on + * the starting position of the source index. + */ + private void splitAndTransferValidityBuffer(int startIndex, int length, + BaseFixedWidthVector target) { + int firstByteSource = BitVectorHelper.byteIndex(startIndex); + int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1); + int byteSizeTarget = getValidityBufferSizeFromCount(length); + int offset = startIndex % 8; + + if (length > 0) { + if (offset == 0) { + /* slice */ + if (target.validityBuffer != null) { + target.validityBuffer.getReferenceManager().release(); + } + target.validityBuffer = validityBuffer.slice(firstByteSource, byteSizeTarget); + target.validityBuffer.getReferenceManager().retain(1); + target.refreshValueCapacity(); + } else { + /* Copy data + * When the first bit starts from the middle of a byte (offset != 0), + * copy data from src BitVector. + * Each byte in the target is composed by a part in i-th byte, + * another part in (i+1)-th byte. + */ + target.allocateValidityBuffer(byteSizeTarget); + + for (int i = 0; i < byteSizeTarget - 1; i++) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(this.validityBuffer, + firstByteSource + i, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(this.validityBuffer, + firstByteSource + i + 1, offset); + + target.validityBuffer.setByte(i, (b1 + b2)); + } + + /* Copying the last piece is done in the following manner: + * if the source vector has 1 or more bytes remaining, we copy + * the last piece as a byte formed by shifting data + * from the current byte and the next byte. + * + * if the source vector has no more bytes remaining + * (we are at the last byte), we copy the last piece as a byte + * by shifting data from the current byte. + */ + if ((firstByteSource + byteSizeTarget - 1) < lastByteSource) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(this.validityBuffer, + firstByteSource + byteSizeTarget - 1, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(this.validityBuffer, + firstByteSource + byteSizeTarget, offset); + + target.validityBuffer.setByte(byteSizeTarget - 1, b1 + b2); + } else { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(this.validityBuffer, + firstByteSource + byteSizeTarget - 1, offset); + target.validityBuffer.setByte(byteSizeTarget - 1, b1); + } + } + } + } + + + /*----------------------------------------------------------------* + | | + | common getters and setters | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the number of elements that are null in the vector. + * + * @return the number of null elements. + */ + @Override + public int getNullCount() { + return BitVectorHelper.getNullCount(validityBuffer, valueCount); + } + + /** + * Get the value count of vector. This will always be zero unless + * {@link #setValueCount(int)} has been called prior to calling this. + * + * @return valueCount for the vector + */ + @Override + public int getValueCount() { + return valueCount; + } + + /** + * Set value count for the vector. + * + * @param valueCount value count to set + */ + @Override + public void setValueCount(int valueCount) { + this.valueCount = valueCount; + final int currentValueCapacity = getValueCapacity(); + while (valueCount > getValueCapacity()) { + reAlloc(); + } + /* + * We are trying to understand the pattern of memory allocation. + * If initially, the user did vector.allocateNew(), we would have + * allocated memory of default size (4096 * type width). + * Later on user invokes setValueCount(count). + * + * If the existing value capacity is twice as large as the + * valueCount, we know that we over-provisioned memory in the + * first place when default memory allocation was done because user + * really needs a much less value count in the vector. + * + * We record this by bumping up the allocationMonitor. If this pattern + * happens for certain number of times and allocationMonitor + * reaches the threshold (internal hardcoded) value, subsequent + * call to allocateNew() will take care of stepping down the + * default memory allocation size. + * + * Another case would be under-provisioning the initial memory and + * thus going through a lot of realloc(). Here the goal is to + * see if we can minimize the number of reallocations. Again the + * state is recorded in allocationMonitor by decrementing it + * (negative value). If a threshold is hit, realloc will try to + * allocate more memory in order to possibly avoid a future realloc. + * This case is also applicable to setSafe() methods which can trigger + * a realloc() and thus we record the state there as well. + */ + if (valueCount > 0) { + if (currentValueCapacity >= (valueCount * 2)) { + incrementAllocationMonitor(); + } else if (currentValueCapacity <= (valueCount / 2)) { + decrementAllocationMonitor(); + } + } + setReaderAndWriterIndex(); + } + + /** + * Check if the given index is within the current value capacity + * of the vector. + * + * @param index position to check + * @return true if index is within the current value capacity + */ + public boolean isSafe(int index) { + return index < getValueCapacity(); + } + + /** + * Check if element at given index is null. + * + * @param index position of element + * @return true if element at given index is null, false otherwise + */ + @Override + public boolean isNull(int index) { + return (isSet(index) == 0); + } + + /** + * Same as {@link #isNull(int)}. + * + * @param index position of element + * @return 1 if element at given index is not null, 0 otherwise + */ + public int isSet(int index) { + final int byteIndex = index >> 3; + final byte b = validityBuffer.getByte(byteIndex); + final int bitIndex = index & 7; + return (b >> bitIndex) & 0x01; + } + + /** + * Mark the particular position in the vector as non-null. + * + * @param index position of the element. + */ + @Override + public void setIndexDefined(int index) { + handleSafe(index); + BitVectorHelper.setBit(validityBuffer, index); + } + + public void set(int index, byte[] value, int start, int length) { + throw new UnsupportedOperationException(); + } + + public void setSafe(int index, byte[] value, int start, int length) { + throw new UnsupportedOperationException(); + } + + public void set(int index, ByteBuffer value, int start, int length) { + throw new UnsupportedOperationException(); + } + + public void setSafe(int index, ByteBuffer value, int start, int length) { + throw new UnsupportedOperationException(); + } + + + /*----------------------------------------------------------------* + | | + | helper methods for setters | + | | + *----------------------------------------------------------------*/ + + + protected void handleSafe(int index) { + while (index >= getValueCapacity()) { + decrementAllocationMonitor(); + reAlloc(); + } + } + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector. The source vector should be of the same type as this one. + * + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + @Override + public void copyFrom(int fromIndex, int thisIndex, ValueVector from) { + Preconditions.checkArgument(this.getMinorType() == from.getMinorType()); + if (from.isNull(fromIndex)) { + BitVectorHelper.unsetBit(this.getValidityBuffer(), thisIndex); + } else { + BitVectorHelper.setBit(this.getValidityBuffer(), thisIndex); + PlatformDependent.copyMemory(from.getDataBuffer().memoryAddress() + (long) fromIndex * typeWidth, + this.getDataBuffer().memoryAddress() + (long) thisIndex * typeWidth, typeWidth); + } + } + + /** + * Same as {@link #copyFrom(int, int, ValueVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + @Override + public void copyFromSafe(int fromIndex, int thisIndex, ValueVector from) { + Preconditions.checkArgument(this.getMinorType() == from.getMinorType()); + handleSafe(thisIndex); + copyFrom(fromIndex, thisIndex, from); + } + + /** + * Set the element at the given index to null. + * + * @param index position of element + */ + public void setNull(int index) { + handleSafe(index); + // not really needed to set the bit to 0 as long as + // the buffer always starts from 0. + BitVectorHelper.unsetBit(validityBuffer, index); + } + + @Override + public ArrowBufPointer getDataPointer(int index) { + return getDataPointer(index, new ArrowBufPointer()); + } + + @Override + public ArrowBufPointer getDataPointer(int index, ArrowBufPointer reuse) { + if (isNull(index)) { + reuse.set(null, 0, 0); + } else { + reuse.set(valueBuffer, (long) index * typeWidth, typeWidth); + } + return reuse; + } + + @Override + public int hashCode(int index) { + return hashCode(index, null); + } + + @Override + public int hashCode(int index, ArrowBufHasher hasher) { + if (isNull(index)) { + return ArrowBufPointer.NULL_HASH_CODE; + } + long start = (long) typeWidth * index; + long end = (long) typeWidth * (index + 1); + return ByteFunctionHelpers.hash(hasher, this.getDataBuffer(), start, end); + } + + @Override + public OUT accept(VectorVisitor visitor, IN value) { + return visitor.visit(this, value); + } + +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BaseIntVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BaseIntVector.java new file mode 100644 index 000000000..556411c86 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BaseIntVector.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Interface for all int type vectors. + */ +public interface BaseIntVector extends FieldVector { + + /** + * Sets the value at index, note this value may need to be need truncated. + * Note this is safe version (i.e. call setSafe method in vector) + */ + void setWithPossibleTruncate(int index, long value); + + /** + * Sets the value at index, note this value may need to be need truncated. + * Note this is unsafe version (i.e. call set method in vector) + */ + void setUnsafeWithPossibleTruncate(int index, long value); + + /** + * Gets the value at index. + * This value may have been extended to long and will throw {@link NullPointerException} + * if the value is null. Note null check could be turned off via {@link NullCheckingForGet}. + */ + long getValueAsLong(int index); +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java new file mode 100644 index 000000000..90694db83 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java @@ -0,0 +1,1370 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.memory.util.LargeMemoryUtil.capAtMaxInt; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.memory.util.ArrowBufPointer; +import org.apache.arrow.memory.util.ByteFunctionHelpers; +import org.apache.arrow.memory.util.CommonUtil; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.util.TransferPair; + +/** + * BaseLargeVariableWidthVector is a base class providing functionality for large strings/large bytes types. + */ +public abstract class BaseLargeVariableWidthVector extends BaseValueVector + implements VariableWidthVector, FieldVector, VectorDefinitionSetter { + private static final int DEFAULT_RECORD_BYTE_COUNT = 12; + private static final int INITIAL_BYTE_COUNT = INITIAL_VALUE_ALLOCATION * DEFAULT_RECORD_BYTE_COUNT; + private int lastValueCapacity; + private long lastValueAllocationSizeInBytes; + + /* protected members */ + public static final int OFFSET_WIDTH = 8; /* 8 byte unsigned int to track offsets */ + protected static final byte[] emptyByteArray = new byte[]{}; + protected ArrowBuf validityBuffer; + protected ArrowBuf valueBuffer; + protected ArrowBuf offsetBuffer; + protected int valueCount; + protected int lastSet; + protected final Field field; + + /** + * Constructs a new instance. + * + * @param field The field materialized by this vector. + * @param allocator The allocator to use for creating/resizing buffers + */ + public BaseLargeVariableWidthVector(Field field, final BufferAllocator allocator) { + super(allocator); + this.field = field; + lastValueAllocationSizeInBytes = INITIAL_BYTE_COUNT; + // -1 because we require one extra slot for the offset array. + lastValueCapacity = INITIAL_VALUE_ALLOCATION - 1; + valueCount = 0; + lastSet = -1; + offsetBuffer = allocator.getEmpty(); + validityBuffer = allocator.getEmpty(); + valueBuffer = allocator.getEmpty(); + } + + @Override + public String getName() { + return field.getName(); + } + + /** + * Get buffer that manages the validity (NULL or NON-NULL nature) of + * elements in the vector. Consider it as a buffer for internal bit vector + * data structure. + * @return buffer + */ + @Override + public ArrowBuf getValidityBuffer() { + return validityBuffer; + } + + /** + * Get the buffer that stores the data for elements in the vector. + * @return buffer + */ + @Override + public ArrowBuf getDataBuffer() { + return valueBuffer; + } + + /** + * buffer that stores the offsets for elements + * in the vector. This operation is not supported for fixed-width vectors. + * @return buffer + */ + @Override + public ArrowBuf getOffsetBuffer() { + return offsetBuffer; + } + + /** + * Get the memory address of buffer that stores the offsets for elements + * in the vector. + * @return starting address of the buffer + */ + @Override + public long getOffsetBufferAddress() { + return offsetBuffer.memoryAddress(); + } + + /** + * Get the memory address of buffer that manages the validity + * (NULL or NON-NULL nature) of elements in the vector. + * @return starting address of the buffer + */ + @Override + public long getValidityBufferAddress() { + return validityBuffer.memoryAddress(); + } + + /** + * Get the memory address of buffer that stores the data for elements + * in the vector. + * @return starting address of the buffer + */ + @Override + public long getDataBufferAddress() { + return valueBuffer.memoryAddress(); + } + + /** + * Sets the desired value capacity for the vector. This function doesn't + * allocate any memory for the vector. + * @param valueCount desired number of elements in the vector + */ + @Override + public void setInitialCapacity(int valueCount) { + final long size = (long) valueCount * DEFAULT_RECORD_BYTE_COUNT; + checkDataBufferSize(size); + computeAndCheckOffsetsBufferSize(valueCount); + lastValueAllocationSizeInBytes = size; + lastValueCapacity = valueCount; + } + + /** + * Sets the desired value capacity for the vector. This function doesn't + * allocate any memory for the vector. + * @param valueCount desired number of elements in the vector + * @param density average number of bytes per variable width element + */ + @Override + public void setInitialCapacity(int valueCount, double density) { + long size = Math.max((long) (valueCount * density), 1L); + checkDataBufferSize(size); + computeAndCheckOffsetsBufferSize(valueCount); + lastValueAllocationSizeInBytes = size; + lastValueCapacity = valueCount; + } + + /** + * Get the density of this ListVector. + * @return density + */ + public double getDensity() { + if (valueCount == 0) { + return 0.0D; + } + final long startOffset = offsetBuffer.getLong(0); + final long endOffset = offsetBuffer.getLong((long) valueCount * OFFSET_WIDTH); + final double totalListSize = endOffset - startOffset; + return totalListSize / valueCount; + } + + /** + * Get the current capacity which does not exceed either validity buffer or offset buffer. + * Note: Here the `getValueCapacity` has no relationship with the value buffer. + * @return number of elements that vector can hold. + */ + @Override + public int getValueCapacity() { + final long offsetValueCapacity = Math.max(getOffsetBufferValueCapacity() - 1, 0); + return capAtMaxInt(Math.min(offsetValueCapacity, getValidityBufferValueCapacity())); + } + + private long getValidityBufferValueCapacity() { + return validityBuffer.capacity() * 8; + } + + private long getOffsetBufferValueCapacity() { + return offsetBuffer.capacity() / OFFSET_WIDTH; + } + + /** + * zero out the vector and the data in associated buffers. + */ + public void zeroVector() { + initValidityBuffer(); + initOffsetBuffer(); + valueBuffer.setZero(0, valueBuffer.capacity()); + } + + /* zero out the validity buffer */ + private void initValidityBuffer() { + validityBuffer.setZero(0, validityBuffer.capacity()); + } + + /* zero out the offset buffer */ + private void initOffsetBuffer() { + offsetBuffer.setZero(0, offsetBuffer.capacity()); + } + + /** + * Reset the vector to initial state. Same as {@link #zeroVector()}. + * Note that this method doesn't release any memory. + */ + public void reset() { + zeroVector(); + lastSet = -1; + valueCount = 0; + } + + /** + * Close the vector and release the associated buffers. + */ + @Override + public void close() { + clear(); + } + + /** + * Same as {@link #close()}. + */ + @Override + public void clear() { + validityBuffer = releaseBuffer(validityBuffer); + valueBuffer = releaseBuffer(valueBuffer); + offsetBuffer = releaseBuffer(offsetBuffer); + lastSet = -1; + valueCount = 0; + } + + /** + * Get the inner vectors. + * + * @deprecated This API will be removed as the current implementations no longer support inner vectors. + * + * @return the inner vectors for this field as defined by the TypeLayout + */ + @Override + @Deprecated + public List getFieldInnerVectors() { + throw new UnsupportedOperationException("There are no inner vectors. Use getFieldBuffers"); + } + + /** + * Initialize the children in schema for this Field. This operation is a + * NO-OP for scalar types since they don't have any children. + * @param children the schema + * @throws IllegalArgumentException if children is a non-empty list for scalar types. + */ + @Override + public void initializeChildrenFromFields(List children) { + if (!children.isEmpty()) { + throw new IllegalArgumentException("primitive type vector can not have children"); + } + } + + /** + * Get the inner child vectors. + * @return list of child vectors for complex types, empty list for scalar vector types + */ + @Override + public List getChildrenFromFields() { + return Collections.emptyList(); + } + + + /** + * Load the buffers of this vector with provided source buffers. + * The caller manages the source buffers and populates them before invoking + * this method. + * @param fieldNode the fieldNode indicating the value count + * @param ownBuffers the buffers for this Field (own buffers only, children not included) + */ + @Override + public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { + ArrowBuf bitBuffer = ownBuffers.get(0); + ArrowBuf offBuffer = ownBuffers.get(1); + ArrowBuf dataBuffer = ownBuffers.get(2); + + validityBuffer.getReferenceManager().release(); + validityBuffer = BitVectorHelper.loadValidityBuffer(fieldNode, bitBuffer, allocator); + offsetBuffer.getReferenceManager().release(); + offsetBuffer = offBuffer.getReferenceManager().retain(offBuffer, allocator); + valueBuffer.getReferenceManager().release(); + valueBuffer = dataBuffer.getReferenceManager().retain(dataBuffer, allocator); + + lastSet = fieldNode.getLength() - 1; + valueCount = fieldNode.getLength(); + } + + /** + * Get the buffers belonging to this vector. + * @return the inner buffers. + */ + public List getFieldBuffers() { + // before flight/IPC, we must bring the vector to a consistent state. + // this is because, it is possible that the offset buffers of some trailing values + // are not updated. this may cause some data in the data buffer being lost. + // for details, please see TestValueVector#testUnloadVariableWidthVector. + fillHoles(valueCount); + + List result = new ArrayList<>(3); + setReaderAndWriterIndex(); + result.add(validityBuffer); + result.add(offsetBuffer); + result.add(valueBuffer); + + return result; + } + + /** + * Set the reader and writer indexes for the inner buffers. + */ + private void setReaderAndWriterIndex() { + validityBuffer.readerIndex(0); + offsetBuffer.readerIndex(0); + valueBuffer.readerIndex(0); + if (valueCount == 0) { + validityBuffer.writerIndex(0); + offsetBuffer.writerIndex(0); + valueBuffer.writerIndex(0); + } else { + final long lastDataOffset = getStartOffset(valueCount); + validityBuffer.writerIndex(getValidityBufferSizeFromCount(valueCount)); + offsetBuffer.writerIndex((long) (valueCount + 1) * OFFSET_WIDTH); + valueBuffer.writerIndex(lastDataOffset); + } + } + + /** + * Same as {@link #allocateNewSafe()}. + */ + @Override + public void allocateNew() { + allocateNew(lastValueAllocationSizeInBytes, lastValueCapacity); + } + + /** + * Allocate memory for the vector. We internally use a default value count + * of 4096 to allocate memory for at least these many elements in the + * vector. See {@link #allocateNew(long, int)} for allocating memory for specific + * number of elements in the vector. + * + * @return false if memory allocation fails, true otherwise. + */ + @Override + public boolean allocateNewSafe() { + try { + allocateNew(lastValueAllocationSizeInBytes, lastValueCapacity); + return true; + } catch (Exception e) { + return false; + } + } + + /** + * Allocate memory for the vector to support storing at least the provided number of + * elements in the vector. This method must be called prior to using the ValueVector. + * + * @param totalBytes desired total memory capacity + * @param valueCount the desired number of elements in the vector + * @throws org.apache.arrow.memory.OutOfMemoryException if memory allocation fails + */ + @Override + public void allocateNew(long totalBytes, int valueCount) { + assert totalBytes >= 0; + + checkDataBufferSize(totalBytes); + computeAndCheckOffsetsBufferSize(valueCount); + + /* we are doing a new allocation -- release the current buffers */ + clear(); + + try { + allocateBytes(totalBytes, valueCount); + } catch (Exception e) { + clear(); + throw e; + } + } + + @Override + public void allocateNew(int valueCount) { + allocateNew(lastValueAllocationSizeInBytes, valueCount); + } + + /* Check if the data buffer size is within bounds. */ + private void checkDataBufferSize(long size) { + if (size > MAX_ALLOCATION_SIZE || size < 0) { + throw new OversizedAllocationException("Memory required for vector " + + " is (" + size + "), which is more than max allowed (" + MAX_ALLOCATION_SIZE + ")"); + } + } + + /** + * Compute the buffer size required for 'valueCount' offsets and validity, and check if it's + * within bounds. + */ + private long computeAndCheckOffsetsBufferSize(int valueCount) { + /* to track the end offset of last data element in vector, we need + * an additional slot in offset buffer. + */ + final long size = computeCombinedBufferSize(valueCount + 1, OFFSET_WIDTH); + if (size > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Memory required for vector capacity " + + valueCount + + " is (" + size + "), which is more than max allowed (" + MAX_ALLOCATION_SIZE + ")"); + } + return size; + } + + /* allocate the inner buffers */ + private void allocateBytes(final long valueBufferSize, final int valueCount) { + /* allocate data buffer */ + long curSize = valueBufferSize; + valueBuffer = allocator.buffer(curSize); + valueBuffer.readerIndex(0); + + /* allocate offset buffer and validity buffer */ + DataAndValidityBuffers buffers = allocFixedDataAndValidityBufs(valueCount + 1, OFFSET_WIDTH); + offsetBuffer = buffers.getDataBuf(); + validityBuffer = buffers.getValidityBuf(); + initOffsetBuffer(); + initValidityBuffer(); + + lastValueCapacity = getValueCapacity(); + lastValueAllocationSizeInBytes = capAtMaxInt(valueBuffer.capacity()); + } + + /* allocate offset buffer */ + private void allocateOffsetBuffer(final long size) { + offsetBuffer = allocator.buffer(size); + offsetBuffer.readerIndex(0); + initOffsetBuffer(); + } + + /* allocate validity buffer */ + private void allocateValidityBuffer(final long size) { + validityBuffer = allocator.buffer(size); + validityBuffer.readerIndex(0); + initValidityBuffer(); + } + + /** + * Resize the vector to increase the capacity. The internal behavior is to + * double the current value capacity. + */ + public void reAlloc() { + reallocDataBuffer(); + reallocValidityAndOffsetBuffers(); + } + + /** + * Reallocate the data buffer. Data Buffer stores the actual data for + * LARGEVARCHAR or LARGEVARBINARY elements in the vector. The behavior is to double + * the size of buffer. + * @throws OversizedAllocationException if the desired new size is more than + * max allowed + * @throws OutOfMemoryException if the internal memory allocation fails + */ + public void reallocDataBuffer() { + final long currentBufferCapacity = valueBuffer.capacity(); + long newAllocationSize = currentBufferCapacity * 2; + if (newAllocationSize == 0) { + if (lastValueAllocationSizeInBytes > 0) { + newAllocationSize = lastValueAllocationSizeInBytes; + } else { + newAllocationSize = INITIAL_BYTE_COUNT * 2; + } + } + newAllocationSize = CommonUtil.nextPowerOfTwo(newAllocationSize); + assert newAllocationSize >= 1; + + checkDataBufferSize(newAllocationSize); + + final ArrowBuf newBuf = allocator.buffer(newAllocationSize); + newBuf.setBytes(0, valueBuffer, 0, currentBufferCapacity); + valueBuffer.getReferenceManager().release(); + valueBuffer = newBuf; + lastValueAllocationSizeInBytes = valueBuffer.capacity(); + } + + /** + * Reallocate the validity and offset buffers for this vector. Validity + * buffer is used to track the NULL or NON-NULL nature of elements in + * the vector and offset buffer is used to store the lengths of variable + * width elements in the vector. + * + *

Note that data buffer for variable length vectors moves independent + * of the companion validity and offset buffers. This is in + * contrast to what we have for fixed width vectors. + * + *

So even though we may have setup an initial capacity of 1024 + * elements in the vector, it is quite possible + * that we need to reAlloc() the data buffer when we are setting + * the 5th element in the vector simply because previous + * variable length elements have exhausted the buffer capacity. + * However, we really don't need to reAlloc() validity and + * offset buffers until we try to set the 1025th element + * This is why we do a separate check for safe methods to + * determine which buffer needs reallocation. + * @throws OversizedAllocationException if the desired new size is more than + * max allowed + * @throws OutOfMemoryException if the internal memory allocation fails + */ + public void reallocValidityAndOffsetBuffers() { + int targetOffsetCount = capAtMaxInt((offsetBuffer.capacity() / OFFSET_WIDTH) * 2); + if (targetOffsetCount == 0) { + if (lastValueCapacity > 0) { + targetOffsetCount = (lastValueCapacity + 1); + } else { + targetOffsetCount = 2 * (INITIAL_VALUE_ALLOCATION + 1); + } + } + computeAndCheckOffsetsBufferSize(targetOffsetCount); + + DataAndValidityBuffers buffers = allocFixedDataAndValidityBufs(targetOffsetCount, OFFSET_WIDTH); + final ArrowBuf newOffsetBuffer = buffers.getDataBuf(); + newOffsetBuffer.setBytes(0, offsetBuffer, 0, offsetBuffer.capacity()); + newOffsetBuffer.setZero(offsetBuffer.capacity(), newOffsetBuffer.capacity() - offsetBuffer.capacity()); + offsetBuffer.getReferenceManager().release(); + offsetBuffer = newOffsetBuffer; + + final ArrowBuf newValidityBuffer = buffers.getValidityBuf(); + newValidityBuffer.setBytes(0, validityBuffer, 0, validityBuffer.capacity()); + newValidityBuffer.setZero(validityBuffer.capacity(), newValidityBuffer.capacity() - validityBuffer.capacity()); + validityBuffer.getReferenceManager().release(); + validityBuffer = newValidityBuffer; + + lastValueCapacity = getValueCapacity(); + } + + /** + * Get the size (number of bytes) of underlying data buffer. + * @return number of bytes in the data buffer + */ + @Override + public int getByteCapacity() { + return capAtMaxInt(valueBuffer.capacity()); + } + + @Override + public int sizeOfValueBuffer() { + if (valueCount == 0) { + return 0; + } + return capAtMaxInt(offsetBuffer.getLong((long) valueCount * OFFSET_WIDTH)); + } + + /** + * Get the size (number of bytes) of underlying buffers used by this + * vector. + * @return size of underlying buffers. + */ + @Override + public int getBufferSize() { + return getBufferSizeFor(this.valueCount); + } + + /** + * Get the potential buffer size for a particular number of records. + * @param valueCount desired number of elements in the vector + * @return estimated size of underlying buffers if the vector holds + * a given number of elements + */ + @Override + public int getBufferSizeFor(final int valueCount) { + if (valueCount == 0) { + return 0; + } + + final long validityBufferSize = getValidityBufferSizeFromCount(valueCount); + final long offsetBufferSize = (long) (valueCount + 1) * OFFSET_WIDTH; + /* get the end offset for this valueCount */ + final long dataBufferSize = offsetBuffer.getLong((long) valueCount * OFFSET_WIDTH); + return capAtMaxInt(validityBufferSize + offsetBufferSize + dataBufferSize); + } + + /** + * Get information about how this field is materialized. + * @return the field corresponding to this vector + */ + @Override + public Field getField() { + return field; + } + + /** + * Return the underlying buffers associated with this vector. Note that this doesn't + * impact the reference counts for this buffer so it only should be used for in-context + * access. Also note that this buffer changes regularly thus + * external classes shouldn't hold a reference to it (unless they change it). + * + * @param clear Whether to clear vector before returning; the buffers will still be refcounted + * but the returned array will be the only reference to them + * @return The underlying {@link io.netty.buffer.ArrowBuf buffers} that is used by this + * vector instance. + */ + @Override + public ArrowBuf[] getBuffers(boolean clear) { + final ArrowBuf[] buffers; + setReaderAndWriterIndex(); + if (getBufferSize() == 0) { + buffers = new ArrowBuf[0]; + } else { + buffers = new ArrowBuf[3]; + buffers[0] = validityBuffer; + buffers[1] = offsetBuffer; + buffers[2] = valueBuffer; + } + if (clear) { + for (final ArrowBuf buffer : buffers) { + buffer.getReferenceManager().retain(); + } + clear(); + } + return buffers; + } + + /** + * Construct a transfer pair of this vector and another vector of same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @param callBack not used + * @return TransferPair + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { + return getTransferPair(ref, allocator); + } + + /** + * Construct a transfer pair of this vector and another vector of same type. + * @param allocator allocator for the target vector + * @return TransferPair + */ + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + return getTransferPair(getName(), allocator); + } + + /** + * Construct a transfer pair of this vector and another vector of same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return TransferPair + */ + public abstract TransferPair getTransferPair(String ref, BufferAllocator allocator); + + /** + * Transfer this vector'data to another vector. The memory associated + * with this vector is transferred to the allocator of target vector + * for accounting and management purposes. + * @param target destination vector for transfer + */ + public void transferTo(BaseLargeVariableWidthVector target) { + compareTypes(target, "transferTo"); + target.clear(); + target.validityBuffer = transferBuffer(validityBuffer, target.allocator); + target.valueBuffer = transferBuffer(valueBuffer, target.allocator); + target.offsetBuffer = transferBuffer(offsetBuffer, target.allocator); + target.setLastSet(this.lastSet); + if (this.valueCount > 0) { + target.setValueCount(this.valueCount); + } + clear(); + } + + /** + * Slice this vector at desired index and length and transfer the + * corresponding data to the target vector. + * @param startIndex start position of the split in source vector. + * @param length length of the split. + * @param target destination vector + */ + public void splitAndTransferTo(int startIndex, int length, + BaseLargeVariableWidthVector target) { + Preconditions.checkArgument(startIndex >= 0 && startIndex < valueCount, + "Invalid startIndex: %s", startIndex); + Preconditions.checkArgument(startIndex + length <= valueCount, + "Invalid length: %s", length); + compareTypes(target, "splitAndTransferTo"); + target.clear(); + splitAndTransferValidityBuffer(startIndex, length, target); + splitAndTransferOffsetBuffer(startIndex, length, target); + target.setLastSet(length - 1); + if (length > 0) { + target.setValueCount(length); + } + } + + /** + * Transfer the offsets along with data. Unlike the data buffer, we cannot simply + * slice the offset buffer for split and transfer. The reason is that offsets + * in the target vector have to be adjusted and made relative to the staring + * offset in source vector from the start index of split. This is why, we + * need to explicitly allocate the offset buffer and set the adjusted offsets + * in the target vector. + */ + private void splitAndTransferOffsetBuffer(int startIndex, int length, BaseLargeVariableWidthVector target) { + final long start = offsetBuffer.getLong((long) startIndex * OFFSET_WIDTH); + final long end = offsetBuffer.getLong((long) (startIndex + length) * OFFSET_WIDTH); + final long dataLength = end - start; + target.allocateOffsetBuffer((long) (length + 1) * OFFSET_WIDTH); + for (int i = 0; i < length + 1; i++) { + final long relativeSourceOffset = offsetBuffer.getLong((long) (startIndex + i) * OFFSET_WIDTH) - start; + target.offsetBuffer.setLong((long) i * OFFSET_WIDTH, relativeSourceOffset); + } + final ArrowBuf slicedBuffer = valueBuffer.slice(start, dataLength); + target.valueBuffer = transferBuffer(slicedBuffer, target.allocator); + } + + /* + * Transfer the validity. + */ + private void splitAndTransferValidityBuffer(int startIndex, int length, + BaseLargeVariableWidthVector target) { + int firstByteSource = BitVectorHelper.byteIndex(startIndex); + int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1); + int byteSizeTarget = getValidityBufferSizeFromCount(length); + int offset = startIndex % 8; + + if (length > 0) { + if (offset == 0) { + // slice + if (target.validityBuffer != null) { + target.validityBuffer.getReferenceManager().release(); + } + target.validityBuffer = validityBuffer.slice(firstByteSource, byteSizeTarget); + target.validityBuffer.getReferenceManager().retain(); + } else { + /* Copy data + * When the first bit starts from the middle of a byte (offset != 0), + * copy data from src BitVector. + * Each byte in the target is composed by a part in i-th byte, + * another part in (i+1)-th byte. + */ + target.allocateValidityBuffer(byteSizeTarget); + + for (int i = 0; i < byteSizeTarget - 1; i++) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(this.validityBuffer, firstByteSource + i, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(this.validityBuffer, firstByteSource + i + 1, offset); + + target.validityBuffer.setByte(i, (b1 + b2)); + } + /* Copying the last piece is done in the following manner: + * if the source vector has 1 or more bytes remaining, we copy + * the last piece as a byte formed by shifting data + * from the current byte and the next byte. + * + * if the source vector has no more bytes remaining + * (we are at the last byte), we copy the last piece as a byte + * by shifting data from the current byte. + */ + if ((firstByteSource + byteSizeTarget - 1) < lastByteSource) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(this.validityBuffer, + firstByteSource + byteSizeTarget - 1, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(this.validityBuffer, + firstByteSource + byteSizeTarget, offset); + + target.validityBuffer.setByte(byteSizeTarget - 1, b1 + b2); + } else { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(this.validityBuffer, + firstByteSource + byteSizeTarget - 1, offset); + target.validityBuffer.setByte(byteSizeTarget - 1, b1); + } + } + } + } + + + /*----------------------------------------------------------------* + | | + | common getters and setters | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the number of elements that are null in the vector. + * + * @return the number of null elements. + */ + public int getNullCount() { + return BitVectorHelper.getNullCount(validityBuffer, valueCount); + } + + /** + * Check if the given index is within the current value capacity + * of the vector. + * + * @param index position to check + * @return true if index is within the current value capacity + */ + public boolean isSafe(int index) { + return index < getValueCapacity(); + } + + /** + * Check if element at given index is null. + * + * @param index position of element + * @return true if element at given index is null + */ + public boolean isNull(int index) { + return (isSet(index) == 0); + } + + /** + * Same as {@link #isNull(int)}. + * + * @param index position of element + * @return 1 if element at given index is not null, 0 otherwise + */ + public int isSet(int index) { + final int byteIndex = index >> 3; + final byte b = validityBuffer.getByte(byteIndex); + final int bitIndex = index & 7; + return (b >> bitIndex) & 0x01; + } + + /** + * Get the value count of vector. This will always be zero unless + * setValueCount(int) has been called prior to calling this. + * + * @return valueCount for the vector + */ + public int getValueCount() { + return valueCount; + } + + /** + * Sets the value count for the vector. + * + * @param valueCount value count + */ + public void setValueCount(int valueCount) { + assert valueCount >= 0; + this.valueCount = valueCount; + while (valueCount > getValueCapacity()) { + reallocValidityAndOffsetBuffers(); + } + fillHoles(valueCount); + lastSet = valueCount - 1; + setReaderAndWriterIndex(); + } + + /** + * Create holes in the vector upto the given index (exclusive). + * Holes will be created from the current last set position in + * the vector. + * + * @param index target index + */ + public void fillEmpties(int index) { + handleSafe(index, emptyByteArray.length); + fillHoles(index); + lastSet = index - 1; + } + + /** + * Set the index of last non-null element in the vector. + * It is important to call this method with appropriate value + * before calling {@link #setValueCount(int)}. + * + * @param value desired index of last non-null element. + */ + public void setLastSet(int value) { + lastSet = value; + } + + /** + * Get the index of last non-null element in the vector. + * + * @return index of the last non-null element + */ + public int getLastSet() { + return lastSet; + } + + /** + * Mark the particular position in the vector as non-null. + * + * @param index position of the element. + */ + @Override + public void setIndexDefined(int index) { + // We need to check and realloc both validity and offset buffer + while (index >= getValueCapacity()) { + reallocValidityAndOffsetBuffers(); + } + BitVectorHelper.setBit(validityBuffer, index); + } + + /** + * Sets the value length for an element. + * + * @param index position of the element to set + * @param length length of the element + */ + public void setValueLengthSafe(int index, int length) { + assert index >= 0; + handleSafe(index, length); + fillHoles(index); + final long startOffset = getStartOffset(index); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + length); + lastSet = index; + } + + /** + * Get the variable length element at specified index as Text. + * + * @param index position of element to get + * @return greater than 0 length for non-null element, 0 otherwise + */ + public int getValueLength(int index) { + assert index >= 0; + if (isSet(index) == 0) { + return 0; + } + final long startOffset = getStartOffset(index); + final int dataLength = + (int) (offsetBuffer.getLong((long) (index + 1) * OFFSET_WIDTH) - startOffset); + return dataLength; + } + + /** + * Set the variable length element at the specified index to the supplied + * byte array. This is same as using {@link #set(int, byte[], int, int)} + * with start as 0 and length as value.length + * + * @param index position of the element to set + * @param value array of bytes to write + */ + public void set(int index, byte[] value) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, value, 0, value.length); + lastSet = index; + } + + /** + * Same as {@link #set(int, byte[])} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param value array of bytes to write + */ + public void setSafe(int index, byte[] value) { + assert index >= 0; + handleSafe(index, value.length); + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, value, 0, value.length); + lastSet = index; + } + + /** + * Set the variable length element at the specified index to the supplied + * byte array. + * + * @param index position of the element to set + * @param value array of bytes to write + * @param start start index in array of bytes + * @param length length of data in array of bytes + */ + public void set(int index, byte[] value, int start, int length) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, value, start, length); + lastSet = index; + } + + /** + * Same as {@link #set(int, byte[], int, int)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param value array of bytes to write + * @param start start index in array of bytes + * @param length length of data in array of bytes + */ + public void setSafe(int index, byte[] value, int start, int length) { + assert index >= 0; + handleSafe(index, length); + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, value, start, length); + lastSet = index; + } + + /** + * Set the variable length element at the specified index to the + * content in supplied ByteBuffer. + * + * @param index position of the element to set + * @param value ByteBuffer with data + * @param start start index in ByteBuffer + * @param length length of data in ByteBuffer + */ + public void set(int index, ByteBuffer value, int start, int length) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + final long startOffset = getStartOffset(index); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + length); + valueBuffer.setBytes(startOffset, value, start, length); + lastSet = index; + } + + /** + * Same as {@link #set(int, ByteBuffer, int, int)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param value ByteBuffer with data + * @param start start index in ByteBuffer + * @param length length of data in ByteBuffer + */ + public void setSafe(int index, ByteBuffer value, int start, int length) { + assert index >= 0; + handleSafe(index, length); + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + final long startOffset = getStartOffset(index); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + length); + valueBuffer.setBytes(startOffset, value, start, length); + lastSet = index; + } + + /** + * Set the element at the given index to null. + * + * @param index position of element + */ + public void setNull(int index) { + // We need to check and realloc both validity and offset buffer + while (index >= getValueCapacity()) { + reallocValidityAndOffsetBuffers(); + } + BitVectorHelper.unsetBit(validityBuffer, index); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param start start position of data in buffer + * @param end end position of data in buffer + * @param buffer data buffer containing the variable width element to be stored + * in the vector + */ + public void set(int index, int isSet, long start, long end, ArrowBuf buffer) { + assert index >= 0; + final long dataLength = end - start; + fillHoles(index); + BitVectorHelper.setValidityBit(validityBuffer, index, isSet); + final long startOffset = offsetBuffer.getLong((long) index * OFFSET_WIDTH); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, end); + valueBuffer.setBytes(startOffset, buffer, start, dataLength); + lastSet = index; + } + + /** + * Same as {@link #set(int, int, long, long, ArrowBuf)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param start start position of data in buffer + * @param end end position of data in buffer + * @param buffer data buffer containing the variable width element to be stored + * in the vector + */ + public void setSafe(int index, int isSet, long start, long end, ArrowBuf buffer) { + assert index >= 0; + final long dataLength = end - start; + handleSafe(index, (int) dataLength); + fillHoles(index); + BitVectorHelper.setValidityBit(validityBuffer, index, isSet); + final long startOffset = offsetBuffer.getLong((long) index * OFFSET_WIDTH); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, buffer, start, dataLength); + lastSet = index; + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param start start position of data in buffer + * @param length length of data in buffer + * @param buffer data buffer containing the variable width element to be stored + * in the vector + */ + public void set(int index, long start, int length, ArrowBuf buffer) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + final long startOffset = offsetBuffer.getLong((long) index * OFFSET_WIDTH); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + length); + final ArrowBuf bb = buffer.slice(start, length); + valueBuffer.setBytes(startOffset, bb); + lastSet = index; + } + + /** + * Same as {@link #set(int, int, long, long, ArrowBuf)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param start start position of data in buffer + * @param length length of data in buffer + * @param buffer data buffer containing the variable width element to be stored + * in the vector + */ + public void setSafe(int index, long start, int length, ArrowBuf buffer) { + assert index >= 0; + handleSafe(index, length); + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + final long startOffset = offsetBuffer.getLong((long) index * OFFSET_WIDTH); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + length); + final ArrowBuf bb = buffer.slice(start, length); + valueBuffer.setBytes(startOffset, bb); + lastSet = index; + } + + + /*----------------------------------------------------------------* + | | + | helper methods for setters | + | | + *----------------------------------------------------------------*/ + + + protected final void fillHoles(int index) { + for (int i = lastSet + 1; i < index; i++) { + setBytes(i, emptyByteArray, 0, emptyByteArray.length); + } + lastSet = index - 1; + } + + protected final void setBytes(int index, byte[] value, int start, int length) { + /* end offset of current last element in the vector. this will + * be the start offset of new element we are trying to store. + */ + final long startOffset = getStartOffset(index); + /* set new end offset */ + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + length); + /* store the var length data in value buffer */ + valueBuffer.setBytes(startOffset, value, start, length); + } + + /** + * Gets the starting offset of a record, given its index. + * @param index index of the record. + * @return the starting offset of the record. + */ + protected final long getStartOffset(int index) { + return offsetBuffer.getLong((long) index * OFFSET_WIDTH); + } + + protected final void handleSafe(int index, int dataLength) { + /* + * IMPORTANT: + * value buffer for variable length vectors moves independent + * of the companion validity and offset buffers. This is in + * contrast to what we have for fixed width vectors. + * + * Here there is no concept of getValueCapacity() in the + * data stream. getValueCapacity() is applicable only to validity + * and offset buffers. + * + * So even though we may have setup an initial capacity of 1024 + * elements in the vector, it is quite possible + * that we need to reAlloc() the data buffer when we are setting + * the 5th element in the vector simply because previous + * variable length elements have exhausted the buffer capacity. + * However, we really don't need to reAlloc() validity and + * offset buffers until we try to set the 1025th element + * This is why we do a separate check for safe methods to + * determine which buffer needs reallocation. + */ + while (index >= getValueCapacity()) { + reallocValidityAndOffsetBuffers(); + } + final long startOffset = lastSet < 0 ? 0L : getStartOffset(lastSet + 1); + while (valueBuffer.capacity() < (startOffset + dataLength)) { + reallocDataBuffer(); + } + } + + /** + * Method used by Json Writer to read a variable width element from + * the variable width vector and write to Json. + * + *

This method should not be used externally. + * + * @param data buffer storing the variable width vector elements + * @param offset buffer storing the offsets of variable width vector elements + * @param index position of the element in the vector + * @return array of bytes + */ + public static byte[] get(final ArrowBuf data, final ArrowBuf offset, int index) { + final long currentStartOffset = offset.getLong((long) index * OFFSET_WIDTH); + final int dataLength = + (int) (offset.getLong((long) (index + 1) * OFFSET_WIDTH) - currentStartOffset); + final byte[] result = new byte[dataLength]; + data.getBytes(currentStartOffset, result, 0, dataLength); + return result; + } + + /** + * Method used by Json Reader to explicitly set the offsets of the variable + * width vector data. The method takes care of allocating the memory for + * offsets if the caller hasn't done so. + * + *

This method should not be used externally. + * + * @param buffer ArrowBuf to store offsets for variable width elements + * @param allocator memory allocator + * @param valueCount number of elements + * @param index position of the element + * @param value offset of the element + * @return buffer holding the offsets + */ + public static ArrowBuf set(ArrowBuf buffer, BufferAllocator allocator, + int valueCount, int index, long value) { + if (buffer == null) { + buffer = allocator.buffer((long) valueCount * OFFSET_WIDTH); + } + buffer.setLong((long) index * OFFSET_WIDTH, value); + if (index == (valueCount - 1)) { + buffer.writerIndex((long) valueCount * OFFSET_WIDTH); + } + + return buffer; + } + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector. + * + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + @Override + public void copyFrom(int fromIndex, int thisIndex, ValueVector from) { + Preconditions.checkArgument(this.getMinorType() == from.getMinorType()); + if (from.isNull(fromIndex)) { + fillHoles(thisIndex); + BitVectorHelper.unsetBit(this.validityBuffer, thisIndex); + final long copyStart = offsetBuffer.getLong((long) thisIndex * OFFSET_WIDTH); + offsetBuffer.setLong((long) (thisIndex + 1) * OFFSET_WIDTH, copyStart); + } else { + final long start = from.getOffsetBuffer().getLong((long) fromIndex * OFFSET_WIDTH); + final long end = from.getOffsetBuffer().getLong((long) (fromIndex + 1) * OFFSET_WIDTH); + final long length = end - start; + fillHoles(thisIndex); + BitVectorHelper.setBit(this.validityBuffer, thisIndex); + final long copyStart = offsetBuffer.getLong((long) thisIndex * OFFSET_WIDTH); + from.getDataBuffer().getBytes(start, this.valueBuffer, copyStart, (int) length); + offsetBuffer.setLong((long) (thisIndex + 1) * OFFSET_WIDTH, copyStart + length); + } + lastSet = thisIndex; + } + + /** + * Same as {@link #copyFrom(int, int, ValueVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + @Override + public void copyFromSafe(int fromIndex, int thisIndex, ValueVector from) { + Preconditions.checkArgument(this.getMinorType() == from.getMinorType()); + if (from.isNull(fromIndex)) { + handleSafe(thisIndex, 0); + fillHoles(thisIndex); + BitVectorHelper.unsetBit(this.validityBuffer, thisIndex); + final long copyStart = offsetBuffer.getLong((long) thisIndex * OFFSET_WIDTH); + offsetBuffer.setLong((long) (thisIndex + 1) * OFFSET_WIDTH, copyStart); + } else { + final long start = from.getOffsetBuffer().getLong((long) fromIndex * OFFSET_WIDTH); + final long end = from.getOffsetBuffer().getLong((long) (fromIndex + 1) * OFFSET_WIDTH); + final int length = (int) (end - start); + handleSafe(thisIndex, length); + fillHoles(thisIndex); + BitVectorHelper.setBit(this.validityBuffer, thisIndex); + final long copyStart = offsetBuffer.getLong((long) thisIndex * OFFSET_WIDTH); + from.getDataBuffer().getBytes(start, this.valueBuffer, copyStart, length); + offsetBuffer.setLong((long) (thisIndex + 1) * OFFSET_WIDTH, copyStart + length); + } + lastSet = thisIndex; + } + + @Override + public ArrowBufPointer getDataPointer(int index) { + return getDataPointer(index, new ArrowBufPointer()); + } + + @Override + public ArrowBufPointer getDataPointer(int index, ArrowBufPointer reuse) { + if (isNull(index)) { + reuse.set(null, 0, 0); + } else { + long offset = offsetBuffer.getLong((long) index * OFFSET_WIDTH); + int length = (int) (offsetBuffer.getLong((long) (index + 1) * OFFSET_WIDTH) - offset); + reuse.set(valueBuffer, offset, length); + } + return reuse; + } + + @Override + public int hashCode(int index) { + return hashCode(index, null); + } + + @Override + public int hashCode(int index, ArrowBufHasher hasher) { + if (isNull(index)) { + return ArrowBufPointer.NULL_HASH_CODE; + } + final long start = getStartOffset(index); + final long end = getStartOffset(index + 1); + return ByteFunctionHelpers.hash(hasher, this.getDataBuffer(), start, end); + } + + @Override + public OUT accept(VectorVisitor visitor, IN value) { + return visitor.visit(this, value); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java new file mode 100644 index 000000000..22fe4254f --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java @@ -0,0 +1,231 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import java.util.Collections; +import java.util.Iterator; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.ReferenceManager; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.util.DataSizeRoundingUtil; +import org.apache.arrow.vector.util.TransferPair; +import org.apache.arrow.vector.util.ValueVectorUtility; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Base class for other Arrow Vector Types. Provides basic functionality around + * memory management. + */ +public abstract class BaseValueVector implements ValueVector { + private static final Logger logger = LoggerFactory.getLogger(BaseValueVector.class); + + public static final String MAX_ALLOCATION_SIZE_PROPERTY = "arrow.vector.max_allocation_bytes"; + public static final long MAX_ALLOCATION_SIZE = Long.getLong(MAX_ALLOCATION_SIZE_PROPERTY, Long.MAX_VALUE); + /* + * For all fixed width vectors, the value and validity buffers are sliced from a single buffer. + * Similarly, for variable width vectors, the offsets and validity buffers are sliced from a + * single buffer. To ensure the single buffer is power-of-2 size, the initial value allocation + * should be less than power-of-2. For IntVectors, this comes to 3970*4 (15880) for the data + * buffer and 504 bytes for the validity buffer, totalling to 16384 (2^16). + */ + public static final int INITIAL_VALUE_ALLOCATION = 3970; + + protected final BufferAllocator allocator; + + protected BaseValueVector(BufferAllocator allocator) { + this.allocator = Preconditions.checkNotNull(allocator, "allocator cannot be null"); + } + + @Override + public abstract String getName(); + + /** + * Representation of vector suitable for debugging. + */ + @Override + public String toString() { + return ValueVectorUtility.getToString(this, 0, getValueCount()); + } + + @Override + public void clear() { + } + + @Override + public void close() { + clear(); + } + + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + return getTransferPair(getName(), allocator); + } + + @Override + public Iterator iterator() { + return Collections.emptyIterator(); + } + + /** + * Checks to ensure that every buffer vv uses + * has a positive reference count, throws if this precondition + * isn't met. Returns true otherwise. + */ + public static boolean checkBufRefs(final ValueVector vv) { + for (final ArrowBuf buffer : vv.getBuffers(false)) { + if (buffer.refCnt() <= 0) { + throw new IllegalStateException("zero refcount"); + } + } + + return true; + } + + @Override + public BufferAllocator getAllocator() { + return allocator; + } + + void compareTypes(BaseValueVector target, String caller) { + if (this.getMinorType() != target.getMinorType()) { + throw new UnsupportedOperationException(caller + " should have vectors of exact same type"); + } + } + + protected ArrowBuf releaseBuffer(ArrowBuf buffer) { + buffer.getReferenceManager().release(); + buffer = allocator.getEmpty(); + return buffer; + } + + /* number of bytes for the validity buffer for the given valueCount */ + protected static int getValidityBufferSizeFromCount(final int valueCount) { + return DataSizeRoundingUtil.divideBy8Ceil(valueCount); + } + + /* round up bytes for the validity buffer for the given valueCount */ + private static long roundUp8ForValidityBuffer(long valueCount) { + return ((valueCount + 63) >> 6) << 3; + } + + long computeCombinedBufferSize(int valueCount, int typeWidth) { + Preconditions.checkArgument(valueCount >= 0, "valueCount must be >= 0"); + Preconditions.checkArgument(typeWidth >= 0, "typeWidth must be >= 0"); + + // compute size of validity buffer. + long bufferSize = roundUp8ForValidityBuffer(valueCount); + + // add the size of the value buffer. + if (typeWidth == 0) { + // for boolean type, value-buffer and validity-buffer are of same size. + bufferSize *= 2; + } else { + bufferSize += DataSizeRoundingUtil.roundUpTo8Multiple((long) valueCount * typeWidth); + } + return allocator.getRoundingPolicy().getRoundedSize(bufferSize); + } + + /** + * Container for primitive vectors (1 for the validity bit-mask and one to hold the values). + */ + class DataAndValidityBuffers { + private ArrowBuf dataBuf; + private ArrowBuf validityBuf; + + DataAndValidityBuffers(ArrowBuf dataBuf, ArrowBuf validityBuf) { + this.dataBuf = dataBuf; + this.validityBuf = validityBuf; + } + + ArrowBuf getDataBuf() { + return dataBuf; + } + + ArrowBuf getValidityBuf() { + return validityBuf; + } + } + + DataAndValidityBuffers allocFixedDataAndValidityBufs(int valueCount, int typeWidth) { + long bufferSize = computeCombinedBufferSize(valueCount, typeWidth); + assert bufferSize <= MAX_ALLOCATION_SIZE; + + long validityBufferSize; + long dataBufferSize; + if (typeWidth == 0) { + validityBufferSize = dataBufferSize = bufferSize / 2; + } else { + // Due to the rounding policy, the bufferSize could be greater than the + // requested size. Utilize the allocated buffer fully.; + long actualCount = (long) ((bufferSize * 8.0) / (8 * typeWidth + 1)); + do { + validityBufferSize = roundUp8ForValidityBuffer(actualCount); + dataBufferSize = DataSizeRoundingUtil.roundUpTo8Multiple(actualCount * typeWidth); + if (validityBufferSize + dataBufferSize <= bufferSize) { + break; + } + --actualCount; + } + while (true); + } + + + /* allocate combined buffer */ + ArrowBuf combinedBuffer = allocator.buffer(bufferSize); + + /* slice into requested lengths */ + ArrowBuf dataBuf = null; + ArrowBuf validityBuf = null; + long bufferOffset = 0; + for (int numBuffers = 0; numBuffers < 2; ++numBuffers) { + long len = (numBuffers == 0 ? dataBufferSize : validityBufferSize); + ArrowBuf buf = combinedBuffer.slice(bufferOffset, len); + buf.getReferenceManager().retain(); + buf.readerIndex(0); + buf.writerIndex(0); + + bufferOffset += len; + if (numBuffers == 0) { + dataBuf = buf; + } else { + validityBuf = buf; + } + } + combinedBuffer.getReferenceManager().release(); + return new DataAndValidityBuffers(dataBuf, validityBuf); + } + + public static ArrowBuf transferBuffer(final ArrowBuf srcBuffer, final BufferAllocator targetAllocator) { + final ReferenceManager referenceManager = srcBuffer.getReferenceManager(); + return referenceManager.transferOwnership(srcBuffer, targetAllocator).getTransferredBuffer(); + } + + @Override + public void copyFrom(int fromIndex, int thisIndex, ValueVector from) { + throw new UnsupportedOperationException(); + } + + @Override + public void copyFromSafe(int fromIndex, int thisIndex, ValueVector from) { + throw new UnsupportedOperationException(); + } +} + diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java new file mode 100644 index 000000000..866dd9e21 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -0,0 +1,1410 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.memory.util.LargeMemoryUtil.capAtMaxInt; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.memory.util.ArrowBufPointer; +import org.apache.arrow.memory.util.ByteFunctionHelpers; +import org.apache.arrow.memory.util.CommonUtil; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.util.TransferPair; + +/** + * BaseVariableWidthVector is a base class providing functionality for strings/bytes types. + */ +public abstract class BaseVariableWidthVector extends BaseValueVector + implements VariableWidthVector, FieldVector, VectorDefinitionSetter { + private static final int DEFAULT_RECORD_BYTE_COUNT = 8; + private static final int INITIAL_BYTE_COUNT = INITIAL_VALUE_ALLOCATION * DEFAULT_RECORD_BYTE_COUNT; + private int lastValueCapacity; + private long lastValueAllocationSizeInBytes; + + /* protected members */ + public static final int OFFSET_WIDTH = 4; /* 4 byte unsigned int to track offsets */ + protected static final byte[] emptyByteArray = new byte[]{}; + protected ArrowBuf validityBuffer; + protected ArrowBuf valueBuffer; + protected ArrowBuf offsetBuffer; + protected int valueCount; + protected int lastSet; + protected final Field field; + + /** + * Constructs a new instance. + * + * @param field The field materialized by this vector. + * @param allocator The allocator to use for creating/resizing buffers + */ + public BaseVariableWidthVector(Field field, final BufferAllocator allocator) { + super(allocator); + this.field = field; + lastValueAllocationSizeInBytes = INITIAL_BYTE_COUNT; + // -1 because we require one extra slot for the offset array. + lastValueCapacity = INITIAL_VALUE_ALLOCATION - 1; + valueCount = 0; + lastSet = -1; + offsetBuffer = allocator.getEmpty(); + validityBuffer = allocator.getEmpty(); + valueBuffer = allocator.getEmpty(); + } + + @Override + public String getName() { + return field.getName(); + } + + /* TODO: + * see if getNullCount() can be made faster -- O(1) + */ + + /* TODO: + * Once the entire hierarchy has been refactored, move common functions + * like getNullCount(), splitAndTransferValidityBuffer to top level + * base class BaseValueVector. + * + * Along with this, some class members (validityBuffer) can also be + * abstracted out to top level base class. + * + * Right now BaseValueVector is the top level base class for other + * vector types in ValueVector hierarchy (non-nullable) and those + * vectors have not yet been refactored/removed so moving things to + * the top class as of now is not a good idea. + */ + + /** + * Get buffer that manages the validity (NULL or NON-NULL nature) of + * elements in the vector. Consider it as a buffer for internal bit vector + * data structure. + * @return buffer + */ + @Override + public ArrowBuf getValidityBuffer() { + return validityBuffer; + } + + /** + * Get the buffer that stores the data for elements in the vector. + * @return buffer + */ + @Override + public ArrowBuf getDataBuffer() { + return valueBuffer; + } + + /** + * buffer that stores the offsets for elements + * in the vector. This operation is not supported for fixed-width vectors. + * @return buffer + */ + @Override + public ArrowBuf getOffsetBuffer() { + return offsetBuffer; + } + + /** + * Get the memory address of buffer that stores the offsets for elements + * in the vector. + * @return starting address of the buffer + */ + @Override + public long getOffsetBufferAddress() { + return offsetBuffer.memoryAddress(); + } + + /** + * Get the memory address of buffer that manages the validity + * (NULL or NON-NULL nature) of elements in the vector. + * @return starting address of the buffer + */ + @Override + public long getValidityBufferAddress() { + return validityBuffer.memoryAddress(); + } + + /** + * Get the memory address of buffer that stores the data for elements + * in the vector. + * @return starting address of the buffer + */ + @Override + public long getDataBufferAddress() { + return valueBuffer.memoryAddress(); + } + + /** + * Sets the desired value capacity for the vector. This function doesn't + * allocate any memory for the vector. + * @param valueCount desired number of elements in the vector + */ + @Override + public void setInitialCapacity(int valueCount) { + final long size = (long) valueCount * DEFAULT_RECORD_BYTE_COUNT; + checkDataBufferSize(size); + computeAndCheckOffsetsBufferSize(valueCount); + lastValueAllocationSizeInBytes = (int) size; + lastValueCapacity = valueCount; + } + + /** + * Sets the desired value capacity for the vector. This function doesn't + * allocate any memory for the vector. + * @param valueCount desired number of elements in the vector + * @param density average number of bytes per variable width element + */ + @Override + public void setInitialCapacity(int valueCount, double density) { + long size = Math.max((long) (valueCount * density), 1L); + checkDataBufferSize(size); + computeAndCheckOffsetsBufferSize(valueCount); + lastValueAllocationSizeInBytes = (int) size; + lastValueCapacity = valueCount; + } + + /** + * Get the density of this ListVector. + * @return density + */ + public double getDensity() { + if (valueCount == 0) { + return 0.0D; + } + final int startOffset = offsetBuffer.getInt(0); + final int endOffset = offsetBuffer.getInt((long) valueCount * OFFSET_WIDTH); + final double totalListSize = endOffset - startOffset; + return totalListSize / valueCount; + } + + /** + * Get the current capacity which does not exceed either validity buffer or offset buffer. + * Note: Here the `getValueCapacity` has no relationship with the value buffer. + * @return number of elements that vector can hold. + */ + @Override + public int getValueCapacity() { + final int offsetValueCapacity = Math.max(getOffsetBufferValueCapacity() - 1, 0); + return Math.min(offsetValueCapacity, getValidityBufferValueCapacity()); + } + + private int getValidityBufferValueCapacity() { + return capAtMaxInt(validityBuffer.capacity() * 8); + } + + private int getOffsetBufferValueCapacity() { + return capAtMaxInt(offsetBuffer.capacity() / OFFSET_WIDTH); + } + + /** + * zero out the vector and the data in associated buffers. + */ + public void zeroVector() { + initValidityBuffer(); + initOffsetBuffer(); + valueBuffer.setZero(0, valueBuffer.capacity()); + } + + /* zero out the validity buffer */ + private void initValidityBuffer() { + validityBuffer.setZero(0, validityBuffer.capacity()); + } + + /* zero out the offset buffer */ + private void initOffsetBuffer() { + offsetBuffer.setZero(0, offsetBuffer.capacity()); + } + + /** + * Reset the vector to initial state. Same as {@link #zeroVector()}. + * Note that this method doesn't release any memory. + */ + public void reset() { + zeroVector(); + lastSet = -1; + valueCount = 0; + } + + /** + * Close the vector and release the associated buffers. + */ + @Override + public void close() { + clear(); + } + + /** + * Same as {@link #close()}. + */ + @Override + public void clear() { + validityBuffer = releaseBuffer(validityBuffer); + valueBuffer = releaseBuffer(valueBuffer); + offsetBuffer = releaseBuffer(offsetBuffer); + lastSet = -1; + valueCount = 0; + } + + /** + * Get the inner vectors. + * + * @deprecated This API will be removed as the current implementations no longer support inner vectors. + * + * @return the inner vectors for this field as defined by the TypeLayout + */ + @Deprecated + @Override + public List getFieldInnerVectors() { + throw new UnsupportedOperationException("There are no inner vectors. Use getFieldBuffers"); + } + + /** + * Initialize the children in schema for this Field. This operation is a + * NO-OP for scalar types since they don't have any children. + * @param children the schema + * @throws IllegalArgumentException if children is a non-empty list for scalar types. + */ + @Override + public void initializeChildrenFromFields(List children) { + if (!children.isEmpty()) { + throw new IllegalArgumentException("primitive type vector can not have children"); + } + } + + /** + * Get the inner child vectors. + * @return list of child vectors for complex types, empty list for scalar vector types + */ + @Override + public List getChildrenFromFields() { + return Collections.emptyList(); + } + + + /** + * Load the buffers of this vector with provided source buffers. + * The caller manages the source buffers and populates them before invoking + * this method. + * @param fieldNode the fieldNode indicating the value count + * @param ownBuffers the buffers for this Field (own buffers only, children not included) + */ + @Override + public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { + ArrowBuf bitBuffer = ownBuffers.get(0); + ArrowBuf offBuffer = ownBuffers.get(1); + ArrowBuf dataBuffer = ownBuffers.get(2); + + validityBuffer.getReferenceManager().release(); + validityBuffer = BitVectorHelper.loadValidityBuffer(fieldNode, bitBuffer, allocator); + offsetBuffer.getReferenceManager().release(); + offsetBuffer = offBuffer.getReferenceManager().retain(offBuffer, allocator); + valueBuffer.getReferenceManager().release(); + valueBuffer = dataBuffer.getReferenceManager().retain(dataBuffer, allocator); + + lastSet = fieldNode.getLength() - 1; + valueCount = fieldNode.getLength(); + } + + /** + * Get the buffers belonging to this vector. + * @return the inner buffers. + */ + public List getFieldBuffers() { + // before flight/IPC, we must bring the vector to a consistent state. + // this is because, it is possible that the offset buffers of some trailing values + // are not updated. this may cause some data in the data buffer being lost. + // for details, please see TestValueVector#testUnloadVariableWidthVector. + fillHoles(valueCount); + + List result = new ArrayList<>(3); + setReaderAndWriterIndex(); + result.add(validityBuffer); + result.add(offsetBuffer); + result.add(valueBuffer); + + return result; + } + + /** + * Set the reader and writer indexes for the inner buffers. + */ + private void setReaderAndWriterIndex() { + validityBuffer.readerIndex(0); + offsetBuffer.readerIndex(0); + valueBuffer.readerIndex(0); + if (valueCount == 0) { + validityBuffer.writerIndex(0); + offsetBuffer.writerIndex(0); + valueBuffer.writerIndex(0); + } else { + final int lastDataOffset = getStartOffset(valueCount); + validityBuffer.writerIndex(getValidityBufferSizeFromCount(valueCount)); + offsetBuffer.writerIndex((long) (valueCount + 1) * OFFSET_WIDTH); + valueBuffer.writerIndex(lastDataOffset); + } + } + + /** + * Same as {@link #allocateNewSafe()}. + */ + @Override + public void allocateNew() { + allocateNew(lastValueAllocationSizeInBytes, lastValueCapacity); + } + + /** + * Allocate memory for the vector. We internally use a default value count + * of 4096 to allocate memory for at least these many elements in the + * vector. See {@link #allocateNew(long, int)} for allocating memory for specific + * number of elements in the vector. + * + * @return false if memory allocation fails, true otherwise. + */ + @Override + public boolean allocateNewSafe() { + try { + allocateNew(lastValueAllocationSizeInBytes, lastValueCapacity); + return true; + } catch (Exception e) { + return false; + } + } + + /** + * Allocate memory for the vector to support storing at least the provided number of + * elements in the vector. This method must be called prior to using the ValueVector. + * + * @param totalBytes desired total memory capacity + * @param valueCount the desired number of elements in the vector + * @throws org.apache.arrow.memory.OutOfMemoryException if memory allocation fails + */ + @Override + public void allocateNew(long totalBytes, int valueCount) { + assert totalBytes >= 0; + + checkDataBufferSize(totalBytes); + computeAndCheckOffsetsBufferSize(valueCount); + + /* we are doing a new allocation -- release the current buffers */ + clear(); + + try { + allocateBytes(totalBytes, valueCount); + } catch (Exception e) { + clear(); + throw e; + } + } + + @Override + public void allocateNew(int valueCount) { + allocateNew(lastValueAllocationSizeInBytes, valueCount); + } + + /* Check if the data buffer size is within bounds. */ + private void checkDataBufferSize(long size) { + if (size > MAX_ALLOCATION_SIZE || size < 0) { + throw new OversizedAllocationException("Memory required for vector " + + " is (" + size + "), which is more than max allowed (" + MAX_ALLOCATION_SIZE + ")"); + } + } + + /* + * Compute the buffer size required for 'valueCount' offsets and validity, and check if it's + * within bounds. + */ + private long computeAndCheckOffsetsBufferSize(int valueCount) { + /* to track the end offset of last data element in vector, we need + * an additional slot in offset buffer. + */ + final long size = computeCombinedBufferSize(valueCount + 1, OFFSET_WIDTH); + if (size > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Memory required for vector capacity " + + valueCount + + " is (" + size + "), which is more than max allowed (" + MAX_ALLOCATION_SIZE + ")"); + } + return size; + } + + /* allocate the inner buffers */ + private void allocateBytes(final long valueBufferSize, final int valueCount) { + /* allocate data buffer */ + long curSize = valueBufferSize; + valueBuffer = allocator.buffer(curSize); + valueBuffer.readerIndex(0); + + /* allocate offset buffer and validity buffer */ + DataAndValidityBuffers buffers = allocFixedDataAndValidityBufs(valueCount + 1, OFFSET_WIDTH); + offsetBuffer = buffers.getDataBuf(); + validityBuffer = buffers.getValidityBuf(); + initOffsetBuffer(); + initValidityBuffer(); + + lastValueCapacity = getValueCapacity(); + lastValueAllocationSizeInBytes = capAtMaxInt(valueBuffer.capacity()); + } + + /* allocate offset buffer */ + private void allocateOffsetBuffer(final long size) { + final int curSize = (int) size; + offsetBuffer = allocator.buffer(curSize); + offsetBuffer.readerIndex(0); + initOffsetBuffer(); + } + + /* allocate validity buffer */ + private void allocateValidityBuffer(final long size) { + final int curSize = (int) size; + validityBuffer = allocator.buffer(curSize); + validityBuffer.readerIndex(0); + initValidityBuffer(); + } + + /** + * Resize the vector to increase the capacity. The internal behavior is to + * double the current value capacity. + */ + public void reAlloc() { + reallocDataBuffer(); + reallocValidityAndOffsetBuffers(); + } + + /** + * Reallocate the data buffer. Data Buffer stores the actual data for + * VARCHAR or VARBINARY elements in the vector. The behavior is to double + * the size of buffer. + * @throws OversizedAllocationException if the desired new size is more than + * max allowed + * @throws OutOfMemoryException if the internal memory allocation fails + */ + public void reallocDataBuffer() { + final long currentBufferCapacity = valueBuffer.capacity(); + long newAllocationSize = currentBufferCapacity * 2; + if (newAllocationSize == 0) { + if (lastValueAllocationSizeInBytes > 0) { + newAllocationSize = lastValueAllocationSizeInBytes; + } else { + newAllocationSize = INITIAL_BYTE_COUNT * 2L; + } + } + newAllocationSize = CommonUtil.nextPowerOfTwo(newAllocationSize); + assert newAllocationSize >= 1; + + checkDataBufferSize(newAllocationSize); + + final ArrowBuf newBuf = allocator.buffer(newAllocationSize); + newBuf.setBytes(0, valueBuffer, 0, currentBufferCapacity); + valueBuffer.getReferenceManager().release(); + valueBuffer = newBuf; + lastValueAllocationSizeInBytes = valueBuffer.capacity(); + } + + /** + * Reallocate the validity and offset buffers for this vector. Validity + * buffer is used to track the NULL or NON-NULL nature of elements in + * the vector and offset buffer is used to store the lengths of variable + * width elements in the vector. + * + *

Note that data buffer for variable length vectors moves independent + * of the companion validity and offset buffers. This is in + * contrast to what we have for fixed width vectors. + * + *

So even though we may have setup an initial capacity of 1024 + * elements in the vector, it is quite possible + * that we need to reAlloc() the data buffer when we are setting + * the 5th element in the vector simply because previous + * variable length elements have exhausted the buffer capacity. + * However, we really don't need to reAlloc() validity and + * offset buffers until we try to set the 1025th element + * This is why we do a separate check for safe methods to + * determine which buffer needs reallocation. + * @throws OversizedAllocationException if the desired new size is more than + * max allowed + * @throws OutOfMemoryException if the internal memory allocation fails + */ + public void reallocValidityAndOffsetBuffers() { + int targetOffsetCount = capAtMaxInt((offsetBuffer.capacity() / OFFSET_WIDTH) * 2); + if (targetOffsetCount == 0) { + if (lastValueCapacity > 0) { + targetOffsetCount = (lastValueCapacity + 1); + } else { + targetOffsetCount = 2 * (INITIAL_VALUE_ALLOCATION + 1); + } + } + computeAndCheckOffsetsBufferSize(targetOffsetCount); + + DataAndValidityBuffers buffers = allocFixedDataAndValidityBufs(targetOffsetCount, OFFSET_WIDTH); + final ArrowBuf newOffsetBuffer = buffers.getDataBuf(); + newOffsetBuffer.setBytes(0, offsetBuffer, 0, offsetBuffer.capacity()); + newOffsetBuffer.setZero(offsetBuffer.capacity(), newOffsetBuffer.capacity() - offsetBuffer.capacity()); + offsetBuffer.getReferenceManager().release(); + offsetBuffer = newOffsetBuffer; + + final ArrowBuf newValidityBuffer = buffers.getValidityBuf(); + newValidityBuffer.setBytes(0, validityBuffer, 0, validityBuffer.capacity()); + newValidityBuffer.setZero(validityBuffer.capacity(), newValidityBuffer.capacity() - validityBuffer.capacity()); + validityBuffer.getReferenceManager().release(); + validityBuffer = newValidityBuffer; + + lastValueCapacity = getValueCapacity(); + } + + /** + * Get the size (number of bytes) of underlying data buffer. + * @return number of bytes in the data buffer + */ + @Override + public int getByteCapacity() { + return capAtMaxInt(valueBuffer.capacity()); + } + + @Override + public int sizeOfValueBuffer() { + if (valueCount == 0) { + return 0; + } + return offsetBuffer.getInt((long) valueCount * OFFSET_WIDTH); + } + + /** + * Get the size (number of bytes) of underlying buffers used by this + * vector. + * @return size of underlying buffers. + */ + @Override + public int getBufferSize() { + return getBufferSizeFor(this.valueCount); + } + + /** + * Get the potential buffer size for a particular number of records. + * @param valueCount desired number of elements in the vector + * @return estimated size of underlying buffers if the vector holds + * a given number of elements + */ + @Override + public int getBufferSizeFor(final int valueCount) { + if (valueCount == 0) { + return 0; + } + + final int validityBufferSize = getValidityBufferSizeFromCount(valueCount); + final int offsetBufferSize = (valueCount + 1) * OFFSET_WIDTH; + /* get the end offset for this valueCount */ + final int dataBufferSize = offsetBuffer.getInt((long) valueCount * OFFSET_WIDTH); + return validityBufferSize + offsetBufferSize + dataBufferSize; + } + + /** + * Get information about how this field is materialized. + * @return the field corresponding to this vector + */ + @Override + public Field getField() { + return field; + } + + /** + * Return the underlying buffers associated with this vector. Note that this doesn't + * impact the reference counts for this buffer so it only should be used for in-context + * access. Also note that this buffer changes regularly thus + * external classes shouldn't hold a reference to it (unless they change it). + * + * @param clear Whether to clear vector before returning; the buffers will still be refcounted + * but the returned array will be the only reference to them + * @return The underlying {@link ArrowBuf buffers} that is used by this + * vector instance. + */ + @Override + public ArrowBuf[] getBuffers(boolean clear) { + final ArrowBuf[] buffers; + setReaderAndWriterIndex(); + if (getBufferSize() == 0) { + buffers = new ArrowBuf[0]; + } else { + buffers = new ArrowBuf[3]; + buffers[0] = validityBuffer; + buffers[1] = offsetBuffer; + buffers[2] = valueBuffer; + } + if (clear) { + for (final ArrowBuf buffer : buffers) { + buffer.getReferenceManager().retain(); + } + clear(); + } + return buffers; + } + + /** + * Construct a transfer pair of this vector and another vector of same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @param callBack not used + * @return TransferPair + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { + return getTransferPair(ref, allocator); + } + + /** + * Construct a transfer pair of this vector and another vector of same type. + * @param allocator allocator for the target vector + * @return TransferPair + */ + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + return getTransferPair(getName(), allocator); + } + + /** + * Construct a transfer pair of this vector and another vector of same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return TransferPair + */ + public abstract TransferPair getTransferPair(String ref, BufferAllocator allocator); + + /** + * Transfer this vector'data to another vector. The memory associated + * with this vector is transferred to the allocator of target vector + * for accounting and management purposes. + * @param target destination vector for transfer + */ + public void transferTo(BaseVariableWidthVector target) { + compareTypes(target, "transferTo"); + target.clear(); + target.validityBuffer = transferBuffer(validityBuffer, target.allocator); + target.valueBuffer = transferBuffer(valueBuffer, target.allocator); + target.offsetBuffer = transferBuffer(offsetBuffer, target.allocator); + target.setLastSet(this.lastSet); + if (this.valueCount > 0) { + target.setValueCount(this.valueCount); + } + clear(); + } + + /** + * Slice this vector at desired index and length and transfer the + * corresponding data to the target vector. + * @param startIndex start position of the split in source vector. + * @param length length of the split. + * @param target destination vector + */ + public void splitAndTransferTo(int startIndex, int length, + BaseVariableWidthVector target) { + Preconditions.checkArgument(startIndex >= 0 && length >= 0 && startIndex + length <= valueCount, + "Invalid parameters startIndex: %s, length: %s for valueCount: %s", startIndex, length, valueCount); + compareTypes(target, "splitAndTransferTo"); + target.clear(); + splitAndTransferValidityBuffer(startIndex, length, target); + splitAndTransferOffsetBuffer(startIndex, length, target); + target.setLastSet(length - 1); + if (length > 0) { + target.setValueCount(length); + } + } + + /** + * Transfer the offsets along with data. Unlike the data buffer, we cannot simply + * slice the offset buffer for split and transfer. The reason is that offsets + * in the target vector have to be adjusted and made relative to the staring + * offset in source vector from the start index of split. This is why, we + * need to explicitly allocate the offset buffer and set the adjusted offsets + * in the target vector. + */ + private void splitAndTransferOffsetBuffer(int startIndex, int length, BaseVariableWidthVector target) { + final int start = offsetBuffer.getInt((long) startIndex * OFFSET_WIDTH); + final int end = offsetBuffer.getInt((long) (startIndex + length) * OFFSET_WIDTH); + final int dataLength = end - start; + + if (start == 0) { + final ArrowBuf slicedOffsetBuffer = offsetBuffer.slice(startIndex * OFFSET_WIDTH, (1 + length) * OFFSET_WIDTH); + target.offsetBuffer = transferBuffer(slicedOffsetBuffer, target.allocator); + } else { + target.allocateOffsetBuffer((long) (length + 1) * OFFSET_WIDTH); + for (int i = 0; i < length + 1; i++) { + final int relativeSourceOffset = offsetBuffer.getInt((long) (startIndex + i) * OFFSET_WIDTH) - start; + target.offsetBuffer.setInt((long) i * OFFSET_WIDTH, relativeSourceOffset); + } + } + final ArrowBuf slicedBuffer = valueBuffer.slice(start, dataLength); + target.valueBuffer = transferBuffer(slicedBuffer, target.allocator); + } + + /* + * Transfer the validity. + */ + private void splitAndTransferValidityBuffer(int startIndex, int length, + BaseVariableWidthVector target) { + if (length <= 0) { + return; + } + + final int firstByteSource = BitVectorHelper.byteIndex(startIndex); + final int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1); + final int byteSizeTarget = getValidityBufferSizeFromCount(length); + final int offset = startIndex % 8; + + if (offset == 0) { + // slice + if (target.validityBuffer != null) { + target.validityBuffer.getReferenceManager().release(); + } + final ArrowBuf slicedValidityBuffer = validityBuffer.slice(firstByteSource, byteSizeTarget); + target.validityBuffer = transferBuffer(slicedValidityBuffer, target.allocator); + return; + } + + /* Copy data + * When the first bit starts from the middle of a byte (offset != 0), + * copy data from src BitVector. + * Each byte in the target is composed by a part in i-th byte, + * another part in (i+1)-th byte. + */ + target.allocateValidityBuffer(byteSizeTarget); + + for (int i = 0; i < byteSizeTarget - 1; i++) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(this.validityBuffer, firstByteSource + i, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(this.validityBuffer, firstByteSource + i + 1, offset); + + target.validityBuffer.setByte(i, (b1 + b2)); + } + /* Copying the last piece is done in the following manner: + * if the source vector has 1 or more bytes remaining, we copy + * the last piece as a byte formed by shifting data + * from the current byte and the next byte. + * + * if the source vector has no more bytes remaining + * (we are at the last byte), we copy the last piece as a byte + * by shifting data from the current byte. + */ + if ((firstByteSource + byteSizeTarget - 1) < lastByteSource) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(this.validityBuffer, + firstByteSource + byteSizeTarget - 1, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(this.validityBuffer, + firstByteSource + byteSizeTarget, offset); + + target.validityBuffer.setByte(byteSizeTarget - 1, b1 + b2); + } else { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(this.validityBuffer, + firstByteSource + byteSizeTarget - 1, offset); + target.validityBuffer.setByte(byteSizeTarget - 1, b1); + } + } + + + /*----------------------------------------------------------------* + | | + | common getters and setters | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the number of elements that are null in the vector. + * + * @return the number of null elements. + */ + public int getNullCount() { + return BitVectorHelper.getNullCount(validityBuffer, valueCount); + } + + /** + * Check if the given index is within the current value capacity + * of the vector. + * + * @param index position to check + * @return true if index is within the current value capacity + */ + public boolean isSafe(int index) { + return index < getValueCapacity(); + } + + /** + * Check if element at given index is null. + * + * @param index position of element + * @return true if element at given index is null + */ + public boolean isNull(int index) { + return (isSet(index) == 0); + } + + /** + * Same as {@link #isNull(int)}. + * + * @param index position of element + * @return 1 if element at given index is not null, 0 otherwise + */ + public int isSet(int index) { + final int byteIndex = index >> 3; + final byte b = validityBuffer.getByte(byteIndex); + final int bitIndex = index & 7; + return (b >> bitIndex) & 0x01; + } + + /** + * Get the value count of vector. This will always be zero unless + * setValueCount(int) has been called prior to calling this. + * + * @return valueCount for the vector + */ + public int getValueCount() { + return valueCount; + } + + /** + * Sets the value count for the vector. + * + * @param valueCount value count + */ + public void setValueCount(int valueCount) { + assert valueCount >= 0; + this.valueCount = valueCount; + while (valueCount > getValueCapacity()) { + reallocValidityAndOffsetBuffers(); + } + fillHoles(valueCount); + lastSet = valueCount - 1; + setReaderAndWriterIndex(); + } + + /** + * Create holes in the vector upto the given index (exclusive). + * Holes will be created from the current last set position in + * the vector. + * + * @param index target index + */ + public void fillEmpties(int index) { + handleSafe(index, emptyByteArray.length); + fillHoles(index); + lastSet = index - 1; + } + + /** + * Set the index of last non-null element in the vector. + * It is important to call this method with appropriate value + * before calling {@link #setValueCount(int)}. + * + * @param value desired index of last non-null element. + */ + public void setLastSet(int value) { + lastSet = value; + } + + /** + * Get the index of last non-null element in the vector. + * + * @return index of the last non-null element + */ + public int getLastSet() { + return lastSet; + } + + /** + * Get the starting position (offset) in the data stream for a given + * element in the vector. + * + * @param index position of the element in the vector + * @return starting offset for the element + */ + public long getStartEnd(int index) { + return offsetBuffer.getLong((long) index * OFFSET_WIDTH); + } + + /** + * Mark the particular position in the vector as non-null. + * + * @param index position of the element. + */ + @Override + public void setIndexDefined(int index) { + // We need to check and realloc both validity and offset buffer + while (index >= getValueCapacity()) { + reallocValidityAndOffsetBuffers(); + } + BitVectorHelper.setBit(validityBuffer, index); + } + + /** + * Sets the value length for an element. + * + * @param index position of the element to set + * @param length length of the element + */ + public void setValueLengthSafe(int index, int length) { + assert index >= 0; + handleSafe(index, length); + fillHoles(index); + final int startOffset = getStartOffset(index); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + length); + lastSet = index; + } + + /** + * Get the variable length element at specified index as Text. + * + * @param index position of element to get + * @return greater than 0 length for non-null element, 0 otherwise + */ + public int getValueLength(int index) { + assert index >= 0; + if (isSet(index) == 0) { + return 0; + } + final int startOffset = getStartOffset(index); + final int dataLength = + offsetBuffer.getInt((index + 1) * OFFSET_WIDTH) - startOffset; + return dataLength; + } + + /** + * Set the variable length element at the specified index to the supplied + * byte array. This is same as using {@link #set(int, byte[], int, int)} + * with start as 0 and length as value.length + * + * @param index position of the element to set + * @param value array of bytes to write + */ + public void set(int index, byte[] value) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, value, 0, value.length); + lastSet = index; + } + + /** + * Same as {@link #set(int, byte[])} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param value array of bytes to write + */ + public void setSafe(int index, byte[] value) { + assert index >= 0; + handleSafe(index, value.length); + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, value, 0, value.length); + lastSet = index; + } + + /** + * Set the variable length element at the specified index to the supplied + * byte array. + * + * @param index position of the element to set + * @param value array of bytes to write + * @param start start index in array of bytes + * @param length length of data in array of bytes + */ + public void set(int index, byte[] value, int start, int length) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, value, start, length); + lastSet = index; + } + + /** + * Same as {@link #set(int, byte[], int, int)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param value array of bytes to write + * @param start start index in array of bytes + * @param length length of data in array of bytes + */ + public void setSafe(int index, byte[] value, int start, int length) { + assert index >= 0; + handleSafe(index, length); + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, value, start, length); + lastSet = index; + } + + /** + * Set the variable length element at the specified index to the + * content in supplied ByteBuffer. + * + * @param index position of the element to set + * @param value ByteBuffer with data + * @param start start index in ByteBuffer + * @param length length of data in ByteBuffer + */ + public void set(int index, ByteBuffer value, int start, int length) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + final int startOffset = getStartOffset(index); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + length); + valueBuffer.setBytes(startOffset, value, start, length); + lastSet = index; + } + + /** + * Same as {@link #set(int, ByteBuffer, int, int)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param value ByteBuffer with data + * @param start start index in ByteBuffer + * @param length length of data in ByteBuffer + */ + public void setSafe(int index, ByteBuffer value, int start, int length) { + assert index >= 0; + handleSafe(index, length); + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + final int startOffset = getStartOffset(index); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + length); + valueBuffer.setBytes(startOffset, value, start, length); + lastSet = index; + } + + /** + * Set the element at the given index to null. + * + * @param index position of element + */ + public void setNull(int index) { + // We need to check and realloc both validity and offset buffer + while (index >= getValueCapacity()) { + reallocValidityAndOffsetBuffers(); + } + BitVectorHelper.unsetBit(validityBuffer, index); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param start start position of data in buffer + * @param end end position of data in buffer + * @param buffer data buffer containing the variable width element to be stored + * in the vector + */ + public void set(int index, int isSet, int start, int end, ArrowBuf buffer) { + assert index >= 0; + final int dataLength = end - start; + fillHoles(index); + BitVectorHelper.setValidityBit(validityBuffer, index, isSet); + final int startOffset = offsetBuffer.getInt((long) index * OFFSET_WIDTH); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, buffer, start, dataLength); + lastSet = index; + } + + /** + * Same as {@link #set(int, int, int, int, ArrowBuf)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param start start position of data in buffer + * @param end end position of data in buffer + * @param buffer data buffer containing the variable width element to be stored + * in the vector + */ + public void setSafe(int index, int isSet, int start, int end, ArrowBuf buffer) { + assert index >= 0; + final int dataLength = end - start; + handleSafe(index, dataLength); + fillHoles(index); + BitVectorHelper.setValidityBit(validityBuffer, index, isSet); + final int startOffset = offsetBuffer.getInt((long) index * OFFSET_WIDTH); + offsetBuffer.setInt((long) (index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, buffer, start, dataLength); + lastSet = index; + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param start start position of data in buffer + * @param length length of data in buffer + * @param buffer data buffer containing the variable width element to be stored + * in the vector + */ + public void set(int index, int start, int length, ArrowBuf buffer) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + final int startOffset = offsetBuffer.getInt((long) index * OFFSET_WIDTH); + offsetBuffer.setInt((long) (index + 1) * OFFSET_WIDTH, startOffset + length); + final ArrowBuf bb = buffer.slice(start, length); + valueBuffer.setBytes(startOffset, bb); + lastSet = index; + } + + /** + * Same as {@link #set(int, int, int, int, ArrowBuf)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param start start position of data in buffer + * @param length length of data in buffer + * @param buffer data buffer containing the variable width element to be stored + * in the vector + */ + public void setSafe(int index, int start, int length, ArrowBuf buffer) { + assert index >= 0; + handleSafe(index, length); + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + final int startOffset = offsetBuffer.getInt((long) index * OFFSET_WIDTH); + offsetBuffer.setInt((long) (index + 1) * OFFSET_WIDTH, startOffset + length); + final ArrowBuf bb = buffer.slice(start, length); + valueBuffer.setBytes(startOffset, bb); + lastSet = index; + } + + + /*----------------------------------------------------------------* + | | + | helper methods for setters | + | | + *----------------------------------------------------------------*/ + + + protected final void fillHoles(int index) { + for (int i = lastSet + 1; i < index; i++) { + setBytes(i, emptyByteArray, 0, emptyByteArray.length); + } + lastSet = index - 1; + } + + protected final void setBytes(int index, byte[] value, int start, int length) { + /* end offset of current last element in the vector. this will + * be the start offset of new element we are trying to store. + */ + final int startOffset = getStartOffset(index); + /* set new end offset */ + offsetBuffer.setInt((long) (index + 1) * OFFSET_WIDTH, startOffset + length); + /* store the var length data in value buffer */ + valueBuffer.setBytes(startOffset, value, start, length); + } + + public final int getStartOffset(int index) { + return offsetBuffer.getInt((long) index * OFFSET_WIDTH); + } + + protected final void handleSafe(int index, int dataLength) { + /* + * IMPORTANT: + * value buffer for variable length vectors moves independent + * of the companion validity and offset buffers. This is in + * contrast to what we have for fixed width vectors. + * + * Here there is no concept of getValueCapacity() in the + * data stream. getValueCapacity() is applicable only to validity + * and offset buffers. + * + * So even though we may have setup an initial capacity of 1024 + * elements in the vector, it is quite possible + * that we need to reAlloc() the data buffer when we are setting + * the 5th element in the vector simply because previous + * variable length elements have exhausted the buffer capacity. + * However, we really don't need to reAlloc() validity and + * offset buffers until we try to set the 1025th element + * This is why we do a separate check for safe methods to + * determine which buffer needs reallocation. + */ + while (index >= getValueCapacity()) { + reallocValidityAndOffsetBuffers(); + } + final int startOffset = lastSet < 0 ? 0 : getStartOffset(lastSet + 1); + while (valueBuffer.capacity() < (startOffset + dataLength)) { + reallocDataBuffer(); + } + } + + /** + * Method used by Json Writer to read a variable width element from + * the variable width vector and write to Json. + * + *

This method should not be used externally. + * + * @param data buffer storing the variable width vector elements + * @param offset buffer storing the offsets of variable width vector elements + * @param index position of the element in the vector + * @return array of bytes + */ + public static byte[] get(final ArrowBuf data, final ArrowBuf offset, int index) { + final int currentStartOffset = offset.getInt((long) index * OFFSET_WIDTH); + final int dataLength = + offset.getInt((long) (index + 1) * OFFSET_WIDTH) - currentStartOffset; + final byte[] result = new byte[dataLength]; + data.getBytes(currentStartOffset, result, 0, dataLength); + return result; + } + + /** + * Method used by Json Reader to explicitly set the offsets of the variable + * width vector data. The method takes care of allocating the memory for + * offsets if the caller hasn't done so. + * + *

This method should not be used externally. + * + * @param buffer ArrowBuf to store offsets for variable width elements + * @param allocator memory allocator + * @param valueCount number of elements + * @param index position of the element + * @param value offset of the element + * @return buffer holding the offsets + */ + public static ArrowBuf set(ArrowBuf buffer, BufferAllocator allocator, + int valueCount, int index, int value) { + if (buffer == null) { + buffer = allocator.buffer((long) valueCount * OFFSET_WIDTH); + } + buffer.setInt((long) index * OFFSET_WIDTH, value); + if (index == (valueCount - 1)) { + buffer.writerIndex((long) valueCount * OFFSET_WIDTH); + } + + return buffer; + } + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector. + * + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + @Override + public void copyFrom(int fromIndex, int thisIndex, ValueVector from) { + Preconditions.checkArgument(this.getMinorType() == from.getMinorType()); + if (from.isNull(fromIndex)) { + fillHoles(thisIndex); + BitVectorHelper.unsetBit(this.validityBuffer, thisIndex); + final int copyStart = offsetBuffer.getInt((long) thisIndex * OFFSET_WIDTH); + offsetBuffer.setInt((long) (thisIndex + 1) * OFFSET_WIDTH, copyStart); + } else { + final int start = from.getOffsetBuffer().getInt((long) fromIndex * OFFSET_WIDTH); + final int end = from.getOffsetBuffer().getInt((long) (fromIndex + 1) * OFFSET_WIDTH); + final int length = end - start; + fillHoles(thisIndex); + BitVectorHelper.setBit(this.validityBuffer, thisIndex); + final int copyStart = offsetBuffer.getInt((long) thisIndex * OFFSET_WIDTH); + from.getDataBuffer().getBytes(start, this.valueBuffer, copyStart, length); + offsetBuffer.setInt((long) (thisIndex + 1) * OFFSET_WIDTH, copyStart + length); + } + lastSet = thisIndex; + } + + /** + * Same as {@link #copyFrom(int, int, ValueVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + @Override + public void copyFromSafe(int fromIndex, int thisIndex, ValueVector from) { + Preconditions.checkArgument(this.getMinorType() == from.getMinorType()); + if (from.isNull(fromIndex)) { + handleSafe(thisIndex, 0); + fillHoles(thisIndex); + BitVectorHelper.unsetBit(this.validityBuffer, thisIndex); + final int copyStart = offsetBuffer.getInt(thisIndex * OFFSET_WIDTH); + offsetBuffer.setInt((long) (thisIndex + 1) * OFFSET_WIDTH, copyStart); + } else { + final int start = from.getOffsetBuffer().getInt((long) fromIndex * OFFSET_WIDTH); + final int end = from.getOffsetBuffer().getInt((long) (fromIndex + 1) * OFFSET_WIDTH); + final int length = end - start; + handleSafe(thisIndex, length); + fillHoles(thisIndex); + BitVectorHelper.setBit(this.validityBuffer, thisIndex); + final int copyStart = offsetBuffer.getInt((long) thisIndex * OFFSET_WIDTH); + from.getDataBuffer().getBytes(start, this.valueBuffer, copyStart, length); + offsetBuffer.setInt((long) (thisIndex + 1) * OFFSET_WIDTH, copyStart + length); + } + lastSet = thisIndex; + } + + @Override + public ArrowBufPointer getDataPointer(int index) { + return getDataPointer(index, new ArrowBufPointer()); + } + + @Override + public ArrowBufPointer getDataPointer(int index, ArrowBufPointer reuse) { + if (isNull(index)) { + reuse.set(null, 0, 0); + } else { + int offset = offsetBuffer.getInt((long) index * OFFSET_WIDTH); + int length = offsetBuffer.getInt((long) (index + 1) * OFFSET_WIDTH) - offset; + reuse.set(valueBuffer, offset, length); + } + return reuse; + } + + @Override + public int hashCode(int index) { + return hashCode(index, null); + } + + @Override + public int hashCode(int index, ArrowBufHasher hasher) { + if (isNull(index)) { + return ArrowBufPointer.NULL_HASH_CODE; + } + final int start = getStartOffset(index); + final int end = getStartOffset(index + 1); + return ByteFunctionHelpers.hash(hasher, this.getDataBuffer(), start, end); + } + + @Override + public OUT accept(VectorVisitor visitor, IN value) { + return visitor.visit(this, value); + } + + /** + * Gets the ending offset of a record, given its index. + */ + public final int getEndOffset(int index) { + return offsetBuffer.getInt((long) (index + 1) * OFFSET_WIDTH); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BigIntVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BigIntVector.java new file mode 100644 index 000000000..c19955b54 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BigIntVector.java @@ -0,0 +1,358 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.BigIntReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.BigIntHolder; +import org.apache.arrow.vector.holders.NullableBigIntHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * BigIntVector implements a fixed width vector (8 bytes) of + * integer values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public final class BigIntVector extends BaseFixedWidthVector implements BaseIntVector { + public static final byte TYPE_WIDTH = 8; + private final FieldReader reader; + + /** + * Instantiate a BigIntVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public BigIntVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.BIGINT.getType()), allocator); + } + + /** + * Instantiate a BigIntVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public BigIntVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a BigIntVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public BigIntVector(Field field, BufferAllocator allocator) { + super(field, allocator, TYPE_WIDTH); + reader = new BigIntReaderImpl(BigIntVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.BIGINT; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public long get(int index) throws IllegalStateException { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getLong((long) index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableBigIntHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getLong((long) index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Long getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getLong((long) index * TYPE_WIDTH); + } + } + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + private void setValue(int index, long value) { + valueBuffer.setLong((long) index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, long value) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableBigIntHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, BigIntHolder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, long)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, long value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableBigIntHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableBigIntHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, BigIntHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, BigIntHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, long value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, int, long)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, long value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + *

This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static long get(final ArrowBuf buffer, final int index) { + return buffer.getLong((long) index * TYPE_WIDTH); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((BigIntVector) to); + } + + @Override + public void setWithPossibleTruncate(int index, long value) { + this.setSafe(index, value); + } + + @Override + public void setUnsafeWithPossibleTruncate(int index, long value) { + this.set(index, value); + } + + @Override + public long getValueAsLong(int index) { + return this.get(index); + } + + private class TransferImpl implements TransferPair { + BigIntVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new BigIntVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(BigIntVector to) { + this.to = to; + } + + @Override + public BigIntVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, BigIntVector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java new file mode 100644 index 000000000..3bcfd983e --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java @@ -0,0 +1,599 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.memory.util.LargeMemoryUtil.capAtMaxInt; +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.util.ArrowBufPointer; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.complex.impl.BitReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.BitHolder; +import org.apache.arrow.vector.holders.NullableBitHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.util.TransferPair; + +/** + * BitVector implements a fixed width (1 bit) vector of + * boolean values which could be null. Each value in the vector corresponds + * to a single bit in the underlying data stream backing the vector. + */ +public final class BitVector extends BaseFixedWidthVector { + + private static final int HASH_CODE_FOR_ZERO = 17; + + private static final int HASH_CODE_FOR_ONE = 19; + + private final FieldReader reader; + + /** + * Instantiate a BitVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public BitVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.BIT.getType()), allocator); + } + + /** + * Instantiate a BitVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public BitVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a BitVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field the Field materialized by this vector + * @param allocator allocator for memory management. + */ + public BitVector(Field field, BufferAllocator allocator) { + super(field, allocator, 0); + reader = new BitReaderImpl(BitVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.BIT; + } + + /** + * Sets the desired value capacity for the vector. This function doesn't + * allocate any memory for the vector. + * + * @param valueCount desired number of elements in the vector + */ + @Override + public void setInitialCapacity(int valueCount) { + final int size = getValidityBufferSizeFromCount(valueCount); + if (size * 2 > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); + } + lastValueCapacity = valueCount; + } + + @Override + protected int getValueBufferValueCapacity() { + return capAtMaxInt(valueBuffer.capacity() * 8); + } + + /** + * Get the potential buffer size for a particular number of records. + * + * @param count desired number of elements in the vector + * @return estimated size of underlying buffers if the vector holds + * a given number of elements + */ + @Override + public int getBufferSizeFor(final int count) { + if (count == 0) { + return 0; + } + return 2 * getValidityBufferSizeFromCount(count); + } + + /** + * Get the size (number of bytes) of underlying buffers used by this vector. + * + * @return size of underlying buffers. + */ + @Override + public int getBufferSize() { + return getBufferSizeFor(valueCount); + } + + /** + * Slice this vector at desired index and length and transfer the + * corresponding data to the target vector. + * + * @param startIndex start position of the split in source vector. + * @param length length of the split. + * @param target destination vector + */ + public void splitAndTransferTo(int startIndex, int length, BaseFixedWidthVector target) { + Preconditions.checkArgument(startIndex >= 0 && length >= 0 && startIndex + length <= valueCount, + "Invalid parameters startIndex: %s, length: %s for valueCount: %s", startIndex, length, valueCount); + compareTypes(target, "splitAndTransferTo"); + target.clear(); + target.validityBuffer = splitAndTransferBuffer(startIndex, length, target, + validityBuffer, target.validityBuffer); + target.valueBuffer = splitAndTransferBuffer(startIndex, length, target, + valueBuffer, target.valueBuffer); + target.refreshValueCapacity(); + + target.setValueCount(length); + } + + private ArrowBuf splitAndTransferBuffer( + int startIndex, + int length, + BaseFixedWidthVector target, + ArrowBuf sourceBuffer, + ArrowBuf destBuffer) { + int firstByteSource = BitVectorHelper.byteIndex(startIndex); + int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1); + int byteSizeTarget = getValidityBufferSizeFromCount(length); + int offset = startIndex % 8; + + if (length > 0) { + if (offset == 0) { + /* slice */ + if (destBuffer != null) { + destBuffer.getReferenceManager().release(); + } + destBuffer = sourceBuffer.slice(firstByteSource, byteSizeTarget); + destBuffer.getReferenceManager().retain(1); + } else { + /* Copy data + * When the first bit starts from the middle of a byte (offset != 0), + * copy data from src BitVector. + * Each byte in the target is composed by a part in i-th byte, + * another part in (i+1)-th byte. + */ + destBuffer = allocator.buffer(byteSizeTarget); + destBuffer.readerIndex(0); + destBuffer.setZero(0, destBuffer.capacity()); + + for (int i = 0; i < byteSizeTarget - 1; i++) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(sourceBuffer, firstByteSource + i, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(sourceBuffer, firstByteSource + i + 1, offset); + + destBuffer.setByte(i, (b1 + b2)); + } + + /* Copying the last piece is done in the following manner: + * if the source vector has 1 or more bytes remaining, we copy + * the last piece as a byte formed by shifting data + * from the current byte and the next byte. + * + * if the source vector has no more bytes remaining + * (we are at the last byte), we copy the last piece as a byte + * by shifting data from the current byte. + */ + if ((firstByteSource + byteSizeTarget - 1) < lastByteSource) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(sourceBuffer, + firstByteSource + byteSizeTarget - 1, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(sourceBuffer, + firstByteSource + byteSizeTarget, offset); + + destBuffer.setByte(byteSizeTarget - 1, b1 + b2); + } else { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(sourceBuffer, + firstByteSource + byteSizeTarget - 1, offset); + destBuffer.setByte(byteSizeTarget - 1, b1); + } + } + } + + return destBuffer; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + private int getBit(int index) { + final int byteIndex = index >> 3; + final byte b = valueBuffer.getByte(byteIndex); + final int bitIndex = index & 7; + return (b >> bitIndex) & 0x01; + } + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public int get(int index) throws IllegalStateException { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return getBit(index); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableBitHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = getBit(index); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Boolean getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return new Boolean(getBit(index) != 0); + } + } + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector. + * + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + @Override + public void copyFrom(int fromIndex, int thisIndex, ValueVector from) { + Preconditions.checkArgument(this.getMinorType() == from.getMinorType()); + boolean fromIsSet = BitVectorHelper.get(from.getValidityBuffer(), fromIndex) != 0; + if (fromIsSet) { + BitVectorHelper.setBit(validityBuffer, thisIndex); + BitVectorHelper.setValidityBit(valueBuffer, thisIndex, ((BitVector) from).getBit(fromIndex)); + } else { + BitVectorHelper.unsetBit(validityBuffer, thisIndex); + } + } + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, int value) { + BitVectorHelper.setBit(validityBuffer, index); + if (value != 0) { + BitVectorHelper.setBit(valueBuffer, index); + } else { + BitVectorHelper.unsetBit(valueBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableBitHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + if (holder.value != 0) { + BitVectorHelper.setBit(valueBuffer, index); + } else { + BitVectorHelper.unsetBit(valueBuffer, index); + } + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, BitHolder holder) { + BitVectorHelper.setBit(validityBuffer, index); + if (holder.value != 0) { + BitVectorHelper.setBit(valueBuffer, index); + } else { + BitVectorHelper.unsetBit(valueBuffer, index); + } + } + + /** + * Same as {@link #set(int, int)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, int value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableBitHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableBitHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, BitHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, BitHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, int value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, int, int)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, int value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Set the element at the given index to one. + * + * @param index position of element + */ + public void setToOne(int index) { + BitVectorHelper.setBit(validityBuffer, index); + BitVectorHelper.setBit(valueBuffer, index); + } + + /** + * Same as {@link #setToOne(int)} except that it handles the case when + * index is greater than or equal to current value capacity of the vector. + * + * @param index position of the element + */ + public void setSafeToOne(int index) { + handleSafe(index); + setToOne(index); + } + + @Override + public ArrowBufPointer getDataPointer(int index) { + throw new UnsupportedOperationException(); + } + + @Override + public ArrowBufPointer getDataPointer(int index, ArrowBufPointer reuse) { + throw new UnsupportedOperationException(); + } + + @Override + public int hashCode(int index) { + if (isNull(index)) { + return ArrowBufPointer.NULL_HASH_CODE; + } else { + if (get(index) == 0) { + return HASH_CODE_FOR_ZERO; + } else { + return HASH_CODE_FOR_ONE; + } + } + } + + @Override + public int hashCode(int index, ArrowBufHasher hasher) { + return hashCode(index); + } + + /** + * Set count bits to 1 in data starting at firstBitIndex. + * + * @param firstBitIndex the index of the first bit to set + * @param count the number of bits to set + */ + public void setRangeToOne(int firstBitIndex, int count) { + int startByteIndex = BitVectorHelper.byteIndex(firstBitIndex); + final int lastBitIndex = firstBitIndex + count; + final int endByteIndex = BitVectorHelper.byteIndex(lastBitIndex); + final int startByteBitIndex = BitVectorHelper.bitIndex(firstBitIndex); + final int endBytebitIndex = BitVectorHelper.bitIndex(lastBitIndex); + if (count < 8 && startByteIndex == endByteIndex) { + // handles the case where we don't have a first and a last byte + byte bitMask = 0; + for (int i = startByteBitIndex; i < endBytebitIndex; ++i) { + bitMask |= (byte) (1L << i); + } + BitVectorHelper.setBitMaskedByte(validityBuffer, startByteIndex, bitMask); + BitVectorHelper.setBitMaskedByte(valueBuffer, startByteIndex, bitMask); + } else { + // fill in first byte (if it's not full) + if (startByteBitIndex != 0) { + final byte bitMask = (byte) (0xFFL << startByteBitIndex); + BitVectorHelper.setBitMaskedByte(validityBuffer, startByteIndex, bitMask); + BitVectorHelper.setBitMaskedByte(valueBuffer, startByteIndex, bitMask); + ++startByteIndex; + } + + // fill in one full byte at a time + validityBuffer.setOne(startByteIndex, endByteIndex - startByteIndex); + valueBuffer.setOne(startByteIndex, endByteIndex - startByteIndex); + + // fill in the last byte (if it's not full) + if (endBytebitIndex != 0) { + final int byteIndex = BitVectorHelper.byteIndex(lastBitIndex - endBytebitIndex); + final byte bitMask = (byte) (0xFFL >>> ((8 - endBytebitIndex) & 7)); + BitVectorHelper.setBitMaskedByte(validityBuffer, byteIndex, bitMask); + BitVectorHelper.setBitMaskedByte(valueBuffer, byteIndex, bitMask); + } + } + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((BitVector) to); + } + + private class TransferImpl implements TransferPair { + BitVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new BitVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(BitVector to) { + this.to = to; + } + + @Override + public BitVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, BitVector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java new file mode 100644 index 000000000..3745c5a75 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java @@ -0,0 +1,449 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static io.netty.util.internal.PlatformDependent.getByte; +import static io.netty.util.internal.PlatformDependent.getInt; +import static io.netty.util.internal.PlatformDependent.getLong; +import static org.apache.arrow.memory.util.LargeMemoryUtil.checkedCastToInt; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BoundsChecking; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.util.DataSizeRoundingUtil; + +import io.netty.util.internal.PlatformDependent; + +/** + * Helper class for performing generic operations on a bit vector buffer. + * External use of this class is not recommended. + */ +public class BitVectorHelper { + + private BitVectorHelper() {} + + /** + * Get the index of byte corresponding to bit index in validity buffer. + */ + public static long byteIndex(long absoluteBitIndex) { + return absoluteBitIndex >> 3; + } + + /** + * Get the relative index of bit within the byte in validity buffer. + */ + public static int bitIndex(long absoluteBitIndex) { + return checkedCastToInt(absoluteBitIndex & 7); + } + + /** + * Get the index of byte corresponding to bit index in validity buffer. + */ + public static int byteIndex(int absoluteBitIndex) { + return absoluteBitIndex >> 3; + } + + /** + * Get the relative index of bit within the byte in validity buffer. + */ + public static int bitIndex(int absoluteBitIndex) { + return absoluteBitIndex & 7; + } + + /** + * Set the bit at provided index to 1. + * + * @param validityBuffer validity buffer of the vector + * @param index index to be set + */ + public static void setBit(ArrowBuf validityBuffer, long index) { + // it can be observed that some logic is duplicate of the logic in setValidityBit. + // this is because JIT cannot always remove the if branch in setValidityBit, + // so we give a dedicated implementation for setting bits. + final long byteIndex = byteIndex(index); + final int bitIndex = bitIndex(index); + + // the byte is promoted to an int, because according to Java specification, + // bytes will be promoted to ints automatically, upon expression evaluation. + // by promoting it manually, we avoid the unnecessary conversions. + int currentByte = validityBuffer.getByte(byteIndex); + final int bitMask = 1 << bitIndex; + currentByte |= bitMask; + validityBuffer.setByte(byteIndex, currentByte); + } + + /** + * Set the bit at provided index to 0. + * + * @param validityBuffer validity buffer of the vector + * @param index index to be set + */ + public static void unsetBit(ArrowBuf validityBuffer, int index) { + // it can be observed that some logic is duplicate of the logic in setValidityBit. + // this is because JIT cannot always remove the if branch in setValidityBit, + // so we give a dedicated implementation for unsetting bits. + final int byteIndex = byteIndex(index); + final int bitIndex = bitIndex(index); + + // the byte is promoted to an int, because according to Java specification, + // bytes will be promoted to ints automatically, upon expression evaluation. + // by promoting it manually, we avoid the unnecessary conversions. + int currentByte = validityBuffer.getByte(byteIndex); + final int bitMask = 1 << bitIndex; + currentByte &= ~bitMask; + validityBuffer.setByte(byteIndex, currentByte); + } + + /** + * Set the bit at a given index to provided value (1 or 0). + * + * @param validityBuffer validity buffer of the vector + * @param index index to be set + * @param value value to set + */ + public static void setValidityBit(ArrowBuf validityBuffer, int index, int value) { + final int byteIndex = byteIndex(index); + final int bitIndex = bitIndex(index); + + // the byte is promoted to an int, because according to Java specification, + // bytes will be promoted to ints automatically, upon expression evaluation. + // by promoting it manually, we avoid the unnecessary conversions. + int currentByte = validityBuffer.getByte(byteIndex); + final int bitMask = 1 << bitIndex; + if (value != 0) { + currentByte |= bitMask; + } else { + currentByte &= ~bitMask; + } + validityBuffer.setByte(byteIndex, currentByte); + } + + /** + * Set the bit at a given index to provided value (1 or 0). Internally + * takes care of allocating the buffer if the caller didn't do so. + * + * @param validityBuffer validity buffer of the vector + * @param allocator allocator for the buffer + * @param valueCount number of values to allocate/set + * @param index index to be set + * @param value value to set + * @return ArrowBuf + */ + public static ArrowBuf setValidityBit(ArrowBuf validityBuffer, BufferAllocator allocator, + int valueCount, int index, int value) { + if (validityBuffer == null) { + validityBuffer = allocator.buffer(getValidityBufferSize(valueCount)); + } + setValidityBit(validityBuffer, index, value); + if (index == (valueCount - 1)) { + validityBuffer.writerIndex(getValidityBufferSize(valueCount)); + } + + return validityBuffer; + } + + /** + * Check if a bit at a given index is set or not. + * + * @param buffer buffer to check + * @param index index of the buffer + * @return 1 if bit is set, 0 otherwise. + */ + public static int get(final ArrowBuf buffer, int index) { + final int byteIndex = index >> 3; + final byte b = buffer.getByte(byteIndex); + final int bitIndex = index & 7; + return (b >> bitIndex) & 0x01; + } + + /** + * Compute the size of validity buffer required to manage a given number + * of elements in a vector. + * + * @param valueCount number of elements in the vector + * @return buffer size + */ + public static int getValidityBufferSize(int valueCount) { + return DataSizeRoundingUtil.divideBy8Ceil(valueCount); + } + + /** + * Given a validity buffer, find the number of bits that are not set. + * This is used to compute the number of null elements in a nullable vector. + * + * @param validityBuffer validity buffer of the vector + * @param valueCount number of values in the vector + * @return number of bits not set. + */ + public static int getNullCount(final ArrowBuf validityBuffer, final int valueCount) { + if (valueCount == 0) { + return 0; + } + int count = 0; + final int sizeInBytes = getValidityBufferSize(valueCount); + // If value count is not a multiple of 8, then calculate number of used bits in the last byte + final int remainder = valueCount % 8; + final int fullBytesCount = remainder == 0 ? sizeInBytes : sizeInBytes - 1; + + int index = 0; + while (index + 8 <= fullBytesCount) { + long longValue = validityBuffer.getLong(index); + count += Long.bitCount(longValue); + index += 8; + } + + if (index + 4 <= fullBytesCount) { + int intValue = validityBuffer.getInt(index); + count += Integer.bitCount(intValue); + index += 4; + } + + while (index < fullBytesCount) { + byte byteValue = validityBuffer.getByte(index); + count += Integer.bitCount(byteValue & 0xFF); + index += 1; + } + + // handling with the last bits + if (remainder != 0) { + byte byteValue = validityBuffer.getByte(sizeInBytes - 1); + + // making the remaining bits all 1s if it is not fully filled + byte mask = (byte) (0xFF << remainder); + byteValue = (byte) (byteValue | mask); + count += Integer.bitCount(byteValue & 0xFF); + } + + return 8 * sizeInBytes - count; + } + + /** + * Tests if all bits in a validity buffer are equal 0 or 1, according to the specified parameter. + * @param validityBuffer the validity buffer. + * @param valueCount the bit count. + * @param checkOneBits if set to true, the method checks if all bits are equal to 1; + * otherwise, it checks if all bits are equal to 0. + * @return true if all bits are 0 or 1 according to the parameter, and false otherwise. + */ + public static boolean checkAllBitsEqualTo( + final ArrowBuf validityBuffer, final int valueCount, final boolean checkOneBits) { + if (valueCount == 0) { + return true; + } + final int sizeInBytes = getValidityBufferSize(valueCount); + + // boundary check + validityBuffer.checkBytes(0, sizeInBytes); + + // If value count is not a multiple of 8, then calculate number of used bits in the last byte + final int remainder = valueCount % 8; + final int fullBytesCount = remainder == 0 ? sizeInBytes : sizeInBytes - 1; + + // the integer number to compare against + final int intToCompare = checkOneBits ? -1 : 0; + + int index = 0; + while (index + 8 <= fullBytesCount) { + long longValue = getLong(validityBuffer.memoryAddress() + index); + if (longValue != (long) intToCompare) { + return false; + } + index += 8; + } + + if (index + 4 <= fullBytesCount) { + int intValue = getInt(validityBuffer.memoryAddress() + index); + if (intValue != intToCompare) { + return false; + } + index += 4; + } + + while (index < fullBytesCount) { + byte byteValue = getByte(validityBuffer.memoryAddress() + index); + if (byteValue != (byte) intToCompare) { + return false; + } + index += 1; + } + + // handling with the last bits + if (remainder != 0) { + byte byteValue = getByte(validityBuffer.memoryAddress() + sizeInBytes - 1); + byte mask = (byte) ((1 << remainder) - 1); + byteValue = (byte) (byteValue & mask); + if (checkOneBits) { + if ((mask & byteValue) != mask) { + return false; + } + } else { + if (byteValue != (byte) 0) { + return false; + } + } + } + return true; + } + + /** Returns the byte at index from data right-shifted by offset. */ + public static byte getBitsFromCurrentByte(final ArrowBuf data, final int index, final int offset) { + return (byte) ((data.getByte(index) & 0xFF) >>> offset); + } + + /** + * Returns the byte at index from left-shifted by (8 - offset). + */ + public static byte getBitsFromNextByte(ArrowBuf data, int index, int offset) { + return (byte) ((data.getByte(index) << (8 - offset))); + } + + /** + * Returns a new buffer if the source validity buffer is either all null or all + * not-null, otherwise returns a buffer pointing to the same memory as source. + * + * @param fieldNode The fieldNode containing the null count + * @param sourceValidityBuffer The source validity buffer that will have its + * position copied if there is a mix of null and non-null values + * @param allocator The allocator to use for creating a new buffer if necessary. + * @return A new buffer that is either allocated or points to the same memory as sourceValidityBuffer. + */ + public static ArrowBuf loadValidityBuffer(final ArrowFieldNode fieldNode, + final ArrowBuf sourceValidityBuffer, + final BufferAllocator allocator) { + final int valueCount = fieldNode.getLength(); + ArrowBuf newBuffer = null; + /* either all NULLs or all non-NULLs */ + if (fieldNode.getNullCount() == 0 || fieldNode.getNullCount() == valueCount) { + newBuffer = allocator.buffer(getValidityBufferSize(valueCount)); + newBuffer.setZero(0, newBuffer.capacity()); + if (fieldNode.getNullCount() != 0) { + /* all NULLs */ + return newBuffer; + } + /* all non-NULLs */ + int fullBytesCount = valueCount / 8; + newBuffer.setOne(0, fullBytesCount); + int remainder = valueCount % 8; + if (remainder > 0) { + byte bitMask = (byte) (0xFFL >>> ((8 - remainder) & 7)); + newBuffer.setByte(fullBytesCount, bitMask); + } + } else { + /* mixed byte pattern -- create another ArrowBuf associated with the + * target allocator + */ + newBuffer = sourceValidityBuffer.getReferenceManager().retain(sourceValidityBuffer, allocator); + } + + return newBuffer; + } + + /** + * Set the byte of the given index in the data buffer by applying a bit mask to + * the current byte at that index. + * + * @param data buffer to set + * @param byteIndex byteIndex within the buffer + * @param bitMask bit mask to be set + */ + static void setBitMaskedByte(ArrowBuf data, int byteIndex, byte bitMask) { + byte currentByte = data.getByte(byteIndex); + currentByte |= bitMask; + data.setByte(byteIndex, currentByte); + } + + /** + * Concat two validity buffers. + * @param input1 the first validity buffer. + * @param numBits1 the number of bits in the first validity buffer. + * @param input2 the second validity buffer. + * @param numBits2 the number of bits in the second validity buffer. + * @param output the output validity buffer. It can be the same one as the first input. + * The caller must make sure the output buffer has enough capacity. + */ + public static void concatBits(ArrowBuf input1, int numBits1, ArrowBuf input2, int numBits2, ArrowBuf output) { + int numBytes1 = DataSizeRoundingUtil.divideBy8Ceil(numBits1); + int numBytes2 = DataSizeRoundingUtil.divideBy8Ceil(numBits2); + int numBytesOut = DataSizeRoundingUtil.divideBy8Ceil(numBits1 + numBits2); + + if (BoundsChecking.BOUNDS_CHECKING_ENABLED) { + output.checkBytes(0, numBytesOut); + } + + // copy the first bit set + if (input1 != output) { + PlatformDependent.copyMemory(input1.memoryAddress(), output.memoryAddress(), numBytes1); + } + + if (bitIndex(numBits1) == 0) { + // The number of bits for the first bit set is a multiple of 8, so the boundary is at byte boundary. + // For this case, we have a shortcut to copy all bytes from the second set after the byte boundary. + PlatformDependent.copyMemory(input2.memoryAddress(), output.memoryAddress() + numBytes1, numBytes2); + return; + } + + // the number of bits to fill a full byte after the first input is processed + int numBitsToFill = 8 - bitIndex(numBits1); + + // mask to clear high bits + int mask = (1 << (8 - numBitsToFill)) - 1; + + int numFullBytes = numBits2 / 8; + + int prevByte = output.getByte(numBytes1 - 1) & mask; + for (int i = 0; i < numFullBytes; i++) { + int curByte = input2.getByte(i) & 0xff; + + // first fill the bits to a full byte + int byteToFill = (curByte << (8 - numBitsToFill)) & 0xff; + output.setByte(numBytes1 + i - 1, byteToFill | prevByte); + + // fill remaining bits in the current byte + // note that it is also the previous byte for the next iteration + prevByte = curByte >>> numBitsToFill; + } + + int lastOutputByte = prevByte; + + // the number of extra bits for the second input, relative to full bytes + int numTrailingBits = bitIndex(numBits2); + + if (numTrailingBits == 0) { + output.setByte(numBytes1 + numFullBytes - 1, lastOutputByte); + return; + } + + // process remaining bits from input2 + int remByte = input2.getByte(numBytes2 - 1) & 0xff; + + int byteToFill = remByte << (8 - numBitsToFill); + lastOutputByte |= byteToFill; + + output.setByte(numBytes1 + numFullBytes - 1, lastOutputByte); + + if (numTrailingBits > numBitsToFill) { + // clear all bits for the last byte before writing + output.setByte(numBytes1 + numFullBytes, 0); + + // some remaining bits cannot be filled in the previous byte + int leftByte = remByte >>> numBitsToFill; + output.setByte(numBytes1 + numFullBytes, leftByte); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java new file mode 100644 index 000000000..ccba5b26c --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; + +/** + * Content is backed by a buffer and can be loaded/unloaded. + */ +public interface BufferBacked { + + void load(ArrowFieldNode fieldNode, ArrowBuf data); + + ArrowBuf unLoad(); +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java new file mode 100644 index 000000000..09c874e39 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.util.Preconditions; + +/** + * Metadata class that captures the "type" of an Arrow buffer. + * (e.g. data buffers, offset buffers for variable width types and validity + * buffers). + */ +public class BufferLayout { + + /** + * Enumeration of the different logical types a buffer can have. + */ + public enum BufferType { + DATA("DATA"), + OFFSET("OFFSET"), + VALIDITY("VALIDITY"), + TYPE("TYPE_ID"); + + private final String name; + + BufferType(String name) { + this.name = name; + } + + public String getName() { + return name; + } + } + + private static final BufferLayout VALIDITY_BUFFER = new BufferLayout(BufferType.VALIDITY, 1); + private static final BufferLayout OFFSET_BUFFER = new BufferLayout(BufferType.OFFSET, 32); + private static final BufferLayout LARGE_OFFSET_BUFFER = new BufferLayout(BufferType.OFFSET, 64); + private static final BufferLayout TYPE_BUFFER = new BufferLayout(BufferType.TYPE, 32); + private static final BufferLayout BIT_BUFFER = new BufferLayout(BufferType.DATA, 1); + private static final BufferLayout VALUES_256 = new BufferLayout(BufferType.DATA, 256); + private static final BufferLayout VALUES_128 = new BufferLayout(BufferType.DATA, 128); + private static final BufferLayout VALUES_64 = new BufferLayout(BufferType.DATA, 64); + private static final BufferLayout VALUES_32 = new BufferLayout(BufferType.DATA, 32); + private static final BufferLayout VALUES_16 = new BufferLayout(BufferType.DATA, 16); + private static final BufferLayout VALUES_8 = new BufferLayout(BufferType.DATA, 8); + + public static BufferLayout typeBuffer() { + return TYPE_BUFFER; + } + + public static BufferLayout offsetBuffer() { + return OFFSET_BUFFER; + } + + public static BufferLayout largeOffsetBuffer() { + return LARGE_OFFSET_BUFFER; + } + + /** + * Returns a databuffer for the given bitwidth. Only supports powers of two between 8 and 128 + * inclusive. + */ + public static BufferLayout dataBuffer(int typeBitWidth) { + switch (typeBitWidth) { + case 8: + return VALUES_8; + case 16: + return VALUES_16; + case 32: + return VALUES_32; + case 64: + return VALUES_64; + case 128: + return VALUES_128; + case 256: + return VALUES_256; + default: + throw new IllegalArgumentException("only 8, 16, 32, 64, 128, or 256 bits supported"); + } + } + + public static BufferLayout booleanVector() { + return BIT_BUFFER; + } + + public static BufferLayout validityVector() { + return VALIDITY_BUFFER; + } + + public static BufferLayout byteVector() { + return dataBuffer(8); + } + + private final short typeBitWidth; + + private final BufferType type; + + BufferLayout(BufferType type, int typeBitWidth) { + super(); + this.type = Preconditions.checkNotNull(type); + this.typeBitWidth = (short) typeBitWidth; + if (typeBitWidth <= 0) { + throw new IllegalArgumentException("bitWidth invalid: " + typeBitWidth); + } + } + + public int getTypeBitWidth() { + return typeBitWidth; + } + + public BufferType getType() { + return type; + } + + @Override + public String toString() { + return String.format("%s(%s)", type, typeBitWidth); + } + + @Override + public int hashCode() { + return 31 * (31 + type.hashCode()) + typeBitWidth; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + BufferLayout other = (BufferLayout) obj; + return type.equals(other.type) && (typeBitWidth == other.typeBitWidth); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/DateDayVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/DateDayVector.java new file mode 100644 index 000000000..3e8826845 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/DateDayVector.java @@ -0,0 +1,347 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.DateDayReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.DateDayHolder; +import org.apache.arrow.vector.holders.NullableDateDayHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * DateDayVector implements a fixed width (4 bytes) vector of + * date values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public final class DateDayVector extends BaseFixedWidthVector { + + public static final byte TYPE_WIDTH = 4; + private final FieldReader reader; + + /** + * Instantiate a DateDayVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public DateDayVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.DATEDAY.getType()), allocator); + } + + /** + * Instantiate a DateDayVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public DateDayVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a DateDayVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field Field materialized by this vector + * @param allocator allocator for memory management. + */ + public DateDayVector(Field field, BufferAllocator allocator) { + super(field, allocator, TYPE_WIDTH); + reader = new DateDayReaderImpl(DateDayVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.DATEDAY; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public int get(int index) throws IllegalStateException { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getInt((long) index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableDateDayHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getInt((long) index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Integer getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getInt((long) index * TYPE_WIDTH); + } + } + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + private void setValue(int index, int value) { + valueBuffer.setInt((long) index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, int value) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableDateDayHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, DateDayHolder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, int)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, int value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableDateDayHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableDateDayHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, DateDayHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, DateDayHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, int value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, int, int)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, int value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + *

This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static int get(final ArrowBuf buffer, final int index) { + return buffer.getInt((long) index * TYPE_WIDTH); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((DateDayVector) to); + } + + private class TransferImpl implements TransferPair { + DateDayVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new DateDayVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(DateDayVector to) { + this.to = to; + } + + @Override + public DateDayVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, DateDayVector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/DateMilliVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/DateMilliVector.java new file mode 100644 index 000000000..73738d771 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/DateMilliVector.java @@ -0,0 +1,350 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import java.time.LocalDateTime; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.DateMilliReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.DateMilliHolder; +import org.apache.arrow.vector.holders.NullableDateMilliHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.DateUtility; +import org.apache.arrow.vector.util.TransferPair; + +/** + * DateMilliVector implements a fixed width vector (8 bytes) of + * date values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public final class DateMilliVector extends BaseFixedWidthVector { + public static final byte TYPE_WIDTH = 8; + private final FieldReader reader; + + /** + * Instantiate a DateMilliVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public DateMilliVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.DATEMILLI.getType()), allocator); + } + + /** + * Instantiate a DateMilliVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public DateMilliVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a DateMilliVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public DateMilliVector(Field field, BufferAllocator allocator) { + super(field, allocator, TYPE_WIDTH); + reader = new DateMilliReaderImpl(DateMilliVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.DATEMILLI; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public long get(int index) throws IllegalStateException { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getLong((long) index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableDateMilliHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getLong((long) index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public LocalDateTime getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + final long millis = valueBuffer.getLong((long) index * TYPE_WIDTH); + return DateUtility.getLocalDateTimeFromEpochMilli(millis); + } + } + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + private void setValue(int index, long value) { + valueBuffer.setLong((long) index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, long value) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableDateMilliHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, DateMilliHolder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, long)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, long value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableDateMilliHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableDateMilliHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, DateMilliHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, DateMilliHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, long value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, int, long)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, long value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + *

This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static long get(final ArrowBuf buffer, final int index) { + return buffer.getLong((long) index * TYPE_WIDTH); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((DateMilliVector) to); + } + + private class TransferImpl implements TransferPair { + DateMilliVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new DateMilliVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(DateMilliVector to) { + this.to = to; + } + + @Override + public DateMilliVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, DateMilliVector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/Decimal256Vector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/Decimal256Vector.java new file mode 100644 index 000000000..c5fef82d0 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/Decimal256Vector.java @@ -0,0 +1,584 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import java.math.BigDecimal; +import java.nio.ByteOrder; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.Decimal256ReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.Decimal256Holder; +import org.apache.arrow.vector.holders.NullableDecimal256Holder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.DecimalUtility; +import org.apache.arrow.vector.util.TransferPair; + +import io.netty.util.internal.PlatformDependent; + +/** + * Decimal256Vector implements a fixed width vector (32 bytes) of + * decimal values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public final class Decimal256Vector extends BaseFixedWidthVector { + public static final byte TYPE_WIDTH = 32; + private static final boolean LITTLE_ENDIAN = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN; + private final FieldReader reader; + + private final int precision; + private final int scale; + + /** + * Instantiate a Decimal256Vector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public Decimal256Vector(String name, BufferAllocator allocator, + int precision, int scale) { + this(name, FieldType.nullable(new ArrowType.Decimal(precision, scale, /*bitWidth=*/TYPE_WIDTH * 8)), allocator); + } + + /** + * Instantiate a Decimal256Vector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public Decimal256Vector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a Decimal256Vector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public Decimal256Vector(Field field, BufferAllocator allocator) { + super(field, allocator, TYPE_WIDTH); + ArrowType.Decimal arrowType = (ArrowType.Decimal) field.getFieldType().getType(); + reader = new Decimal256ReaderImpl(Decimal256Vector.this); + this.precision = arrowType.getPrecision(); + this.scale = arrowType.getScale(); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.DECIMAL256; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public ArrowBuf get(int index) throws IllegalStateException { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.slice((long) index * TYPE_WIDTH, TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableDecimal256Holder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.buffer = valueBuffer; + holder.precision = precision; + holder.scale = scale; + holder.start = ((long) index) * TYPE_WIDTH; + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public BigDecimal getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return DecimalUtility.getBigDecimalFromArrowBuf(valueBuffer, index, scale, TYPE_WIDTH); + } + } + + /** + * Return precision for the decimal value. + */ + public int getPrecision() { + return precision; + } + + /** + * Return scale for the decimal value. + */ + public int getScale() { + return scale; + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param buffer ArrowBuf containing decimal value. + */ + public void set(int index, ArrowBuf buffer) { + BitVectorHelper.setBit(validityBuffer, index); + valueBuffer.setBytes((long) index * TYPE_WIDTH, buffer, 0, TYPE_WIDTH); + } + + /** + * Set the decimal element at given index to the provided array of bytes. + * Decimal256 is now implemented as Native Endian. This API allows the user + * to pass a decimal value in the form of byte array in BE byte order. + * + *

Consumers of Arrow code can use this API instead of first swapping + * the source bytes (doing a write and read) and then finally writing to + * ArrowBuf of decimal vector. + * + *

This method takes care of adding the necessary padding if the length + * of byte array is less then 32 (length of decimal type). + * + * @param index position of element + * @param value array of bytes containing decimal in big endian byte order. + */ + public void setBigEndian(int index, byte[] value) { + BitVectorHelper.setBit(validityBuffer, index); + final int length = value.length; + + // do the bound check. + valueBuffer.checkBytes((long) index * TYPE_WIDTH, (long) (index + 1) * TYPE_WIDTH); + + long outAddress = valueBuffer.memoryAddress() + (long) index * TYPE_WIDTH; + if (length == 0) { + PlatformDependent.setMemory(outAddress, Decimal256Vector.TYPE_WIDTH, (byte) 0); + return; + } + if (LITTLE_ENDIAN) { + // swap bytes to convert BE to LE + for (int byteIdx = 0; byteIdx < length; ++byteIdx) { + PlatformDependent.putByte(outAddress + byteIdx, value[length - 1 - byteIdx]); + } + + if (length == TYPE_WIDTH) { + return; + } + + if (length < TYPE_WIDTH) { + // sign extend + final byte pad = (byte) (value[0] < 0 ? 0xFF : 0x00); + PlatformDependent.setMemory(outAddress + length, Decimal256Vector.TYPE_WIDTH - length, pad); + return; + } + } else { + if (length <= TYPE_WIDTH) { + // copy data from value to outAddress + PlatformDependent.copyMemory(value, 0, outAddress + Decimal256Vector.TYPE_WIDTH - length, length); + // sign extend + final byte pad = (byte) (value[0] < 0 ? 0xFF : 0x00); + PlatformDependent.setMemory(outAddress, Decimal256Vector.TYPE_WIDTH - length, pad); + return; + } + } + throw new IllegalArgumentException( + "Invalid decimal value length. Valid length in [1 - 32], got " + length); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param start start index of data in the buffer + * @param buffer ArrowBuf containing decimal value. + */ + public void set(int index, long start, ArrowBuf buffer) { + BitVectorHelper.setBit(validityBuffer, index); + valueBuffer.setBytes((long) index * TYPE_WIDTH, buffer, start, TYPE_WIDTH); + } + + /** + * Sets the element at given index using the buffer whose size maybe <= 32 bytes. + * @param index index to write the decimal to + * @param start start of value in the buffer + * @param buffer contains the decimal in native endian bytes + * @param length length of the value in the buffer + */ + public void setSafe(int index, long start, ArrowBuf buffer, int length) { + handleSafe(index); + BitVectorHelper.setBit(validityBuffer, index); + + // do the bound checks. + buffer.checkBytes(start, start + length); + valueBuffer.checkBytes((long) index * TYPE_WIDTH, (long) (index + 1) * TYPE_WIDTH); + + long inAddress = buffer.memoryAddress() + start; + long outAddress = valueBuffer.memoryAddress() + (long) index * TYPE_WIDTH; + if (LITTLE_ENDIAN) { + PlatformDependent.copyMemory(inAddress, outAddress, length); + // sign extend + if (length < TYPE_WIDTH) { + byte msb = PlatformDependent.getByte(inAddress + length - 1); + final byte pad = (byte) (msb < 0 ? 0xFF : 0x00); + PlatformDependent.setMemory(outAddress + length, Decimal256Vector.TYPE_WIDTH - length, pad); + } + } else { + PlatformDependent.copyMemory(inAddress, outAddress + Decimal256Vector.TYPE_WIDTH - length, length); + // sign extend + if (length < TYPE_WIDTH) { + byte msb = PlatformDependent.getByte(inAddress); + final byte pad = (byte) (msb < 0 ? 0xFF : 0x00); + PlatformDependent.setMemory(outAddress, Decimal256Vector.TYPE_WIDTH - length, pad); + } + } + } + + + /** + * Sets the element at given index using the buffer whose size maybe <= 32 bytes. + * @param index index to write the decimal to + * @param start start of value in the buffer + * @param buffer contains the decimal in big endian bytes + * @param length length of the value in the buffer + */ + public void setBigEndianSafe(int index, long start, ArrowBuf buffer, int length) { + handleSafe(index); + BitVectorHelper.setBit(validityBuffer, index); + + // do the bound checks. + buffer.checkBytes(start, start + length); + valueBuffer.checkBytes((long) index * TYPE_WIDTH, (long) (index + 1) * TYPE_WIDTH); + + // not using buffer.getByte() to avoid boundary checks for every byte. + long inAddress = buffer.memoryAddress() + start; + long outAddress = valueBuffer.memoryAddress() + (long) index * TYPE_WIDTH; + if (LITTLE_ENDIAN) { + // swap bytes to convert BE to LE + for (int byteIdx = 0; byteIdx < length; ++byteIdx) { + byte val = PlatformDependent.getByte((inAddress + length - 1) - byteIdx); + PlatformDependent.putByte(outAddress + byteIdx, val); + } + // sign extend + if (length < 32) { + byte msb = PlatformDependent.getByte(inAddress); + final byte pad = (byte) (msb < 0 ? 0xFF : 0x00); + PlatformDependent.setMemory(outAddress + length, Decimal256Vector.TYPE_WIDTH - length, pad); + } + } else { + PlatformDependent.copyMemory(inAddress, outAddress + Decimal256Vector.TYPE_WIDTH - length, length); + // sign extend + if (length < TYPE_WIDTH) { + byte msb = PlatformDependent.getByte(inAddress); + final byte pad = (byte) (msb < 0 ? 0xFF : 0x00); + PlatformDependent.setMemory(outAddress, Decimal256Vector.TYPE_WIDTH - length, pad); + } + } + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value BigDecimal containing decimal value. + */ + public void set(int index, BigDecimal value) { + BitVectorHelper.setBit(validityBuffer, index); + DecimalUtility.checkPrecisionAndScale(value, precision, scale); + DecimalUtility.writeBigDecimalToArrowBuf(value, valueBuffer, index, TYPE_WIDTH); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value long value. + */ + public void set(int index, long value) { + BitVectorHelper.setBit(validityBuffer, index); + DecimalUtility.writeLongToArrowBuf(value, valueBuffer, index, TYPE_WIDTH); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableDecimal256Holder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + valueBuffer.setBytes((long) index * TYPE_WIDTH, holder.buffer, holder.start, TYPE_WIDTH); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, Decimal256Holder holder) { + BitVectorHelper.setBit(validityBuffer, index); + valueBuffer.setBytes((long) index * TYPE_WIDTH, holder.buffer, holder.start, TYPE_WIDTH); + } + + /** + * Same as {@link #set(int, ArrowBuf)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param buffer ArrowBuf containing decimal value. + */ + public void setSafe(int index, ArrowBuf buffer) { + handleSafe(index); + set(index, buffer); + } + + /** + * Same as {@link #setBigEndian(int, byte[])} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + */ + public void setBigEndianSafe(int index, byte[] value) { + handleSafe(index); + setBigEndian(index, value); + } + + /** + * Same as {@link #set(int, int, ArrowBuf)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param start start index of data in the buffer + * @param buffer ArrowBuf containing decimal value. + */ + public void setSafe(int index, long start, ArrowBuf buffer) { + handleSafe(index); + set(index, start, buffer); + } + + /** + * Same as {@link #set(int, BigDecimal)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value BigDecimal containing decimal value. + */ + public void setSafe(int index, BigDecimal value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, long)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value long value. + */ + public void setSafe(int index, long value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableDecimalHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableDecimal256Holder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, Decimal256Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, Decimal256Holder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param start start position of the value in the buffer + * @param buffer buffer containing the value to be stored in the vector + */ + public void set(int index, int isSet, long start, ArrowBuf buffer) { + if (isSet > 0) { + set(index, start, buffer); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #setSafe(int, int, int, ArrowBuf)} except that it handles + * the case when the position of new value is beyond the current value + * capacity of the vector. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param start start position of the value in the buffer + * @param buffer buffer containing the value to be stored in the vector + */ + public void setSafe(int index, int isSet, long start, ArrowBuf buffer) { + handleSafe(index); + set(index, isSet, start, buffer); + } + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((Decimal256Vector) to); + } + + private class TransferImpl implements TransferPair { + Decimal256Vector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new Decimal256Vector(ref, allocator, Decimal256Vector.this.precision, + Decimal256Vector.this.scale); + } + + public TransferImpl(Decimal256Vector to) { + this.to = to; + } + + @Override + public Decimal256Vector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, Decimal256Vector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/DecimalVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/DecimalVector.java new file mode 100644 index 000000000..f988f4f94 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/DecimalVector.java @@ -0,0 +1,584 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import java.math.BigDecimal; +import java.nio.ByteOrder; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.DecimalReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.DecimalHolder; +import org.apache.arrow.vector.holders.NullableDecimalHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.DecimalUtility; +import org.apache.arrow.vector.util.TransferPair; + +import io.netty.util.internal.PlatformDependent; + +/** + * DecimalVector implements a fixed width vector (16 bytes) of + * decimal values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public final class DecimalVector extends BaseFixedWidthVector { + public static final byte TYPE_WIDTH = 16; + private static final boolean LITTLE_ENDIAN = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN; + private final FieldReader reader; + + private final int precision; + private final int scale; + + /** + * Instantiate a DecimalVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public DecimalVector(String name, BufferAllocator allocator, + int precision, int scale) { + this(name, FieldType.nullable(new ArrowType.Decimal(precision, scale, TYPE_WIDTH * 8)), allocator); + } + + /** + * Instantiate a DecimalVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public DecimalVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a DecimalVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public DecimalVector(Field field, BufferAllocator allocator) { + super(field, allocator, TYPE_WIDTH); + ArrowType.Decimal arrowType = (ArrowType.Decimal) field.getFieldType().getType(); + reader = new DecimalReaderImpl(DecimalVector.this); + this.precision = arrowType.getPrecision(); + this.scale = arrowType.getScale(); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.DECIMAL; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public ArrowBuf get(int index) throws IllegalStateException { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.slice((long) index * TYPE_WIDTH, TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableDecimalHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.buffer = valueBuffer; + holder.precision = precision; + holder.scale = scale; + holder.start = (long) index * TYPE_WIDTH; + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public BigDecimal getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return DecimalUtility.getBigDecimalFromArrowBuf(valueBuffer, index, scale, TYPE_WIDTH); + } + } + + /** + * Return precision for the decimal value. + */ + public int getPrecision() { + return precision; + } + + /** + * Return scale for the decimal value. + */ + public int getScale() { + return scale; + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param buffer ArrowBuf containing decimal value. + */ + public void set(int index, ArrowBuf buffer) { + BitVectorHelper.setBit(validityBuffer, index); + valueBuffer.setBytes((long) index * TYPE_WIDTH, buffer, 0, TYPE_WIDTH); + } + + /** + * Set the decimal element at given index to the provided array of bytes. + * Decimal is now implemented as Native Endian. This API allows the user + * to pass a decimal value in the form of byte array in BE byte order. + * + *

Consumers of Arrow code can use this API instead of first swapping + * the source bytes (doing a write and read) and then finally writing to + * ArrowBuf of decimal vector. + * + *

This method takes care of adding the necessary padding if the length + * of byte array is less then 16 (length of decimal type). + * + * @param index position of element + * @param value array of bytes containing decimal in big endian byte order. + */ + public void setBigEndian(int index, byte[] value) { + BitVectorHelper.setBit(validityBuffer, index); + final int length = value.length; + + // do the bound check. + valueBuffer.checkBytes((long) index * TYPE_WIDTH, (long) (index + 1) * TYPE_WIDTH); + + long outAddress = valueBuffer.memoryAddress() + (long) index * TYPE_WIDTH; + if (length == 0) { + PlatformDependent.setMemory(outAddress, DecimalVector.TYPE_WIDTH, (byte) 0); + return; + } + if (LITTLE_ENDIAN) { + // swap bytes to convert BE to LE + for (int byteIdx = 0; byteIdx < length; ++byteIdx) { + PlatformDependent.putByte(outAddress + byteIdx, value[length - 1 - byteIdx]); + } + + if (length == TYPE_WIDTH) { + return; + } + + if (length < TYPE_WIDTH) { + // sign extend + final byte pad = (byte) (value[0] < 0 ? 0xFF : 0x00); + PlatformDependent.setMemory(outAddress + length, DecimalVector.TYPE_WIDTH - length, pad); + return; + } + } else { + if (length <= TYPE_WIDTH) { + // copy data from value to outAddress + PlatformDependent.copyMemory(value, 0, outAddress + DecimalVector.TYPE_WIDTH - length, length); + // sign extend + final byte pad = (byte) (value[0] < 0 ? 0xFF : 0x00); + PlatformDependent.setMemory(outAddress, DecimalVector.TYPE_WIDTH - length, pad); + return; + } + } + throw new IllegalArgumentException( + "Invalid decimal value length. Valid length in [1 - 16], got " + length); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param start start index of data in the buffer + * @param buffer ArrowBuf containing decimal value. + */ + public void set(int index, long start, ArrowBuf buffer) { + BitVectorHelper.setBit(validityBuffer, index); + valueBuffer.setBytes((long) index * TYPE_WIDTH, buffer, start, TYPE_WIDTH); + } + + /** + * Sets the element at given index using the buffer whose size maybe <= 16 bytes. + * @param index index to write the decimal to + * @param start start of value in the buffer + * @param buffer contains the decimal in native endian bytes + * @param length length of the value in the buffer + */ + public void setSafe(int index, long start, ArrowBuf buffer, int length) { + handleSafe(index); + BitVectorHelper.setBit(validityBuffer, index); + + // do the bound checks. + buffer.checkBytes(start, start + length); + valueBuffer.checkBytes((long) index * TYPE_WIDTH, (long) (index + 1) * TYPE_WIDTH); + + long inAddress = buffer.memoryAddress() + start; + long outAddress = valueBuffer.memoryAddress() + (long) index * TYPE_WIDTH; + if (LITTLE_ENDIAN) { + PlatformDependent.copyMemory(inAddress, outAddress, length); + // sign extend + if (length < TYPE_WIDTH) { + byte msb = PlatformDependent.getByte(inAddress + length - 1); + final byte pad = (byte) (msb < 0 ? 0xFF : 0x00); + PlatformDependent.setMemory(outAddress + length, DecimalVector.TYPE_WIDTH - length, pad); + } + } else { + PlatformDependent.copyMemory(inAddress, outAddress + DecimalVector.TYPE_WIDTH - length, length); + // sign extend + if (length < TYPE_WIDTH) { + byte msb = PlatformDependent.getByte(inAddress); + final byte pad = (byte) (msb < 0 ? 0xFF : 0x00); + PlatformDependent.setMemory(outAddress, DecimalVector.TYPE_WIDTH - length, pad); + } + } + } + + + /** + * Sets the element at given index using the buffer whose size maybe <= 16 bytes. + * @param index index to write the decimal to + * @param start start of value in the buffer + * @param buffer contains the decimal in big endian bytes + * @param length length of the value in the buffer + */ + public void setBigEndianSafe(int index, long start, ArrowBuf buffer, int length) { + handleSafe(index); + BitVectorHelper.setBit(validityBuffer, index); + + // do the bound checks. + buffer.checkBytes(start, start + length); + valueBuffer.checkBytes((long) index * TYPE_WIDTH, (long) (index + 1) * TYPE_WIDTH); + + // not using buffer.getByte() to avoid boundary checks for every byte. + long inAddress = buffer.memoryAddress() + start; + long outAddress = valueBuffer.memoryAddress() + (long) index * TYPE_WIDTH; + if (LITTLE_ENDIAN) { + // swap bytes to convert BE to LE + for (int byteIdx = 0; byteIdx < length; ++byteIdx) { + byte val = PlatformDependent.getByte((inAddress + length - 1) - byteIdx); + PlatformDependent.putByte(outAddress + byteIdx, val); + } + // sign extend + if (length < TYPE_WIDTH) { + byte msb = PlatformDependent.getByte(inAddress); + final byte pad = (byte) (msb < 0 ? 0xFF : 0x00); + PlatformDependent.setMemory(outAddress + length, DecimalVector.TYPE_WIDTH - length, pad); + } + } else { + PlatformDependent.copyMemory(inAddress, outAddress + DecimalVector.TYPE_WIDTH - length, length); + // sign extend + if (length < TYPE_WIDTH) { + byte msb = PlatformDependent.getByte(inAddress); + final byte pad = (byte) (msb < 0 ? 0xFF : 0x00); + PlatformDependent.setMemory(outAddress, DecimalVector.TYPE_WIDTH - length, pad); + } + } + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value BigDecimal containing decimal value. + */ + public void set(int index, BigDecimal value) { + BitVectorHelper.setBit(validityBuffer, index); + DecimalUtility.checkPrecisionAndScale(value, precision, scale); + DecimalUtility.writeBigDecimalToArrowBuf(value, valueBuffer, index, TYPE_WIDTH); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value long value. + */ + public void set(int index, long value) { + BitVectorHelper.setBit(validityBuffer, index); + DecimalUtility.writeLongToArrowBuf(value, valueBuffer, index, TYPE_WIDTH); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableDecimalHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + valueBuffer.setBytes((long) index * TYPE_WIDTH, holder.buffer, holder.start, TYPE_WIDTH); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, DecimalHolder holder) { + BitVectorHelper.setBit(validityBuffer, index); + valueBuffer.setBytes((long) index * TYPE_WIDTH, holder.buffer, holder.start, TYPE_WIDTH); + } + + /** + * Same as {@link #set(int, ArrowBuf)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param buffer ArrowBuf containing decimal value. + */ + public void setSafe(int index, ArrowBuf buffer) { + handleSafe(index); + set(index, buffer); + } + + /** + * Same as {@link #setBigEndian(int, byte[])} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + */ + public void setBigEndianSafe(int index, byte[] value) { + handleSafe(index); + setBigEndian(index, value); + } + + /** + * Same as {@link #set(int, long, ArrowBuf)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param start start index of data in the buffer + * @param buffer ArrowBuf containing decimal value. + */ + public void setSafe(int index, long start, ArrowBuf buffer) { + handleSafe(index); + set(index, start, buffer); + } + + /** + * Same as {@link #set(int, BigDecimal)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value BigDecimal containing decimal value. + */ + public void setSafe(int index, BigDecimal value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, long)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value long value. + */ + public void setSafe(int index, long value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableDecimalHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableDecimalHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, DecimalHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, DecimalHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param start start position of the value in the buffer + * @param buffer buffer containing the value to be stored in the vector + */ + public void set(int index, int isSet, long start, ArrowBuf buffer) { + if (isSet > 0) { + set(index, start, buffer); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, int, long, ArrowBuf)} except that it handles + * the case when the position of new value is beyond the current value + * capacity of the vector. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param start start position of the value in the buffer + * @param buffer buffer containing the value to be stored in the vector + */ + public void setSafe(int index, int isSet, long start, ArrowBuf buffer) { + handleSafe(index); + set(index, isSet, start, buffer); + } + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((DecimalVector) to); + } + + private class TransferImpl implements TransferPair { + DecimalVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new DecimalVector(ref, allocator, DecimalVector.this.precision, + DecimalVector.this.scale); + } + + public TransferImpl(DecimalVector to) { + this.to = to; + } + + @Override + public DecimalVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, DecimalVector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java new file mode 100644 index 000000000..c16db40f7 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Vector that support density aware initial capacity settings. + * We use this for ListVector and VarCharVector as of now to + * control the memory allocated. + * + *

For ListVector, we have been using a multiplier of 5 + * to compute the initial capacity of the inner data vector. + * For deeply nested lists and lists with lots of NULL values, + * this is over-allocation upfront. So density helps to be + * conservative when computing the value capacity of the + * inner vector. + * + *

For example, a density value of 10 implies each position in the + * list vector has a list of 10 values. So we will provision + * an initial capacity of (valuecount * 10) for the inner vector. + * A density value of 0.1 implies out of 10 positions in the list vector, + * 1 position has a list of size 1 and remaining positions are + * null (no lists) or empty lists. This helps in tightly controlling + * the memory we provision for inner data vector. + * + *

Similar analogy is applicable for VarCharVector where the capacity + * of the data buffer can be controlled using density multiplier + * instead of default multiplier of 8 (default size of average + * varchar length). + * + *

Also from container vectors, we propagate the density down + * the inner vectors so that they can use it appropriately. + */ +public interface DensityAwareVector { + + /** + * Set value with density. + * + * @param valueCount the number of values in this vector + * @param density the density of the vector + */ + void setInitialCapacity(int valueCount, double density); +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/DurationVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/DurationVector.java new file mode 100644 index 000000000..9671b34e0 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/DurationVector.java @@ -0,0 +1,406 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static java.util.concurrent.TimeUnit.MICROSECONDS; +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import java.time.Duration; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.DurationReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.DurationHolder; +import org.apache.arrow.vector.holders.NullableDurationHolder; +import org.apache.arrow.vector.types.TimeUnit; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * DurationVector implements a fixed width vector (8 bytes) of + * a configurable TimeUnit granularity duration values which could be null. + * A validity buffer (bit vector) is maintained to track which elements in the + * vector are null. + */ +public final class DurationVector extends BaseFixedWidthVector { + public static final byte TYPE_WIDTH = 8; + private final FieldReader reader; + + private final TimeUnit unit; + + /** + * Instantiate a DurationVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public DurationVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a DurationVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public DurationVector(Field field, BufferAllocator allocator) { + super(field, allocator, TYPE_WIDTH); + reader = new DurationReaderImpl(DurationVector.this); + this.unit = ((ArrowType.Duration) field.getFieldType().getType()).getUnit(); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.DURATION; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + *

This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static long get(final ArrowBuf buffer, final int index) { + return buffer.getLong((long) index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public ArrowBuf get(int index) throws IllegalStateException { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + return null; + } + return valueBuffer.slice((long) index * TYPE_WIDTH, TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableDurationHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = get(valueBuffer, index); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Duration getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + final long value = get(valueBuffer, index); + return toDuration(value, unit); + } + } + + /** + * Converts the given value and unit to the appropriate {@link Duration}. + */ + public static Duration toDuration(long value, TimeUnit unit) { + switch (unit) { + case SECOND: + return Duration.ofSeconds(value); + case MILLISECOND: + return Duration.ofMillis(value); + case NANOSECOND: + return Duration.ofNanos(value); + case MICROSECOND: + return Duration.ofNanos(MICROSECONDS.toNanos(value)); + default: + throw new IllegalArgumentException("Unknown timeunit: " + unit); + } + } + + /** + * Get the Interval value at a given index as a {@link StringBuilder} object. + * + * @param index position of the element + * @return String Builder object with Interval in java.time.Duration format. + */ + public StringBuilder getAsStringBuilder(int index) { + if (isSet(index) == 0) { + return null; + } else { + return getAsStringBuilderHelper(index); + } + } + + private StringBuilder getAsStringBuilderHelper(int index) { + return new StringBuilder(getObject(index).toString()); + } + + /** + * Gets the time unit of the duration. + */ + public TimeUnit getUnit() { + return unit; + } + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, ArrowBuf value) { + BitVectorHelper.setBit(validityBuffer, index); + valueBuffer.setBytes((long) index * TYPE_WIDTH, value, 0, TYPE_WIDTH); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value The duration value (in the timeunit associated with this vector) + */ + public void set(int index, long value) { + final long offsetIndex = (long) index * TYPE_WIDTH; + BitVectorHelper.setBit(validityBuffer, index); + valueBuffer.setLong(offsetIndex, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableDurationHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + set(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, DurationHolder holder) { + set(index, holder.value); + } + + /** + * Same as {@link #set(int, ArrowBuf)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, ArrowBuf value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, long)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value duration in the time unit this vector was constructed with + */ + public void setSafe(int index, long value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableDurationHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableDurationHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, DurationHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, DurationHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value The duration value (in the TimeUnit associated with this vector). + */ + public void set(int index, int isSet, long value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, int, long)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value The duration value (in the timeunit associated with this vector) + */ + public void setSafe(int index, int isSet, long value) { + handleSafe(index); + set(index, isSet, value); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((DurationVector) to); + } + + private class TransferImpl implements TransferPair { + DurationVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new DurationVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(DurationVector to) { + this.to = to; + } + + @Override + public DurationVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, DurationVector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ElementAddressableVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ElementAddressableVector.java new file mode 100644 index 000000000..f37a50100 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ElementAddressableVector.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.memory.util.ArrowBufPointer; + +/** + * Vector for which each data element resides in a continuous memory region, + * so it can be pointed to by an {@link org.apache.arrow.memory.util.ArrowBufPointer}. + */ +public interface ElementAddressableVector extends ValueVector { + + /** + * Gets the pointer for the data at the given index. + * @param index the index for the data. + * @return the pointer to the data. + */ + ArrowBufPointer getDataPointer(int index); + + /** + * Gets the pointer for the data at the given index. + * @param index the index for the data. + * @param reuse the data pointer to fill, this avoids creating a new pointer object. + * @return the pointer to the data, it should be the same one as the input parameter + */ + ArrowBufPointer getDataPointer(int index, ArrowBufPointer reuse); +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ExtensionTypeVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ExtensionTypeVector.java new file mode 100644 index 000000000..2041227fc --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ExtensionTypeVector.java @@ -0,0 +1,274 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import java.util.Iterator; +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.TransferPair; + +/** + * A vector that wraps an underlying vector, used to help implement extension types. + * @param The wrapped vector type. + */ +public abstract class ExtensionTypeVector extends BaseValueVector implements + FieldVector { + + private final T underlyingVector; + private final String name; + + /** + * Instantiate an extension type vector. + * @param name name of the vector + * @param allocator allocator for memory management + * @param underlyingVector underlying filed vector + */ + public ExtensionTypeVector(String name, BufferAllocator allocator, T underlyingVector) { + super(allocator); + Preconditions.checkNotNull(underlyingVector, "underlyingVector can not be null."); + this.name = name; + this.underlyingVector = underlyingVector; + } + + /** + * Instantiate an extension type vector. + * @param field field materialized by this vector. + * @param allocator allocator for memory management + * @param underlyingVector underlying filed vector + */ + public ExtensionTypeVector(Field field, BufferAllocator allocator, T underlyingVector) { + this(field.getName(), allocator, underlyingVector); + } + + @Override + public String getName() { + return name; + } + + /** Get the underlying vector. */ + public T getUnderlyingVector() { + return underlyingVector; + } + + @Override + public void allocateNew() throws OutOfMemoryException { + this.underlyingVector.allocateNew(); + } + + @Override + public boolean allocateNewSafe() { + return this.underlyingVector.allocateNewSafe(); + } + + @Override + public void reAlloc() { + this.underlyingVector.reAlloc(); + } + + @Override + public void setInitialCapacity(int numRecords) { + this.underlyingVector.setInitialCapacity(numRecords); + } + + @Override + public int getValueCapacity() { + return this.underlyingVector.getValueCapacity(); + } + + @Override + public void reset() { + this.underlyingVector.reset(); + } + + @Override + public Field getField() { + return this.underlyingVector.getField(); + } + + @Override + public MinorType getMinorType() { + return MinorType.EXTENSIONTYPE; + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return underlyingVector.getTransferPair(ref, allocator); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { + return underlyingVector.getTransferPair(ref, allocator, callBack); + } + + @Override + public TransferPair makeTransferPair(ValueVector target) { + return underlyingVector.makeTransferPair(target); + } + + @Override + public FieldReader getReader() { + return underlyingVector.getReader(); + } + + @Override + public int getBufferSize() { + return underlyingVector.getBufferSize(); + } + + @Override + public int getBufferSizeFor(int valueCount) { + return underlyingVector.getBufferSizeFor(valueCount); + } + + @Override + public ArrowBuf[] getBuffers(boolean clear) { + return underlyingVector.getBuffers(clear); + } + + @Override + public ArrowBuf getValidityBuffer() { + return underlyingVector.getValidityBuffer(); + } + + @Override + public ArrowBuf getDataBuffer() { + return underlyingVector.getDataBuffer(); + } + + @Override + public ArrowBuf getOffsetBuffer() { + return underlyingVector.getOffsetBuffer(); + } + + @Override + public int getValueCount() { + return underlyingVector.getValueCount(); + } + + @Override + public void setValueCount(int valueCount) { + underlyingVector.setValueCount(valueCount); + } + + /** + * Get the extension object at the specified index. + * + *

Generally, this should access the underlying vector and construct the corresponding Java object from the raw + * data. + */ + @Override + public abstract Object getObject(int index); + + @Override + public int getNullCount() { + return underlyingVector.getNullCount(); + } + + @Override + public boolean isNull(int index) { + return underlyingVector.isNull(index); + } + + @Override + public void initializeChildrenFromFields(List children) { + underlyingVector.initializeChildrenFromFields(children); + } + + @Override + public List getChildrenFromFields() { + return underlyingVector.getChildrenFromFields(); + } + + @Override + public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { + underlyingVector.loadFieldBuffers(fieldNode, ownBuffers); + } + + @Override + public List getFieldBuffers() { + return underlyingVector.getFieldBuffers(); + } + + /** + * Get the inner vectors. + * + * @deprecated This API will be removed as the current implementations no longer support inner vectors. + * + * @return the inner vectors for this field as defined by the TypeLayout + */ + @Deprecated + @Override + public List getFieldInnerVectors() { + return underlyingVector.getFieldInnerVectors(); + } + + @Override + public long getValidityBufferAddress() { + return underlyingVector.getValidityBufferAddress(); + } + + @Override + public long getDataBufferAddress() { + return underlyingVector.getDataBufferAddress(); + } + + @Override + public long getOffsetBufferAddress() { + return underlyingVector.getOffsetBufferAddress(); + } + + @Override + public void clear() { + underlyingVector.clear(); + } + + @Override + public void close() { + underlyingVector.close(); + } + + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + return underlyingVector.getTransferPair(allocator); + } + + @Override + public Iterator iterator() { + return underlyingVector.iterator(); + } + + @Override + public BufferAllocator getAllocator() { + return underlyingVector.getAllocator(); + } + + @Override + public OUT accept(VectorVisitor visitor, IN value) { + return visitor.visit(this, value); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java new file mode 100644 index 000000000..b00581a04 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.types.pojo.Field; + +/** + * A vector corresponding to a Field in the schema. + * It has inner vectors backed by buffers (validity, offsets, data, ...) + */ +public interface FieldVector extends ValueVector { + + /** + * Initializes the child vectors + * to be later loaded with loadBuffers. + * + * @param children the schema + */ + void initializeChildrenFromFields(List children); + + /** + * The returned list is the same size as the list passed to initializeChildrenFromFields. + * + * @return the children according to schema (empty for primitive types) + */ + List getChildrenFromFields(); + + /** + * Loads data in the vectors. + * (ownBuffers must be the same size as getFieldVectors()) + * + * @param fieldNode the fieldNode + * @param ownBuffers the buffers for this Field (own buffers only, children not included) + */ + void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers); + + /** + * Get the buffers of the fields, (same size as getFieldVectors() since it is their content). + * + * @return the buffers containing the data for this vector (ready for reading) + */ + List getFieldBuffers(); + + /** + * Get the inner vectors. + * + * @deprecated This API will be removed as the current implementations no longer support inner vectors. + * + * @return the inner vectors for this field as defined by the TypeLayout + */ + @Deprecated + List getFieldInnerVectors(); + + /** + * Gets the starting address of the underlying buffer associated with validity vector. + * + * @return buffer address + */ + long getValidityBufferAddress(); + + /** + * Gets the starting address of the underlying buffer associated with data vector. + * + * @return buffer address + */ + long getDataBufferAddress(); + + /** + * Gets the starting address of the underlying buffer associated with offset vector. + * + * @return buffer address + */ + long getOffsetBufferAddress(); +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java new file mode 100644 index 000000000..e1847e4bb --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java @@ -0,0 +1,386 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.complex.impl.FixedSizeBinaryReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.FixedSizeBinaryHolder; +import org.apache.arrow.vector.holders.NullableFixedSizeBinaryHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeBinary; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * FixedSizeBinaryVector implements a fixed width vector of + * binary values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public class FixedSizeBinaryVector extends BaseFixedWidthVector { + private final int byteWidth; + private final FieldReader reader; + + /** + * Instantiate a FixedSizeBinaryVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + * @param byteWidth byte width of the binary values + */ + public FixedSizeBinaryVector(String name, BufferAllocator allocator, int byteWidth) { + this(name, FieldType.nullable(new FixedSizeBinary(byteWidth)), allocator); + } + + /** + * Instantiate a FixedSizeBinaryVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public FixedSizeBinaryVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a FixedSizeBinaryVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public FixedSizeBinaryVector(Field field, BufferAllocator allocator) { + super(field, allocator, ((FixedSizeBinary) field.getFieldType().getType()).getByteWidth()); + reader = new FixedSizeBinaryReaderImpl(FixedSizeBinaryVector.this); + byteWidth = ((FixedSizeBinary) field.getFieldType().getType()).getByteWidth(); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.FIXEDSIZEBINARY; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public byte[] get(int index) { + assert index >= 0; + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + return null; + } + final byte[] dst = new byte[byteWidth]; + valueBuffer.getBytes((long) index * byteWidth, dst, 0, byteWidth); + return dst; + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + * @param holder nullable holder to carry the buffer + */ + public void get(int index, NullableFixedSizeBinaryHolder holder) { + assert index >= 0; + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.buffer = valueBuffer.slice((long) index * byteWidth, byteWidth); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + @Override + public byte[] getObject(int index) { + return get(index); + } + + public int getByteWidth() { + return byteWidth; + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + /** Sets the value at index to the provided one. */ + public void set(int index, byte[] value) { + assert index >= 0; + Preconditions.checkNotNull(value, "expecting a valid byte array"); + assert byteWidth <= value.length; + BitVectorHelper.setBit(validityBuffer, index); + valueBuffer.setBytes((long) index * byteWidth, value, 0, byteWidth); + } + + /** + * Same as {@link #set(int, byte[])} but reallocates if index + * is larger than capacity. + */ + public void setSafe(int index, byte[] value) { + handleSafe(index); + set(index, value); + } + + /** + * Sets the value if isSet is positive, otherwise sets the index to null/invalid. + */ + public void set(int index, int isSet, byte[] value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + public void setSafe(int index, int isSet, byte[] value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param buffer ArrowBuf containing binary value. + */ + public void set(int index, ArrowBuf buffer) { + assert index >= 0; + assert byteWidth <= buffer.capacity(); + BitVectorHelper.setBit(validityBuffer, index); + valueBuffer.setBytes((long) index * byteWidth, buffer, 0, byteWidth); + } + + /** + * Same as {@link #set(int, ArrowBuf)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param buffer ArrowBuf containing binary value. + */ + public void setSafe(int index, ArrowBuf buffer) { + handleSafe(index); + set(index, buffer); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param buffer ArrowBuf containing binary value. + */ + public void set(int index, int isSet, ArrowBuf buffer) { + if (isSet > 0) { + set(index, buffer); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, ArrowBuf)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param buffer ArrowBuf containing binary value. + */ + public void setSafe(int index, int isSet, ArrowBuf buffer) { + handleSafe(index); + set(index, isSet, buffer); + } + + /** + * Set the variable length element at the specified index to the data + * buffer supplied in the holder. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, FixedSizeBinaryHolder holder) { + assert holder.byteWidth == byteWidth; + set(index, holder.buffer); + } + + /** + * Same as {@link #set(int, FixedSizeBinaryHolder)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, FixedSizeBinaryHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Set the variable length element at the specified index to the data + * buffer supplied in the holder. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, NullableFixedSizeBinaryHolder holder) { + assert holder.byteWidth == byteWidth; + if (holder.isSet < 0) { + throw new IllegalArgumentException("holder has a negative isSet value"); + } else if (holder.isSet > 0) { + set(index, holder.buffer); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, NullableFixedSizeBinaryHolder)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, NullableFixedSizeBinaryHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + *

This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static byte[] get(final ArrowBuf buffer, final int index, final int byteWidth) { + final byte[] dst = new byte[byteWidth]; + buffer.getBytes((long) index * byteWidth, dst, 0, byteWidth); + return dst; + } + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((FixedSizeBinaryVector) to); + } + + private class TransferImpl implements TransferPair { + FixedSizeBinaryVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new FixedSizeBinaryVector(ref, allocator, FixedSizeBinaryVector.this.byteWidth); + } + + public TransferImpl(FixedSizeBinaryVector to) { + this.to = to; + } + + @Override + public FixedSizeBinaryVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, FixedSizeBinaryVector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/FixedWidthVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/FixedWidthVector.java new file mode 100644 index 000000000..58effeecb --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/FixedWidthVector.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Interface for all fixed width {@link ElementAddressableVector} (e.g. integer, fixed size binary, etc). + */ +public interface FixedWidthVector extends ElementAddressableVector { + + /** + * Allocate a new memory space for this vector. Must be called prior to using the ValueVector. + * + * @param valueCount Number of values in the vector. + */ + void allocateNew(int valueCount); + + /** + * Zero out the underlying buffer backing this vector. + */ + void zeroVector(); +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/Float4Vector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/Float4Vector.java new file mode 100644 index 000000000..365a1529b --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/Float4Vector.java @@ -0,0 +1,361 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.Float4ReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.Float4Holder; +import org.apache.arrow.vector.holders.NullableFloat4Holder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * Float4Vector implements a fixed width vector (4 bytes) of + * float values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public final class Float4Vector extends BaseFixedWidthVector implements FloatingPointVector { + public static final byte TYPE_WIDTH = 4; + private final FieldReader reader; + + /** + * Instantiate a Float4Vector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public Float4Vector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.FLOAT4.getType()), allocator); + } + + /** + * Instantiate a Float4Vector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public Float4Vector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a Float4Vector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public Float4Vector(Field field, BufferAllocator allocator) { + super(field, allocator, TYPE_WIDTH); + reader = new Float4ReaderImpl(Float4Vector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.FLOAT4; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public float get(int index) throws IllegalStateException { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getFloat((long) index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableFloat4Holder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getFloat((long) index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Float getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getFloat((long) index * TYPE_WIDTH); + } + } + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + private void setValue(int index, float value) { + valueBuffer.setFloat((long) index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, float value) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableFloat4Holder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, Float4Holder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, float)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, float value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableFloat4Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableFloat4Holder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, Float4Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, Float4Holder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, float value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, int, float)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, float value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + *

This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static float get(final ArrowBuf buffer, final int index) { + return buffer.getFloat((long) index * TYPE_WIDTH); + } + + @Override + public void setWithPossibleTruncate(int index, double value) { + set(index, (float) value); + } + + @Override + public void setSafeWithPossibleTruncate(int index, double value) { + setSafe(index, (float) value); + } + + @Override + public double getValueAsDouble(int index) { + return get(index); + } + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((Float4Vector) to); + } + + private class TransferImpl implements TransferPair { + Float4Vector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new Float4Vector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(Float4Vector to) { + this.to = to; + } + + @Override + public Float4Vector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, Float4Vector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/Float8Vector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/Float8Vector.java new file mode 100644 index 000000000..948390d46 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/Float8Vector.java @@ -0,0 +1,362 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.Float8ReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.Float8Holder; +import org.apache.arrow.vector.holders.NullableFloat8Holder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * Float8Vector implements a fixed width vector (8 bytes) of + * double values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public final class Float8Vector extends BaseFixedWidthVector implements FloatingPointVector { + public static final byte TYPE_WIDTH = 8; + private final FieldReader reader; + + /** + * Instantiate a Float8Vector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public Float8Vector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.FLOAT8.getType()), allocator); + } + + /** + * Instantiate a Float8Vector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public Float8Vector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a Float8Vector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public Float8Vector(Field field, BufferAllocator allocator) { + super(field, allocator, TYPE_WIDTH); + reader = new Float8ReaderImpl(Float8Vector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.FLOAT8; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public double get(int index) throws IllegalStateException { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getDouble((long) index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableFloat8Holder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getDouble((long) index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Double getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getDouble((long) index * TYPE_WIDTH); + } + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + private void setValue(int index, double value) { + valueBuffer.setDouble((long) index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, double value) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableFloat8Holder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, Float8Holder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, double)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, double value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableFloat8Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableFloat8Holder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, Float8Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, Float8Holder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, double value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, int, double)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, double value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + *

This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static double get(final ArrowBuf buffer, final int index) { + return buffer.getDouble((long) index * TYPE_WIDTH); + } + + @Override + public void setWithPossibleTruncate(int index, double value) { + set(index, value); + } + + @Override + public void setSafeWithPossibleTruncate(int index, double value) { + setSafe(index, value); + } + + @Override + public double getValueAsDouble(int index) { + return get(index); + } + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((Float8Vector) to); + } + + private class TransferImpl implements TransferPair { + Float8Vector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new Float8Vector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(Float8Vector to) { + this.to = to; + } + + @Override + public Float8Vector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, Float8Vector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/FloatingPointVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/FloatingPointVector.java new file mode 100644 index 000000000..4c5143de6 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/FloatingPointVector.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * The interface for vectors with floating point values. + */ +public interface FloatingPointVector extends ValueVector { + + /** + * Sets the value at the given index, note this value may be truncated internally. + * @param index the index to set. + * @param value the value to set. + */ + void setWithPossibleTruncate(int index, double value); + + /** + * Sets the value at the given index, note this value may be truncated internally. + * Any expansion/reallocation is handled automatically. + * @param index the index to set. + * @param value the value to set. + */ + void setSafeWithPossibleTruncate(int index, double value); + + /** + * Gets the value at the given index. + * @param index the index to retrieve the value. + * @return the value at the index. + */ + double getValueAsDouble(int index); +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/GenerateSampleData.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/GenerateSampleData.java new file mode 100644 index 000000000..3da915541 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/GenerateSampleData.java @@ -0,0 +1,337 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import java.math.BigDecimal; +import java.nio.charset.Charset; + +/** + * Helper class to generate test data for Nullable fixed and variable + * width scalar vectors. Previous implementations of java vector classes + * provided generateTestData(now deprecated) API to populate the vector + * with sample data. This class should be used for that purpose. + */ +public class GenerateSampleData { + private GenerateSampleData() {} + + /** Populates vector with valueCount random values. */ + public static void generateTestData(final ValueVector vector, final int valueCount) { + if (vector instanceof IntVector) { + writeIntData((IntVector) vector, valueCount); + } else if (vector instanceof DecimalVector) { + writeDecimalData((DecimalVector) vector, valueCount); + } else if (vector instanceof BitVector) { + writeBooleanData((BitVector) vector, valueCount); + } else if (vector instanceof VarCharVector) { + writeVarCharData((VarCharVector) vector, valueCount); + } else if (vector instanceof VarBinaryVector) { + writeVarBinaryData((VarBinaryVector) vector, valueCount); + } else if (vector instanceof BigIntVector) { + writeBigIntData((BigIntVector) vector, valueCount); + } else if (vector instanceof Float4Vector) { + writeFloatData((Float4Vector) vector, valueCount); + } else if (vector instanceof Float8Vector) { + writeDoubleData((Float8Vector) vector, valueCount); + } else if (vector instanceof DateDayVector) { + writeDateDayData((DateDayVector) vector, valueCount); + } else if (vector instanceof DateMilliVector) { + writeDateMilliData((DateMilliVector) vector, valueCount); + } else if (vector instanceof IntervalDayVector) { + writeIntervalDayData((IntervalDayVector) vector, valueCount); + } else if (vector instanceof IntervalYearVector) { + writeIntervalYearData((IntervalYearVector) vector, valueCount); + } else if (vector instanceof SmallIntVector) { + writeSmallIntData((SmallIntVector) vector, valueCount); + } else if (vector instanceof TinyIntVector) { + writeTinyIntData((TinyIntVector) vector, valueCount); + } else if (vector instanceof TimeMicroVector) { + writeTimeMicroData((TimeMicroVector) vector, valueCount); + } else if (vector instanceof TimeMilliVector) { + writeTimeMilliData((TimeMilliVector) vector, valueCount); + } else if (vector instanceof TimeNanoVector) { + writeTimeNanoData((TimeNanoVector) vector, valueCount); + } else if (vector instanceof TimeSecVector) { + writeTimeSecData((TimeSecVector) vector, valueCount); + } else if (vector instanceof TimeStampSecVector) { + writeTimeStampData((TimeStampSecVector) vector, valueCount); + } else if (vector instanceof TimeStampMicroVector) { + writeTimeStampData((TimeStampMicroVector) vector, valueCount); + } else if (vector instanceof TimeStampMilliVector) { + writeTimeStampData((TimeStampMilliVector) vector, valueCount); + } else if (vector instanceof TimeStampNanoVector) { + writeTimeStampData((TimeStampNanoVector) vector, valueCount); + } else if (vector instanceof TimeStampSecTZVector) { + writeTimeStampData((TimeStampSecTZVector) vector, valueCount); + } else if (vector instanceof TimeStampMicroTZVector) { + writeTimeStampData((TimeStampMicroTZVector) vector, valueCount); + } else if (vector instanceof TimeStampMilliTZVector) { + writeTimeStampData((TimeStampMilliTZVector) vector, valueCount); + } else if (vector instanceof TimeStampNanoTZVector) { + writeTimeStampData((TimeStampNanoTZVector) vector, valueCount); + } + } + + private static void writeTimeStampData(TimeStampVector vector, int valueCount) { + final long even = 100000; + final long odd = 200000; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeDecimalData(DecimalVector vector, int valueCount) { + final BigDecimal even = new BigDecimal(0.0543278923); + final BigDecimal odd = new BigDecimal(2.0543278923); + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeIntData(IntVector vector, int valueCount) { + final int even = 1000; + final int odd = 2000; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeBooleanData(BitVector vector, int valueCount) { + final int even = 0; + final int odd = 1; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeIntervalYearData(IntervalYearVector vector, int valueCount) { + final int even = 1; + final int odd = 2; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeIntervalDayData(IntervalDayVector vector, int valueCount) { + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, 1, 50); + } else { + vector.setSafe(i, 2, 100); + } + } + vector.setValueCount(valueCount); + } + + private static void writeTimeSecData(TimeSecVector vector, int valueCount) { + final int even = 500; + final int odd = 900; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeTimeMilliData(TimeMilliVector vector, int valueCount) { + final int even = 1000; + final int odd = 2000; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeTimeMicroData(TimeMicroVector vector, int valueCount) { + final long even = 1000000000; + final long odd = 2000000000; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + + } + + private static void writeTimeNanoData(TimeNanoVector vector, int valueCount) { + final long even = 1000000000; + final long odd = 2000000000; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeDateDayData(DateDayVector vector, int valueCount) { + final int even = 1000; + final int odd = 2000; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeDateMilliData(DateMilliVector vector, int valueCount) { + final long even = 1000000000; + final long odd = 2000000000; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeSmallIntData(SmallIntVector vector, int valueCount) { + final short even = 10; + final short odd = 20; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeTinyIntData(TinyIntVector vector, int valueCount) { + final byte even = 1; + final byte odd = 2; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeBigIntData(BigIntVector vector, int valueCount) { + final long even = 1000000000; + final long odd = 2000000000; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeFloatData(Float4Vector vector, int valueCount) { + final float even = 20.3f; + final float odd = 40.2f; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeDoubleData(Float8Vector vector, int valueCount) { + final double even = 20.2373; + final double odd = 40.2378; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeVarBinaryData(VarBinaryVector vector, int valueCount) { + Charset utf8Charset = Charset.forName("UTF-8"); + final byte[] even = "AAAAA1".getBytes(utf8Charset); + final byte[] odd = "BBBBBBBBB2".getBytes(utf8Charset); + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeVarCharData(VarCharVector vector, int valueCount) { + Charset utf8Charset = Charset.forName("UTF-8"); + final byte[] even = "AAAAA1".getBytes(utf8Charset); + final byte[] odd = "BBBBBBBBB2".getBytes(utf8Charset); + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } +} + diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/IntVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/IntVector.java new file mode 100644 index 000000000..e591ec1e8 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/IntVector.java @@ -0,0 +1,362 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.IntReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.IntHolder; +import org.apache.arrow.vector.holders.NullableIntHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * IntVector implements a fixed width (4 bytes) vector of + * integer values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public final class IntVector extends BaseFixedWidthVector implements BaseIntVector { + public static final byte TYPE_WIDTH = 4; + private final FieldReader reader; + + /** + * Instantiate a IntVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public IntVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.INT.getType()), allocator); + } + + /** + * Instantiate a IntVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public IntVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a IntVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public IntVector(Field field, BufferAllocator allocator) { + super(field, allocator, TYPE_WIDTH); + reader = new IntReaderImpl(IntVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.INT; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public int get(int index) throws IllegalStateException { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getInt((long) index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableIntHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getInt((long) index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Integer getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getInt((long) index * TYPE_WIDTH); + } + } + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + private void setValue(int index, int value) { + valueBuffer.setInt((long) index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, int value) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableIntHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, IntHolder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, int)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, int value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableIntHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableIntHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, IntHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, IntHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, int value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, int, int)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, int value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + *

This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static int get(final ArrowBuf buffer, final int index) { + return buffer.getInt((long) index * TYPE_WIDTH); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((IntVector) to); + } + + @Override + public void setWithPossibleTruncate(int index, long value) { + this.setSafe(index, (int) value); + } + + @Override + public void setUnsafeWithPossibleTruncate(int index, long value) { + this.set(index, (int) value); + } + + @Override + public long getValueAsLong(int index) { + return this.get(index); + } + + private class TransferImpl implements TransferPair { + IntVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new IntVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(IntVector to) { + this.to = to; + } + + @Override + public IntVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, IntVector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/IntervalDayVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/IntervalDayVector.java new file mode 100644 index 000000000..0dc860e6b --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/IntervalDayVector.java @@ -0,0 +1,433 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import java.time.Duration; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.IntervalDayReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.IntervalDayHolder; +import org.apache.arrow.vector.holders.NullableIntervalDayHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * IntervalDayVector implements a fixed width vector (8 bytes) of + * interval (days and milliseconds) values which could be null. + * A validity buffer (bit vector) is maintained to track which elements in the + * vector are null. + */ +public final class IntervalDayVector extends BaseFixedWidthVector { + public static final byte TYPE_WIDTH = 8; + private static final byte MILLISECOND_OFFSET = 4; + private final FieldReader reader; + + /** + * Instantiate a IntervalDayVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public IntervalDayVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.INTERVALDAY.getType()), allocator); + } + + /** + * Instantiate a IntervalDayVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public IntervalDayVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a IntervalDayVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public IntervalDayVector(Field field, BufferAllocator allocator) { + super(field, allocator, TYPE_WIDTH); + reader = new IntervalDayReaderImpl(IntervalDayVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.INTERVALDAY; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + /** + * Given a data buffer, get the number of days stored at a particular position + * in the vector. + * + *

This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return day value stored at the index. + */ + public static int getDays(final ArrowBuf buffer, final int index) { + return buffer.getInt((long) index * TYPE_WIDTH); + } + + /** + * Given a data buffer, get the get the number of milliseconds stored at a particular position + * in the vector. + * + *

This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return milliseconds value stored at the index. + */ + public static int getMilliseconds(final ArrowBuf buffer, final int index) { + return buffer.getInt((long) index * TYPE_WIDTH + MILLISECOND_OFFSET); + } + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public ArrowBuf get(int index) throws IllegalStateException { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + return null; + } + return valueBuffer.slice((long) index * TYPE_WIDTH, TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableIntervalDayHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + final long startIndex = (long) index * TYPE_WIDTH; + holder.isSet = 1; + holder.days = valueBuffer.getInt(startIndex); + holder.milliseconds = valueBuffer.getInt(startIndex + MILLISECOND_OFFSET); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Duration getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + final long startIndex = (long) index * TYPE_WIDTH; + final int days = valueBuffer.getInt(startIndex); + final int milliseconds = valueBuffer.getInt(startIndex + MILLISECOND_OFFSET); + return Duration.ofDays(days).plusMillis(milliseconds); + } + } + + /** + * Get the Interval value at a given index as a {@link StringBuilder} object. + * + * @param index position of the element + * @return String Builder object with Interval value as + * [days, hours, minutes, seconds, millis] + */ + public StringBuilder getAsStringBuilder(int index) { + if (isSet(index) == 0) { + return null; + } else { + return getAsStringBuilderHelper(index); + } + } + + private StringBuilder getAsStringBuilderHelper(int index) { + final long startIndex = (long) index * TYPE_WIDTH; + + final int days = valueBuffer.getInt(startIndex); + int millis = valueBuffer.getInt(startIndex + MILLISECOND_OFFSET); + + final int hours = millis / (org.apache.arrow.vector.util.DateUtility.hoursToMillis); + millis = millis % (org.apache.arrow.vector.util.DateUtility.hoursToMillis); + + final int minutes = millis / (org.apache.arrow.vector.util.DateUtility.minutesToMillis); + millis = millis % (org.apache.arrow.vector.util.DateUtility.minutesToMillis); + + final int seconds = millis / (org.apache.arrow.vector.util.DateUtility.secondsToMillis); + millis = millis % (org.apache.arrow.vector.util.DateUtility.secondsToMillis); + + final String dayString = (Math.abs(days) == 1) ? " day " : " days "; + + return (new StringBuilder() + .append(days).append(dayString) + .append(hours).append(":") + .append(minutes).append(":") + .append(seconds).append(".") + .append(millis)); + } + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, ArrowBuf value) { + BitVectorHelper.setBit(validityBuffer, index); + valueBuffer.setBytes((long) index * TYPE_WIDTH, value, 0, TYPE_WIDTH); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param days days for the interval + * @param milliseconds milliseconds for the interval + */ + public void set(int index, int days, int milliseconds) { + final long offsetIndex = (long) index * TYPE_WIDTH; + BitVectorHelper.setBit(validityBuffer, index); + valueBuffer.setInt(offsetIndex, days); + valueBuffer.setInt((offsetIndex + MILLISECOND_OFFSET), milliseconds); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableIntervalDayHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + set(index, holder.days, holder.milliseconds); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, IntervalDayHolder holder) { + set(index, holder.days, holder.milliseconds); + } + + /** + * Same as {@link #set(int, ArrowBuf)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, ArrowBuf value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, int, int)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param days days for the interval + * @param milliseconds milliseconds for the interval + */ + public void setSafe(int index, int days, int milliseconds) { + handleSafe(index); + set(index, days, milliseconds); + } + + /** + * Same as {@link #set(int, NullableIntervalDayHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableIntervalDayHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, IntervalDayHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, IntervalDayHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param days days component of interval + * @param milliseconds millisecond component of interval + */ + public void set(int index, int isSet, int days, int milliseconds) { + if (isSet > 0) { + set(index, days, milliseconds); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, int, int, int)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param days days component of interval + * @param milliseconds millisecond component of interval + */ + public void setSafe(int index, int isSet, int days, int milliseconds) { + handleSafe(index); + set(index, isSet, days, milliseconds); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((IntervalDayVector) to); + } + + private class TransferImpl implements TransferPair { + IntervalDayVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new IntervalDayVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(IntervalDayVector to) { + this.to = to; + } + + @Override + public IntervalDayVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, IntervalDayVector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/IntervalMonthDayNanoVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/IntervalMonthDayNanoVector.java new file mode 100644 index 000000000..ba3a26a89 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/IntervalMonthDayNanoVector.java @@ -0,0 +1,442 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import java.time.Duration; +import java.time.Period; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.IntervalMonthDayNanoReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.IntervalMonthDayNanoHolder; +import org.apache.arrow.vector.holders.NullableIntervalMonthDayNanoHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * IntervalMonthDayNanoVector implements a fixed width vector (16 bytes) of + * interval (month, days and nanoseconds) values which could be null. + * A validity buffer (bit vector) is maintained to track which elements in the + * vector are null. + * + * Month, day and nanoseconds are indepndent from one another and there + * is no specific limits imposed on their values. + */ +public final class IntervalMonthDayNanoVector extends BaseFixedWidthVector { + public static final byte TYPE_WIDTH = 16; + private static final byte DAY_OFFSET = 4; + private static final byte NANOSECOND_OFFSET = 8; + private final FieldReader reader; + + + /** + * Instantiate a IntervalMonthDayNanoVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public IntervalMonthDayNanoVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.INTERVALDAY.getType()), allocator); + } + + /** + * Instantiate a IntervalMonthDayNanoVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public IntervalMonthDayNanoVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a IntervalMonthDayNanoVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public IntervalMonthDayNanoVector(Field field, BufferAllocator allocator) { + super(field, allocator, TYPE_WIDTH); + reader = new IntervalMonthDayNanoReaderImpl(IntervalMonthDayNanoVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.INTERVALMONTHDAYNANO; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + /** + * Given a data buffer, get the number of months stored at a particular position + * in the vector. + * + *

This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return day value stored at the index. + */ + public static int getMonths(final ArrowBuf buffer, final int index) { + return buffer.getInt((long) index * TYPE_WIDTH); + } + + + /** + * Given a data buffer, get the number of days stored at a particular position + * in the vector. + * + *

This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return day value stored at the index. + */ + public static int getDays(final ArrowBuf buffer, final int index) { + return buffer.getInt((long) index * TYPE_WIDTH + DAY_OFFSET); + } + + /** + * Given a data buffer, get the get the number of nanoseconds stored at a particular position + * in the vector. + * + *

This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return nanoseconds value stored at the index. + */ + public static long getNanoseconds(final ArrowBuf buffer, final int index) { + return buffer.getLong((long) index * TYPE_WIDTH + NANOSECOND_OFFSET); + } + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public ArrowBuf get(int index) throws IllegalStateException { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + return null; + } + return valueBuffer.slice((long) index * TYPE_WIDTH, TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableIntervalMonthDayNanoHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + final long startIndex = (long) index * TYPE_WIDTH; + holder.isSet = 1; + holder.months = valueBuffer.getInt(startIndex); + holder.days = valueBuffer.getInt(startIndex + DAY_OFFSET); + holder.nanoseconds = valueBuffer.getLong(startIndex + NANOSECOND_OFFSET); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public PeriodDuration getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + final long startIndex = (long) index * TYPE_WIDTH; + final int months = valueBuffer.getInt(startIndex); + final int days = valueBuffer.getInt(startIndex + DAY_OFFSET); + final long nanoseconds = valueBuffer.getLong(startIndex + NANOSECOND_OFFSET); + + return new PeriodDuration(Period.ofMonths(months).plusDays(days), + Duration.ofNanos(nanoseconds)); + } + } + + /** + * Get the Interval value at a given index as a {@link StringBuilder} object. + * + * @param index position of the element + * @return String Builder object with Interval value as + */ + public StringBuilder getAsStringBuilder(int index) { + if (isSet(index) == 0) { + return null; + } else { + return getAsStringBuilderHelper(index); + } + } + + private StringBuilder getAsStringBuilderHelper(int index) { + return new StringBuilder().append(getObject(index).toString()).append(" "); + } + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, ArrowBuf value) { + BitVectorHelper.setBit(validityBuffer, index); + valueBuffer.setBytes((long) index * TYPE_WIDTH, value, 0, TYPE_WIDTH); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param months months component of interval + * @param days days component of interval + * @param nanoseconds nanosecond component of interval + */ + public void set(int index, int months, int days, long nanoseconds) { + final long offsetIndex = (long) index * TYPE_WIDTH; + BitVectorHelper.setBit(validityBuffer, index); + valueBuffer.setInt(offsetIndex, months); + valueBuffer.setInt(offsetIndex + DAY_OFFSET, days); + valueBuffer.setLong((offsetIndex + NANOSECOND_OFFSET), nanoseconds); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableIntervalMonthDayNanoHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + set(index, holder.months, holder.days, holder.nanoseconds); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, IntervalMonthDayNanoHolder holder) { + set(index, holder.months, holder.days, holder.nanoseconds); + } + + /** + * Same as {@link #set(int, ArrowBuf)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, ArrowBuf value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, int, int, long)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param months months for the interval + * @param days days for the interval + * @param nanoseconds nanoseconds for the interval + */ + public void setSafe(int index, int months, int days, long nanoseconds) { + handleSafe(index); + set(index, months, days, nanoseconds); + } + + /** + * Same as {@link #set(int, NullableIntervalMonthDayNanoHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableIntervalMonthDayNanoHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, IntervalMonthDayNanoHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, IntervalMonthDayNanoHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param months months component of interval + * @param days days component of interval + * @param nanoseconds nanosecond component of interval + */ + public void set(int index, int isSet, int months, int days, long nanoseconds) { + if (isSet > 0) { + set(index, months, days, nanoseconds); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, int, int, int, long)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param months months component of interval + * @param days days component of interval + * @param nanoseconds nanosecond component of interval + */ + public void setSafe(int index, int isSet, int months, int days, + long nanoseconds) { + handleSafe(index); + set(index, isSet, months, days, nanoseconds); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((IntervalMonthDayNanoVector) to); + } + + private class TransferImpl implements TransferPair { + IntervalMonthDayNanoVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new IntervalMonthDayNanoVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(IntervalMonthDayNanoVector to) { + this.to = to; + } + + @Override + public IntervalMonthDayNanoVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, IntervalMonthDayNanoVector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/IntervalYearVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/IntervalYearVector.java new file mode 100644 index 000000000..7ddfe6b78 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/IntervalYearVector.java @@ -0,0 +1,382 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import java.time.Period; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.IntervalYearReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.IntervalYearHolder; +import org.apache.arrow.vector.holders.NullableIntervalYearHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * IntervalYearVector implements a fixed width (4 bytes) vector of + * interval (years and months) values which could be null. A validity buffer + * (bit vector) is maintained to track which elements in the vector are null. + */ +public final class IntervalYearVector extends BaseFixedWidthVector { + public static final byte TYPE_WIDTH = 4; + private final FieldReader reader; + + /** + * Instantiate a IntervalYearVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public IntervalYearVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.INTERVALYEAR.getType()), allocator); + } + + /** + * Instantiate a IntervalYearVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public IntervalYearVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a IntervalYearVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public IntervalYearVector(Field field, BufferAllocator allocator) { + super(field, allocator, TYPE_WIDTH); + reader = new IntervalYearReaderImpl(IntervalYearVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.INTERVALYEAR; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + *

This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static int getTotalMonths(final ArrowBuf buffer, final int index) { + return buffer.getInt((long) index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public int get(int index) throws IllegalStateException { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getInt((long) index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableIntervalYearHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getInt((long) index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Period getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + final int interval = valueBuffer.getInt((long) index * TYPE_WIDTH); + // TODO: verify interval is in months + return Period.ofMonths(interval); + } + } + + /** + * Get the Interval value at a given index as a {@link StringBuilder} object. + * + * @param index position of the element + * @return String Builder object with Interval value as + * [years, months] + */ + public StringBuilder getAsStringBuilder(int index) { + if (isSet(index) == 0) { + return null; + } else { + return getAsStringBuilderHelper(index); + } + } + + private StringBuilder getAsStringBuilderHelper(int index) { + int value = valueBuffer.getInt((long) index * TYPE_WIDTH); + + final int years = (value / org.apache.arrow.vector.util.DateUtility.yearsToMonths); + final int months = (value % org.apache.arrow.vector.util.DateUtility.yearsToMonths); + + final String yearString = (Math.abs(years) == 1) ? " year " : " years "; + final String monthString = (Math.abs(months) == 1) ? " month " : " months "; + + return (new StringBuilder() + .append(years) + .append(yearString) + .append(months) + .append(monthString)); + } + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + private void setValue(int index, int value) { + valueBuffer.setInt((long) index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, int value) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableIntervalYearHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, IntervalYearHolder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, int)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, int value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableIntervalYearHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableIntervalYearHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, IntervalYearHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, IntervalYearHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, int value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, int, int)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, int value) { + handleSafe(index); + set(index, isSet, value); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((IntervalYearVector) to); + } + + private class TransferImpl implements TransferPair { + IntervalYearVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new IntervalYearVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(IntervalYearVector to) { + this.to = to; + } + + @Override + public IntervalYearVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, IntervalYearVector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/LargeVarBinaryVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/LargeVarBinaryVector.java new file mode 100644 index 000000000..e9d60b38e --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/LargeVarBinaryVector.java @@ -0,0 +1,305 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.LargeVarBinaryReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.LargeVarBinaryHolder; +import org.apache.arrow.vector.holders.NullableLargeVarBinaryHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * LargeVarBinaryVector implements a large variable width vector of binary + * values which could be NULL. A validity buffer (bit vector) is maintained + * to track which elements in the vector are null. + * The size of the underlying buffer can be over 2GB. + */ +public final class LargeVarBinaryVector extends BaseLargeVariableWidthVector { + private final FieldReader reader; + + /** + * Instantiate a LargeVarBinaryVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public LargeVarBinaryVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.LARGEVARBINARY.getType()), allocator); + } + + /** + * Instantiate a LargeVarBinaryVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public LargeVarBinaryVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a LargeVarBinaryVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public LargeVarBinaryVector(Field field, BufferAllocator allocator) { + super(field, allocator); + reader = new LargeVarBinaryReaderImpl(LargeVarBinaryVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.LARGEVARBINARY; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the variable length element at specified index as byte array. + * + * @param index position of element to get + * @return array of bytes for non-null element, null otherwise + */ + public byte[] get(int index) { + assert index >= 0; + if (isSet(index) == 0) { + return null; + } + final long startOffset = getStartOffset(index); + final int dataLength = + (int) (offsetBuffer.getLong((long) (index + 1) * OFFSET_WIDTH) - startOffset); + final byte[] result = new byte[dataLength]; + valueBuffer.getBytes(startOffset, result, 0, dataLength); + return result; + } + + /** + * Get the variable length element at specified index as Text. + * + * @param index position of element to get + * @return byte array for non-null element, null otherwise + */ + public byte[] getObject(int index) { + return get(index); + } + + /** + * Get the variable length element at specified index and sets the state + * in provided holder. + * + * @param index position of element to get + * @param holder data holder to be populated by this function + */ + public void get(int index, NullableLargeVarBinaryHolder holder) { + assert index >= 0; + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.start = getStartOffset(index); + holder.end = offsetBuffer.getLong((long) (index + 1) * OFFSET_WIDTH); + holder.buffer = valueBuffer; + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Set the variable length element at the specified index to the data + * buffer supplied in the holder. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, LargeVarBinaryHolder holder) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + final int dataLength = (int) (holder.end - holder.start); + final long startOffset = getStartOffset(index); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + lastSet = index; + } + + /** + * Same as {@link #set(int, LargeVarBinaryHolder)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, LargeVarBinaryHolder holder) { + assert index >= 0; + final int dataLength = (int) (holder.end - holder.start); + handleSafe(index, dataLength); + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + final long startOffset = getStartOffset(index); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + lastSet = index; + } + + /** + * Set the variable length element at the specified index to the data + * buffer supplied in the holder. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, NullableLargeVarBinaryHolder holder) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setValidityBit(validityBuffer, index, holder.isSet); + final long startOffset = getStartOffset(index); + if (holder.isSet != 0) { + final int dataLength = (int) (holder.end - holder.start); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + } else { + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset); + } + lastSet = index; + } + + /** + * Same as {@link #set(int, NullableLargeVarBinaryHolder)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, NullableLargeVarBinaryHolder holder) { + assert index >= 0; + if (holder.isSet != 0) { + final int dataLength = (int) (holder.end - holder.start); + handleSafe(index, dataLength); + fillHoles(index); + final long startOffset = getStartOffset(index); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + } else { + fillEmpties(index + 1); + } + BitVectorHelper.setValidityBit(validityBuffer, index, holder.isSet); + lastSet = index; + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((LargeVarBinaryVector) to); + } + + private class TransferImpl implements TransferPair { + LargeVarBinaryVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new LargeVarBinaryVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(LargeVarBinaryVector to) { + this.to = to; + } + + @Override + public LargeVarBinaryVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, LargeVarBinaryVector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java new file mode 100644 index 000000000..fd2057260 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java @@ -0,0 +1,331 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.LargeVarCharReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.LargeVarCharHolder; +import org.apache.arrow.vector.holders.NullableLargeVarCharHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.Text; +import org.apache.arrow.vector.util.TransferPair; + +/** + * LargeVarCharVector implements a variable width vector of VARCHAR + * values which could be NULL. A validity buffer (bit vector) is maintained + * to track which elements in the vector are null. + *

+ * The offset width of this vector is 8, so the underlying buffer can be larger than 2GB. + *

+ */ +public final class LargeVarCharVector extends BaseLargeVariableWidthVector { + private final FieldReader reader; + + /** + * Instantiate a LargeVarCharVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public LargeVarCharVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(Types.MinorType.LARGEVARCHAR.getType()), allocator); + } + + /** + * Instantiate a LargeVarCharVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public LargeVarCharVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a LargeVarCharVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public LargeVarCharVector(Field field, BufferAllocator allocator) { + super(field, allocator); + reader = new LargeVarCharReaderImpl(LargeVarCharVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.LARGEVARCHAR; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the variable length element at specified index as byte array. + * + * @param index position of element to get + * @return array of bytes for non-null element, null otherwise + */ + public byte[] get(int index) { + assert index >= 0; + if (isSet(index) == 0) { + return null; + } + final long startOffset = getStartOffset(index); + final int dataLength = + (int) (offsetBuffer.getLong((long) (index + 1) * OFFSET_WIDTH) - startOffset); + final byte[] result = new byte[dataLength]; + valueBuffer.getBytes(startOffset, result, 0, dataLength); + return result; + } + + /** + * Get the variable length element at specified index as Text. + * + * @param index position of element to get + * @return Text object for non-null element, null otherwise + */ + public Text getObject(int index) { + byte[] b = get(index); + if (b == null) { + return null; + } else { + return new Text(b); + } + } + + /** + * Get the variable length element at specified index and sets the state + * in provided holder. + * + * @param index position of element to get + * @param holder data holder to be populated by this function + */ + public void get(int index, NullableLargeVarCharHolder holder) { + assert index >= 0; + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.start = getStartOffset(index); + holder.end = offsetBuffer.getLong((long) (index + 1) * OFFSET_WIDTH); + holder.buffer = valueBuffer; + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Set the variable length element at the specified index to the data + * buffer supplied in the holder. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, LargeVarCharHolder holder) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + final int dataLength = (int) (holder.end - holder.start); + final long startOffset = getStartOffset(index); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + lastSet = index; + } + + /** + * Same as {@link #set(int, LargeVarCharHolder)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, LargeVarCharHolder holder) { + assert index >= 0; + final int dataLength = (int) (holder.end - holder.start); + handleSafe(index, dataLength); + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + final long startOffset = getStartOffset(index); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + lastSet = index; + } + + /** + * Set the variable length element at the specified index to the data + * buffer supplied in the holder. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, NullableLargeVarCharHolder holder) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setValidityBit(validityBuffer, index, holder.isSet); + final long startOffset = getStartOffset(index); + if (holder.isSet != 0) { + final int dataLength = (int) (holder.end - holder.start); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + } else { + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset); + } + lastSet = index; + } + + /** + * Same as {@link #set(int, NullableLargeVarCharHolder)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, NullableLargeVarCharHolder holder) { + assert index >= 0; + if (holder.isSet != 0) { + final int dataLength = (int) (holder.end - holder.start); + handleSafe(index, dataLength); + fillHoles(index); + final long startOffset = getStartOffset(index); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + } else { + fillHoles(index + 1); + } + BitVectorHelper.setValidityBit(validityBuffer, index, holder.isSet); + lastSet = index; + } + + /** + * Set the variable length element at the specified index to the + * content in supplied Text. + * + * @param index position of the element to set + * @param text Text object with data + */ + public void set(int index, Text text) { + set(index, text.getBytes(), 0, text.getLength()); + } + + /** + * Same as {@link #set(int, NullableLargeVarCharHolder)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set. + * @param text Text object with data + */ + public void setSafe(int index, Text text) { + setSafe(index, text.getBytes(), 0, text.getLength()); + } + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new LargeVarCharVector.TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new LargeVarCharVector.TransferImpl((LargeVarCharVector) to); + } + + private class TransferImpl implements TransferPair { + LargeVarCharVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new LargeVarCharVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(LargeVarCharVector to) { + this.to = to; + } + + @Override + public LargeVarCharVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, LargeVarCharVector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/NullCheckingForGet.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/NullCheckingForGet.java new file mode 100644 index 000000000..9961c72a4 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/NullCheckingForGet.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Configuration class to determine if null checking should be enabled or disabled for the "get" methods. + * For example, the get method of class org.apache.arrow.vector.Float8Vector first checks if the value + * at the given index is null, before retrieving the value. This configuration will turn on and off such checks. + * + *

Null checking is on by default. You can disable it by setting either the system property or the + * environmental variable to "false". The system property is named "arrow.enable_null_check_for_get" and + * the environmental variable is named "ARROW_ENABLE_NULL_CHECK_FOR_GET". + * When both the system property and the environmental variable are set, the system property takes precedence. + *

+ *

+ * Disabling null-checking in the "get" methods may lead to performance improvements. + * For example, suppose we have the following micro-benchmark: + *

+ *

+ *

{@code
+ *
+ *   Float8Vector vector = ...
+ *
+ *   public void test() {
+ *     sum = 0;
+ *     for (int i = 0; i < 1024; i++) {
+ *       vector.set(i, i + 10.0);
+ *       safeSum += vector.get(i);
+ *     }
+ *   }
+ *
+ * }
+ *

+ *

+ * Performance evaluations of the micro-benchmark with the JMH framework reveal that, disabling null checking + * has the following effects: + * 1. The amounts of byte code and assembly code generated by JIT are both smaller. + * 2. The performance improves by about 30% (2.819 ± 0.005 us/op vs. 4.069 ± 0.004 us/op). + *

+ *

+ * Therefore, for scenarios where the user can be sure that the null-checking is unnecessary, + * it is beneficial to disable it with this configuration. + *

+ */ +public class NullCheckingForGet { + + /** + * The flag to indicate if null checking is enabled for "get" methods. + */ + public static final boolean NULL_CHECKING_ENABLED; + + static { + String envProperty = System.getenv("ARROW_ENABLE_NULL_CHECK_FOR_GET"); + String sysProperty = System.getProperty("arrow.enable_null_check_for_get"); + + // The system property has a higher priority than the environmental variable. + String flagValue = sysProperty; + if (flagValue == null) { + flagValue = envProperty; + } + + // The flag is set to false only if the system property/environmental + // variable is explicitly set to "false". + NULL_CHECKING_ENABLED = !"false".equals(flagValue); + } + + private NullCheckingForGet() { + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java new file mode 100644 index 000000000..1010d8d47 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java @@ -0,0 +1,338 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.complex.BaseRepeatedValueVector.DATA_VECTOR_NAME; + +import java.util.Collections; +import java.util.Iterator; +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.complex.impl.NullReader; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.TransferPair; + +/** + * A null type vector. + */ +public class NullVector implements FieldVector { + + private int valueCount; + + protected Field field; + + /** + * Instantiate a NullVector. + * + * @param name name of the vector + */ + public NullVector(String name) { + this(name, FieldType.nullable(Types.MinorType.NULL.getType())); + } + + /** + * Instantiate a NullVector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector. + */ + public NullVector(String name, FieldType fieldType) { + this(new Field(name, fieldType, null)); + } + + /** + * Instantiate a NullVector. + * + * @param field field materialized by this vector. + */ + public NullVector(Field field) { + this.valueCount = 0; + this.field = field; + } + + @Deprecated + public NullVector() { + this(new Field(DATA_VECTOR_NAME, FieldType.nullable(new ArrowType.Null()), null)); + } + + @Override + public void close() { + } + + @Override + public void clear() { + } + + @Override + public void reset() { + } + + @Override + public Field getField() { + return field; + } + + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.NULL; + } + + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + return getTransferPair(null, allocator); + } + + @Override + public Iterator iterator() { + return Collections.emptyIterator(); + } + + @Override + public int getBufferSize() { + return 0; + } + + @Override + public int getBufferSizeFor(final int valueCount) { + return 0; + } + + @Override + public ArrowBuf[] getBuffers(boolean clear) { + return new ArrowBuf[0]; + } + + @Override + public void allocateNew() throws OutOfMemoryException { + allocateNewSafe(); + } + + @Override + public boolean allocateNewSafe() { + return true; + } + + @Override + public void reAlloc() { + } + + @Override + public BufferAllocator getAllocator() { + throw new UnsupportedOperationException("Tried to get allocator from NullVector"); + } + + @Override + public void setInitialCapacity(int numRecords) { + } + + @Override + public int getValueCapacity() { + return this.valueCount; + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { + return getTransferPair(ref, allocator); + } + + @Override + public TransferPair makeTransferPair(ValueVector target) { + return new TransferImpl((NullVector) target); + } + + @Override + public FieldReader getReader() { + return NullReader.INSTANCE; + } + + @Override + public void initializeChildrenFromFields(List children) { + if (!children.isEmpty()) { + throw new IllegalArgumentException("Null vector has no children"); + } + } + + @Override + public List getChildrenFromFields() { + return Collections.emptyList(); + } + + @Override + public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { + Preconditions.checkArgument(ownBuffers.isEmpty(), "Null vector has no buffers"); + } + + @Override + public List getFieldBuffers() { + return Collections.emptyList(); + } + + /** + * Get the inner vectors. + * + * @deprecated This API will be removed as the current implementations no longer support inner vectors. + * + * @return the inner vectors for this field as defined by the TypeLayout + */ + @Deprecated + @Override + public List getFieldInnerVectors() { + return Collections.emptyList(); + } + + @Override + public long getValidityBufferAddress() { + throw new UnsupportedOperationException(); + } + + @Override + public long getDataBufferAddress() { + throw new UnsupportedOperationException(); + } + + @Override + public long getOffsetBufferAddress() { + throw new UnsupportedOperationException(); + } + + @Override + public ArrowBuf getValidityBuffer() { + throw new UnsupportedOperationException(); + } + + @Override + public ArrowBuf getDataBuffer() { + throw new UnsupportedOperationException(); + } + + @Override + public ArrowBuf getOffsetBuffer() { + throw new UnsupportedOperationException(); + } + + @Override + public int getValueCount() { + return this.valueCount; + } + + @Override + public void setValueCount(int valueCount) { + this.valueCount = valueCount; + } + + @Override + public Object getObject(int index) { + return null; + } + + @Override + public int getNullCount() { + return this.valueCount; + } + + @Override + public boolean isNull(int index) { + return true; + } + + @Override + public int hashCode(int index) { + return 31 * valueCount; + } + + @Override + public int hashCode(int index, ArrowBufHasher hasher) { + return 31 * valueCount; + } + + @Override + public OUT accept(VectorVisitor visitor, IN value) { + return visitor.visit(this, value); + } + + @Override + public void copyFrom(int fromIndex, int thisIndex, ValueVector from) { + throw new UnsupportedOperationException(); + } + + @Override + public void copyFromSafe(int fromIndex, int thisIndex, ValueVector from) { + throw new UnsupportedOperationException(); + } + + @Override + public String getName() { + return this.getField().getName(); + } + + private class TransferImpl implements TransferPair { + NullVector to; + + public TransferImpl(String ref) { + to = new NullVector(ref); + } + + @Deprecated + public TransferImpl() { + to = new NullVector(); + } + + public TransferImpl(NullVector to) { + this.to = to; + } + + @Override + public NullVector getTo() { + return to; + } + + @Override + public void transfer() { + to.valueCount = valueCount; + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + to.valueCount = length; + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + if (toIndex > to.valueCount) { + to.valueCount = toIndex; + } + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/PeriodDuration.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/PeriodDuration.java new file mode 100644 index 000000000..ee48fe797 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/PeriodDuration.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import java.time.Duration; +import java.time.Period; + +import org.apache.arrow.util.Preconditions; + +/** + * Combination of Period and Duration for representing this interval type + * as a POJO. + */ +public class PeriodDuration { + private final Period period; + private final Duration duration; + + public PeriodDuration(Period period, Duration duration) { + this.period = Preconditions.checkNotNull(period); + this.duration = Preconditions.checkNotNull(duration); + } + + public Period getPeriod() { + return period; + } + + public Duration getDuration() { + return duration; + } + + @Override + public String toString() { + return period.toString() + " " + duration.toString(); + } + + @Override + public boolean equals(Object o) { + if (!(o instanceof PeriodDuration)) { + return false; + } + PeriodDuration other = (PeriodDuration) o; + return this.period.equals(other.period) && this.duration.equals(other.duration); + } + + @Override + public int hashCode() { + return this.period.hashCode() * 31 + this.duration.hashCode(); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/SchemaChangeCallBack.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/SchemaChangeCallBack.java new file mode 100644 index 000000000..b61e4a160 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/SchemaChangeCallBack.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.vector.util.CallBack; + + +/** + * Callback for when the Schema for the Vector changes (generally happens when a vector is promoted to a union type + * from a single value type). + */ +public class SchemaChangeCallBack implements CallBack { + private boolean schemaChanged = false; + + /** + * Constructs a schema-change callback with the schema-changed state set to + * {@code false}. + */ + public SchemaChangeCallBack() { + } + + /** + * Sets the schema-changed state to {@code true}. + */ + @Override + public void doWork() { + schemaChanged = true; + } + + /** + * Returns the value of schema-changed state, resetting the + * schema-changed state to {@code false}. + * + * @return the previous schema-changed state + */ + public boolean getSchemaChangedAndReset() { + final boolean current = schemaChanged; + schemaChanged = false; + return current; + } +} + diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/SmallIntVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/SmallIntVector.java new file mode 100644 index 000000000..1de6dea90 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/SmallIntVector.java @@ -0,0 +1,389 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.SmallIntReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.NullableSmallIntHolder; +import org.apache.arrow.vector.holders.SmallIntHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * SmallIntVector implements a fixed width (2 bytes) vector of + * short values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public final class SmallIntVector extends BaseFixedWidthVector implements BaseIntVector { + public static final byte TYPE_WIDTH = 2; + private final FieldReader reader; + + /** + * Instantiate a SmallIntVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public SmallIntVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.SMALLINT.getType()), allocator); + } + + /** + * Instantiate a SmallIntVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public SmallIntVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a SmallIntVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public SmallIntVector(Field field, BufferAllocator allocator) { + super(field, allocator, TYPE_WIDTH); + reader = new SmallIntReaderImpl(SmallIntVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.SMALLINT; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public short get(int index) throws IllegalStateException { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getShort((long) index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableSmallIntHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getShort((long) index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Short getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getShort((long) index * TYPE_WIDTH); + } + } + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + private void setValue(int index, int value) { + valueBuffer.setShort((long) index * TYPE_WIDTH, value); + } + + private void setValue(int index, short value) { + valueBuffer.setShort((long) index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, int value) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, short value) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableSmallIntHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, SmallIntHolder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, int)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, int value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, short)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, short value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableSmallIntHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableSmallIntHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, SmallIntHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, SmallIntHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, short value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, int, short)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, short value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + *

This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static short get(final ArrowBuf buffer, final int index) { + return buffer.getShort((long) index * TYPE_WIDTH); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((SmallIntVector) to); + } + + @Override + public void setWithPossibleTruncate(int index, long value) { + this.setSafe(index, (int) value); + } + + @Override + public void setUnsafeWithPossibleTruncate(int index, long value) { + this.set(index, (int) value); + } + + @Override + public long getValueAsLong(int index) { + return this.get(index); + } + + private class TransferImpl implements TransferPair { + SmallIntVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new SmallIntVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(SmallIntVector to) { + this.to = to; + } + + @Override + public SmallIntVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, SmallIntVector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeMicroVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeMicroVector.java new file mode 100644 index 000000000..cf128859e --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeMicroVector.java @@ -0,0 +1,347 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.TimeMicroReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.NullableTimeMicroHolder; +import org.apache.arrow.vector.holders.TimeMicroHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * TimeMicroVector implements a fixed width vector (8 bytes) of + * time (microsecond resolution) values which could be null. + * A validity buffer (bit vector) is maintained to track which elements in the + * vector are null. + */ +public final class TimeMicroVector extends BaseFixedWidthVector { + public static final byte TYPE_WIDTH = 8; + private final FieldReader reader; + + /** + * Instantiate a TimeMicroVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public TimeMicroVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.TIMEMICRO.getType()), allocator); + } + + /** + * Instantiate a TimeMicroVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeMicroVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a TimeMicroVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeMicroVector(Field field, BufferAllocator allocator) { + super(field, allocator, TYPE_WIDTH); + reader = new TimeMicroReaderImpl(TimeMicroVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.TIMEMICRO; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public long get(int index) throws IllegalStateException { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getLong((long) index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableTimeMicroHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getLong((long) index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Long getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getLong((long) index * TYPE_WIDTH); + } + } + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + private void setValue(int index, long value) { + valueBuffer.setLong((long) index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, long value) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableTimeMicroHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, TimeMicroHolder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, long)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, long value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableTimeMicroHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableTimeMicroHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, TimeMicroHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, TimeMicroHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, long value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, int, long)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, long value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + *

This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static long get(final ArrowBuf buffer, int index) { + return buffer.getLong((long) index * TYPE_WIDTH); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((TimeMicroVector) to); + } + + private class TransferImpl implements TransferPair { + TimeMicroVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new TimeMicroVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(TimeMicroVector to) { + this.to = to; + } + + @Override + public TimeMicroVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, TimeMicroVector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeMilliVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeMilliVector.java new file mode 100644 index 000000000..b96990b10 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeMilliVector.java @@ -0,0 +1,351 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import java.time.LocalDateTime; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.TimeMilliReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.NullableTimeMilliHolder; +import org.apache.arrow.vector.holders.TimeMilliHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.DateUtility; +import org.apache.arrow.vector.util.TransferPair; + +/** + * TimeMilliVector implements a fixed width (4 bytes) vector of + * time (millisecond resolution) values which could be null. A validity buffer + * (bit vector) is maintained to track which elements in the vector are null. + */ +public final class TimeMilliVector extends BaseFixedWidthVector { + public static final byte TYPE_WIDTH = 4; + private final FieldReader reader; + + /** + * Instantiate a TimeMilliVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public TimeMilliVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.TIMEMILLI.getType()), allocator); + } + + /** + * Instantiate a TimeMilliVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeMilliVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a TimeMilliVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeMilliVector(Field field, BufferAllocator allocator) { + super(field, allocator, TYPE_WIDTH); + reader = new TimeMilliReaderImpl(TimeMilliVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.TIMEMILLI; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public int get(int index) throws IllegalStateException { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getInt((long) index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableTimeMilliHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getInt((long) index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public LocalDateTime getObject(int index) { + if (isSet(index) == 0) { + return null; + } + final int millis = valueBuffer.getInt((long) index * TYPE_WIDTH); + // TODO: this doesn't seem right, time not from epoch + return DateUtility.getLocalDateTimeFromEpochMilli(millis); + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + private void setValue(int index, int value) { + valueBuffer.setInt((long) index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, int value) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableTimeMilliHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, TimeMilliHolder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, int)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, int value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableTimeMilliHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableTimeMilliHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, TimeMilliHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, TimeMilliHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, int value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, int, int)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, int value) { + handleSafe(index); + set(index, isSet, value); + } + + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + *

This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static int get(final ArrowBuf buffer, final int index) { + return buffer.getInt((long) index * TYPE_WIDTH); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((TimeMilliVector) to); + } + + private class TransferImpl implements TransferPair { + TimeMilliVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new TimeMilliVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(TimeMilliVector to) { + this.to = to; + } + + @Override + public TimeMilliVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, TimeMilliVector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeNanoVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeNanoVector.java new file mode 100644 index 000000000..bc78a0264 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeNanoVector.java @@ -0,0 +1,347 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.TimeNanoReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.NullableTimeNanoHolder; +import org.apache.arrow.vector.holders.TimeNanoHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * TimeNanoVector implements a fixed width vector (8 bytes) of + * time (nanosecond resolution) values which could be null. A validity buffer + * (bit vector) is maintained to track which elements in the vector are null. + */ +public final class TimeNanoVector extends BaseFixedWidthVector { + public static final byte TYPE_WIDTH = 8; + private final FieldReader reader; + + /** + * Instantiate a TimeNanoVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public TimeNanoVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.TIMENANO.getType()), allocator); + } + + /** + * Instantiate a TimeNanoVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeNanoVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a TimeNanoVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeNanoVector(Field field, BufferAllocator allocator) { + super(field, allocator, TYPE_WIDTH); + reader = new TimeNanoReaderImpl(TimeNanoVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.TIMENANO; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public long get(int index) throws IllegalStateException { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getLong((long) index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableTimeNanoHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getLong((long) index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Long getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getLong((long) index * TYPE_WIDTH); + } + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + private void setValue(int index, long value) { + valueBuffer.setLong((long) index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, long value) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableTimeNanoHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, TimeNanoHolder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, long)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, long value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableTimeNanoHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableTimeNanoHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, TimeNanoHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, TimeNanoHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, long value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, int, long)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, long value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + *

This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static long get(final ArrowBuf buffer, final int index) { + return buffer.getLong((long) index * TYPE_WIDTH); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((TimeNanoVector) to); + } + + private class TransferImpl implements TransferPair { + TimeNanoVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new TimeNanoVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(TimeNanoVector to) { + this.to = to; + } + + @Override + public TimeNanoVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, TimeNanoVector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeSecVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeSecVector.java new file mode 100644 index 000000000..29b7381be --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeSecVector.java @@ -0,0 +1,348 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.TimeSecReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.NullableTimeSecHolder; +import org.apache.arrow.vector.holders.TimeSecHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * TimeSecVector implements a fixed width (4 bytes) vector of + * time (seconds resolution) values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public final class TimeSecVector extends BaseFixedWidthVector { + public static final byte TYPE_WIDTH = 4; + private final FieldReader reader; + + /** + * Instantiate a TimeSecVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public TimeSecVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.TIMESEC.getType()), allocator); + } + + /** + * Instantiate a TimeSecVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeSecVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a TimeSecVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeSecVector(Field field, BufferAllocator allocator) { + super(field, allocator, TYPE_WIDTH); + reader = new TimeSecReaderImpl(TimeSecVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.TIMESEC; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public int get(int index) throws IllegalStateException { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getInt((long) index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableTimeSecHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getInt((long) index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Integer getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getInt((long) index * TYPE_WIDTH); + } + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + private void setValue(int index, int value) { + valueBuffer.setInt((long) index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, int value) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableTimeSecHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, TimeSecHolder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, int)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, int value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableTimeSecHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableTimeSecHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, TimeSecHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, TimeSecHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, int value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, int, int)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, int value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + *

This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static int get(final ArrowBuf buffer, final int index) { + return buffer.getInt((long) index * TYPE_WIDTH); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((TimeSecVector) to); + } + + private class TransferImpl implements TransferPair { + TimeSecVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new TimeSecVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(TimeSecVector to) { + this.to = to; + } + + @Override + public TimeSecVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, TimeSecVector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMicroTZVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMicroTZVector.java new file mode 100644 index 000000000..17715780e --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMicroTZVector.java @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.TimeStampMicroTZReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.NullableTimeStampMicroTZHolder; +import org.apache.arrow.vector.holders.TimeStampMicroTZHolder; +import org.apache.arrow.vector.types.TimeUnit; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * TimeStampMicroTZVector implements a fixed width vector (8 bytes) of + * timestamp (microsecond resolution) values which could be null. A validity buffer + * (bit vector) is maintained to track which elements in the vector are null. + */ +public final class TimeStampMicroTZVector extends TimeStampVector { + private final FieldReader reader; + private final String timeZone; + + /** + * Instantiate a TimeStampMicroTZVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public TimeStampMicroTZVector(String name, BufferAllocator allocator, String timeZone) { + this(name, FieldType.nullable(new ArrowType.Timestamp(TimeUnit.MICROSECOND, timeZone)), allocator); + } + + /** + * Instantiate a TimeStampMicroTZVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampMicroTZVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, fieldType, allocator); + ArrowType.Timestamp arrowType = (ArrowType.Timestamp) fieldType.getType(); + timeZone = arrowType.getTimezone(); + reader = new TimeStampMicroTZReaderImpl(TimeStampMicroTZVector.this); + } + + /** + * Instantiate a TimeStampMicroTZVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampMicroTZVector(Field field, BufferAllocator allocator) { + super(field, allocator); + ArrowType.Timestamp arrowType = (ArrowType.Timestamp) field.getFieldType().getType(); + timeZone = arrowType.getTimezone(); + reader = new TimeStampMicroTZReaderImpl(TimeStampMicroTZVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.TIMESTAMPMICROTZ; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableTimeStampMicroTZHolder holder) { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getLong((long) index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Long getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getLong((long) index * TYPE_WIDTH); + } + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableTimeStampMicroTZHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, TimeStampMicroTZHolder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, NullableTimeStampMicroTZHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableTimeStampMicroTZHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, TimeStampMicroTZHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, TimeStampMicroTZHolder holder) { + handleSafe(index); + set(index, holder); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + TimeStampMicroTZVector to = new TimeStampMicroTZVector(ref, + field.getFieldType(), allocator); + return new TransferImpl(to); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((TimeStampMicroTZVector) to); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMicroVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMicroVector.java new file mode 100644 index 000000000..5cbef8962 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMicroVector.java @@ -0,0 +1,236 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import java.time.LocalDateTime; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.TimeStampMicroReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.NullableTimeStampMicroHolder; +import org.apache.arrow.vector.holders.TimeStampMicroHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.DateUtility; +import org.apache.arrow.vector.util.TransferPair; + +/** + * TimeStampMicroVector implements a fixed width vector (8 bytes) of + * timestamp (microsecond resolution) values which could be null. A validity buffer + * (bit vector) is maintained to track which elements in the vector are null. + */ +public final class TimeStampMicroVector extends TimeStampVector { + private final FieldReader reader; + + /** + * Instantiate a TimeStampMicroVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public TimeStampMicroVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.TIMESTAMPMICRO.getType()), allocator); + } + + /** + * Instantiate a TimeStampMicroVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampMicroVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, fieldType, allocator); + reader = new TimeStampMicroReaderImpl(TimeStampMicroVector.this); + } + + /** + * Instantiate a TimeStampMicroVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampMicroVector(Field field, BufferAllocator allocator) { + super(field, allocator); + reader = new TimeStampMicroReaderImpl(TimeStampMicroVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.TIMESTAMPMICRO; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableTimeStampMicroHolder holder) { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getLong((long) index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public LocalDateTime getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + final long micros = valueBuffer.getLong((long) index * TYPE_WIDTH); + return DateUtility.getLocalDateTimeFromEpochMicro(micros); + } + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableTimeStampMicroHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, TimeStampMicroHolder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, NullableTimeStampMicroHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableTimeStampMicroHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, TimeStampMicroHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, TimeStampMicroHolder holder) { + handleSafe(index); + set(index, holder); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + TimeStampMicroVector to = new TimeStampMicroVector(ref, + field.getFieldType(), allocator); + return new TransferImpl(to); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((TimeStampMicroVector) to); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMilliTZVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMilliTZVector.java new file mode 100644 index 000000000..e66bbf450 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMilliTZVector.java @@ -0,0 +1,238 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.TimeStampMilliTZReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.NullableTimeStampMilliTZHolder; +import org.apache.arrow.vector.holders.TimeStampMilliTZHolder; +import org.apache.arrow.vector.types.TimeUnit; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * TimeStampMilliTZVector implements a fixed width vector (8 bytes) of + * timestamp (millisecond resolution) values which could be null. A validity buffer + * (bit vector) is maintained to track which elements in the vector are null. + */ +public final class TimeStampMilliTZVector extends TimeStampVector { + private final FieldReader reader; + private final String timeZone; + + /** + * Instantiate a TimeStampMilliTZVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public TimeStampMilliTZVector(String name, BufferAllocator allocator, String timeZone) { + this(name, FieldType.nullable(new ArrowType.Timestamp(TimeUnit.MILLISECOND, timeZone)), allocator); + } + + /** + * Instantiate a TimeStampMilliTZVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampMilliTZVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, fieldType, allocator); + ArrowType.Timestamp arrowType = (ArrowType.Timestamp) fieldType.getType(); + timeZone = arrowType.getTimezone(); + reader = new TimeStampMilliTZReaderImpl(TimeStampMilliTZVector.this); + } + + /** + * Instantiate a TimeStampMilliTZVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampMilliTZVector(Field field, BufferAllocator allocator) { + super(field, allocator); + ArrowType.Timestamp arrowType = (ArrowType.Timestamp) field.getFieldType().getType(); + timeZone = arrowType.getTimezone(); + reader = new TimeStampMilliTZReaderImpl(TimeStampMilliTZVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.TIMESTAMPMILLITZ; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableTimeStampMilliTZHolder holder) { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getLong((long) index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Long getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getLong((long) index * TYPE_WIDTH); + } + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableTimeStampMilliTZHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, TimeStampMilliTZHolder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, NullableTimeStampMilliTZHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableTimeStampMilliTZHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, TimeStampMilliTZHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, TimeStampMilliTZHolder holder) { + handleSafe(index); + set(index, holder); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + TimeStampMilliTZVector to = new TimeStampMilliTZVector(ref, + field.getFieldType(), allocator); + return new TransferImpl(to); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((TimeStampMilliTZVector) to); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMilliVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMilliVector.java new file mode 100644 index 000000000..8f46f5606 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMilliVector.java @@ -0,0 +1,236 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import java.time.LocalDateTime; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.TimeStampMilliReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.NullableTimeStampMilliHolder; +import org.apache.arrow.vector.holders.TimeStampMilliHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.DateUtility; +import org.apache.arrow.vector.util.TransferPair; + +/** + * TimeStampMilliVector implements a fixed width vector (8 bytes) of + * timestamp (millisecond resolution) values which could be null. A validity buffer + * (bit vector) is maintained to track which elements in the vector are null. + */ +public final class TimeStampMilliVector extends TimeStampVector { + private final FieldReader reader; + + /** + * Instantiate a TimeStampMilliVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public TimeStampMilliVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.TIMESTAMPMILLI.getType()), allocator); + } + + /** + * Instantiate a TimeStampMilliVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampMilliVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, fieldType, allocator); + reader = new TimeStampMilliReaderImpl(TimeStampMilliVector.this); + } + + /** + * Instantiate a TimeStampMilliVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampMilliVector(Field field, BufferAllocator allocator) { + super(field, allocator); + reader = new TimeStampMilliReaderImpl(TimeStampMilliVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.TIMESTAMPMILLI; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableTimeStampMilliHolder holder) { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getLong((long) index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public LocalDateTime getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + final long millis = valueBuffer.getLong((long) index * TYPE_WIDTH); + return DateUtility.getLocalDateTimeFromEpochMilli(millis); + } + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableTimeStampMilliHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, TimeStampMilliHolder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, NullableTimeStampMilliHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableTimeStampMilliHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, TimeStampMilliHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, TimeStampMilliHolder holder) { + handleSafe(index); + set(index, holder); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + TimeStampMilliVector to = new TimeStampMilliVector(ref, + field.getFieldType(), allocator); + return new TransferImpl(to); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((TimeStampMilliVector) to); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampNanoTZVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampNanoTZVector.java new file mode 100644 index 000000000..a3e582a7c --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampNanoTZVector.java @@ -0,0 +1,241 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.TimeStampNanoTZReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.NullableTimeStampNanoTZHolder; +import org.apache.arrow.vector.holders.TimeStampNanoTZHolder; +import org.apache.arrow.vector.types.TimeUnit; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * TimeStampNanoTZVector implements a fixed width vector (8 bytes) of + * timestamp (nanosecond resolution) values which could be null. A validity buffer + * (bit vector) is maintained to track which elements in the vector are null. + */ +public final class TimeStampNanoTZVector extends TimeStampVector { + private final FieldReader reader; + private final String timeZone; + + /** + * Instantiate a TimeStampNanoTZVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public TimeStampNanoTZVector(String name, BufferAllocator allocator, String timeZone) { + this(name, FieldType.nullable(new ArrowType.Timestamp(TimeUnit.NANOSECOND, timeZone)), allocator); + } + + /** + * Instantiate a TimeStampNanoTZVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampNanoTZVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, fieldType, allocator); + ArrowType.Timestamp arrowType = (ArrowType.Timestamp) fieldType.getType(); + timeZone = arrowType.getTimezone(); + reader = new TimeStampNanoTZReaderImpl(TimeStampNanoTZVector.this); + } + + /** + * Instantiate a TimeStampNanoTZVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampNanoTZVector(Field field, BufferAllocator allocator) { + super(field, allocator); + ArrowType.Timestamp arrowType = (ArrowType.Timestamp) field.getFieldType().getType(); + timeZone = arrowType.getTimezone(); + reader = new TimeStampNanoTZReaderImpl(TimeStampNanoTZVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.TIMESTAMPNANOTZ; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableTimeStampNanoTZHolder holder) { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getLong((long) index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Long getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getLong((long) index * TYPE_WIDTH); + } + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableTimeStampNanoTZHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, TimeStampNanoTZHolder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, NullableTimeStampNanoTZHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe( + int index, + NullableTimeStampNanoTZHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, TimeStampNanoTZHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, TimeStampNanoTZHolder holder) { + handleSafe(index); + set(index, holder); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + TimeStampNanoTZVector to = new TimeStampNanoTZVector(ref, + field.getFieldType(), allocator); + return new TransferImpl(to); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((TimeStampNanoTZVector) to); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampNanoVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampNanoVector.java new file mode 100644 index 000000000..7b87dac43 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampNanoVector.java @@ -0,0 +1,236 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import java.time.LocalDateTime; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.TimeStampNanoReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.NullableTimeStampNanoHolder; +import org.apache.arrow.vector.holders.TimeStampNanoHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.DateUtility; +import org.apache.arrow.vector.util.TransferPair; + +/** + * TimeStampNanoVector implements a fixed width vector (8 bytes) of + * timestamp (nanosecond resolution) values which could be null. A validity buffer + * (bit vector) is maintained to track which elements in the vector are null. + */ +public final class TimeStampNanoVector extends TimeStampVector { + private final FieldReader reader; + + /** + * Instantiate a TimeStampNanoVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public TimeStampNanoVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.TIMESTAMPNANO.getType()), allocator); + } + + /** + * Instantiate a TimeStampNanoVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampNanoVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, fieldType, allocator); + reader = new TimeStampNanoReaderImpl(TimeStampNanoVector.this); + } + + /** + * Instantiate a TimeStampNanoVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampNanoVector(Field field, BufferAllocator allocator) { + super(field, allocator); + reader = new TimeStampNanoReaderImpl(TimeStampNanoVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.TIMESTAMPNANO; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableTimeStampNanoHolder holder) { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getLong((long) index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public LocalDateTime getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + final long nanos = valueBuffer.getLong((long) index * TYPE_WIDTH); + return DateUtility.getLocalDateTimeFromEpochNano(nanos); + } + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableTimeStampNanoHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, TimeStampNanoHolder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, NullableTimeStampNanoHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableTimeStampNanoHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, TimeStampNanoHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, TimeStampNanoHolder holder) { + handleSafe(index); + set(index, holder); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + TimeStampNanoVector to = new TimeStampNanoVector(ref, + field.getFieldType(), allocator); + return new TransferImpl(to); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((TimeStampNanoVector) to); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampSecTZVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampSecTZVector.java new file mode 100644 index 000000000..f5a0498fe --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampSecTZVector.java @@ -0,0 +1,238 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.TimeStampSecTZReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.NullableTimeStampSecTZHolder; +import org.apache.arrow.vector.holders.TimeStampSecTZHolder; +import org.apache.arrow.vector.types.TimeUnit; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * TimeStampSecTZVector implements a fixed width vector (8 bytes) of + * timestamp (seconds resolution) values which could be null. A validity buffer + * (bit vector) is maintained to track which elements in the vector are null. + */ +public final class TimeStampSecTZVector extends TimeStampVector { + private final FieldReader reader; + private final String timeZone; + + /** + * Instantiate a TimeStampSecTZVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public TimeStampSecTZVector(String name, BufferAllocator allocator, String timeZone) { + this(name, FieldType.nullable(new ArrowType.Timestamp(TimeUnit.SECOND, timeZone)), allocator); + } + + /** + * Instantiate a TimeStampSecTZVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampSecTZVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, fieldType, allocator); + ArrowType.Timestamp arrowType = (ArrowType.Timestamp) fieldType.getType(); + timeZone = arrowType.getTimezone(); + reader = new TimeStampSecTZReaderImpl(TimeStampSecTZVector.this); + } + + /** + * Instantiate a TimeStampSecTZVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampSecTZVector(Field field, BufferAllocator allocator) { + super(field, allocator); + ArrowType.Timestamp arrowType = (ArrowType.Timestamp) field.getFieldType().getType(); + timeZone = arrowType.getTimezone(); + reader = new TimeStampSecTZReaderImpl(TimeStampSecTZVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.TIMESTAMPSECTZ; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableTimeStampSecTZHolder holder) { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getLong((long) index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Long getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getLong((long) index * TYPE_WIDTH); + } + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableTimeStampSecTZHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, TimeStampSecTZHolder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, NullableTimeStampSecTZHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableTimeStampSecTZHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, TimeStampSecTZHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, TimeStampSecTZHolder holder) { + handleSafe(index); + set(index, holder); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + TimeStampSecTZVector to = new TimeStampSecTZVector(ref, + field.getFieldType(), allocator); + return new TransferImpl(to); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((TimeStampSecTZVector) to); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampSecVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampSecVector.java new file mode 100644 index 000000000..f12e19684 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampSecVector.java @@ -0,0 +1,237 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import java.time.LocalDateTime; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.TimeStampSecReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.NullableTimeStampSecHolder; +import org.apache.arrow.vector.holders.TimeStampSecHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.DateUtility; +import org.apache.arrow.vector.util.TransferPair; + +/** + * TimeStampSecVector implements a fixed width vector (8 bytes) of + * timestamp (seconds resolution) values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public final class TimeStampSecVector extends TimeStampVector { + private final FieldReader reader; + + /** + * Instantiate a TimeStampSecVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public TimeStampSecVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.TIMESTAMPSEC.getType()), allocator); + } + + /** + * Instantiate a TimeStampSecVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampSecVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, fieldType, allocator); + reader = new TimeStampSecReaderImpl(TimeStampSecVector.this); + } + + /** + * Instantiate a TimeStampSecVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampSecVector(Field field, BufferAllocator allocator) { + super(field, allocator); + reader = new TimeStampSecReaderImpl(TimeStampSecVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.TIMESTAMPSEC; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableTimeStampSecHolder holder) { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getLong((long) index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public LocalDateTime getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + final long secs = valueBuffer.getLong((long) index * TYPE_WIDTH); + final long millis = java.util.concurrent.TimeUnit.SECONDS.toMillis(secs); + return DateUtility.getLocalDateTimeFromEpochMilli(millis); + } + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableTimeStampSecHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, TimeStampSecHolder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, NullableTimeStampSecHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableTimeStampSecHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, TimeStampSecHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, TimeStampSecHolder holder) { + handleSafe(index); + set(index, holder); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + TimeStampSecVector to = new TimeStampSecVector(ref, + field.getFieldType(), allocator); + return new TransferImpl(to); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((TimeStampSecVector) to); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampVector.java new file mode 100644 index 000000000..d85a793fb --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TimeStampVector.java @@ -0,0 +1,197 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * TimeStampVector is an abstract interface for fixed width vector (8 bytes) + * of timestamp values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public abstract class TimeStampVector extends BaseFixedWidthVector { + public static final byte TYPE_WIDTH = 8; + + /** + * Instantiate a TimeStampVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a TimeStampVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampVector(Field field, BufferAllocator allocator) { + super(field, allocator, TYPE_WIDTH); + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public long get(int index) throws IllegalStateException { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getLong((long) index * TYPE_WIDTH); + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + protected void setValue(int index, long value) { + valueBuffer.setLong((long) index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, long value) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, value); + } + + /** + * Same as {@link #set(int, long)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, long value) { + handleSafe(index); + set(index, value); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, long value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, int, long)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, long value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + *

This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static long get(final ArrowBuf buffer, final int index) { + return buffer.getLong((long) index * TYPE_WIDTH); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + /** + * {@link TransferPair} for {@link TimeStampVector}. + */ + public class TransferImpl implements TransferPair { + TimeStampVector to; + + public TransferImpl(TimeStampVector to) { + this.to = to; + } + + @Override + public TimeStampVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, TimeStampVector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TinyIntVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TinyIntVector.java new file mode 100644 index 000000000..f08b0e02f --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TinyIntVector.java @@ -0,0 +1,390 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.TinyIntReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.NullableTinyIntHolder; +import org.apache.arrow.vector.holders.TinyIntHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * TinyIntVector implements a fixed width (1 bytes) vector of + * byte values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public final class TinyIntVector extends BaseFixedWidthVector implements BaseIntVector { + public static final byte TYPE_WIDTH = 1; + private final FieldReader reader; + + /** + * Instantiate a TinyIntVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public TinyIntVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.TINYINT.getType()), allocator); + } + + /** + * Instantiate a TinyIntVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TinyIntVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a TinyIntVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public TinyIntVector(Field field, BufferAllocator allocator) { + super(field, allocator, TYPE_WIDTH); + reader = new TinyIntReaderImpl(TinyIntVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.TINYINT; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public byte get(int index) throws IllegalStateException { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getByte(index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableTinyIntHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getByte(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Byte getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getByte(index * TYPE_WIDTH); + } + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + private void setValue(int index, int value) { + valueBuffer.setByte(index * TYPE_WIDTH, value); + } + + private void setValue(int index, byte value) { + valueBuffer.setByte(index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, int value) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, byte value) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableTinyIntHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, TinyIntHolder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, int)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, int value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, byte)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, byte value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableTinyIntHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableTinyIntHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, TinyIntHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, TinyIntHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, byte value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, int, byte)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, byte value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + *

This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static byte get(final ArrowBuf buffer, final int index) { + return buffer.getByte(index * TYPE_WIDTH); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((TinyIntVector) to); + } + + @Override + public void setWithPossibleTruncate(int index, long value) { + this.setSafe(index, (int) value); + } + + @Override + public void setUnsafeWithPossibleTruncate(int index, long value) { + this.set(index, (int) value); + } + + @Override + public long getValueAsLong(int index) { + return this.get(index); + } + + private class TransferImpl implements TransferPair { + TinyIntVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new TinyIntVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(TinyIntVector to) { + this.to = to; + } + + @Override + public TinyIntVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, TinyIntVector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java new file mode 100644 index 000000000..60fe2a6a6 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java @@ -0,0 +1,448 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static java.util.Arrays.asList; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.BufferLayout.BufferType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeVisitor; +import org.apache.arrow.vector.types.pojo.ArrowType.Binary; +import org.apache.arrow.vector.types.pojo.ArrowType.Bool; +import org.apache.arrow.vector.types.pojo.ArrowType.Date; +import org.apache.arrow.vector.types.pojo.ArrowType.Decimal; +import org.apache.arrow.vector.types.pojo.ArrowType.Duration; +import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeBinary; +import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeList; +import org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint; +import org.apache.arrow.vector.types.pojo.ArrowType.Int; +import org.apache.arrow.vector.types.pojo.ArrowType.Interval; +import org.apache.arrow.vector.types.pojo.ArrowType.LargeBinary; +import org.apache.arrow.vector.types.pojo.ArrowType.LargeUtf8; +import org.apache.arrow.vector.types.pojo.ArrowType.Map; +import org.apache.arrow.vector.types.pojo.ArrowType.Null; +import org.apache.arrow.vector.types.pojo.ArrowType.Struct; +import org.apache.arrow.vector.types.pojo.ArrowType.Time; +import org.apache.arrow.vector.types.pojo.ArrowType.Timestamp; +import org.apache.arrow.vector.types.pojo.ArrowType.Union; +import org.apache.arrow.vector.types.pojo.ArrowType.Utf8; + +/** + * The buffer layout of vectors for a given type. + * It defines its own buffers followed by the buffers for the children + * if it is a nested type (Struct_, List, Union) + */ +public class TypeLayout { + + /** + * Constructs a new {@TypeLayout} for the given arrowType. + */ + public static TypeLayout getTypeLayout(final ArrowType arrowType) { + TypeLayout layout = arrowType.accept(new ArrowTypeVisitor() { + + @Override + public TypeLayout visit(Int type) { + return newFixedWidthTypeLayout(BufferLayout.dataBuffer(type.getBitWidth())); + } + + @Override + public TypeLayout visit(Union type) { + List vectors; + switch (type.getMode()) { + case Dense: + vectors = asList( + BufferLayout.typeBuffer(), + BufferLayout.offsetBuffer() // offset to find the vector + ); + break; + case Sparse: + vectors = asList( + BufferLayout.typeBuffer() // type of the value at the index or 0 if null + ); + break; + default: + throw new UnsupportedOperationException("Unsupported Union Mode: " + type.getMode()); + } + return new TypeLayout(vectors); + } + + @Override + public TypeLayout visit(Struct type) { + List vectors = asList( + BufferLayout.validityVector() + ); + return new TypeLayout(vectors); + } + + @Override + public TypeLayout visit(Timestamp type) { + return newFixedWidthTypeLayout(BufferLayout.dataBuffer(64)); + } + + @Override + public TypeLayout visit(org.apache.arrow.vector.types.pojo.ArrowType.List type) { + List vectors = asList( + BufferLayout.validityVector(), + BufferLayout.offsetBuffer() + ); + return new TypeLayout(vectors); + } + + @Override + public TypeLayout visit(ArrowType.LargeList type) { + List vectors = asList( + BufferLayout.validityVector(), + BufferLayout.largeOffsetBuffer() + ); + return new TypeLayout(vectors); + } + + @Override + public TypeLayout visit(FixedSizeList type) { + List vectors = asList( + BufferLayout.validityVector() + ); + return new TypeLayout(vectors); + } + + @Override + public TypeLayout visit(Map type) { + List vectors = asList( + BufferLayout.validityVector(), + BufferLayout.offsetBuffer() + ); + return new TypeLayout(vectors); + } + + @Override + public TypeLayout visit(FloatingPoint type) { + int bitWidth; + switch (type.getPrecision()) { + case HALF: + bitWidth = 16; + break; + case SINGLE: + bitWidth = 32; + break; + case DOUBLE: + bitWidth = 64; + break; + default: + throw new UnsupportedOperationException("Unsupported Precision: " + type.getPrecision()); + } + return newFixedWidthTypeLayout(BufferLayout.dataBuffer(bitWidth)); + } + + @Override + public TypeLayout visit(Decimal type) { + return newFixedWidthTypeLayout(BufferLayout.dataBuffer(type.getBitWidth())); + } + + @Override + public TypeLayout visit(FixedSizeBinary type) { + return newFixedWidthTypeLayout(new BufferLayout(BufferType.DATA, type.getByteWidth() * 8)); + } + + @Override + public TypeLayout visit(Bool type) { + return newFixedWidthTypeLayout(BufferLayout.booleanVector()); + } + + @Override + public TypeLayout visit(Binary type) { + return newVariableWidthTypeLayout(); + } + + @Override + public TypeLayout visit(Utf8 type) { + return newVariableWidthTypeLayout(); + } + + @Override + public TypeLayout visit(LargeUtf8 type) { + return newLargeVariableWidthTypeLayout(); + } + + @Override + public TypeLayout visit(LargeBinary type) { + return newLargeVariableWidthTypeLayout(); + } + + private TypeLayout newVariableWidthTypeLayout() { + return newPrimitiveTypeLayout(BufferLayout.validityVector(), BufferLayout.offsetBuffer(), + BufferLayout.byteVector()); + } + + private TypeLayout newLargeVariableWidthTypeLayout() { + return newPrimitiveTypeLayout(BufferLayout.validityVector(), BufferLayout.largeOffsetBuffer(), + BufferLayout.byteVector()); + } + + private TypeLayout newPrimitiveTypeLayout(BufferLayout... vectors) { + return new TypeLayout(asList(vectors)); + } + + public TypeLayout newFixedWidthTypeLayout(BufferLayout dataVector) { + return newPrimitiveTypeLayout(BufferLayout.validityVector(), dataVector); + } + + @Override + public TypeLayout visit(Null type) { + return new TypeLayout(Collections.emptyList()); + } + + @Override + public TypeLayout visit(Date type) { + switch (type.getUnit()) { + case DAY: + return newFixedWidthTypeLayout(BufferLayout.dataBuffer(32)); + case MILLISECOND: + return newFixedWidthTypeLayout(BufferLayout.dataBuffer(64)); + default: + throw new UnsupportedOperationException("Unknown unit " + type.getUnit()); + } + } + + @Override + public TypeLayout visit(Time type) { + return newFixedWidthTypeLayout(BufferLayout.dataBuffer(type.getBitWidth())); + } + + @Override + public TypeLayout visit(Interval type) { + switch (type.getUnit()) { + case DAY_TIME: + return newFixedWidthTypeLayout(BufferLayout.dataBuffer(64)); + case YEAR_MONTH: + return newFixedWidthTypeLayout(BufferLayout.dataBuffer(32)); + case MONTH_DAY_NANO: + return newFixedWidthTypeLayout(BufferLayout.dataBuffer(128)); + default: + throw new UnsupportedOperationException("Unknown unit " + type.getUnit()); + } + } + + @Override + public TypeLayout visit(Duration type) { + return newFixedWidthTypeLayout(BufferLayout.dataBuffer(64)); + } + + }); + return layout; + } + + /** + * Gets the number of {@link BufferLayout}s for the given arrowType. + */ + public static int getTypeBufferCount(final ArrowType arrowType) { + return arrowType.accept(new ArrowTypeVisitor() { + + /** + * All fixed width vectors have a common number of buffers 2: one validity buffer, plus a data buffer. + */ + static final int FIXED_WIDTH_BUFFER_COUNT = 2; + + /** + * All variable width vectors have a common number of buffers 3: a validity buffer, + * an offset buffer, and a data buffer. + */ + static final int VARIABLE_WIDTH_BUFFER_COUNT = 3; + + @Override + public Integer visit(Int type) { + return FIXED_WIDTH_BUFFER_COUNT; + } + + @Override + public Integer visit(Union type) { + switch (type.getMode()) { + case Dense: + // TODO: validate this + return 2; + case Sparse: + // type buffer + return 1; + default: + throw new UnsupportedOperationException("Unsupported Union Mode: " + type.getMode()); + } + } + + @Override + public Integer visit(Struct type) { + // validity buffer + return 1; + } + + @Override + public Integer visit(Timestamp type) { + return FIXED_WIDTH_BUFFER_COUNT; + } + + @Override + public Integer visit(org.apache.arrow.vector.types.pojo.ArrowType.List type) { + // validity buffer + offset buffer + return 2; + } + + @Override + public Integer visit(ArrowType.LargeList type) { + // validity buffer + offset buffer + return 2; + } + + @Override + public Integer visit(FixedSizeList type) { + // validity buffer + return 1; + } + + @Override + public Integer visit(Map type) { + // validity buffer + offset buffer + return 2; + } + + @Override + public Integer visit(FloatingPoint type) { + return FIXED_WIDTH_BUFFER_COUNT; + } + + @Override + public Integer visit(Decimal type) { + return FIXED_WIDTH_BUFFER_COUNT; + } + + @Override + public Integer visit(FixedSizeBinary type) { + return FIXED_WIDTH_BUFFER_COUNT; + } + + @Override + public Integer visit(Bool type) { + return FIXED_WIDTH_BUFFER_COUNT; + } + + @Override + public Integer visit(Binary type) { + return VARIABLE_WIDTH_BUFFER_COUNT; + } + + @Override + public Integer visit(Utf8 type) { + return VARIABLE_WIDTH_BUFFER_COUNT; + } + + @Override + public Integer visit(LargeUtf8 type) { + return VARIABLE_WIDTH_BUFFER_COUNT; + } + + @Override + public Integer visit(LargeBinary type) { + return VARIABLE_WIDTH_BUFFER_COUNT; + } + + @Override + public Integer visit(Null type) { + return 0; + } + + @Override + public Integer visit(Date type) { + return FIXED_WIDTH_BUFFER_COUNT; + } + + @Override + public Integer visit(Time type) { + return FIXED_WIDTH_BUFFER_COUNT; + } + + @Override + public Integer visit(Interval type) { + return FIXED_WIDTH_BUFFER_COUNT; + } + + @Override + public Integer visit(Duration type) { + return FIXED_WIDTH_BUFFER_COUNT; + } + + }); + } + + private final List bufferLayouts; + + public TypeLayout(List bufferLayouts) { + super(); + this.bufferLayouts = Preconditions.checkNotNull(bufferLayouts); + } + + public TypeLayout(BufferLayout... bufferLayouts) { + this(asList(bufferLayouts)); + } + + /** + * Returns the individual {@linkplain BufferLayout}s for the given type. + */ + public List getBufferLayouts() { + return bufferLayouts; + } + + /** + * Returns the types of each buffer for this layout. A layout can consist + * of multiple buffers for example a validity bitmap buffer, a value buffer or + * an offset buffer. + */ + public List getBufferTypes() { + List types = new ArrayList<>(bufferLayouts.size()); + for (BufferLayout vector : bufferLayouts) { + types.add(vector.getType()); + } + return types; + } + + public String toString() { + return bufferLayouts.toString(); + } + + @Override + public int hashCode() { + return bufferLayouts.hashCode(); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + TypeLayout other = (TypeLayout) obj; + return bufferLayouts.equals(other.bufferLayouts); + } + +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/UInt1Vector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/UInt1Vector.java new file mode 100644 index 000000000..bd9a732c1 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/UInt1Vector.java @@ -0,0 +1,368 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.UInt1ReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.NullableUInt1Holder; +import org.apache.arrow.vector.holders.UInt1Holder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; +import org.apache.arrow.vector.util.ValueVectorUtility; + +/** + * UInt1Vector implements a fixed width (1 bytes) vector of + * integer values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public final class UInt1Vector extends BaseFixedWidthVector implements BaseIntVector { + /** + * The mask to use when promoting the unsigned byte value to an integer. + */ + public static final int PROMOTION_MASK = 0xFF; + + /** + * The maximum 8-bit unsigned integer. + */ + public static final byte MAX_UINT1 = (byte) 0XFF; + + public static final byte TYPE_WIDTH = 1; + private final FieldReader reader; + + public UInt1Vector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.UINT1.getType()), allocator); + } + + public UInt1Vector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + public UInt1Vector(Field field, BufferAllocator allocator) { + super(field, allocator, TYPE_WIDTH); + reader = new UInt1ReaderImpl(UInt1Vector.this); + } + + @Override + public FieldReader getReader() { + return reader; + } + + @Override + public MinorType getMinorType() { + return MinorType.UINT1; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + *

To avoid overflow, the returned type is one step up from the signed + * type. + * + *

This method is mainly meant for integration tests. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static short getNoOverflow(final ArrowBuf buffer, final int index) { + byte b = buffer.getByte(index * TYPE_WIDTH); + return (short) (PROMOTION_MASK & b); + } + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public byte get(int index) throws IllegalStateException { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getByte(index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableUInt1Holder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getByte(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Byte getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getByte(index * TYPE_WIDTH); + } + } + + /** + * Returns the value stored at index without the potential for overflow. + * + * @param index position of element + * @return element at given index + */ + public Short getObjectNoOverflow(int index) { + if (isSet(index) == 0) { + return null; + } else { + return getNoOverflow(valueBuffer, index); + } + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + private void setValue(int index, int value) { + valueBuffer.setByte(index * TYPE_WIDTH, value); + } + + private void setValue(int index, byte value) { + valueBuffer.setByte(index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, int value) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, byte value) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableUInt1Holder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, UInt1Holder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, int)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, int value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, byte)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, byte value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableUInt1Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableUInt1Holder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, UInt1Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, UInt1Holder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Sets the value at index to value isSet > 0, otherwise sets the index position + * to invalid/null. + */ + public void set(int index, int isSet, byte value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, int, byte)} but will reallocate the buffer if index + * is larger than current capacity. + */ + public void setSafe(int index, int isSet, byte value) { + handleSafe(index); + set(index, isSet, value); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((UInt1Vector) to); + } + + @Override + public void setWithPossibleTruncate(int index, long value) { + this.setSafe(index, (int) value); + } + + @Override + public void setUnsafeWithPossibleTruncate(int index, long value) { + this.set(index, (int) value); + } + + @Override + public long getValueAsLong(int index) { + return this.get(index) & PROMOTION_MASK; + } + + @Override + public String toString() { + return ValueVectorUtility.getToString(this, 0, getValueCount(), (v, i) -> v.getObjectNoOverflow(i)); + } + + private class TransferImpl implements TransferPair { + UInt1Vector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new UInt1Vector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(UInt1Vector to) { + this.to = to; + } + + @Override + public UInt1Vector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, UInt1Vector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/UInt2Vector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/UInt2Vector.java new file mode 100644 index 000000000..5c29ab6b3 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/UInt2Vector.java @@ -0,0 +1,346 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.UInt2ReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.NullableUInt2Holder; +import org.apache.arrow.vector.holders.UInt2Holder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; +import org.apache.arrow.vector.util.ValueVectorUtility; + +/** + * UInt2Vector implements a fixed width (2 bytes) vector of + * integer values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public final class UInt2Vector extends BaseFixedWidthVector implements BaseIntVector { + + /** + * The maximum 16-bit unsigned integer. + */ + public static final char MAX_UINT2 = (char) 0XFFFF; + + public static final byte TYPE_WIDTH = 2; + private final FieldReader reader; + + public UInt2Vector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.UINT2.getType()), allocator); + } + + public UInt2Vector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + public UInt2Vector(Field field, BufferAllocator allocator) { + super(field, allocator, TYPE_WIDTH); + reader = new UInt2ReaderImpl(UInt2Vector.this); + } + + @Override + public FieldReader getReader() { + return reader; + } + + @Override + public MinorType getMinorType() { + return MinorType.UINT2; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + *

This method is mainly meant for integration tests. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static char get(final ArrowBuf buffer, final int index) { + return buffer.getChar((long) index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public char get(int index) throws IllegalStateException { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getChar((long) index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableUInt2Holder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getChar((long) index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Character getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getChar((long) index * TYPE_WIDTH); + } + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + private void setValue(int index, int value) { + valueBuffer.setChar((long) index * TYPE_WIDTH, value); + } + + private void setValue(int index, char value) { + valueBuffer.setChar((long) index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, int value) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, char value) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableUInt2Holder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, UInt2Holder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, int)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, int value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, char)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, char value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableUInt2Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableUInt2Holder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, UInt2Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, UInt2Holder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Sets the given index to value is isSet is positive, otherwise sets + * the position as invalid/null. + */ + public void set(int index, int isSet, char value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, int, char)} but will reallocate the buffer if index + * is larger than current capacity. + */ + public void setSafe(int index, int isSet, char value) { + handleSafe(index); + set(index, isSet, value); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((UInt2Vector) to); + } + + @Override + public void setWithPossibleTruncate(int index, long value) { + this.setSafe(index, (int) value); + } + + @Override + public void setUnsafeWithPossibleTruncate(int index, long value) { + this.set(index, (int) value); + } + + @Override + public long getValueAsLong(int index) { + return this.get(index); + } + + @Override + public String toString() { + return ValueVectorUtility.getToString(this, 0, getValueCount(), (v, i) -> + v.isNull(i) ? "null" : Integer.toString(v.get(i) & 0x0000ffff)); + } + + private class TransferImpl implements TransferPair { + UInt2Vector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new UInt2Vector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(UInt2Vector to) { + this.to = to; + } + + @Override + public UInt2Vector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, UInt2Vector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/UInt4Vector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/UInt4Vector.java new file mode 100644 index 000000000..cc954d67d --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/UInt4Vector.java @@ -0,0 +1,340 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.UInt4ReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.NullableUInt4Holder; +import org.apache.arrow.vector.holders.UInt4Holder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; +import org.apache.arrow.vector.util.ValueVectorUtility; + +/** + * UInt4Vector implements a fixed width (4 bytes) vector of + * integer values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public final class UInt4Vector extends BaseFixedWidthVector implements BaseIntVector { + + /** + * The mask to use when promoting the unsigned int value to a long int. + */ + public static final long PROMOTION_MASK = 0x00000000FFFFFFFFL; + + /** + * The maximum 32-bit unsigned integer. + */ + public static final int MAX_UINT4 = 0XFFFFFFFF; + + public static final byte TYPE_WIDTH = 4; + private final FieldReader reader; + + public UInt4Vector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.UINT4.getType()), allocator); + } + + public UInt4Vector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + public UInt4Vector(Field field, BufferAllocator allocator) { + super(field, allocator, TYPE_WIDTH); + reader = new UInt4ReaderImpl(UInt4Vector.this); + } + + @Override + public FieldReader getReader() { + return reader; + } + + @Override + public MinorType getMinorType() { + return MinorType.UINT4; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + *

To avoid overflow, the returned type is one step up from the signed + * type. + * + *

This method is mainly meant for integration tests. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static long getNoOverflow(final ArrowBuf buffer, final int index) { + long l = buffer.getInt((long) index * TYPE_WIDTH); + return PROMOTION_MASK & l; + } + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public int get(int index) throws IllegalStateException { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getInt((long) index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableUInt4Holder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getInt((long) index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Integer getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getInt((long) index * TYPE_WIDTH); + } + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Long getObjectNoOverflow(int index) { + if (isSet(index) == 0) { + return null; + } else { + return getNoOverflow(valueBuffer, index); + } + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + private void setValue(int index, int value) { + valueBuffer.setInt((long) index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, int value) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableUInt4Holder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, UInt4Holder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, int)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, int value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableUInt4Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableUInt4Holder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, UInt4Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, UInt4Holder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Sets the value at index to value isSet > 0, otherwise sets the index position + * to invalid/null. + */ + public void set(int index, int isSet, int value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, int, int)} but will reallocate if the buffer if index + * is larger than the current capacity. + */ + public void setSafe(int index, int isSet, int value) { + handleSafe(index); + set(index, isSet, value); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((UInt4Vector) to); + } + + @Override + public void setWithPossibleTruncate(int index, long value) { + this.setSafe(index, (int) value); + } + + @Override + public void setUnsafeWithPossibleTruncate(int index, long value) { + this.set(index, (int) value); + } + + @Override + public long getValueAsLong(int index) { + return this.get(index) & PROMOTION_MASK; + } + + @Override + public String toString() { + return ValueVectorUtility.getToString(this, 0, getValueCount(), (v, i) -> v.getObjectNoOverflow(i)); + } + + private class TransferImpl implements TransferPair { + UInt4Vector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new UInt4Vector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(UInt4Vector to) { + this.to = to; + } + + @Override + public UInt4Vector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, UInt4Vector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/UInt8Vector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/UInt8Vector.java new file mode 100644 index 000000000..98eaf25a6 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/UInt8Vector.java @@ -0,0 +1,336 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import java.math.BigInteger; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.UInt8ReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.NullableUInt8Holder; +import org.apache.arrow.vector.holders.UInt8Holder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; +import org.apache.arrow.vector.util.ValueVectorUtility; + +/** + * UInt8Vector implements a fixed width vector (8 bytes) of + * integer values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public final class UInt8Vector extends BaseFixedWidthVector implements BaseIntVector { + + /** + * The maximum 64-bit unsigned long integer. + */ + public static final long MAX_UINT8 = 0XFFFFFFFFFFFFFFFFL; + + public static final byte TYPE_WIDTH = 8; + private final FieldReader reader; + + public UInt8Vector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.UINT8.getType()), allocator); + } + + public UInt8Vector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + public UInt8Vector(Field field, BufferAllocator allocator) { + super(field, allocator, TYPE_WIDTH); + reader = new UInt8ReaderImpl(UInt8Vector.this); + } + + @Override + public FieldReader getReader() { + return reader; + } + + @Override + public MinorType getMinorType() { + return MinorType.UINT8; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + private static final BigInteger SAFE_CONVERSION_MASK = new BigInteger("ffffffffffffffff", 16); + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + *

To avoid overflow, the returned type is one step up from the signed + * type. + * + *

This method is mainly meant for integration tests. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static BigInteger getNoOverflow(final ArrowBuf buffer, final int index) { + BigInteger l = BigInteger.valueOf(buffer.getLong((long) index * TYPE_WIDTH)); + return SAFE_CONVERSION_MASK.and(l); + } + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public long get(int index) throws IllegalStateException { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getLong((long) index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableUInt8Holder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getLong((long) index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Long getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getLong((long) index * TYPE_WIDTH); + } + } + + /** + * Returns the value stored at index without the potential for overflow. + * + * @param index position of element + * @return element at given index + */ + public BigInteger getObjectNoOverflow(int index) { + if (isSet(index) == 0) { + return null; + } else { + return getNoOverflow(valueBuffer, index); + } + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + private void setValue(int index, long value) { + valueBuffer.setLong((long) index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, long value) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableUInt8Holder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, UInt8Holder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, long)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, long value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableUInt8Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableUInt8Holder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, UInt8Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, UInt8Holder holder) { + handleSafe(index); + set(index, holder); + } + + /** Sets value at index is isSet is positive otherwise sets the index to invalid/null. */ + public void set(int index, int isSet, long value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, int, long)} but will reallocate if index is greater than current capacity. + */ + public void setSafe(int index, int isSet, long value) { + handleSafe(index); + set(index, isSet, value); + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((UInt8Vector) to); + } + + @Override + public void setWithPossibleTruncate(int index, long value) { + this.setSafe(index, value); + } + + @Override + public void setUnsafeWithPossibleTruncate(int index, long value) { + this.set(index, value); + } + + @Override + public long getValueAsLong(int index) { + return this.get(index); + } + + @Override + public String toString() { + return ValueVectorUtility.getToString(this, 0, getValueCount(), (v, i) -> v.getObjectNoOverflow(i)); + } + + private class TransferImpl implements TransferPair { + UInt8Vector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new UInt8Vector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(UInt8Vector to) { + this.to = to; + } + + @Override + public UInt8Vector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, UInt8Vector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java new file mode 100644 index 000000000..aa29c2931 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java @@ -0,0 +1,285 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import java.io.Closeable; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.TransferPair; + +/** + * An abstraction that is used to store a sequence of values in an individual column. + * + *

A {@link ValueVector value vector} stores underlying data in-memory in a columnar fashion that is compact and + * efficient. The column whose data is stored, is referred by {@link #getField()}. + * + *

It is important that vector is allocated before attempting to read or write. + * + *

There are a few "rules" around vectors: + * + *

    + *
  • values need to be written in order (e.g. index 0, 1, 2, 5)
  • + *
  • null vectors start with all values as null before writing anything
  • + *
  • for variable width types, the offset vector should be all zeros before writing
  • + *
  • you must call setValueCount before a vector can be read
  • + *
  • you should never write to a vector once it has been read.
  • + *
+ * + *

Please note that the current implementation doesn't enforce those rules, hence we may find few places that + * deviate from these rules (e.g. offset vectors in Variable Length and Repeated vector) + * + *

This interface "should" strive to guarantee this order of operation: + *

+ * allocate > mutate > setvaluecount > access > clear (or allocate to start the process over). + *
+ */ +public interface ValueVector extends Closeable, Iterable { + /** + * Allocate new buffers. ValueVector implements logic to determine how much to allocate. + * + * @throws OutOfMemoryException Thrown if no memory can be allocated. + */ + void allocateNew() throws OutOfMemoryException; + + /** + * Allocates new buffers. ValueVector implements logic to determine how much to allocate. + * + * @return Returns true if allocation was successful. + */ + boolean allocateNewSafe(); + + /** + * Allocate new buffer with double capacity, and copy data into the new buffer. + * Replace vector's buffer with new buffer, and release old one + */ + void reAlloc(); + + BufferAllocator getAllocator(); + + /** + * Set the initial record capacity. + * + * @param numRecords the initial record capacity. + */ + void setInitialCapacity(int numRecords); + + /** + * Returns the maximum number of values that can be stored in this vector instance. + * + * @return the maximum number of values that can be stored in this vector instance. + */ + int getValueCapacity(); + + /** + * Alternative to clear(). Allows use as an AutoCloseable in try-with-resources. + */ + @Override + void close(); + + /** + * Release any owned ArrowBuf and reset the ValueVector to the initial state. If the + * vector has any child vectors, they will also be cleared. + */ + void clear(); + + /** + * Reset the ValueVector to the initial state without releasing any owned ArrowBuf. + * Buffer capacities will remain unchanged and any previous data will be zeroed out. + * This includes buffers for data, validity, offset, etc. If the vector has any + * child vectors, they will also be reset. + */ + void reset(); + + /** + * Get information about how this field is materialized. + * + * @return the field corresponding to this vector + */ + Field getField(); + + MinorType getMinorType(); + + /** + * To transfer quota responsibility. + * + * @param allocator the target allocator + * @return a {@link org.apache.arrow.vector.util.TransferPair transfer pair}, creating a new target vector of + * the same type. + */ + TransferPair getTransferPair(BufferAllocator allocator); + + TransferPair getTransferPair(String ref, BufferAllocator allocator); + + TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack); + + /** + * Makes a new transfer pair used to transfer underlying buffers. + * + * @param target the target for the transfer + * @return a new {@link org.apache.arrow.vector.util.TransferPair transfer pair} that is used to transfer underlying + * buffers into the target vector. + */ + TransferPair makeTransferPair(ValueVector target); + + /** + * Get a reader for this vector. + * + * @return a {@link org.apache.arrow.vector.complex.reader.FieldReader field reader} that supports reading values + * from this vector. + */ + FieldReader getReader(); + + /** + * Get the number of bytes used by this vector. + * + * @return the number of bytes that is used by this vector instance. + */ + int getBufferSize(); + + /** + * Returns the number of bytes that is used by this vector if it holds the given number + * of values. The result will be the same as if setValueCount() were called, followed + * by calling getBufferSize(), but without any of the closing side-effects that setValueCount() + * implies wrt finishing off the population of a vector. Some operations might wish to use + * this to determine how much memory has been used by a vector so far, even though it is + * not finished being populated. + * + * @param valueCount the number of values to assume this vector contains + * @return the buffer size if this vector is holding valueCount values + */ + int getBufferSizeFor(int valueCount); + + /** + * Return the underlying buffers associated with this vector. Note that this doesn't impact the reference counts for + * this buffer so it only should be used for in-context access. Also note that this buffer changes regularly thus + * external classes shouldn't hold a reference to it (unless they change it). + * + * @param clear Whether to clear vector before returning; the buffers will still be refcounted; + * but the returned array will be the only reference to them + * @return The underlying {@link ArrowBuf buffers} that is used by this vector instance. + */ + ArrowBuf[] getBuffers(boolean clear); + + /** + * Gets the underlying buffer associated with validity vector. + * + * @return buffer + */ + ArrowBuf getValidityBuffer(); + + /** + * Gets the underlying buffer associated with data vector. + * + * @return buffer + */ + ArrowBuf getDataBuffer(); + + /** + * Gets the underlying buffer associated with offset vector. + * + * @return buffer + */ + ArrowBuf getOffsetBuffer(); + + /** + * Gets the number of values. + * + * @return number of values in the vector + */ + int getValueCount(); + + /** + * Set number of values in the vector. + */ + void setValueCount(int valueCount); + + /** + * Get friendly type object from the vector. + * + * @param index index of object to get + * @return friendly type object + */ + Object getObject(int index); + + /** + * Returns number of null elements in the vector. + * + * @return number of null elements + */ + int getNullCount(); + + /** + * Check whether an element in the vector is null. + * + * @param index index to check for null + * @return true if element is null + */ + boolean isNull(int index); + + /** + * Returns hashCode of element in index with the default hasher. + */ + int hashCode(int index); + + /** + * Returns hashCode of element in index with the given hasher. + */ + int hashCode(int index, ArrowBufHasher hasher); + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector. + * + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + void copyFrom(int fromIndex, int thisIndex, ValueVector from); + + /** + * Same as {@link #copyFrom(int, int, ValueVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + void copyFromSafe(int fromIndex, int thisIndex, ValueVector from); + + /** + * Accept a generic {@link VectorVisitor} and return the result. + * @param the output result type. + * @param the input data together with visitor. + */ + OUT accept(VectorVisitor visitor, IN value); + + /** + * Gets the name of the vector. + * @return the name of the vector. + */ + String getName(); +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VarBinaryVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VarBinaryVector.java new file mode 100644 index 000000000..798d30fe4 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VarBinaryVector.java @@ -0,0 +1,306 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.VarBinaryReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.NullableVarBinaryHolder; +import org.apache.arrow.vector.holders.VarBinaryHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * VarBinaryVector implements a variable width vector of binary + * values which could be NULL. A validity buffer (bit vector) is maintained + * to track which elements in the vector are null. + */ +public final class VarBinaryVector extends BaseVariableWidthVector { + private final FieldReader reader; + + /** + * Instantiate a VarBinaryVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public VarBinaryVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.VARBINARY.getType()), allocator); + } + + /** + * Instantiate a VarBinaryVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public VarBinaryVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a VarBinaryVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public VarBinaryVector(Field field, BufferAllocator allocator) { + super(field, allocator); + reader = new VarBinaryReaderImpl(VarBinaryVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.VARBINARY; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the variable length element at specified index as byte array. + * + * @param index position of element to get + * @return array of bytes for non-null element, null otherwise + */ + public byte[] get(int index) { + assert index >= 0; + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + return null; + } + final int startOffset = getStartOffset(index); + final int dataLength = + offsetBuffer.getInt((long) (index + 1) * OFFSET_WIDTH) - startOffset; + final byte[] result = new byte[dataLength]; + valueBuffer.getBytes(startOffset, result, 0, dataLength); + return result; + } + + /** + * Get the variable length element at specified index as Text. + * + * @param index position of element to get + * @return byte array for non-null element, null otherwise + */ + public byte[] getObject(int index) { + return get(index); + } + + /** + * Get the variable length element at specified index and sets the state + * in provided holder. + * + * @param index position of element to get + * @param holder data holder to be populated by this function + */ + public void get(int index, NullableVarBinaryHolder holder) { + assert index >= 0; + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.start = getStartOffset(index); + holder.end = offsetBuffer.getInt((index + 1) * OFFSET_WIDTH); + holder.buffer = valueBuffer; + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Set the variable length element at the specified index to the data + * buffer supplied in the holder. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, VarBinaryHolder holder) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + final int dataLength = holder.end - holder.start; + final int startOffset = getStartOffset(index); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + lastSet = index; + } + + /** + * Same as {@link #set(int, VarBinaryHolder)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, VarBinaryHolder holder) { + assert index >= 0; + final int dataLength = holder.end - holder.start; + handleSafe(index, dataLength); + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + final int startOffset = getStartOffset(index); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + lastSet = index; + } + + /** + * Set the variable length element at the specified index to the data + * buffer supplied in the holder. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, NullableVarBinaryHolder holder) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setValidityBit(validityBuffer, index, holder.isSet); + final int startOffset = getStartOffset(index); + if (holder.isSet != 0) { + final int dataLength = holder.end - holder.start; + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + } else { + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset); + } + lastSet = index; + } + + /** + * Same as {@link #set(int, NullableVarBinaryHolder)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, NullableVarBinaryHolder holder) { + assert index >= 0; + if (holder.isSet != 0) { + final int dataLength = holder.end - holder.start; + handleSafe(index, dataLength); + fillHoles(index); + final int startOffset = getStartOffset(index); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + } else { + fillEmpties(index + 1); + } + BitVectorHelper.setValidityBit(validityBuffer, index, holder.isSet); + lastSet = index; + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((VarBinaryVector) to); + } + + private class TransferImpl implements TransferPair { + VarBinaryVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new VarBinaryVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(VarBinaryVector to) { + this.to = to; + } + + @Override + public VarBinaryVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, VarBinaryVector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java new file mode 100644 index 000000000..e725e2d28 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java @@ -0,0 +1,331 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.VarCharReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.NullableVarCharHolder; +import org.apache.arrow.vector.holders.VarCharHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.Text; +import org.apache.arrow.vector.util.TransferPair; + +/** + * VarCharVector implements a variable width vector of VARCHAR + * values which could be NULL. A validity buffer (bit vector) is maintained + * to track which elements in the vector are null. + */ +public final class VarCharVector extends BaseVariableWidthVector { + private final FieldReader reader; + + /** + * Instantiate a VarCharVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public VarCharVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.VARCHAR.getType()), allocator); + } + + /** + * Instantiate a VarCharVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public VarCharVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a VarCharVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public VarCharVector(Field field, BufferAllocator allocator) { + super(field, allocator); + reader = new VarCharReaderImpl(VarCharVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.VARCHAR; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the variable length element at specified index as byte array. + * + * @param index position of element to get + * @return array of bytes for non-null element, null otherwise + */ + public byte[] get(int index) { + assert index >= 0; + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + return null; + } + final int startOffset = getStartOffset(index); + final int dataLength = + offsetBuffer.getInt((long) (index + 1) * OFFSET_WIDTH) - startOffset; + final byte[] result = new byte[dataLength]; + valueBuffer.getBytes(startOffset, result, 0, dataLength); + return result; + } + + /** + * Get the variable length element at specified index as Text. + * + * @param index position of element to get + * @return Text object for non-null element, null otherwise + */ + public Text getObject(int index) { + byte[] b = get(index); + if (b == null) { + return null; + } else { + return new Text(b); + } + } + + /** + * Get the variable length element at specified index and sets the state + * in provided holder. + * + * @param index position of element to get + * @param holder data holder to be populated by this function + */ + public void get(int index, NullableVarCharHolder holder) { + assert index >= 0; + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.start = getStartOffset(index); + holder.end = offsetBuffer.getInt((index + 1) * OFFSET_WIDTH); + holder.buffer = valueBuffer; + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Set the variable length element at the specified index to the data + * buffer supplied in the holder. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, VarCharHolder holder) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + final int dataLength = holder.end - holder.start; + final int startOffset = getStartOffset(index); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + lastSet = index; + } + + /** + * Same as {@link #set(int, VarCharHolder)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, VarCharHolder holder) { + assert index >= 0; + final int dataLength = holder.end - holder.start; + handleSafe(index, dataLength); + fillHoles(index); + + BitVectorHelper.setBit(validityBuffer, index); + final int startOffset = getStartOffset(index); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + lastSet = index; + } + + /** + * Set the variable length element at the specified index to the data + * buffer supplied in the holder. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, NullableVarCharHolder holder) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setValidityBit(validityBuffer, index, holder.isSet); + final int startOffset = getStartOffset(index); + if (holder.isSet != 0) { + final int dataLength = holder.end - holder.start; + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + } else { + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset); + } + lastSet = index; + } + + /** + * Same as {@link #set(int, NullableVarCharHolder)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, NullableVarCharHolder holder) { + assert index >= 0; + if (holder.isSet != 0) { + final int dataLength = holder.end - holder.start; + handleSafe(index, dataLength); + fillHoles(index); + final int startOffset = getStartOffset(index); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + } else { + fillEmpties(index + 1); + } + BitVectorHelper.setValidityBit(validityBuffer, index, holder.isSet); + lastSet = index; + } + + /** + * Set the variable length element at the specified index to the + * content in supplied Text. + * + * @param index position of the element to set + * @param text Text object with data + */ + public void set(int index, Text text) { + set(index, text.getBytes(), 0, text.getLength()); + } + + /** + * Same as {@link #set(int, NullableVarCharHolder)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set. + * @param text Text object with data + */ + public void setSafe(int index, Text text) { + setSafe(index, text.getBytes(), 0, text.getLength()); + } + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((VarCharVector) to); + } + + private class TransferImpl implements TransferPair { + VarCharVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new VarCharVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(VarCharVector to) { + this.to = to; + } + + @Override + public VarCharVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, VarCharVector.this); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java new file mode 100644 index 000000000..f6b8364e3 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Interface vectors that contain variable width members (e.g. Strings, Lists, etc). + */ +public interface VariableWidthVector extends ElementAddressableVector, DensityAwareVector { + + /** + * Allocate a new memory space for this vector. Must be called prior to using the ValueVector. + * + * @param totalBytes Desired size of the underlying data buffer. + * @param valueCount Number of values in the vector. + */ + void allocateNew(long totalBytes, int valueCount); + + /** + * Allocate a new memory space for this vector. Must be called prior to using the ValueVector. + * The initial size in bytes is either default (or) reused from previous allocation + * + * @param valueCount Number of values in the vector. + */ + void allocateNew(int valueCount); + + /** + * Provide the maximum amount of variable width bytes that can be stored in this vector. + * + * @return the byte capacity of this vector + */ + int getByteCapacity(); + + /** + * Provide the number of bytes contained in the valueBuffer. + * @return the number of bytes in valueBuffer. + */ + int sizeOfValueBuffer(); +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VectorDefinitionSetter.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VectorDefinitionSetter.java new file mode 100644 index 000000000..39804ee41 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VectorDefinitionSetter.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Interface for setting a specific index values as defined/valid on a vector. + */ +public interface VectorDefinitionSetter { + + void setIndexDefined(int index); +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java new file mode 100644 index 000000000..ed5f3aef1 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.util.Preconditions.checkArgument; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.util.Collections2; +import org.apache.arrow.vector.compression.CompressionCodec; +import org.apache.arrow.vector.compression.CompressionUtil; +import org.apache.arrow.vector.compression.NoCompressionCodec; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; +import org.apache.arrow.vector.types.pojo.Field; + +/** + * Loads buffers into vectors. + */ +public class VectorLoader { + + private final VectorSchemaRoot root; + + private final CompressionCodec.Factory factory; + + /** + * A flag indicating if decompression is needed. + * This will affect the behavior of releasing buffers. + */ + private boolean decompressionNeeded; + + /** + * Construct with a root to load and will create children in root based on schema. + * + * @param root the root to add vectors to based on schema + */ + public VectorLoader(VectorSchemaRoot root) { + this(root, NoCompressionCodec.Factory.INSTANCE); + } + + /** + * Construct with a root to load and will create children in root based on schema. + * + * @param root the root to add vectors to based on schema. + * @param factory the factory to create codec. + */ + public VectorLoader(VectorSchemaRoot root, CompressionCodec.Factory factory) { + this.root = root; + this.factory = factory; + } + + /** + * Loads the record batch in the vectors. + * will not close the record batch + * + * @param recordBatch the batch to load + */ + public void load(ArrowRecordBatch recordBatch) { + Iterator buffers = recordBatch.getBuffers().iterator(); + Iterator nodes = recordBatch.getNodes().iterator(); + CompressionUtil.CodecType codecType = + CompressionUtil.CodecType.fromCompressionType(recordBatch.getBodyCompression().getCodec()); + decompressionNeeded = codecType != CompressionUtil.CodecType.NO_COMPRESSION; + CompressionCodec codec = decompressionNeeded ? factory.createCodec(codecType) : NoCompressionCodec.INSTANCE; + for (FieldVector fieldVector : root.getFieldVectors()) { + loadBuffers(fieldVector, fieldVector.getField(), buffers, nodes, codec); + } + root.setRowCount(recordBatch.getLength()); + if (nodes.hasNext() || buffers.hasNext()) { + throw new IllegalArgumentException("not all nodes and buffers were consumed. nodes: " + + Collections2.toString(nodes) + " buffers: " + Collections2.toString(buffers)); + } + } + + private void loadBuffers( + FieldVector vector, + Field field, + Iterator buffers, + Iterator nodes, + CompressionCodec codec) { + checkArgument(nodes.hasNext(), "no more field nodes for for field %s and vector %s", field, vector); + ArrowFieldNode fieldNode = nodes.next(); + int bufferLayoutCount = TypeLayout.getTypeBufferCount(field.getType()); + List ownBuffers = new ArrayList<>(bufferLayoutCount); + for (int j = 0; j < bufferLayoutCount; j++) { + ArrowBuf nextBuf = buffers.next(); + // for vectors without nulls, the buffer is empty, so there is no need to decompress it. + ArrowBuf bufferToAdd = nextBuf.writerIndex() > 0 ? codec.decompress(vector.getAllocator(), nextBuf) : nextBuf; + ownBuffers.add(bufferToAdd); + if (decompressionNeeded) { + // decompression performed + nextBuf.getReferenceManager().retain(); + } + } + try { + vector.loadFieldBuffers(fieldNode, ownBuffers); + if (decompressionNeeded) { + for (ArrowBuf buf : ownBuffers) { + buf.close(); + } + } + } catch (RuntimeException e) { + throw new IllegalArgumentException("Could not load buffers for field " + + field + ". error message: " + e.getMessage(), e); + } + List children = field.getChildren(); + if (children.size() > 0) { + List childrenFromFields = vector.getChildrenFromFields(); + checkArgument(children.size() == childrenFromFields.size(), + "should have as many children as in the schema: found %s expected %s", + childrenFromFields.size(), children.size()); + for (int i = 0; i < childrenFromFields.size(); i++) { + Field child = children.get(i); + FieldVector fieldVector = childrenFromFields.get(i); + loadBuffers(fieldVector, child, buffers, nodes, codec); + } + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java new file mode 100644 index 000000000..623c77317 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java @@ -0,0 +1,429 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.util.AutoCloseables; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.compare.ApproxEqualsVisitor; +import org.apache.arrow.vector.compare.Range; +import org.apache.arrow.vector.compare.VectorEqualsVisitor; +import org.apache.arrow.vector.compare.VectorValueEqualizer; +import org.apache.arrow.vector.compare.util.ValueEpsilonEqualizers; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.TransferPair; + +/** + * Holder for a set of vectors to be loaded/unloaded. + * A VectorSchemaRoot is a container that can hold batches, batches flow through VectorSchemaRoot + * as part of a pipeline. Note this is different from other implementations (i.e. in C++ and Python, + * a RecordBatch is a collection of equal-length vector instances and was created each time for a new batch). + + * The recommended usage for VectorSchemaRoot is creating a single VectorSchemaRoot based on the known + * schema and populated data over and over into the same VectorSchemaRoot in a stream of batches rather + * than create a new VectorSchemaRoot instance each time (see Flight or ArrowFileWriter for better understanding). + * Thus at any one point a VectorSchemaRoot may have data or may have no data (say it was transferred downstream + * or not yet populated). + */ +public class VectorSchemaRoot implements AutoCloseable { + + private Schema schema; + private int rowCount; + private final List fieldVectors; + private final Map fieldVectorsMap = new LinkedHashMap<>(); + + + /** + * Constructs new instance containing each of the vectors. + */ + public VectorSchemaRoot(Iterable vectors) { + this( + StreamSupport.stream(vectors.spliterator(), false).map(t -> t.getField()).collect(Collectors.toList()), + StreamSupport.stream(vectors.spliterator(), false).collect(Collectors.toList()) + ); + } + + /** + * Constructs a new instance containing the children of parent but not the parent itself. + */ + public VectorSchemaRoot(FieldVector parent) { + this(parent.getField().getChildren(), parent.getChildrenFromFields(), parent.getValueCount()); + } + + /** + * Constructs a new instance. + * + * @param fields The types of each vector. + * @param fieldVectors The data vectors (must be equal in size to fields. + */ + public VectorSchemaRoot(List fields, List fieldVectors) { + this(new Schema(fields), fieldVectors, fieldVectors.size() == 0 ? 0 : fieldVectors.get(0).getValueCount()); + } + + /** + * Constructs a new instance. + * + * @param fields The types of each vector. + * @param fieldVectors The data vectors (must be equal in size to fields. + * @param rowCount The number of rows contained. + */ + public VectorSchemaRoot(List fields, List fieldVectors, int rowCount) { + this(new Schema(fields), fieldVectors, rowCount); + } + + /** + * Constructs a new instance. + * + * @param schema The schema for the vectors. + * @param fieldVectors The data vectors. + * @param rowCount The number of rows + */ + public VectorSchemaRoot(Schema schema, List fieldVectors, int rowCount) { + if (schema.getFields().size() != fieldVectors.size()) { + throw new IllegalArgumentException("Fields must match field vectors. Found " + + fieldVectors.size() + " vectors and " + schema.getFields().size() + " fields"); + } + this.schema = schema; + this.rowCount = rowCount; + this.fieldVectors = fieldVectors; + for (int i = 0; i < schema.getFields().size(); ++i) { + Field field = schema.getFields().get(i); + FieldVector vector = fieldVectors.get(i); + fieldVectorsMap.put(field, vector); + } + } + + /** + * Creates a new set of empty vectors corresponding to the given schema. + */ + public static VectorSchemaRoot create(Schema schema, BufferAllocator allocator) { + List fieldVectors = new ArrayList<>(); + for (Field field : schema.getFields()) { + FieldVector vector = field.createVector(allocator); + fieldVectors.add(vector); + } + if (fieldVectors.size() != schema.getFields().size()) { + throw new IllegalArgumentException("The root vector did not create the right number of children. found " + + fieldVectors.size() + " expected " + schema.getFields().size()); + } + return new VectorSchemaRoot(schema, fieldVectors, 0); + } + + /** Constructs a new instance from vectors. */ + public static VectorSchemaRoot of(FieldVector... vectors) { + return new VectorSchemaRoot(Arrays.stream(vectors).collect(Collectors.toList())); + } + + /** + * Do an adaptive allocation of each vector for memory purposes. Sizes will be based on previously + * defined initial allocation for each vector (and subsequent size learned). + */ + public void allocateNew() { + for (FieldVector v : fieldVectors) { + v.allocateNew(); + } + rowCount = 0; + } + + /** + * Release all the memory for each vector held in this root. This DOES NOT remove vectors from the container. + */ + public void clear() { + for (FieldVector v : fieldVectors) { + v.clear(); + } + rowCount = 0; + } + + public List getFieldVectors() { + return fieldVectors.stream().collect(Collectors.toList()); + } + + /** + * gets a vector by name. + * + * if name occurs multiple times this returns the first inserted entry for name + */ + public FieldVector getVector(String name) { + for (Map.Entry entry: fieldVectorsMap.entrySet()) { + if (entry.getKey().getName().equals(name)) { + return entry.getValue(); + } + } + return null; + } + + public FieldVector getVector(Field field) { + return fieldVectorsMap.get(field); + } + + public FieldVector getVector(int index) { + Preconditions.checkArgument(index >= 0 && index < fieldVectors.size()); + return fieldVectors.get(index); + } + + /** + * Add vector to the record batch, producing a new VectorSchemaRoot. + * @param index field index + * @param vector vector to be added. + * @return out VectorSchemaRoot with vector added + */ + public VectorSchemaRoot addVector(int index, FieldVector vector) { + Preconditions.checkNotNull(vector); + Preconditions.checkArgument(index >= 0 && index < fieldVectors.size()); + List newVectors = new ArrayList<>(); + for (int i = 0; i < fieldVectors.size(); i++) { + if (i == index) { + newVectors.add(vector); + } + newVectors.add(fieldVectors.get(i)); + } + return new VectorSchemaRoot(newVectors); + } + + /** + * Remove vector from the record batch, producing a new VectorSchemaRoot. + * @param index field index + * @return out VectorSchemaRoot with vector removed + */ + public VectorSchemaRoot removeVector(int index) { + Preconditions.checkArgument(index >= 0 && index < fieldVectors.size()); + List newVectors = new ArrayList<>(); + for (int i = 0; i < fieldVectors.size(); i++) { + if (i != index) { + newVectors.add(fieldVectors.get(i)); + } + } + return new VectorSchemaRoot(newVectors); + } + + public Schema getSchema() { + return schema; + } + + public int getRowCount() { + return rowCount; + } + + /** + * Set the row count of all the vectors in this container. Also sets the value + * count for each root level contained FieldVector. + * @param rowCount Number of records. + */ + public void setRowCount(int rowCount) { + this.rowCount = rowCount; + for (FieldVector v : getFieldVectors()) { + v.setValueCount(rowCount); + } + } + + @Override + public void close() { + try { + AutoCloseables.close(fieldVectors); + } catch (RuntimeException ex) { + throw ex; + } catch (Exception ex) { + // should never happen since FieldVector.close() doesn't throw IOException + throw new RuntimeException(ex); + } + } + + private void printRow(StringBuilder sb, List row) { + boolean first = true; + for (Object v : row) { + if (first) { + first = false; + } else { + sb.append("\t"); + } + sb.append(v); + } + sb.append("\n"); + } + + /** + * Returns a tab separated value of vectors (based on their java object representation). + */ + public String contentToTSVString() { + StringBuilder sb = new StringBuilder(); + List row = new ArrayList<>(schema.getFields().size()); + for (Field field : schema.getFields()) { + row.add(field.getName()); + } + printRow(sb, row); + for (int i = 0; i < rowCount; i++) { + row.clear(); + for (FieldVector v : fieldVectors) { + row.add(v.getObject(i)); + } + printRow(sb, row); + } + return sb.toString(); + } + + /** + * Synchronizes the schema from the current vectors. + * In some cases, the schema and the actual vector structure may be different. + * This can be caused by a promoted writer (For details, please see + * {@link org.apache.arrow.vector.complex.impl.PromotableWriter}). + * For example, when writing different types of data to a {@link org.apache.arrow.vector.complex.ListVector} + * may lead to such a case. + * When this happens, this method should be called to bring the schema and vector structure in a synchronized state. + * @return true if the schema is updated, false otherwise. + */ + public boolean syncSchema() { + List oldFields = this.schema.getFields(); + List newFields = this.fieldVectors.stream().map(ValueVector::getField).collect(Collectors.toList()); + if (!oldFields.equals(newFields)) { + this.schema = new Schema(newFields); + return true; + } + return false; + } + + /** + * Slice this root from desired index. + * @param index start position of the slice + * @return the sliced root + */ + public VectorSchemaRoot slice(int index) { + return slice(index, this.rowCount - index); + } + + /** + * Slice this root at desired index and length. + * @param index start position of the slice + * @param length length of the slice + * @return the sliced root + */ + public VectorSchemaRoot slice(int index, int length) { + Preconditions.checkArgument(index >= 0, "expecting non-negative index"); + Preconditions.checkArgument(length >= 0, "expecting non-negative length"); + Preconditions.checkArgument(index + length <= rowCount, + "index + length should <= rowCount"); + + if (index == 0 && length == rowCount) { + return this; + } + + List sliceVectors = fieldVectors.stream().map(v -> { + TransferPair transferPair = v.getTransferPair(v.getAllocator()); + transferPair.splitAndTransfer(index, length); + return (FieldVector) transferPair.getTo(); + }).collect(Collectors.toList()); + + return new VectorSchemaRoot(sliceVectors); + } + + /** + * Determine if two VectorSchemaRoots are exactly equal. + */ + public boolean equals(VectorSchemaRoot other) { + if (other == null) { + return false; + } + + if (!this.schema.equals(other.schema)) { + return false; + } + + if (this.rowCount != other.rowCount) { + return false; + } + + for (int i = 0; i < fieldVectors.size(); i++) { + FieldVector vector = fieldVectors.get(i); + FieldVector otherVector = other.fieldVectors.get(i); + if (!VectorEqualsVisitor.vectorEquals(vector, otherVector)) { + return false; + } + } + + return true; + } + + /** + * Determine if two VectorSchemaRoots are approximately equal using the given functions to + * calculate difference between float/double values. + * Note that approx equals are in regards to floating point values, other values are comparing + * to exactly equals. + * + * @param floatDiffFunction function to calculate difference between float values. + * @param doubleDiffFunction function to calculate difference between double values. + */ + public boolean approxEquals( + VectorSchemaRoot other, + VectorValueEqualizer floatDiffFunction, + VectorValueEqualizer doubleDiffFunction) { + + Preconditions.checkNotNull(floatDiffFunction); + Preconditions.checkNotNull(doubleDiffFunction); + + if (other == null) { + return false; + } + + if (!this.schema.equals(other.schema)) { + return false; + } + + if (this.rowCount != other.rowCount) { + return false; + } + + Range range = new Range(0, 0, 0); + for (int i = 0; i < fieldVectors.size(); i++) { + FieldVector vector = fieldVectors.get(i); + FieldVector otherVector = other.fieldVectors.get(i); + if (vector.getValueCount() != otherVector.getValueCount()) { + return false; + } + ApproxEqualsVisitor visitor = + new ApproxEqualsVisitor(vector, otherVector, floatDiffFunction, doubleDiffFunction); + range.setLength(vector.getValueCount()); + if (!visitor.rangeEquals(range)) { + return false; + } + } + + return true; + } + + /** + * Determine if two VectorSchemaRoots are approximately equal using default functions to + * calculate difference between float/double values. + */ + public boolean approxEquals(VectorSchemaRoot other) { + VectorValueEqualizer floatDiffFunction = + new ValueEpsilonEqualizers.Float4EpsilonEqualizer(ApproxEqualsVisitor.DEFAULT_FLOAT_EPSILON); + VectorValueEqualizer doubleDiffFunction = + new ValueEpsilonEqualizers.Float8EpsilonEqualizer(ApproxEqualsVisitor.DEFAULT_DOUBLE_EPSILON); + return approxEquals(other, floatDiffFunction, doubleDiffFunction); + } +} + diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java new file mode 100644 index 000000000..e2cbf3ec1 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.vector.compression.CompressionCodec; +import org.apache.arrow.vector.compression.CompressionUtil; +import org.apache.arrow.vector.compression.NoCompressionCodec; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; + +/** + * Helper class that handles converting a {@link VectorSchemaRoot} + * to a {@link ArrowRecordBatch}. + */ +public class VectorUnloader { + + private final VectorSchemaRoot root; + private final boolean includeNullCount; + private final CompressionCodec codec; + private final boolean alignBuffers; + + /** + * Constructs a new instance of the given set of vectors. + */ + public VectorUnloader(VectorSchemaRoot root) { + this(root, true, NoCompressionCodec.INSTANCE, true); + } + + /** + * Constructs a new instance. + * + * @param root The set of vectors to serialize to an {@link ArrowRecordBatch}. + * @param includeNullCount Controls whether null count is copied to the {@link ArrowRecordBatch} + * @param alignBuffers Controls if buffers get aligned to 8-byte boundaries. + */ + public VectorUnloader( + VectorSchemaRoot root, boolean includeNullCount, boolean alignBuffers) { + this(root, includeNullCount, NoCompressionCodec.INSTANCE, alignBuffers); + } + + /** + * Constructs a new instance. + * + * @param root The set of vectors to serialize to an {@link ArrowRecordBatch}. + * @param includeNullCount Controls whether null count is copied to the {@link ArrowRecordBatch} + * @param codec the codec for compressing data. If it is null, then no compression is needed. + * @param alignBuffers Controls if buffers get aligned to 8-byte boundaries. + */ + public VectorUnloader( + VectorSchemaRoot root, boolean includeNullCount, CompressionCodec codec, boolean alignBuffers) { + this.root = root; + this.includeNullCount = includeNullCount; + this.codec = codec; + this.alignBuffers = alignBuffers; + } + + /** + * Performs the depth first traversal of the Vectors to create an {@link ArrowRecordBatch} suitable + * for serialization. + */ + public ArrowRecordBatch getRecordBatch() { + List nodes = new ArrayList<>(); + List buffers = new ArrayList<>(); + for (FieldVector vector : root.getFieldVectors()) { + appendNodes(vector, nodes, buffers); + } + return new ArrowRecordBatch( + root.getRowCount(), nodes, buffers, CompressionUtil.createBodyCompression(codec), alignBuffers); + } + + private void appendNodes(FieldVector vector, List nodes, List buffers) { + nodes.add(new ArrowFieldNode(vector.getValueCount(), includeNullCount ? vector.getNullCount() : -1)); + List fieldBuffers = vector.getFieldBuffers(); + int expectedBufferCount = TypeLayout.getTypeBufferCount(vector.getField().getType()); + if (fieldBuffers.size() != expectedBufferCount) { + throw new IllegalArgumentException(String.format( + "wrong number of buffers for field %s in vector %s. found: %s", + vector.getField(), vector.getClass().getSimpleName(), fieldBuffers)); + } + for (ArrowBuf buf : fieldBuffers) { + buffers.add(codec.compress(vector.getAllocator(), buf)); + } + for (FieldVector child : vector.getChildrenFromFields()) { + appendNodes(child, nodes, buffers); + } + } + +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java new file mode 100644 index 000000000..079b5c103 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.util.ArrowBufPointer; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.TransferPair; + +/** + * A zero length vector of any type. + */ +public final class ZeroVector extends NullVector { + public static final ZeroVector INSTANCE = new ZeroVector(); + + /** + * Instantiate a ZeroVector. + * + * @param name name of the vector + */ + public ZeroVector(String name) { + super(name); + } + + /** + * Instantiate a ZeroVector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector. + */ + public ZeroVector(String name, FieldType fieldType) { + super(name, fieldType); + } + + /** + * Instantiate a ZeroVector. + * + * @param field field materialized by this vector. + */ + public ZeroVector(Field field) { + super(field); + } + + @Deprecated + public ZeroVector() { + } + + @Override + public int getValueCount() { + return 0; + } + + @Override + public void setValueCount(int valueCount) { + } + + @Override + public int getNullCount() { + return 0; + } + + @Override + public boolean isNull(int index) { + throw new IndexOutOfBoundsException(); + } + + @Override + public int hashCode(int index) { + return 0; + } + + @Override + public int hashCode(int index, ArrowBufHasher hasher) { + return ArrowBufPointer.NULL_HASH_CODE; + } + + @Override + public int getValueCapacity() { + return 0; + } + + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + return defaultPair; + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return defaultPair; + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { + return defaultPair; + } + + @Override + public TransferPair makeTransferPair(ValueVector target) { + return defaultPair; + } + + private final TransferPair defaultPair = new TransferPair() { + @Override + public void transfer() { + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + } + + @Override + public ValueVector getTo() { + return ZeroVector.this; + } + + @Override + public void copyValueSafe(int from, int to) { + } + }; +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/ApproxEqualsVisitor.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/ApproxEqualsVisitor.java new file mode 100644 index 000000000..bcf8c64e0 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/ApproxEqualsVisitor.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.compare; + +import java.util.function.BiFunction; + +import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.compare.util.ValueEpsilonEqualizers; + +/** + * Visitor to compare floating point vectors approximately. + */ +public class ApproxEqualsVisitor extends RangeEqualsVisitor { + + /** + * Functions to calculate difference between float/double values. + */ + private final VectorValueEqualizer floatDiffFunction; + private final VectorValueEqualizer doubleDiffFunction; + + /** + * Default epsilons for diff functions. + */ + public static final float DEFAULT_FLOAT_EPSILON = 1.0E-6f; + public static final double DEFAULT_DOUBLE_EPSILON = 1.0E-6; + + /** + * Constructs a new instance with default tolerances. + * @param left left vector + * @param right right vector + */ + public ApproxEqualsVisitor(ValueVector left, ValueVector right) { + this (left, right, DEFAULT_FLOAT_EPSILON, DEFAULT_DOUBLE_EPSILON); + } + + /** + * Constructs a new instance. + * + * @param left left vector + * @param right right vector + * @param floatEpsilon difference for float values + * @param doubleEpsilon difference for double values + */ + public ApproxEqualsVisitor(ValueVector left, ValueVector right, float floatEpsilon, double doubleEpsilon) { + this (left, right, + new ValueEpsilonEqualizers.Float4EpsilonEqualizer(floatEpsilon), + new ValueEpsilonEqualizers.Float8EpsilonEqualizer(doubleEpsilon)); + } + + /** + * Constructs a new instance. + */ + public ApproxEqualsVisitor(ValueVector left, ValueVector right, + VectorValueEqualizer floatDiffFunction, + VectorValueEqualizer doubleDiffFunction) { + this (left, right, floatDiffFunction, doubleDiffFunction, DEFAULT_TYPE_COMPARATOR); + } + + /** + * Constructs a new instance. + * @param left the left vector. + * @param right the right vector. + * @param floatDiffFunction the equalizer for float values. + * @param doubleDiffFunction the equalizer for double values. + * @param typeComparator type comparator to compare vector type. + */ + public ApproxEqualsVisitor(ValueVector left, ValueVector right, + VectorValueEqualizer floatDiffFunction, + VectorValueEqualizer doubleDiffFunction, + BiFunction typeComparator) { + super(left, right, typeComparator); + this.floatDiffFunction = floatDiffFunction; + this.doubleDiffFunction = doubleDiffFunction; + } + + @Override + public Boolean visit(BaseFixedWidthVector left, Range range) { + if (left instanceof Float4Vector) { + if (!validate(left)) { + return false; + } + return float4ApproxEquals(range); + } else if (left instanceof Float8Vector) { + if (!validate(left)) { + return false; + } + return float8ApproxEquals(range); + } else { + return super.visit(left, range); + } + } + + @Override + protected ApproxEqualsVisitor createInnerVisitor( + ValueVector left, ValueVector right, + BiFunction typeComparator) { + return new ApproxEqualsVisitor(left, right, floatDiffFunction.clone(), doubleDiffFunction.clone(), typeComparator); + } + + private boolean float4ApproxEquals(Range range) { + Float4Vector leftVector = (Float4Vector) getLeft(); + Float4Vector rightVector = (Float4Vector) getRight(); + + for (int i = 0; i < range.getLength(); i++) { + int leftIndex = range.getLeftStart() + i; + int rightIndex = range.getRightStart() + i; + + if (!floatDiffFunction.valuesEqual(leftVector, leftIndex, rightVector, rightIndex)) { + return false; + } + } + return true; + } + + private boolean float8ApproxEquals(Range range) { + Float8Vector leftVector = (Float8Vector) getLeft(); + Float8Vector rightVector = (Float8Vector) getRight(); + + for (int i = 0; i < range.getLength(); i++) { + int leftIndex = range.getLeftStart() + i; + int rightIndex = range.getRightStart() + i; + + if (!doubleDiffFunction.valuesEqual(leftVector, leftIndex, rightVector, rightIndex)) { + return false; + } + } + return true; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/Range.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/Range.java new file mode 100644 index 000000000..0de99ab01 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/Range.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.compare; + +/** + * Wrapper for the parameters of comparing a range of values in two vectors. + */ +public class Range { + + /** + * Start position in the left vector. + */ + private int leftStart = -1; + + /** + * Start position in the right vector. + */ + private int rightStart = -1; + + /** + * Length of the range. + */ + private int length = -1; + + + /** + * Constructs a new instance. + */ + public Range() {} + + /** + * Constructs a new instance. + * + * @param leftStart start index in left vector + * @param rightStart start index in right vector + * @param length length of range + */ + public Range(int leftStart, int rightStart, int length) { + this.leftStart = leftStart; + this.rightStart = rightStart; + this.length = length; + } + + public int getLeftStart() { + return leftStart; + } + + public int getRightStart() { + return rightStart; + } + + public int getLength() { + return length; + } + + public Range setLeftStart(int leftStart) { + this.leftStart = leftStart; + return this; + } + + public Range setRightStart(int rightStart) { + this.rightStart = rightStart; + return this; + } + + public Range setLength(int length) { + this.length = length; + return this; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java new file mode 100644 index 000000000..35b4936e3 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java @@ -0,0 +1,563 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.compare; + +import static org.apache.arrow.memory.util.LargeMemoryUtil.checkedCastToInt; + +import java.util.List; +import java.util.function.BiFunction; + +import org.apache.arrow.memory.util.ByteFunctionHelpers; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.BaseLargeVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.ExtensionTypeVector; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.complex.BaseRepeatedValueVector; +import org.apache.arrow.vector.complex.DenseUnionVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.LargeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.UnionVector; + +/** + * Visitor to compare a range of values for vectors. + */ +public class RangeEqualsVisitor implements VectorVisitor { + private ValueVector left; + private ValueVector right; + + private BiFunction typeComparator; + private boolean typeCompareResult; + + /** + * Default type comparator. + */ + public static final BiFunction DEFAULT_TYPE_COMPARATOR = + (v1, v2) -> new TypeEqualsVisitor(v2).equals(v1); + + /** + * Constructs a new instance with default type comparator. + * @param left left vector + * @param right right vector + */ + public RangeEqualsVisitor(ValueVector left, ValueVector right) { + this (left, right, DEFAULT_TYPE_COMPARATOR); + } + + /** + * Constructs a new instance. + * + * @param left left vector + * @param right right vector + * @param typeComparator type comparator to compare vector type. + */ + public RangeEqualsVisitor( + ValueVector left, + ValueVector right, + BiFunction typeComparator) { + this.left = left; + this.right = right; + this.typeComparator = typeComparator; + + Preconditions.checkArgument(left != null, + "left vector cannot be null"); + Preconditions.checkArgument(right != null, + "right vector cannot be null"); + + // type usually checks only once unless the left vector is changed. + checkType(); + } + + private void checkType() { + if (typeComparator == null || left == right) { + typeCompareResult = true; + } else { + typeCompareResult = typeComparator.apply(left, right); + } + } + + /** + * Validate the passed left vector, if it is changed, reset and check type. + */ + protected boolean validate(ValueVector left) { + if (left != this.left) { + this.left = left; + checkType(); + } + return typeCompareResult; + } + + /** + * Check range equals. + */ + public boolean rangeEquals(Range range) { + if (!typeCompareResult) { + return false; + } + + Preconditions.checkArgument(range.getLeftStart() >= 0, + "leftStart %s must be non negative.", range.getLeftStart()); + Preconditions.checkArgument(range.getRightStart() >= 0, + "rightStart %s must be non negative.", range.getRightStart()); + + Preconditions.checkArgument(range.getRightStart() + range.getLength() <= right.getValueCount(), + "(rightStart + length) %s out of range[0, %s].", 0, right.getValueCount()); + Preconditions.checkArgument(range.getLeftStart() + range.getLength() <= left.getValueCount(), + "(leftStart + length) %s out of range[0, %s].", 0, left.getValueCount()); + + return left.accept(this, range); + } + + public ValueVector getLeft() { + return left; + } + + public ValueVector getRight() { + return right; + } + + @Override + public Boolean visit(BaseFixedWidthVector left, Range range) { + if (!validate(left)) { + return false; + } + return compareBaseFixedWidthVectors(range); + } + + @Override + public Boolean visit(BaseVariableWidthVector left, Range range) { + if (!validate(left)) { + return false; + } + return compareBaseVariableWidthVectors(range); + } + + @Override + public Boolean visit(BaseLargeVariableWidthVector left, Range range) { + if (!validate(left)) { + return false; + } + return compareBaseLargeVariableWidthVectors(range); + } + + @Override + public Boolean visit(ListVector left, Range range) { + if (!validate(left)) { + return false; + } + return compareListVectors(range); + } + + @Override + public Boolean visit(FixedSizeListVector left, Range range) { + if (!validate(left)) { + return false; + } + return compareFixedSizeListVectors(range); + } + + @Override + public Boolean visit(LargeListVector left, Range range) { + if (!validate(left)) { + return false; + } + return compareLargeListVectors(range); + } + + @Override + public Boolean visit(NonNullableStructVector left, Range range) { + if (!validate(left)) { + return false; + } + return compareStructVectors(range); + } + + @Override + public Boolean visit(UnionVector left, Range range) { + if (!validate(left)) { + return false; + } + return compareUnionVectors(range); + } + + @Override + public Boolean visit(DenseUnionVector left, Range range) { + if (!validate(left)) { + return false; + } + return compareDenseUnionVectors(range); + } + + @Override + public Boolean visit(NullVector left, Range range) { + if (!validate(left)) { + return false; + } + return true; + } + + @Override + public Boolean visit(ExtensionTypeVector left, Range range) { + if (!(right instanceof ExtensionTypeVector) || !validate(left)) { + return false; + } + ValueVector rightUnderlying = ((ExtensionTypeVector) right).getUnderlyingVector(); + TypeEqualsVisitor typeVisitor = new TypeEqualsVisitor(rightUnderlying); + RangeEqualsVisitor underlyingVisitor = + createInnerVisitor(left.getUnderlyingVector(), rightUnderlying, (l, r) -> typeVisitor.equals(l)); + return underlyingVisitor.rangeEquals(range); + } + + protected RangeEqualsVisitor createInnerVisitor( + ValueVector leftInner, ValueVector rightInner, + BiFunction typeComparator) { + return new RangeEqualsVisitor(leftInner, rightInner, typeComparator); + } + + protected boolean compareUnionVectors(Range range) { + UnionVector leftVector = (UnionVector) left; + UnionVector rightVector = (UnionVector) right; + + Range subRange = new Range(0, 0, 1); + for (int i = 0; i < range.getLength(); i++) { + subRange.setLeftStart(range.getLeftStart() + i).setRightStart(range.getRightStart() + i); + ValueVector leftSubVector = leftVector.getVector(range.getLeftStart() + i); + ValueVector rightSubVector = rightVector.getVector(range.getRightStart() + i); + + if (leftSubVector == null || rightSubVector == null) { + if (leftSubVector == rightSubVector) { + continue; + } else { + return false; + } + } + TypeEqualsVisitor typeVisitor = new TypeEqualsVisitor(rightSubVector); + RangeEqualsVisitor visitor = + createInnerVisitor(leftSubVector, rightSubVector, (left, right) -> typeVisitor.equals(left)); + if (!visitor.rangeEquals(subRange)) { + return false; + } + } + return true; + } + + protected boolean compareDenseUnionVectors(Range range) { + DenseUnionVector leftVector = (DenseUnionVector) left; + DenseUnionVector rightVector = (DenseUnionVector) right; + + Range subRange = new Range(0, 0, 1); + for (int i = 0; i < range.getLength(); i++) { + boolean isLeftNull = leftVector.isNull(range.getLeftStart() + i); + boolean isRightNull = rightVector.isNull(range.getRightStart() + i); + + // compare nullabilities + if (isLeftNull || isRightNull) { + if (isLeftNull != isRightNull) { + // exactly one slot is null, unequal + return false; + } else { + // both slots are null, pass this iteration + continue; + } + } + + // compare type ids + byte leftTypeId = leftVector.getTypeId(range.getLeftStart() + i); + byte rightTypeId = rightVector.getTypeId(range.getRightStart() + i); + + if (leftTypeId != rightTypeId) { + return false; + } + + ValueVector leftSubVector = leftVector.getVectorByType(leftTypeId); + ValueVector rightSubVector = rightVector.getVectorByType(rightTypeId); + + if (leftSubVector == null || rightSubVector == null) { + if (leftSubVector != rightSubVector) { + // exactly one of the sub-vectors is null, unequal + return false; + } else { + // both sub-vectors are null, pass this iteration + continue; + } + } + + // compare values + int leftOffset = leftVector.getOffset(range.getLeftStart() + i); + int rightOffset = rightVector.getOffset(range.getRightStart() + i); + subRange.setLeftStart(leftOffset).setRightStart(rightOffset); + TypeEqualsVisitor typeVisitor = new TypeEqualsVisitor(rightSubVector); + RangeEqualsVisitor visitor = + createInnerVisitor(leftSubVector, rightSubVector, (left, right) -> typeVisitor.equals(left)); + if (!visitor.rangeEquals(subRange)) { + return false; + } + } + return true; + } + + protected boolean compareStructVectors(Range range) { + NonNullableStructVector leftVector = (NonNullableStructVector) left; + NonNullableStructVector rightVector = (NonNullableStructVector) right; + + List leftChildNames = leftVector.getChildFieldNames(); + if (!leftChildNames.equals(rightVector.getChildFieldNames())) { + return false; + } + + for (String name : leftChildNames) { + RangeEqualsVisitor visitor = + createInnerVisitor(leftVector.getChild(name), rightVector.getChild(name), /*type comparator*/ null); + if (!visitor.rangeEquals(range)) { + return false; + } + } + + return true; + } + + protected boolean compareBaseFixedWidthVectors(Range range) { + BaseFixedWidthVector leftVector = (BaseFixedWidthVector) left; + BaseFixedWidthVector rightVector = (BaseFixedWidthVector) right; + + for (int i = 0; i < range.getLength(); i++) { + int leftIndex = range.getLeftStart() + i; + int rightIndex = range.getRightStart() + i; + + boolean isNull = leftVector.isNull(leftIndex); + + if (isNull != rightVector.isNull(rightIndex)) { + return false; + } + + int typeWidth = leftVector.getTypeWidth(); + if (!isNull) { + int startIndexLeft = typeWidth * leftIndex; + int endIndexLeft = typeWidth * (leftIndex + 1); + + int startIndexRight = typeWidth * rightIndex; + int endIndexRight = typeWidth * (rightIndex + 1); + + int ret = ByteFunctionHelpers.equal(leftVector.getDataBuffer(), startIndexLeft, endIndexLeft, + rightVector.getDataBuffer(), startIndexRight, endIndexRight); + + if (ret == 0) { + return false; + } + } + } + return true; + } + + protected boolean compareBaseVariableWidthVectors(Range range) { + BaseVariableWidthVector leftVector = (BaseVariableWidthVector) left; + BaseVariableWidthVector rightVector = (BaseVariableWidthVector) right; + + for (int i = 0; i < range.getLength(); i++) { + int leftIndex = range.getLeftStart() + i; + int rightIndex = range.getRightStart() + i; + + boolean isNull = leftVector.isNull(leftIndex); + if (isNull != rightVector.isNull(rightIndex)) { + return false; + } + + int offsetWidth = BaseVariableWidthVector.OFFSET_WIDTH; + + if (!isNull) { + final int startIndexLeft = leftVector.getOffsetBuffer().getInt(leftIndex * offsetWidth); + final int endIndexLeft = leftVector.getOffsetBuffer().getInt((leftIndex + 1) * offsetWidth); + + final int startIndexRight = rightVector.getOffsetBuffer().getInt(rightIndex * offsetWidth); + final int endIndexRight = rightVector.getOffsetBuffer().getInt((rightIndex + 1) * offsetWidth); + + int ret = ByteFunctionHelpers.equal(leftVector.getDataBuffer(), startIndexLeft, endIndexLeft, + rightVector.getDataBuffer(), startIndexRight, endIndexRight); + + if (ret == 0) { + return false; + } + } + } + return true; + } + + protected boolean compareBaseLargeVariableWidthVectors(Range range) { + BaseLargeVariableWidthVector leftVector = (BaseLargeVariableWidthVector) left; + BaseLargeVariableWidthVector rightVector = (BaseLargeVariableWidthVector) right; + + for (int i = 0; i < range.getLength(); i++) { + int leftIndex = range.getLeftStart() + i; + int rightIndex = range.getRightStart() + i; + + boolean isNull = leftVector.isNull(leftIndex); + if (isNull != rightVector.isNull(rightIndex)) { + return false; + } + + int offsetWidth = BaseLargeVariableWidthVector.OFFSET_WIDTH; + + if (!isNull) { + final long startIndexLeft = leftVector.getOffsetBuffer().getLong((long) leftIndex * offsetWidth); + final long endIndexLeft = leftVector.getOffsetBuffer().getLong((long) (leftIndex + 1) * offsetWidth); + + final long startIndexRight = rightVector.getOffsetBuffer().getLong((long) rightIndex * offsetWidth); + final long endIndexRight = rightVector.getOffsetBuffer().getLong((long) (rightIndex + 1) * offsetWidth); + + int ret = ByteFunctionHelpers.equal(leftVector.getDataBuffer(), startIndexLeft, endIndexLeft, + rightVector.getDataBuffer(), startIndexRight, endIndexRight); + + if (ret == 0) { + return false; + } + } + } + return true; + } + + protected boolean compareListVectors(Range range) { + ListVector leftVector = (ListVector) left; + ListVector rightVector = (ListVector) right; + + RangeEqualsVisitor innerVisitor = + createInnerVisitor(leftVector.getDataVector(), rightVector.getDataVector(), /*type comparator*/ null); + Range innerRange = new Range(); + + for (int i = 0; i < range.getLength(); i++) { + int leftIndex = range.getLeftStart() + i; + int rightIndex = range.getRightStart() + i; + + boolean isNull = leftVector.isNull(leftIndex); + if (isNull != rightVector.isNull(rightIndex)) { + return false; + } + + int offsetWidth = BaseRepeatedValueVector.OFFSET_WIDTH; + + if (!isNull) { + final int startIndexLeft = leftVector.getOffsetBuffer().getInt(leftIndex * offsetWidth); + final int endIndexLeft = leftVector.getOffsetBuffer().getInt((leftIndex + 1) * offsetWidth); + + final int startIndexRight = rightVector.getOffsetBuffer().getInt(rightIndex * offsetWidth); + final int endIndexRight = rightVector.getOffsetBuffer().getInt((rightIndex + 1) * offsetWidth); + + if ((endIndexLeft - startIndexLeft) != (endIndexRight - startIndexRight)) { + return false; + } + + innerRange = innerRange + .setRightStart(startIndexRight) + .setLeftStart(startIndexLeft) + .setLength(endIndexLeft - startIndexLeft); + if (!innerVisitor.rangeEquals(innerRange)) { + return false; + } + } + } + return true; + } + + protected boolean compareFixedSizeListVectors(Range range) { + FixedSizeListVector leftVector = (FixedSizeListVector) left; + FixedSizeListVector rightVector = (FixedSizeListVector) right; + + if (leftVector.getListSize() != rightVector.getListSize()) { + return false; + } + + int listSize = leftVector.getListSize(); + RangeEqualsVisitor innerVisitor = + createInnerVisitor(leftVector.getDataVector(), rightVector.getDataVector(), /*type comparator*/ null); + Range innerRange = new Range(0, 0, listSize); + + for (int i = 0; i < range.getLength(); i++) { + int leftIndex = range.getLeftStart() + i; + int rightIndex = range.getRightStart() + i; + + boolean isNull = leftVector.isNull(leftIndex); + if (isNull != rightVector.isNull(rightIndex)) { + return false; + } + + if (!isNull) { + final int startIndexLeft = leftIndex * listSize; + final int endIndexLeft = (leftIndex + 1) * listSize; + + final int startIndexRight = rightIndex * listSize; + final int endIndexRight = (rightIndex + 1) * listSize; + + if ((endIndexLeft - startIndexLeft) != (endIndexRight - startIndexRight)) { + return false; + } + + innerRange = innerRange.setLeftStart(startIndexLeft) + .setRightStart(startIndexRight); + if (!innerVisitor.rangeEquals(innerRange)) { + return false; + } + } + } + return true; + } + + protected boolean compareLargeListVectors(Range range) { + LargeListVector leftVector = (LargeListVector) left; + LargeListVector rightVector = (LargeListVector) right; + + RangeEqualsVisitor innerVisitor = + createInnerVisitor(leftVector.getDataVector(), rightVector.getDataVector(), /*type comparator*/ null); + Range innerRange = new Range(); + + for (int i = 0; i < range.getLength(); i++) { + int leftIndex = range.getLeftStart() + i; + int rightIndex = range.getRightStart() + i; + + boolean isNull = leftVector.isNull(leftIndex); + if (isNull != rightVector.isNull(rightIndex)) { + return false; + } + + long offsetWidth = LargeListVector.OFFSET_WIDTH; + + if (!isNull) { + final long startIndexLeft = leftVector.getOffsetBuffer().getLong((long) leftIndex * offsetWidth); + final long endIndexLeft = leftVector.getOffsetBuffer().getLong((long) (leftIndex + 1) * offsetWidth); + + final long startIndexRight = rightVector.getOffsetBuffer().getLong((long) rightIndex * offsetWidth); + final long endIndexRight = rightVector.getOffsetBuffer().getLong((long) (rightIndex + 1) * offsetWidth); + + if ((endIndexLeft - startIndexLeft) != (endIndexRight - startIndexRight)) { + return false; + } + + innerRange = innerRange // TODO revisit these casts when long indexing is finished + .setRightStart(checkedCastToInt(startIndexRight)) + .setLeftStart(checkedCastToInt(startIndexLeft)) + .setLength(checkedCastToInt(endIndexLeft - startIndexLeft)); + if (!innerVisitor.rangeEquals(innerRange)) { + return false; + } + } + } + return true; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java new file mode 100644 index 000000000..443ee1f96 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.compare; + +import java.util.List; +import java.util.Objects; + +import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.BaseLargeVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.ExtensionTypeVector; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.complex.DenseUnionVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.LargeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.types.pojo.Field; + +/** + * Visitor to compare type equals for vectors. + */ +public class TypeEqualsVisitor implements VectorVisitor { + + private final ValueVector right; + + private final boolean checkName; + private final boolean checkMetadata; + + /** + * Construct an instance. + */ + public TypeEqualsVisitor(ValueVector right) { + this (right, true, true); + } + + /** + * Construct an instance. + * @param right right vector + * @param checkName whether checks names + * @param checkMetadata whether checks metadata + */ + public TypeEqualsVisitor(ValueVector right, boolean checkName, boolean checkMetadata) { + this.right = right; + this.checkName = checkName; + this.checkMetadata = checkMetadata; + } + + /** + * Check type equals without passing IN param in VectorVisitor. + */ + public boolean equals(ValueVector left) { + return left.accept(this, null); + } + + @Override + public Boolean visit(BaseFixedWidthVector left, Void value) { + return compareField(left.getField(), right.getField()); + } + + @Override + public Boolean visit(BaseVariableWidthVector left, Void value) { + return compareField(left.getField(), right.getField()); + } + + @Override + public Boolean visit(BaseLargeVariableWidthVector left, Void value) { + return compareField(left.getField(), right.getField()); + } + + @Override + public Boolean visit(ListVector left, Void value) { + return compareField(left.getField(), right.getField()); + } + + @Override + public Boolean visit(FixedSizeListVector left, Void value) { + return compareField(left.getField(), right.getField()); + } + + @Override + public Boolean visit(LargeListVector left, Void value) { + return compareField(left.getField(), right.getField()); + } + + @Override + public Boolean visit(NonNullableStructVector left, Void value) { + return compareField(left.getField(), right.getField()); + } + + @Override + public Boolean visit(UnionVector left, Void value) { + return compareField(left.getField(), right.getField()); + } + + @Override + public Boolean visit(DenseUnionVector left, Void value) { + return compareField(left.getField(), right.getField()); + } + + @Override + public Boolean visit(NullVector left, Void value) { + return compareField(left.getField(), right.getField()); + } + + @Override + public Boolean visit(ExtensionTypeVector left, Void value) { + return compareField(left.getField(), right.getField()); + } + + private boolean compareField(Field leftField, Field rightField) { + + if (leftField == rightField) { + return true; + } + + return (!checkName || Objects.equals(leftField.getName(), rightField.getName())) && + Objects.equals(leftField.isNullable(), rightField.isNullable()) && + Objects.equals(leftField.getType(), rightField.getType()) && + Objects.equals(leftField.getDictionary(), rightField.getDictionary()) && + (!checkMetadata || Objects.equals(leftField.getMetadata(), rightField.getMetadata())) && + compareChildren(leftField.getChildren(), rightField.getChildren()); + } + + private boolean compareChildren(List leftChildren, List rightChildren) { + if (leftChildren.size() != rightChildren.size()) { + return false; + } + + for (int i = 0; i < leftChildren.size(); i++) { + if (!compareField(leftChildren.get(i), rightChildren.get(i))) { + return false; + } + } + return true; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorEqualsVisitor.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorEqualsVisitor.java new file mode 100644 index 000000000..390d13854 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorEqualsVisitor.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.compare; + +import static org.apache.arrow.vector.compare.RangeEqualsVisitor.DEFAULT_TYPE_COMPARATOR; + +import java.util.function.BiFunction; + +import org.apache.arrow.vector.ValueVector; + +/** + * Visitor to compare vectors equal. + */ +public class VectorEqualsVisitor { + + /** + * Checks if two vectors are equals with default type comparator. + * @param left the left vector to compare. + * @param right the right vector to compare. + * @return true if the vectors are equal, and false otherwise. + */ + public static boolean vectorEquals(ValueVector left, ValueVector right) { + return vectorEquals(left, right, DEFAULT_TYPE_COMPARATOR); + } + + /** + * Checks if two vectors are equals. + * @param left the left vector to compare. + * @param right the right vector to compare. + * @param typeComparator type comparator to compare vector type. + * @return true if the vectors are equal, and false otherwise. + */ + public static boolean vectorEquals( + ValueVector left, + ValueVector right, + BiFunction typeComparator) { + + if (left.getValueCount() != right.getValueCount()) { + return false; + } + + RangeEqualsVisitor visitor = new RangeEqualsVisitor(left, right, typeComparator); + return visitor.rangeEquals(new Range(0, 0, left.getValueCount())); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorValueEqualizer.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorValueEqualizer.java new file mode 100644 index 000000000..4f9c1a95e --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorValueEqualizer.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.compare; + +import org.apache.arrow.vector.ValueVector; + +/** + * A function to determine if two vectors are equal at specified positions. + * @param the vector type. + */ +public interface VectorValueEqualizer extends Cloneable { + + /** + * Checks if the vectors are equal at the given positions, given that the values + * at both positions are non-null. + * @param vector1 the first vector. + * @param index1 index in the first vector. + * @param vector2 the second vector. + * @param index2 index in the second vector. + * @return true if the two values are considered to be equal, and false otherwise. + */ + boolean valuesEqual(V vector1, int index1, V vector2, int index2); + + /** + * Creates a equalizer of the same type. + * @return the newly created equalizer. + */ + VectorValueEqualizer clone(); +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java new file mode 100644 index 000000000..aee090706 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.compare; + +import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.BaseLargeVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.ExtensionTypeVector; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.complex.DenseUnionVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.LargeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.UnionVector; + +/** + * Generic visitor to visit a {@link org.apache.arrow.vector.ValueVector}. + * @param the output result type. + * @param the input data together with visitor. + */ +public interface VectorVisitor { + + OUT visit(BaseFixedWidthVector left, IN value); + + OUT visit(BaseVariableWidthVector left, IN value); + + OUT visit(BaseLargeVariableWidthVector left, IN value); + + OUT visit(ListVector left, IN value); + + OUT visit(FixedSizeListVector left, IN value); + + OUT visit(LargeListVector left, IN value); + + OUT visit(NonNullableStructVector left, IN value); + + OUT visit(UnionVector left, IN value); + + OUT visit(DenseUnionVector left, IN value); + + OUT visit(NullVector left, IN value); + + OUT visit(ExtensionTypeVector left, IN value); +} + diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/util/ValueEpsilonEqualizers.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/util/ValueEpsilonEqualizers.java new file mode 100644 index 000000000..a7b6a8ca4 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compare/util/ValueEpsilonEqualizers.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.compare.util; + +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.FloatingPointVector; +import org.apache.arrow.vector.compare.VectorValueEqualizer; + +/** + * Vector value equalizers that regard values as equal if their difference + * is within a small threshold (epsilon). + */ +public class ValueEpsilonEqualizers { + + private ValueEpsilonEqualizers() { + + } + + /** + * Difference function for floating point values. + */ + public static class FloatingPointEpsilonEqualizer implements VectorValueEqualizer { + private final double epsilon; + + public FloatingPointEpsilonEqualizer(double epsilon) { + this.epsilon = epsilon; + } + + @Override + public final boolean valuesEqual( + FloatingPointVector vector1, int index1, FloatingPointVector vector2, int index2) { + boolean isNull1 = vector1.isNull(index1); + boolean isNull2 = vector2.isNull(index2); + + if (isNull1 || isNull2) { + return isNull1 == isNull2; + } + + double d1 = vector1.getValueAsDouble(index1); + double d2 = vector2.getValueAsDouble(index2); + + if (Double.isNaN(d1)) { + return Double.isNaN(d2); + } + if (Double.isInfinite(d1)) { + return Double.isInfinite(d2) && Math.signum(d1) == Math.signum(d2); + } + + return Math.abs(d1 - d2) <= epsilon; + } + + @Override + public VectorValueEqualizer clone() { + return new FloatingPointEpsilonEqualizer(epsilon); + } + } + + /** + * Difference function for float values. + */ + public static class Float4EpsilonEqualizer implements VectorValueEqualizer { + private final float epsilon; + + public Float4EpsilonEqualizer(float epsilon) { + this.epsilon = epsilon; + } + + @Override + public final boolean valuesEqual(Float4Vector vector1, int index1, Float4Vector vector2, int index2) { + boolean isNull1 = vector1.isNull(index1); + boolean isNull2 = vector2.isNull(index2); + + if (isNull1 || isNull2) { + return isNull1 == isNull2; + } + + float f1 = vector1.get(index1); + float f2 = vector2.get(index2); + + if (Float.isNaN(f1)) { + return Float.isNaN(f2); + } + if (Float.isInfinite(f1)) { + return Float.isInfinite(f2) && Math.signum(f1) == Math.signum(f2); + } + + return Math.abs(f1 - f2) <= epsilon; + } + + @Override + public VectorValueEqualizer clone() { + return new Float4EpsilonEqualizer(epsilon); + } + } + + /** + * Difference function for double values. + */ + public static class Float8EpsilonEqualizer implements VectorValueEqualizer { + private final double epsilon; + + public Float8EpsilonEqualizer(double epsilon) { + this.epsilon = epsilon; + } + + @Override + public final boolean valuesEqual(Float8Vector vector1, int index1, Float8Vector vector2, int index2) { + boolean isNull1 = vector1.isNull(index1); + boolean isNull2 = vector2.isNull(index2); + + if (isNull1 || isNull2) { + return isNull1 == isNull2; + } + + double d1 = vector1.get(index1); + double d2 = vector2.get(index2); + + if (Double.isNaN(d1)) { + return Double.isNaN(d2); + } + if (Double.isInfinite(d1)) { + return Double.isInfinite(d2) && Math.signum(d1) == Math.signum(d2); + } + + return Math.abs(d1 - d2) <= epsilon; + } + + @Override + public VectorValueEqualizer clone() { + return new Float8EpsilonEqualizer(epsilon); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java new file mode 100644 index 000000000..898bfe3d3 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.vector.DensityAwareVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeList; +import org.apache.arrow.vector.types.pojo.ArrowType.List; +import org.apache.arrow.vector.types.pojo.ArrowType.Struct; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.CallBack; + +/** + * Base class for composite vectors. + * + *

This class implements common functionality of composite vectors. + */ +public abstract class AbstractContainerVector implements ValueVector, DensityAwareVector { + static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(AbstractContainerVector.class); + + protected final String name; + protected final BufferAllocator allocator; + protected final CallBack callBack; + + protected AbstractContainerVector(String name, BufferAllocator allocator, CallBack callBack) { + this.name = name; + this.allocator = allocator; + this.callBack = callBack; + } + + @Override + public void allocateNew() throws OutOfMemoryException { + if (!allocateNewSafe()) { + throw new OutOfMemoryException(); + } + } + + public BufferAllocator getAllocator() { + return allocator; + } + + /** + * Returns a {@link org.apache.arrow.vector.ValueVector} corresponding to the given field name if exists or null. + * + * @param name the name of the child to return + * @return the corresponding FieldVector + */ + public FieldVector getChild(String name) { + return getChild(name, FieldVector.class); + } + + /** + * Clears out all underlying child vectors. + */ + @Override + public void close() { + for (ValueVector vector : (Iterable) this) { + vector.close(); + } + } + + protected T typeify(ValueVector v, Class clazz) { + if (clazz.isAssignableFrom(v.getClass())) { + return clazz.cast(v); + } + throw new IllegalStateException(String.format("Vector requested [%s] was different than type stored [%s]. Arrow " + + "doesn't yet support heterogeneous types.", clazz.getSimpleName(), v.getClass().getSimpleName())); + } + + protected boolean supportsDirectRead() { + return false; + } + + // return the number of child vectors + public abstract int size(); + + // add a new vector with the input FieldType or return the existing vector if we already added one with the same name + public abstract T addOrGet(String name, FieldType fieldType, Class clazz); + + // return the child vector with the input name + public abstract T getChild(String name, Class clazz); + + // return the child vector's ordinal in the composite container + public abstract VectorWithOrdinal getChildVectorWithOrdinal(String name); + + public StructVector addOrGetStruct(String name) { + return addOrGet(name, FieldType.nullable(new Struct()), StructVector.class); + } + + public ListVector addOrGetList(String name) { + return addOrGet(name, FieldType.nullable(new List()), ListVector.class); + } + + public UnionVector addOrGetUnion(String name) { + return addOrGet(name, FieldType.nullable(MinorType.UNION.getType()), UnionVector.class); + } + + public FixedSizeListVector addOrGetFixedSizeList(String name, int listSize) { + return addOrGet(name, FieldType.nullable(new FixedSizeList(listSize)), FixedSizeListVector.class); + } + + public MapVector addOrGetMap(String name, boolean keysSorted) { + return addOrGet(name, FieldType.nullable(new ArrowType.Map(keysSorted)), MapVector.class); + } + + @Override + public void copyFrom(int fromIndex, int thisIndex, ValueVector from) { + throw new UnsupportedOperationException(); + } + + @Override + public void copyFromSafe(int fromIndex, int thisIndex, ValueVector from) { + throw new UnsupportedOperationException(); + } + + @Override + public String getName() { + return name; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractStructVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractStructVector.java new file mode 100644 index 000000000..be6d99233 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractStructVector.java @@ -0,0 +1,425 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.stream.Collectors; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.BitVectorHelper; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.PromotableMultiMapWithOrdinal; +import org.apache.arrow.vector.util.ValueVectorUtility; + +/** + * Base class for StructVectors. Currently used by NonNullableStructVector + */ +public abstract class AbstractStructVector extends AbstractContainerVector { + private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(AbstractContainerVector.class); + private static final String STRUCT_CONFLICT_POLICY_ENV = "ARROW_STRUCT_CONFLICT_POLICY"; + private static final String STRUCT_CONFLICT_POLICY_JVM = "arrow.struct.conflict.policy"; + private static final ConflictPolicy DEFAULT_CONFLICT_POLICY; + // Maintains a map with key as field name and value is the vector itself + private final PromotableMultiMapWithOrdinal vectors; + protected final boolean allowConflictPolicyChanges; + private ConflictPolicy conflictPolicy; + + + static { + String conflictPolicyStr = System.getProperty(STRUCT_CONFLICT_POLICY_JVM, + ConflictPolicy.CONFLICT_REPLACE.toString()); + if (conflictPolicyStr == null) { + conflictPolicyStr = System.getenv(STRUCT_CONFLICT_POLICY_ENV); + } + ConflictPolicy conflictPolicy; + try { + conflictPolicy = ConflictPolicy.valueOf(conflictPolicyStr.toUpperCase()); + } catch (Exception e) { + conflictPolicy = ConflictPolicy.CONFLICT_REPLACE; + } + DEFAULT_CONFLICT_POLICY = conflictPolicy; + } + + /** + * Policy to determine how to react when duplicate columns are encountered. + */ + public enum ConflictPolicy { + // Ignore the conflict and append the field. This is the default behaviour + CONFLICT_APPEND, + // Keep the existing field and ignore the newer one. + CONFLICT_IGNORE, + // Replace the existing field with the newer one. + CONFLICT_REPLACE, + // Refuse the new field and error out. + CONFLICT_ERROR + } + + /** + * Base coonstructor that sets default conflict policy to APPEND. + */ + protected AbstractStructVector(String name, + BufferAllocator allocator, + CallBack callBack, + ConflictPolicy conflictPolicy, + boolean allowConflictPolicyChanges) { + super(name, allocator, callBack); + this.conflictPolicy = conflictPolicy == null ? DEFAULT_CONFLICT_POLICY : conflictPolicy; + this.vectors = new PromotableMultiMapWithOrdinal<>(allowConflictPolicyChanges, this.conflictPolicy); + this.allowConflictPolicyChanges = allowConflictPolicyChanges; + } + + /** + * Set conflict policy and return last conflict policy state. + */ + public ConflictPolicy setConflictPolicy(ConflictPolicy conflictPolicy) { + ConflictPolicy tmp = this.conflictPolicy; + this.conflictPolicy = conflictPolicy; + this.vectors.setConflictPolicy(conflictPolicy); + return tmp; + } + + public ConflictPolicy getConflictPolicy() { + return conflictPolicy; + } + + @Override + public void close() { + for (final ValueVector valueVector : vectors.values()) { + valueVector.close(); + } + vectors.clear(); + + super.close(); + } + + @Override + public boolean allocateNewSafe() { + /* boolean to keep track if all the memory allocation were successful + * Used in the case of composite vectors when we need to allocate multiple + * buffers for multiple vectors. If one of the allocations failed we need to + * clear all the memory that we allocated + */ + boolean success = false; + try { + for (final ValueVector v : vectors.values()) { + if (!v.allocateNewSafe()) { + return false; + } + } + success = true; + } finally { + if (!success) { + clear(); + } + } + return true; + } + + @Override + public void reAlloc() { + for (final ValueVector v : vectors.values()) { + v.reAlloc(); + } + } + + /** + * Adds a new field with the given parameters or replaces the existing one and consequently returns the resultant + * {@link org.apache.arrow.vector.ValueVector}. + * + *

Execution takes place in the following order: + *

    + *
  • + * if field is new, create and insert a new vector of desired type. + *
  • + *
  • + * if field exists and existing vector is of desired vector type, return the vector. + *
  • + *
  • + * if field exists and null filled, clear the existing vector; create and insert a new vector of desired type. + *
  • + *
  • + * otherwise, throw an {@link java.lang.IllegalStateException} + *
  • + *
+ * + * @param childName the name of the field + * @param fieldType the type for the vector + * @param clazz class of expected vector type + * @param class type of expected vector type + * @return resultant {@link org.apache.arrow.vector.ValueVector} + * @throws java.lang.IllegalStateException raised if there is a hard schema change + */ + public T addOrGet(String childName, FieldType fieldType, Class clazz) { + final ValueVector existing = getChild(childName); + boolean create = false; + if (existing == null) { + create = true; + } else if (clazz.isAssignableFrom(existing.getClass())) { + return clazz.cast(existing); + } else if (nullFilled(existing)) { + existing.clear(); + create = true; + } + if (create) { + final T vector = clazz.cast(fieldType.createNewSingleVector(childName, allocator, callBack)); + putChild(childName, vector); + if (callBack != null) { + callBack.doWork(); + } + return vector; + } + final String message = "Arrow does not support schema change yet. Existing[%s] and desired[%s] vector types " + + "mismatch"; + throw new IllegalStateException(String.format(message, existing.getClass().getSimpleName(), clazz.getSimpleName())); + } + + private boolean nullFilled(ValueVector vector) { + return BitVectorHelper.checkAllBitsEqualTo(vector.getValidityBuffer(), vector.getValueCount(), false); + } + + /** + * Returns a {@link org.apache.arrow.vector.ValueVector} corresponding to the given ordinal identifier. + * + * @param id the ordinal of the child to return + * @return the corresponding child + */ + public ValueVector getChildByOrdinal(int id) { + return vectors.getByOrdinal(id); + } + + /** + * Returns a {@link org.apache.arrow.vector.ValueVector} instance of subtype of T corresponding to the given + * field name if exists or null. + * + * If there is more than one element for name this will return the first inserted. + * + * @param name the name of the child to return + * @param clazz the expected type of the child + * @return the child corresponding to this name + */ + @Override + public T getChild(String name, Class clazz) { + final FieldVector f = vectors.get(name); + if (f == null) { + return null; + } + return typeify(f, clazz); + } + + protected ValueVector add(String childName, FieldType fieldType) { + FieldVector vector = fieldType.createNewSingleVector(childName, allocator, callBack); + putChild(childName, vector); + if (callBack != null) { + callBack.doWork(); + } + return vector; + } + + /** + * Inserts the vector with the given name if it does not exist else replaces it with the new value. + * + *

Note that this method does not enforce any vector type check nor throws a schema change exception. + * + * @param name the name of the child to add + * @param vector the vector to add as a child + */ + protected void putChild(String name, FieldVector vector) { + putVector(name, vector); + } + + private void put(String name, FieldVector vector, boolean overwrite) { + final boolean old = vectors.put( + Preconditions.checkNotNull(name, "field name cannot be null"), + Preconditions.checkNotNull(vector, "vector cannot be null"), + overwrite + ); + if (old) { + logger.debug("Field [{}] mutated to [{}] ", name, + vector.getClass().getSimpleName()); + } + } + + /** + * Inserts the input vector into the map if it does not exist. + * + *

+ * If the field name already exists the conflict is handled according to the currently set ConflictPolicy + *

+ * + * @param name field name + * @param vector vector to be inserted + */ + protected void putVector(String name, FieldVector vector) { + switch (conflictPolicy) { + case CONFLICT_APPEND: + put(name, vector, false); + break; + case CONFLICT_IGNORE: + if (!vectors.containsKey(name)) { + put(name, vector, false); + } + break; + case CONFLICT_REPLACE: + if (vectors.containsKey(name)) { + vectors.removeAll(name); + } + put(name, vector, true); + break; + case CONFLICT_ERROR: + if (vectors.containsKey(name)) { + throw new IllegalStateException(String.format("Vector already exists: Existing[%s], Requested[%s] ", + vector.getClass().getSimpleName(), vector.getField().getFieldType())); + } + put(name, vector, false); + break; + default: + throw new IllegalStateException(String.format("%s type not a valid conflict state", conflictPolicy)); + } + + } + + /** + * Get child vectors. + * @return a sequence of underlying child vectors. + */ + protected List getChildren() { + int size = vectors.size(); + List children = new ArrayList<>(); + for (int i = 0; i < size; i++) { + children.add(vectors.getByOrdinal(i)); + } + return children; + } + + /** + * Get child field names. + */ + public List getChildFieldNames() { + return getChildren().stream() + .map(child -> child.getField().getName()) + .collect(Collectors.toList()); + } + + /** + * Get the number of child vectors. + * @return the number of underlying child vectors. + */ + @Override + public int size() { + return vectors.size(); + } + + @Override + public Iterator iterator() { + return Collections.unmodifiableCollection(vectors.values()).iterator(); + } + + /** + * Get primitive child vectors. + * @return a list of scalar child vectors recursing the entire vector hierarchy. + */ + public List getPrimitiveVectors() { + final List primitiveVectors = new ArrayList<>(); + for (final FieldVector v : vectors.values()) { + primitiveVectors.addAll(getPrimitiveVectors(v)); + } + return primitiveVectors; + } + + private List getPrimitiveVectors(FieldVector v) { + final List primitives = new ArrayList<>(); + if (v instanceof AbstractStructVector) { + AbstractStructVector structVector = (AbstractStructVector) v; + primitives.addAll(structVector.getPrimitiveVectors()); + } else if (v instanceof ListVector) { + ListVector listVector = (ListVector) v; + primitives.addAll(getPrimitiveVectors(listVector.getDataVector())); + } else if (v instanceof FixedSizeListVector) { + FixedSizeListVector listVector = (FixedSizeListVector) v; + primitives.addAll(getPrimitiveVectors(listVector.getDataVector())); + } else if (v instanceof UnionVector) { + UnionVector unionVector = (UnionVector) v; + for (final FieldVector vector : unionVector.getChildrenFromFields()) { + primitives.addAll(getPrimitiveVectors(vector)); + } + } else { + primitives.add(v); + } + return primitives; + } + + /** + * Get a child vector by name. If duplicate names this returns the first inserted. + * @param name the name of the child to return + * @return a vector with its corresponding ordinal mapping if field exists or null. + */ + @Override + public VectorWithOrdinal getChildVectorWithOrdinal(String name) { + final int ordinal = vectors.getOrdinal(name); + if (ordinal < 0) { + return null; + } + final ValueVector vector = vectors.getByOrdinal(ordinal); + return new VectorWithOrdinal(vector, ordinal); + } + + @Override + public ArrowBuf[] getBuffers(boolean clear) { + final List buffers = new ArrayList<>(); + + for (final ValueVector vector : vectors.values()) { + for (final ArrowBuf buf : vector.getBuffers(false)) { + buffers.add(buf); + if (clear) { + buf.getReferenceManager().retain(1); + } + } + if (clear) { + vector.clear(); + } + } + + return buffers.toArray(new ArrowBuf[buffers.size()]); + } + + @Override + public int getBufferSize() { + int actualBufSize = 0; + + for (final ValueVector v : vectors.values()) { + for (final ArrowBuf buf : v.getBuffers(false)) { + actualBufSize += buf.writerIndex(); + } + } + return actualBufSize; + } + + @Override + public String toString() { + return ValueVectorUtility.getToString(this, 0 , getValueCount()); + } + +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseListVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseListVector.java new file mode 100644 index 000000000..5f547b901 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseListVector.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex; + +import org.apache.arrow.vector.FieldVector; + +/** + * Abstraction for all list type vectors. + */ +public interface BaseListVector extends FieldVector { + + /** + * Get data vector start index with the given list index. + */ + int getElementStartIndex(int index); + + /** + * Get data vector end index with the given list index. + */ + int getElementEndIndex(int index); +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java new file mode 100644 index 000000000..62d4a1299 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java @@ -0,0 +1,367 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex; + +import static org.apache.arrow.memory.util.LargeMemoryUtil.capAtMaxInt; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.util.CommonUtil; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.AddOrGetResult; +import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.BaseValueVector; +import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.DensityAwareVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.ZeroVector; +import org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeID; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.util.SchemaChangeRuntimeException; + +/** Base class for Vectors that contain repeated values. */ +public abstract class BaseRepeatedValueVector extends BaseValueVector implements RepeatedValueVector, BaseListVector { + + public static final FieldVector DEFAULT_DATA_VECTOR = ZeroVector.INSTANCE; + public static final String DATA_VECTOR_NAME = "$data$"; + + public static final byte OFFSET_WIDTH = 4; + protected ArrowBuf offsetBuffer; + protected FieldVector vector; + protected final CallBack callBack; + protected int valueCount; + protected long offsetAllocationSizeInBytes = INITIAL_VALUE_ALLOCATION * OFFSET_WIDTH; + private final String name; + + protected String defaultDataVectorName = DATA_VECTOR_NAME; + + protected BaseRepeatedValueVector(String name, BufferAllocator allocator, CallBack callBack) { + this(name, allocator, DEFAULT_DATA_VECTOR, callBack); + } + + protected BaseRepeatedValueVector(String name, BufferAllocator allocator, FieldVector vector, CallBack callBack) { + super(allocator); + this.name = name; + this.offsetBuffer = allocator.getEmpty(); + this.vector = Preconditions.checkNotNull(vector, "data vector cannot be null"); + this.callBack = callBack; + this.valueCount = 0; + } + + @Override + public String getName() { + return name; + } + + @Override + public boolean allocateNewSafe() { + boolean dataAlloc = false; + try { + allocateOffsetBuffer(offsetAllocationSizeInBytes); + dataAlloc = vector.allocateNewSafe(); + } catch (Exception e) { + e.printStackTrace(); + clear(); + return false; + } finally { + if (!dataAlloc) { + clear(); + } + } + return dataAlloc; + } + + protected void allocateOffsetBuffer(final long size) { + final int curSize = (int) size; + offsetBuffer = allocator.buffer(curSize); + offsetBuffer.readerIndex(0); + offsetAllocationSizeInBytes = curSize; + offsetBuffer.setZero(0, offsetBuffer.capacity()); + } + + @Override + public void reAlloc() { + reallocOffsetBuffer(); + vector.reAlloc(); + } + + protected void reallocOffsetBuffer() { + final long currentBufferCapacity = offsetBuffer.capacity(); + long newAllocationSize = currentBufferCapacity * 2; + if (newAllocationSize == 0) { + if (offsetAllocationSizeInBytes > 0) { + newAllocationSize = offsetAllocationSizeInBytes; + } else { + newAllocationSize = INITIAL_VALUE_ALLOCATION * OFFSET_WIDTH * 2; + } + } + + newAllocationSize = CommonUtil.nextPowerOfTwo(newAllocationSize); + newAllocationSize = Math.min(newAllocationSize, (long) (OFFSET_WIDTH) * Integer.MAX_VALUE); + assert newAllocationSize >= 1; + + if (newAllocationSize > MAX_ALLOCATION_SIZE || newAllocationSize <= offsetBuffer.capacity()) { + throw new OversizedAllocationException("Unable to expand the buffer"); + } + + final ArrowBuf newBuf = allocator.buffer(newAllocationSize); + newBuf.setBytes(0, offsetBuffer, 0, currentBufferCapacity); + newBuf.setZero(currentBufferCapacity, newBuf.capacity() - currentBufferCapacity); + offsetBuffer.getReferenceManager().release(1); + offsetBuffer = newBuf; + offsetAllocationSizeInBytes = newAllocationSize; + } + + /** + * Get the offset vector. + * @deprecated This API will be removed, as the current implementations no longer hold inner offset vectors. + * + * @return the underlying offset vector or null if none exists. + */ + @Override + @Deprecated + public UInt4Vector getOffsetVector() { + throw new UnsupportedOperationException("There is no inner offset vector"); + } + + @Override + public FieldVector getDataVector() { + return vector; + } + + @Override + public void setInitialCapacity(int numRecords) { + offsetAllocationSizeInBytes = (numRecords + 1) * OFFSET_WIDTH; + if (vector instanceof BaseFixedWidthVector || vector instanceof BaseVariableWidthVector) { + vector.setInitialCapacity(numRecords * RepeatedValueVector.DEFAULT_REPEAT_PER_RECORD); + } else { + vector.setInitialCapacity(numRecords); + } + } + + /** + * Specialized version of setInitialCapacity() for ListVector. This is + * used by some callers when they want to explicitly control and be + * conservative about memory allocated for inner data vector. This is + * very useful when we are working with memory constraints for a query + * and have a fixed amount of memory reserved for the record batch. In + * such cases, we are likely to face OOM or related problems when + * we reserve memory for a record batch with value count x and + * do setInitialCapacity(x) such that each vector allocates only + * what is necessary and not the default amount but the multiplier + * forces the memory requirement to go beyond what was needed. + * + * @param numRecords value count + * @param density density of ListVector. Density is the average size of + * list per position in the List vector. For example, a + * density value of 10 implies each position in the list + * vector has a list of 10 values. + * A density value of 0.1 implies out of 10 positions in + * the list vector, 1 position has a list of size 1 and + * remaining positions are null (no lists) or empty lists. + * This helps in tightly controlling the memory we provision + * for inner data vector. + */ + @Override + public void setInitialCapacity(int numRecords, double density) { + if ((numRecords * density) >= Integer.MAX_VALUE) { + throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); + } + + offsetAllocationSizeInBytes = (numRecords + 1) * OFFSET_WIDTH; + + int innerValueCapacity = Math.max((int) (numRecords * density), 1); + + if (vector instanceof DensityAwareVector) { + ((DensityAwareVector) vector).setInitialCapacity(innerValueCapacity, density); + } else { + vector.setInitialCapacity(innerValueCapacity); + } + } + + @Override + public int getValueCapacity() { + final int offsetValueCapacity = Math.max(getOffsetBufferValueCapacity() - 1, 0); + if (vector == DEFAULT_DATA_VECTOR) { + return offsetValueCapacity; + } + return Math.min(vector.getValueCapacity(), offsetValueCapacity); + } + + protected int getOffsetBufferValueCapacity() { + return capAtMaxInt(offsetBuffer.capacity() / OFFSET_WIDTH); + } + + @Override + public int getBufferSize() { + if (valueCount == 0) { + return 0; + } + return ((valueCount + 1) * OFFSET_WIDTH) + vector.getBufferSize(); + } + + @Override + public int getBufferSizeFor(int valueCount) { + if (valueCount == 0) { + return 0; + } + + int innerVectorValueCount = offsetBuffer.getInt(valueCount * OFFSET_WIDTH); + + return ((valueCount + 1) * OFFSET_WIDTH) + vector.getBufferSizeFor(innerVectorValueCount); + } + + @Override + public Iterator iterator() { + return Collections.singleton(getDataVector()).iterator(); + } + + @Override + public void clear() { + offsetBuffer = releaseBuffer(offsetBuffer); + vector.clear(); + valueCount = 0; + super.clear(); + } + + @Override + public void reset() { + offsetBuffer.setZero(0, offsetBuffer.capacity()); + vector.reset(); + valueCount = 0; + } + + @Override + public ArrowBuf[] getBuffers(boolean clear) { + final ArrowBuf[] buffers; + if (getBufferSize() == 0) { + buffers = new ArrowBuf[0]; + } else { + List list = new ArrayList<>(); + list.add(offsetBuffer); + list.addAll(Arrays.asList(vector.getBuffers(false))); + buffers = list.toArray(new ArrowBuf[list.size()]); + } + if (clear) { + for (ArrowBuf buffer : buffers) { + buffer.getReferenceManager().retain(); + } + clear(); + } + return buffers; + } + + /** + * Get value indicating if inner vector is set. + * @return 1 if inner vector is explicitly set via #addOrGetVector else 0 + */ + public int size() { + return vector == DEFAULT_DATA_VECTOR ? 0 : 1; + } + + /** + * Initialize the data vector (and execute callback) if it hasn't already been done, + * returns the data vector. + */ + public AddOrGetResult addOrGetVector(FieldType fieldType) { + boolean created = false; + if (vector instanceof NullVector) { + vector = fieldType.createNewSingleVector(defaultDataVectorName, allocator, callBack); + // returned vector must have the same field + created = true; + if (callBack != null && + // not a schema change if changing from ZeroVector to ZeroVector + (fieldType.getType().getTypeID() != ArrowTypeID.Null)) { + callBack.doWork(); + } + } + + if (vector.getField().getType().getTypeID() != fieldType.getType().getTypeID()) { + final String msg = String.format("Inner vector type mismatch. Requested type: [%s], actual type: [%s]", + fieldType.getType().getTypeID(), vector.getField().getType().getTypeID()); + throw new SchemaChangeRuntimeException(msg); + } + + return new AddOrGetResult<>((T) vector, created); + } + + protected void replaceDataVector(FieldVector v) { + vector.clear(); + vector = v; + } + + @Override + public int getValueCount() { + return valueCount; + } + + /* returns the value count for inner data vector for this list vector */ + public int getInnerValueCount() { + return vector.getValueCount(); + } + + + /** Returns the value count for inner data vector at a particular index. */ + public int getInnerValueCountAt(int index) { + return offsetBuffer.getInt((index + 1) * OFFSET_WIDTH) - + offsetBuffer.getInt(index * OFFSET_WIDTH); + } + + /** Return if value at index is null (this implementation is always false). */ + public boolean isNull(int index) { + return false; + } + + /** Return if value at index is empty (this implementation is always false). */ + public boolean isEmpty(int index) { + return false; + } + + /** Starts a new repeated value. */ + public int startNewValue(int index) { + while (index >= getOffsetBufferValueCapacity()) { + reallocOffsetBuffer(); + } + int offset = offsetBuffer.getInt(index * OFFSET_WIDTH); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, offset); + setValueCount(index + 1); + return offset; + } + + /** Preallocates the number of repeated values. */ + public void setValueCount(int valueCount) { + this.valueCount = valueCount; + while (valueCount > getOffsetBufferValueCapacity()) { + reallocOffsetBuffer(); + } + final int childValueCount = valueCount == 0 ? 0 : + offsetBuffer.getInt(valueCount * OFFSET_WIDTH); + vector.setValueCount(childValueCount); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/EmptyValuePopulator.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/EmptyValuePopulator.java new file mode 100644 index 000000000..b32dce367 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/EmptyValuePopulator.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex; + +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.UInt4Vector; + +/** + * A helper class that is used to track and populate empty values in repeated value vectors. + */ +public class EmptyValuePopulator { + private final UInt4Vector offsets; + + public EmptyValuePopulator(UInt4Vector offsets) { + this.offsets = Preconditions.checkNotNull(offsets, "offsets cannot be null"); + } + + /** + * Marks all values since the last set as empty. The last set value is obtained from underlying offsets vector. + * + * @param lastIndex the last index (inclusive) in the offsets vector until which empty population takes place + * @throws java.lang.IndexOutOfBoundsException if lastIndex is negative or greater than offsets capacity. + */ + public void populate(int lastIndex) { + if (lastIndex < 0) { + throw new IndexOutOfBoundsException("index cannot be negative"); + } + final int lastSet = Math.max(offsets.getValueCount() - 1, 0); + final int previousEnd = offsets.get(lastSet); //0 ? 0 : accessor.get(lastSet); + for (int i = lastSet; i < lastIndex; i++) { + offsets.setSafe(i + 1, previousEnd); + } + offsets.setValueCount(lastIndex + 1); + } + +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java new file mode 100644 index 000000000..8d23f55fb --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java @@ -0,0 +1,675 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex; + +import static java.util.Collections.singletonList; +import static org.apache.arrow.memory.util.LargeMemoryUtil.capAtMaxInt; +import static org.apache.arrow.memory.util.LargeMemoryUtil.checkedCastToInt; +import static org.apache.arrow.vector.complex.BaseRepeatedValueVector.DATA_VECTOR_NAME; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Objects; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.memory.util.ArrowBufPointer; +import org.apache.arrow.memory.util.ByteFunctionHelpers; +import org.apache.arrow.memory.util.CommonUtil; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.AddOrGetResult; +import org.apache.arrow.vector.BaseValueVector; +import org.apache.arrow.vector.BitVectorHelper; +import org.apache.arrow.vector.BufferBacked; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.ZeroVector; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.complex.impl.UnionFixedSizeListReader; +import org.apache.arrow.vector.complex.impl.UnionFixedSizeListWriter; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.JsonStringArrayList; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.util.SchemaChangeRuntimeException; +import org.apache.arrow.vector.util.TransferPair; + +/** A ListVector where every list value is of the same size. */ +public class FixedSizeListVector extends BaseValueVector implements BaseListVector, PromotableVector { + + public static FixedSizeListVector empty(String name, int size, BufferAllocator allocator) { + FieldType fieldType = FieldType.nullable(new ArrowType.FixedSizeList(size)); + return new FixedSizeListVector(name, allocator, fieldType, null); + } + + private FieldVector vector; + private ArrowBuf validityBuffer; + private final int listSize; + private final FieldType fieldType; + private final String name; + + private UnionFixedSizeListReader reader; + private int valueCount; + private int validityAllocationSizeInBytes; + + /** + * Creates a new instance. + * + * @param name The name for the vector. + * @param allocator The allocator to use for creating/reallocating buffers for the vector. + * @param fieldType The underlying data type of the vector. + * @param unusedSchemaChangeCallback Currently unused. + */ + public FixedSizeListVector(String name, + BufferAllocator allocator, + FieldType fieldType, + CallBack unusedSchemaChangeCallback) { + super(allocator); + + this.name = name; + this.validityBuffer = allocator.getEmpty(); + this.vector = ZeroVector.INSTANCE; + this.fieldType = fieldType; + this.listSize = ((ArrowType.FixedSizeList) fieldType.getType()).getListSize(); + Preconditions.checkArgument(listSize >= 0, "list size must be non-negative"); + this.valueCount = 0; + this.validityAllocationSizeInBytes = getValidityBufferSizeFromCount(INITIAL_VALUE_ALLOCATION); + } + + @Override + public Field getField() { + List children = Collections.singletonList(getDataVector().getField()); + return new Field(name, fieldType, children); + } + + @Override + public MinorType getMinorType() { + return MinorType.FIXED_SIZE_LIST; + } + + @Override + public String getName() { + return name; + } + + /** Get the fixed size for each list. */ + public int getListSize() { + return listSize; + } + + @Override + public void initializeChildrenFromFields(List children) { + if (children.size() != 1) { + throw new IllegalArgumentException("Lists have only one child. Found: " + children); + } + Field field = children.get(0); + AddOrGetResult addOrGetVector = addOrGetVector(field.getFieldType()); + if (!addOrGetVector.isCreated()) { + throw new IllegalArgumentException("Child vector already existed: " + addOrGetVector.getVector()); + } + addOrGetVector.getVector().initializeChildrenFromFields(field.getChildren()); + } + + @Override + public List getChildrenFromFields() { + return singletonList(vector); + } + + @Override + public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { + if (ownBuffers.size() != 1) { + throw new IllegalArgumentException("Illegal buffer count, expected " + 1 + ", got: " + ownBuffers.size()); + } + + ArrowBuf bitBuffer = ownBuffers.get(0); + + validityBuffer.getReferenceManager().release(); + validityBuffer = BitVectorHelper.loadValidityBuffer(fieldNode, bitBuffer, allocator); + valueCount = fieldNode.getLength(); + + validityAllocationSizeInBytes = checkedCastToInt(validityBuffer.capacity()); + } + + @Override + public List getFieldBuffers() { + List result = new ArrayList<>(1); + setReaderAndWriterIndex(); + result.add(validityBuffer); + + return result; + } + + private void setReaderAndWriterIndex() { + validityBuffer.readerIndex(0); + validityBuffer.writerIndex(getValidityBufferSizeFromCount(valueCount)); + } + + /** + * Get the inner vectors. + * + * @deprecated This API will be removed as the current implementations no longer support inner vectors. + * + * @return the inner vectors for this field as defined by the TypeLayout + */ + @Deprecated + @Override + public List getFieldInnerVectors() { + throw new UnsupportedOperationException("There are no inner vectors. Use getFieldBuffers"); + } + + @Override + public UnionFixedSizeListReader getReader() { + if (reader == null) { + reader = new UnionFixedSizeListReader(this); + } + return reader; + } + + private void invalidateReader() { + reader = null; + } + + @Override + public void allocateNew() throws OutOfMemoryException { + if (!allocateNewSafe()) { + throw new OutOfMemoryException("Failure while allocating memory"); + } + } + + @Override + public boolean allocateNewSafe() { + /* boolean to keep track if all the memory allocation were successful + * Used in the case of composite vectors when we need to allocate multiple + * buffers for multiple vectors. If one of the allocations failed we need to + * clear all the memory that we allocated + */ + boolean success = false; + try { + /* we are doing a new allocation -- release the current buffers */ + clear(); + /* allocate validity buffer */ + allocateValidityBuffer(validityAllocationSizeInBytes); + success = vector.allocateNewSafe(); + } finally { + if (!success) { + clear(); + return false; + } + } + + return true; + } + + private void allocateValidityBuffer(final long size) { + final int curSize = (int) size; + validityBuffer = allocator.buffer(curSize); + validityBuffer.readerIndex(0); + validityAllocationSizeInBytes = curSize; + validityBuffer.setZero(0, validityBuffer.capacity()); + } + + @Override + public void reAlloc() { + reallocValidityBuffer(); + vector.reAlloc(); + } + + private void reallocValidityBuffer() { + final int currentBufferCapacity = checkedCastToInt(validityBuffer.capacity()); + long newAllocationSize = currentBufferCapacity * 2; + if (newAllocationSize == 0) { + if (validityAllocationSizeInBytes > 0) { + newAllocationSize = validityAllocationSizeInBytes; + } else { + newAllocationSize = getValidityBufferSizeFromCount(INITIAL_VALUE_ALLOCATION) * 2; + } + } + + newAllocationSize = CommonUtil.nextPowerOfTwo(newAllocationSize); + assert newAllocationSize >= 1; + + if (newAllocationSize > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Unable to expand the buffer"); + } + + final ArrowBuf newBuf = allocator.buffer((int) newAllocationSize); + newBuf.setBytes(0, validityBuffer, 0, currentBufferCapacity); + newBuf.setZero(currentBufferCapacity, newBuf.capacity() - currentBufferCapacity); + validityBuffer.getReferenceManager().release(1); + validityBuffer = newBuf; + validityAllocationSizeInBytes = (int) newAllocationSize; + } + + public FieldVector getDataVector() { + return vector; + } + + /** + * Start a new value in the list vector. + * + * @param index index of the value to start + */ + public int startNewValue(int index) { + while (index >= getValidityBufferValueCapacity()) { + reallocValidityBuffer(); + } + + BitVectorHelper.setBit(validityBuffer, index); + return index * listSize; + } + + public UnionFixedSizeListWriter getWriter() { + return new UnionFixedSizeListWriter(this); + } + + @Override + public void setInitialCapacity(int numRecords) { + validityAllocationSizeInBytes = getValidityBufferSizeFromCount(numRecords); + vector.setInitialCapacity(numRecords * listSize); + } + + @Override + public int getValueCapacity() { + if (vector == ZeroVector.INSTANCE || listSize == 0) { + return 0; + } + return Math.min(vector.getValueCapacity() / listSize, getValidityBufferValueCapacity()); + } + + @Override + public int getBufferSize() { + if (getValueCount() == 0) { + return 0; + } + return getValidityBufferSizeFromCount(valueCount) + vector.getBufferSize(); + } + + @Override + public int getBufferSizeFor(int valueCount) { + if (valueCount == 0) { + return 0; + } + return getValidityBufferSizeFromCount(valueCount) + + vector.getBufferSizeFor(valueCount * listSize); + } + + @Override + public Iterator iterator() { + return Collections.singleton(vector).iterator(); + } + + @Override + public void clear() { + validityBuffer = releaseBuffer(validityBuffer); + vector.clear(); + valueCount = 0; + super.clear(); + } + + @Override + public void reset() { + validityBuffer.setZero(0, validityBuffer.capacity()); + vector.reset(); + valueCount = 0; + } + + @Override + public ArrowBuf[] getBuffers(boolean clear) { + setReaderAndWriterIndex(); + final ArrowBuf[] buffers; + if (getBufferSize() == 0) { + buffers = new ArrowBuf[0]; + } else { + List list = new ArrayList<>(); + list.add(validityBuffer); + list.addAll(Arrays.asList(vector.getBuffers(false))); + buffers = list.toArray(new ArrowBuf[list.size()]); + } + if (clear) { + for (ArrowBuf buffer : buffers) { + buffer.getReferenceManager().retain(); + } + clear(); + } + return buffers; + } + + /** + * Get value indicating if inner vector is set. + * @return 1 if inner vector is explicitly set via #addOrGetVector else 0 + */ + public int size() { + return vector == ZeroVector.INSTANCE ? 0 : 1; + } + + @Override + @SuppressWarnings("unchecked") + public AddOrGetResult addOrGetVector(FieldType type) { + boolean created = false; + if (vector == ZeroVector.INSTANCE) { + vector = type.createNewSingleVector(DATA_VECTOR_NAME, allocator, null); + invalidateReader(); + created = true; + } + // returned vector must have the same field + if (!Objects.equals(vector.getField().getType(), type.getType())) { + final String msg = String.format("Inner vector type mismatch. Requested type: [%s], actual type: [%s]", + type.getType(), vector.getField().getType()); + throw new SchemaChangeRuntimeException(msg); + } + + return new AddOrGetResult<>((T) vector, created); + } + + @Override + public void copyFromSafe(int inIndex, int outIndex, ValueVector from) { + copyFrom(inIndex, outIndex, from); + } + + @Override + public void copyFrom(int fromIndex, int thisIndex, ValueVector from) { + Preconditions.checkArgument(this.getMinorType() == from.getMinorType()); + TransferPair pair = from.makeTransferPair(this); + pair.copyValueSafe(fromIndex, thisIndex); + } + + @Override + public UnionVector promoteToUnion() { + UnionVector vector = new UnionVector(name, allocator, /* field type */ null, /* call-back */ null); + this.vector.clear(); + this.vector = vector; + invalidateReader(); + return vector; + } + + @Override + public long getValidityBufferAddress() { + return validityBuffer.memoryAddress(); + } + + @Override + public long getDataBufferAddress() { + throw new UnsupportedOperationException(); + } + + @Override + public long getOffsetBufferAddress() { + throw new UnsupportedOperationException(); + } + + @Override + public ArrowBuf getValidityBuffer() { + return validityBuffer; + } + + @Override + public ArrowBuf getDataBuffer() { + throw new UnsupportedOperationException(); + } + + @Override + public ArrowBuf getOffsetBuffer() { + throw new UnsupportedOperationException(); + } + + @Override + public List getObject(int index) { + if (isSet(index) == 0) { + return null; + } + final List vals = new JsonStringArrayList<>(listSize); + for (int i = 0; i < listSize; i++) { + vals.add(vector.getObject(index * listSize + i)); + } + return vals; + } + + /** + * Returns whether the value at index null. + */ + public boolean isNull(int index) { + return (isSet(index) == 0); + } + + /** + * Returns non-zero when the value at index is non-null. + */ + public int isSet(int index) { + final int byteIndex = index >> 3; + final byte b = validityBuffer.getByte(byteIndex); + final int bitIndex = index & 7; + return (b >> bitIndex) & 0x01; + } + + @Override + public int getNullCount() { + return BitVectorHelper.getNullCount(validityBuffer, valueCount); + } + + @Override + public int getValueCount() { + return valueCount; + } + + /** + * Returns the number of elements the validity buffer can represent with its + * current capacity. + */ + private int getValidityBufferValueCapacity() { + return capAtMaxInt(validityBuffer.capacity() * 8); + } + + /** + * Sets the value at index to null. Reallocates if index is larger than capacity. + */ + public void setNull(int index) { + while (index >= getValidityBufferValueCapacity()) { + reallocValidityBuffer(); + } + BitVectorHelper.unsetBit(validityBuffer, index); + } + + /** Sets the value at index to not-null. Reallocates if index is larger than capacity. */ + public void setNotNull(int index) { + while (index >= getValidityBufferValueCapacity()) { + reallocValidityBuffer(); + } + BitVectorHelper.setBit(validityBuffer, index); + } + + @Override + public void setValueCount(int valueCount) { + this.valueCount = valueCount; + while (valueCount > getValidityBufferValueCapacity()) { + reallocValidityBuffer(); + } + vector.setValueCount(valueCount * listSize); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return getTransferPair(ref, allocator, null); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { + return new TransferImpl(ref, allocator, callBack); + } + + @Override + public TransferPair makeTransferPair(ValueVector target) { + return new TransferImpl((FixedSizeListVector) target); + } + + @Override + public int hashCode(int index) { + return hashCode(index, null); + } + + @Override + public int hashCode(int index, ArrowBufHasher hasher) { + if (isSet(index) == 0) { + return ArrowBufPointer.NULL_HASH_CODE; + } + int hash = 0; + for (int i = 0; i < listSize; i++) { + hash = ByteFunctionHelpers.combineHash(hash, vector.hashCode(index * listSize + i, hasher)); + } + return hash; + } + + @Override + public OUT accept(VectorVisitor visitor, IN value) { + return visitor.visit(this, value); + } + + @Override + public int getElementStartIndex(int index) { + return listSize * index; + } + + @Override + public int getElementEndIndex(int index) { + return listSize * (index + 1); + } + + private class TransferImpl implements TransferPair { + + FixedSizeListVector to; + TransferPair dataPair; + + public TransferImpl(String name, BufferAllocator allocator, CallBack callBack) { + this(new FixedSizeListVector(name, allocator, fieldType, callBack)); + } + + public TransferImpl(FixedSizeListVector to) { + this.to = to; + to.addOrGetVector(vector.getField().getFieldType()); + dataPair = vector.makeTransferPair(to.vector); + } + + @Override + public void transfer() { + to.clear(); + dataPair.transfer(); + to.validityBuffer = BaseValueVector.transferBuffer(validityBuffer, to.allocator); + to.setValueCount(valueCount); + clear(); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + Preconditions.checkArgument(startIndex >= 0 && length >= 0 && startIndex + length <= valueCount, + "Invalid parameters startIndex: %s, length: %s for valueCount: %s", startIndex, length, valueCount); + final int startPoint = listSize * startIndex; + final int sliceLength = listSize * length; + to.clear(); + + /* splitAndTransfer validity buffer */ + splitAndTransferValidityBuffer(startIndex, length, to); + /* splitAndTransfer data buffer */ + dataPair.splitAndTransfer(startPoint, sliceLength); + to.setValueCount(length); + } + + /* + * transfer the validity. + */ + private void splitAndTransferValidityBuffer(int startIndex, int length, FixedSizeListVector target) { + int firstByteSource = BitVectorHelper.byteIndex(startIndex); + int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1); + int byteSizeTarget = getValidityBufferSizeFromCount(length); + int offset = startIndex % 8; + + if (length > 0) { + if (offset == 0) { + // slice + if (target.validityBuffer != null) { + target.validityBuffer.getReferenceManager().release(); + } + target.validityBuffer = validityBuffer.slice(firstByteSource, byteSizeTarget); + target.validityBuffer.getReferenceManager().retain(1); + } else { + /* Copy data + * When the first bit starts from the middle of a byte (offset != 0), + * copy data from src BitVector. + * Each byte in the target is composed by a part in i-th byte, + * another part in (i+1)-th byte. + */ + target.allocateValidityBuffer(byteSizeTarget); + + for (int i = 0; i < byteSizeTarget - 1; i++) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(validityBuffer, firstByteSource + i, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(validityBuffer, firstByteSource + i + 1, offset); + + target.validityBuffer.setByte(i, (b1 + b2)); + } + + /* Copying the last piece is done in the following manner: + * if the source vector has 1 or more bytes remaining, we copy + * the last piece as a byte formed by shifting data + * from the current byte and the next byte. + * + * if the source vector has no more bytes remaining + * (we are at the last byte), we copy the last piece as a byte + * by shifting data from the current byte. + */ + if ((firstByteSource + byteSizeTarget - 1) < lastByteSource) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(validityBuffer, + firstByteSource + byteSizeTarget - 1, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(validityBuffer, + firstByteSource + byteSizeTarget, offset); + + target.validityBuffer.setByte(byteSizeTarget - 1, b1 + b2); + } else { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(validityBuffer, + firstByteSource + byteSizeTarget - 1, offset); + target.validityBuffer.setByte(byteSizeTarget - 1, b1); + } + } + } + } + + @Override + public ValueVector getTo() { + return to; + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + while (toIndex >= to.getValueCapacity()) { + to.reAlloc(); + } + BitVectorHelper.setValidityBit(to.validityBuffer, toIndex, isSet(fromIndex)); + int fromOffset = fromIndex * listSize; + int toOffset = toIndex * listSize; + for (int i = 0; i < listSize; i++) { + dataPair.copyValueSafe(fromOffset + i, toOffset + i); + } + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java new file mode 100644 index 000000000..6fbdda277 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java @@ -0,0 +1,1036 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex; + +import static java.util.Collections.singletonList; +import static org.apache.arrow.memory.util.LargeMemoryUtil.capAtMaxInt; +import static org.apache.arrow.memory.util.LargeMemoryUtil.checkedCastToInt; +import static org.apache.arrow.util.Preconditions.checkNotNull; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.memory.util.ArrowBufPointer; +import org.apache.arrow.memory.util.ByteFunctionHelpers; +import org.apache.arrow.memory.util.CommonUtil; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.AddOrGetResult; +import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.BaseValueVector; +import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BitVectorHelper; +import org.apache.arrow.vector.BufferBacked; +import org.apache.arrow.vector.DensityAwareVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.ZeroVector; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.complex.impl.ComplexCopier; +import org.apache.arrow.vector.complex.impl.UnionLargeListReader; +import org.apache.arrow.vector.complex.impl.UnionLargeListWriter; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.JsonStringArrayList; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.util.SchemaChangeRuntimeException; +import org.apache.arrow.vector.util.TransferPair; + +/** + * A list vector contains lists of a specific type of elements. Its structure contains 3 elements. + *
    + *
  1. A validity buffer.
  2. + *
  3. An offset buffer, that denotes lists boundaries.
  4. + *
  5. A child data vector that contains the elements of lists.
  6. + *
+ * + * This is the LargeList variant of list, it has a 64-bit wide offset + * + *

+ * WARNING: Currently Arrow in Java doesn't support 64-bit vectors. This class + * follows the expected behaviour of a LargeList but doesn't actually support allocating + * a 64-bit vector. It has little use until 64-bit vectors are supported and should be used + * with caution. + * todo review checkedCastToInt usage in this class. + * Once int64 indexed vectors are supported these checks aren't needed. + *

+ */ +public class LargeListVector extends BaseValueVector implements RepeatedValueVector, FieldVector, PromotableVector { + + public static LargeListVector empty(String name, BufferAllocator allocator) { + return new LargeListVector(name, allocator, FieldType.nullable(ArrowType.LargeList.INSTANCE), null); + } + + public static final FieldVector DEFAULT_DATA_VECTOR = ZeroVector.INSTANCE; + public static final String DATA_VECTOR_NAME = "$data$"; + + public static final byte OFFSET_WIDTH = 8; + protected ArrowBuf offsetBuffer; + protected FieldVector vector; + protected final CallBack callBack; + protected int valueCount; + protected long offsetAllocationSizeInBytes = INITIAL_VALUE_ALLOCATION * OFFSET_WIDTH; + private final String name; + + protected String defaultDataVectorName = DATA_VECTOR_NAME; + protected ArrowBuf validityBuffer; + protected UnionLargeListReader reader; + private final FieldType fieldType; + private int validityAllocationSizeInBytes; + + /** + * The maximum index that is actually set. + */ + private int lastSet; + + /** + * Constructs a new instance. + * + * @param name The name of the instance. + * @param allocator The allocator to use for allocating/reallocating buffers. + * @param fieldType The type of this list. + * @param callBack A schema change callback. + */ + public LargeListVector(String name, BufferAllocator allocator, FieldType fieldType, CallBack callBack) { + super(allocator); + this.name = name; + this.validityBuffer = allocator.getEmpty(); + this.fieldType = checkNotNull(fieldType); + this.callBack = callBack; + this.validityAllocationSizeInBytes = getValidityBufferSizeFromCount(INITIAL_VALUE_ALLOCATION); + this.lastSet = -1; + this.offsetBuffer = allocator.getEmpty(); + this.vector = vector == null ? DEFAULT_DATA_VECTOR : vector; + this.valueCount = 0; + } + + @Override + public void initializeChildrenFromFields(List children) { + if (children.size() != 1) { + throw new IllegalArgumentException("Lists have only one child. Found: " + children); + } + Field field = children.get(0); + AddOrGetResult addOrGetVector = addOrGetVector(field.getFieldType()); + if (!addOrGetVector.isCreated()) { + throw new IllegalArgumentException("Child vector already existed: " + addOrGetVector.getVector()); + } + + addOrGetVector.getVector().initializeChildrenFromFields(field.getChildren()); + } + + @Override + public void setInitialCapacity(int numRecords) { + validityAllocationSizeInBytes = getValidityBufferSizeFromCount(numRecords); + offsetAllocationSizeInBytes = (long) (numRecords + 1) * OFFSET_WIDTH; + if (vector instanceof BaseFixedWidthVector || vector instanceof BaseVariableWidthVector) { + vector.setInitialCapacity(numRecords * RepeatedValueVector.DEFAULT_REPEAT_PER_RECORD); + } else { + vector.setInitialCapacity(numRecords); + } + } + + /** + * Specialized version of setInitialCapacity() for ListVector. This is + * used by some callers when they want to explicitly control and be + * conservative about memory allocated for inner data vector. This is + * very useful when we are working with memory constraints for a query + * and have a fixed amount of memory reserved for the record batch. In + * such cases, we are likely to face OOM or related problems when + * we reserve memory for a record batch with value count x and + * do setInitialCapacity(x) such that each vector allocates only + * what is necessary and not the default amount but the multiplier + * forces the memory requirement to go beyond what was needed. + * + * @param numRecords value count + * @param density density of ListVector. Density is the average size of + * list per position in the List vector. For example, a + * density value of 10 implies each position in the list + * vector has a list of 10 values. + * A density value of 0.1 implies out of 10 positions in + * the list vector, 1 position has a list of size 1 and + * remaining positions are null (no lists) or empty lists. + * This helps in tightly controlling the memory we provision + * for inner data vector. + */ + @Override + public void setInitialCapacity(int numRecords, double density) { + validityAllocationSizeInBytes = getValidityBufferSizeFromCount(numRecords); + if ((numRecords * density) >= Integer.MAX_VALUE) { + throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); + } + + offsetAllocationSizeInBytes = (numRecords + 1) * OFFSET_WIDTH; + + int innerValueCapacity = Math.max((int) (numRecords * density), 1); + + if (vector instanceof DensityAwareVector) { + ((DensityAwareVector) vector).setInitialCapacity(innerValueCapacity, density); + } else { + vector.setInitialCapacity(innerValueCapacity); + } + } + + /** + * Get the density of this ListVector. + * @return density + */ + public double getDensity() { + if (valueCount == 0) { + return 0.0D; + } + final long startOffset = offsetBuffer.getLong(0L); + final long endOffset = offsetBuffer.getLong((long) valueCount * OFFSET_WIDTH); + final double totalListSize = endOffset - startOffset; + return totalListSize / valueCount; + } + + @Override + public List getChildrenFromFields() { + return singletonList(getDataVector()); + } + + /** + * Load the buffers of this vector with provided source buffers. + * The caller manages the source buffers and populates them before invoking + * this method. + * @param fieldNode the fieldNode indicating the value count + * @param ownBuffers the buffers for this Field (own buffers only, children not included) + */ + @Override + public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { + if (ownBuffers.size() != 2) { + throw new IllegalArgumentException("Illegal buffer count, expected " + 2 + ", got: " + ownBuffers.size()); + } + + ArrowBuf bitBuffer = ownBuffers.get(0); + ArrowBuf offBuffer = ownBuffers.get(1); + + validityBuffer.getReferenceManager().release(); + validityBuffer = BitVectorHelper.loadValidityBuffer(fieldNode, bitBuffer, allocator); + offsetBuffer.getReferenceManager().release(); + offsetBuffer = offBuffer.getReferenceManager().retain(offBuffer, allocator); + + validityAllocationSizeInBytes = checkedCastToInt(validityBuffer.capacity()); + offsetAllocationSizeInBytes = offsetBuffer.capacity(); + + lastSet = fieldNode.getLength() - 1; + valueCount = fieldNode.getLength(); + } + + /** + * Get the buffers belonging to this vector. + * @return the inner buffers. + */ + @Override + public List getFieldBuffers() { + List result = new ArrayList<>(2); + setReaderAndWriterIndex(); + result.add(validityBuffer); + result.add(offsetBuffer); + + return result; + } + + /** + * Set the reader and writer indexes for the inner buffers. + */ + private void setReaderAndWriterIndex() { + validityBuffer.readerIndex(0); + offsetBuffer.readerIndex(0); + if (valueCount == 0) { + validityBuffer.writerIndex(0); + offsetBuffer.writerIndex(0); + } else { + validityBuffer.writerIndex(getValidityBufferSizeFromCount(valueCount)); + offsetBuffer.writerIndex((valueCount + 1) * OFFSET_WIDTH); + } + } + + /** + * Get the inner vectors. + * + * @deprecated This API will be removed as the current implementations no longer support inner vectors. + * + * @return the inner vectors for this field as defined by the TypeLayout + */ + @Deprecated + @Override + public List getFieldInnerVectors() { + throw new UnsupportedOperationException("There are no inner vectors. Use getFieldBuffers"); + } + + /** + * Same as {@link #allocateNewSafe()}. + */ + @Override + public void allocateNew() throws OutOfMemoryException { + if (!allocateNewSafe()) { + throw new OutOfMemoryException("Failure while allocating memory"); + } + } + + /** + * Allocate memory for the vector. We internally use a default value count + * of 4096 to allocate memory for at least these many elements in the + * vector. + * + * @return false if memory allocation fails, true otherwise. + */ + public boolean allocateNewSafe() { + boolean success = false; + try { + /* we are doing a new allocation -- release the current buffers */ + clear(); + /* allocate validity buffer */ + allocateValidityBuffer(validityAllocationSizeInBytes); + /* allocate offset and data buffer */ + boolean dataAlloc = false; + try { + allocateOffsetBuffer(offsetAllocationSizeInBytes); + dataAlloc = vector.allocateNewSafe(); + } catch (Exception e) { + e.printStackTrace(); + clear(); + return false; + } finally { + if (!dataAlloc) { + clear(); + } + } + success = dataAlloc; + } finally { + if (!success) { + clear(); + return false; + } + } + return true; + } + + private void allocateValidityBuffer(final long size) { + final int curSize = (int) size; + validityBuffer = allocator.buffer(curSize); + validityBuffer.readerIndex(0); + validityAllocationSizeInBytes = curSize; + validityBuffer.setZero(0, validityBuffer.capacity()); + } + + protected void allocateOffsetBuffer(final long size) { + offsetBuffer = allocator.buffer(size); + offsetBuffer.readerIndex(0); + offsetAllocationSizeInBytes = size; + offsetBuffer.setZero(0, offsetBuffer.capacity()); + } + + /** + * Resize the vector to increase the capacity. The internal behavior is to + * double the current value capacity. + */ + @Override + public void reAlloc() { + /* reallocate the validity buffer */ + reallocValidityBuffer(); + /* reallocate the offset and data */ + reallocOffsetBuffer(); + vector.reAlloc(); + } + + private void reallocValidityAndOffsetBuffers() { + reallocOffsetBuffer(); + reallocValidityBuffer(); + } + + protected void reallocOffsetBuffer() { + final long currentBufferCapacity = offsetBuffer.capacity(); + long newAllocationSize = currentBufferCapacity * 2; + if (newAllocationSize == 0) { + if (offsetAllocationSizeInBytes > 0) { + newAllocationSize = offsetAllocationSizeInBytes; + } else { + newAllocationSize = INITIAL_VALUE_ALLOCATION * OFFSET_WIDTH * 2; + } + } + + newAllocationSize = CommonUtil.nextPowerOfTwo(newAllocationSize); + newAllocationSize = Math.min(newAllocationSize, (long) (OFFSET_WIDTH) * Integer.MAX_VALUE); + assert newAllocationSize >= 1; + + if (newAllocationSize > MAX_ALLOCATION_SIZE || newAllocationSize <= offsetBuffer.capacity()) { + throw new OversizedAllocationException("Unable to expand the buffer"); + } + + final ArrowBuf newBuf = allocator.buffer(newAllocationSize); + newBuf.setBytes(0, offsetBuffer, 0, currentBufferCapacity); + newBuf.setZero(currentBufferCapacity, newBuf.capacity() - currentBufferCapacity); + offsetBuffer.getReferenceManager().release(1); + offsetBuffer = newBuf; + offsetAllocationSizeInBytes = newAllocationSize; + } + + private void reallocValidityBuffer() { + final int currentBufferCapacity = checkedCastToInt(validityBuffer.capacity()); + long newAllocationSize = currentBufferCapacity * 2; + if (newAllocationSize == 0) { + if (validityAllocationSizeInBytes > 0) { + newAllocationSize = validityAllocationSizeInBytes; + } else { + newAllocationSize = getValidityBufferSizeFromCount(INITIAL_VALUE_ALLOCATION) * 2; + } + } + newAllocationSize = CommonUtil.nextPowerOfTwo(newAllocationSize); + assert newAllocationSize >= 1; + + if (newAllocationSize > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Unable to expand the buffer"); + } + + final ArrowBuf newBuf = allocator.buffer((int) newAllocationSize); + newBuf.setBytes(0, validityBuffer, 0, currentBufferCapacity); + newBuf.setZero(currentBufferCapacity, newBuf.capacity() - currentBufferCapacity); + validityBuffer.getReferenceManager().release(1); + validityBuffer = newBuf; + validityAllocationSizeInBytes = (int) newAllocationSize; + } + + /** + * Same as {@link #copyFrom(int, int, ValueVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * @param inIndex position to copy from in source vector + * @param outIndex position to copy to in this vector + * @param from source vector + */ + @Override + public void copyFromSafe(int inIndex, int outIndex, ValueVector from) { + copyFrom(inIndex, outIndex, from); + } + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector. + * @param inIndex position to copy from in source vector + * @param outIndex position to copy to in this vector + * @param from source vector + */ + @Override + public void copyFrom(int inIndex, int outIndex, ValueVector from) { + Preconditions.checkArgument(this.getMinorType() == from.getMinorType()); + FieldReader in = from.getReader(); + in.setPosition(inIndex); + UnionLargeListWriter out = getWriter(); + out.setPosition(outIndex); + ComplexCopier.copy(in, out); + } + + /** + * Get the offset vector. + * @deprecated This API will be removed, as the current implementations no longer hold inner offset vectors. + * + * @return the underlying offset vector or null if none exists. + */ + @Override + @Deprecated + public UInt4Vector getOffsetVector() { + throw new UnsupportedOperationException("There is no inner offset vector"); + } + + /** + * Get the inner data vector for this list vector. + * @return data vector + */ + @Override + public FieldVector getDataVector() { + return vector; + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return getTransferPair(ref, allocator, null); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { + return new TransferImpl(ref, allocator, callBack); + } + + @Override + public TransferPair makeTransferPair(ValueVector target) { + return new TransferImpl((LargeListVector) target); + } + + @Override + public long getValidityBufferAddress() { + return (validityBuffer.memoryAddress()); + } + + @Override + public long getDataBufferAddress() { + throw new UnsupportedOperationException(); + } + + @Override + public long getOffsetBufferAddress() { + return (offsetBuffer.memoryAddress()); + } + + @Override + public ArrowBuf getValidityBuffer() { + return validityBuffer; + } + + @Override + public ArrowBuf getDataBuffer() { + throw new UnsupportedOperationException(); + } + + @Override + public ArrowBuf getOffsetBuffer() { + return offsetBuffer; + } + + @Override + public int getValueCount() { + return valueCount; + } + + @Override + public int hashCode(int index) { + return hashCode(index, null); + } + + @Override + public int hashCode(int index, ArrowBufHasher hasher) { + if (isSet(index) == 0) { + return ArrowBufPointer.NULL_HASH_CODE; + } + int hash = 0; + final long start = offsetBuffer.getLong((long) index * OFFSET_WIDTH); + final long end = offsetBuffer.getLong(((long) index + 1L) * OFFSET_WIDTH); + for (long i = start; i < end; i++) { + hash = ByteFunctionHelpers.combineHash(hash, vector.hashCode(checkedCastToInt(i), hasher)); + } + return hash; + } + + @Override + public OUT accept(VectorVisitor visitor, IN value) { + return visitor.visit(this, value); + } + + public UnionLargeListWriter getWriter() { + return new UnionLargeListWriter(this); + } + + protected void replaceDataVector(FieldVector v) { + vector.clear(); + vector = v; + } + + @Override + public UnionVector promoteToUnion() { + UnionVector vector = new UnionVector("$data$", allocator, /* field type */ null, callBack); + replaceDataVector(vector); + invalidateReader(); + if (callBack != null) { + callBack.doWork(); + } + return vector; + } + + private class TransferImpl implements TransferPair { + + LargeListVector to; + TransferPair dataTransferPair; + + public TransferImpl(String name, BufferAllocator allocator, CallBack callBack) { + this(new LargeListVector(name, allocator, fieldType, callBack)); + } + + public TransferImpl(LargeListVector to) { + this.to = to; + to.addOrGetVector(vector.getField().getFieldType()); + if (to.getDataVector() instanceof ZeroVector) { + to.addOrGetVector(vector.getField().getFieldType()); + } + dataTransferPair = getDataVector().makeTransferPair(to.getDataVector()); + } + + /** + * Transfer this vector'data to another vector. The memory associated + * with this vector is transferred to the allocator of target vector + * for accounting and management purposes. + */ + @Override + public void transfer() { + to.clear(); + dataTransferPair.transfer(); + to.validityBuffer = transferBuffer(validityBuffer, to.allocator); + to.offsetBuffer = transferBuffer(offsetBuffer, to.allocator); + to.lastSet = lastSet; + if (valueCount > 0) { + to.setValueCount(valueCount); + } + clear(); + } + + /** + * Slice this vector at desired index and length and transfer the + * corresponding data to the target vector. + * @param startIndex start position of the split in source vector. + * @param length length of the split. + */ + @Override + public void splitAndTransfer(int startIndex, int length) { + Preconditions.checkArgument(startIndex >= 0 && length >= 0 && startIndex + length <= valueCount, + "Invalid parameters startIndex: %s, length: %s for valueCount: %s", startIndex, length, valueCount); + final long startPoint = offsetBuffer.getLong((long) startIndex * OFFSET_WIDTH); + final long sliceLength = offsetBuffer.getLong((long) (startIndex + length) * OFFSET_WIDTH) - startPoint; + to.clear(); + to.allocateOffsetBuffer((length + 1) * OFFSET_WIDTH); + /* splitAndTransfer offset buffer */ + for (int i = 0; i < length + 1; i++) { + final long relativeOffset = offsetBuffer.getLong((long) (startIndex + i) * OFFSET_WIDTH) - startPoint; + to.offsetBuffer.setLong((long) i * OFFSET_WIDTH, relativeOffset); + } + /* splitAndTransfer validity buffer */ + splitAndTransferValidityBuffer(startIndex, length, to); + /* splitAndTransfer data buffer */ + dataTransferPair.splitAndTransfer(checkedCastToInt(startPoint), checkedCastToInt(sliceLength)); + to.lastSet = length - 1; + to.setValueCount(length); + } + + /* + * transfer the validity. + */ + private void splitAndTransferValidityBuffer(int startIndex, int length, LargeListVector target) { + int firstByteSource = BitVectorHelper.byteIndex(startIndex); + int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1); + int byteSizeTarget = getValidityBufferSizeFromCount(length); + int offset = startIndex % 8; + + if (length > 0) { + if (offset == 0) { + // slice + if (target.validityBuffer != null) { + target.validityBuffer.getReferenceManager().release(); + } + target.validityBuffer = validityBuffer.slice(firstByteSource, byteSizeTarget); + target.validityBuffer.getReferenceManager().retain(1); + } else { + /* Copy data + * When the first bit starts from the middle of a byte (offset != 0), + * copy data from src BitVector. + * Each byte in the target is composed by a part in i-th byte, + * another part in (i+1)-th byte. + */ + target.allocateValidityBuffer(byteSizeTarget); + + for (int i = 0; i < byteSizeTarget - 1; i++) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(validityBuffer, firstByteSource + i, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(validityBuffer, firstByteSource + i + 1, offset); + + target.validityBuffer.setByte(i, (b1 + b2)); + } + + /* Copying the last piece is done in the following manner: + * if the source vector has 1 or more bytes remaining, we copy + * the last piece as a byte formed by shifting data + * from the current byte and the next byte. + * + * if the source vector has no more bytes remaining + * (we are at the last byte), we copy the last piece as a byte + * by shifting data from the current byte. + */ + if ((firstByteSource + byteSizeTarget - 1) < lastByteSource) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(validityBuffer, + firstByteSource + byteSizeTarget - 1, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(validityBuffer, + firstByteSource + byteSizeTarget, offset); + + target.validityBuffer.setByte(byteSizeTarget - 1, b1 + b2); + } else { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(validityBuffer, + firstByteSource + byteSizeTarget - 1, offset); + target.validityBuffer.setByte(byteSizeTarget - 1, b1); + } + } + } + } + + @Override + public ValueVector getTo() { + return to; + } + + @Override + public void copyValueSafe(int from, int to) { + this.to.copyFrom(from, to, LargeListVector.this); + } + } + + @Override + public UnionLargeListReader getReader() { + if (reader == null) { + reader = new UnionLargeListReader(this); + } + return reader; + } + + /** + * Initialize the data vector (and execute callback) if it hasn't already been done, + * returns the data vector. + */ + public AddOrGetResult addOrGetVector(FieldType fieldType) { + boolean created = false; + if (vector instanceof NullVector) { + vector = fieldType.createNewSingleVector(defaultDataVectorName, allocator, callBack); + // returned vector must have the same field + created = true; + if (callBack != null && + // not a schema change if changing from ZeroVector to ZeroVector + (fieldType.getType().getTypeID() != ArrowType.ArrowTypeID.Null)) { + callBack.doWork(); + } + } + + if (vector.getField().getType().getTypeID() != fieldType.getType().getTypeID()) { + final String msg = String.format("Inner vector type mismatch. Requested type: [%s], actual type: [%s]", + fieldType.getType().getTypeID(), vector.getField().getType().getTypeID()); + throw new SchemaChangeRuntimeException(msg); + } + + invalidateReader(); + return new AddOrGetResult<>((T) vector, created); + } + + /** + * Get the size (number of bytes) of underlying buffers used by this + * vector. + * @return size of underlying buffers. + */ + @Override + public int getBufferSize() { + if (valueCount == 0) { + return 0; + } + final int offsetBufferSize = (valueCount + 1) * OFFSET_WIDTH; + final int validityBufferSize = getValidityBufferSizeFromCount(valueCount); + return offsetBufferSize + validityBufferSize + vector.getBufferSize(); + } + + @Override + public int getBufferSizeFor(int valueCount) { + if (valueCount == 0) { + return 0; + } + final int validityBufferSize = getValidityBufferSizeFromCount(valueCount); + long innerVectorValueCount = offsetBuffer.getLong((long) valueCount * OFFSET_WIDTH); + + return ((valueCount + 1) * OFFSET_WIDTH) + + vector.getBufferSizeFor(checkedCastToInt(innerVectorValueCount)) + + validityBufferSize; + } + + @Override + public Field getField() { + return new Field(getName(), fieldType, Collections.singletonList(getDataVector().getField())); + } + + @Override + public MinorType getMinorType() { + return MinorType.LARGELIST; + } + + @Override + public String getName() { + return name; + } + + @Override + public void clear() { + offsetBuffer = releaseBuffer(offsetBuffer); + vector.clear(); + valueCount = 0; + super.clear(); + validityBuffer = releaseBuffer(validityBuffer); + lastSet = -1; + } + + @Override + public void reset() { + offsetBuffer.setZero(0, offsetBuffer.capacity()); + vector.reset(); + valueCount = 0; + validityBuffer.setZero(0, validityBuffer.capacity()); + lastSet = -1; + } + + /** + * Return the underlying buffers associated with this vector. Note that this doesn't + * impact the reference counts for this buffer so it only should be used for in-context + * access. Also note that this buffer changes regularly thus + * external classes shouldn't hold a reference to it (unless they change it). + * + * @param clear Whether to clear vector before returning; the buffers will still be refcounted + * but the returned array will be the only reference to them + * @return The underlying {@link ArrowBuf buffers} that is used by this + * vector instance. + */ + @Override + public ArrowBuf[] getBuffers(boolean clear) { + setReaderAndWriterIndex(); + final ArrowBuf[] buffers; + if (getBufferSize() == 0) { + buffers = new ArrowBuf[0]; + } else { + List list = new ArrayList<>(); + list.add(offsetBuffer); + list.add(validityBuffer); + list.addAll(Arrays.asList(vector.getBuffers(false))); + buffers = list.toArray(new ArrowBuf[list.size()]); + } + if (clear) { + for (ArrowBuf buffer : buffers) { + buffer.getReferenceManager().retain(); + } + clear(); + } + return buffers; + } + + protected void invalidateReader() { + reader = null; + } + + /** + * Get the element in the list vector at a particular index. + * @param index position of the element + * @return Object at given position + */ + @Override + public List getObject(int index) { + if (isSet(index) == 0) { + return null; + } + final List vals = new JsonStringArrayList<>(); + final long start = offsetBuffer.getLong((long) index * OFFSET_WIDTH); + final long end = offsetBuffer.getLong(((long) index + 1L) * OFFSET_WIDTH); + final ValueVector vv = getDataVector(); + for (long i = start; i < end; i++) { + vals.add(vv.getObject(checkedCastToInt(i))); + } + + return vals; + } + + /** + * Check if element at given index is null. + * + * @param index position of element + * @return true if element at given index is null, false otherwise + */ + @Override + public boolean isNull(int index) { + return (isSet(index) == 0); + } + + /** + * Check if element at given index is empty list. + * @param index position of element + * @return true if element at given index is empty list or NULL, false otherwise + */ + public boolean isEmpty(int index) { + if (isNull(index)) { + return true; + } else { + final long start = offsetBuffer.getLong((long) index * OFFSET_WIDTH); + final long end = offsetBuffer.getLong(((long) index + 1L) * OFFSET_WIDTH); + return start == end; + } + } + + /** + * Same as {@link #isNull(int)}. + * + * @param index position of element + * @return 1 if element at given index is not null, 0 otherwise + */ + public int isSet(int index) { + final int byteIndex = index >> 3; + final byte b = validityBuffer.getByte(byteIndex); + final int bitIndex = index & 7; + return (b >> bitIndex) & 0x01; + } + + /** + * Get the number of elements that are null in the vector. + * + * @return the number of null elements. + */ + @Override + public int getNullCount() { + return BitVectorHelper.getNullCount(validityBuffer, valueCount); + } + + /** + * Get the current value capacity for the vector. + * @return number of elements that vector can hold. + */ + @Override + public int getValueCapacity() { + return getValidityAndOffsetValueCapacity(); + } + + protected int getOffsetBufferValueCapacity() { + return checkedCastToInt(offsetBuffer.capacity() / OFFSET_WIDTH); + } + + private int getValidityAndOffsetValueCapacity() { + final int offsetValueCapacity = Math.max(getOffsetBufferValueCapacity() - 1, 0); + return Math.min(offsetValueCapacity, getValidityBufferValueCapacity()); + } + + private int getValidityBufferValueCapacity() { + return capAtMaxInt(validityBuffer.capacity() * 8); + } + + /** + * Sets the list at index to be not-null. Reallocates validity buffer if index + * is larger than current capacity. + */ + public void setNotNull(int index) { + while (index >= getValidityAndOffsetValueCapacity()) { + reallocValidityAndOffsetBuffers(); + } + BitVectorHelper.setBit(validityBuffer, index); + lastSet = index; + } + + /** + * Sets list at index to be null. + * @param index position in vector + */ + public void setNull(int index) { + while (index >= getValidityAndOffsetValueCapacity()) { + reallocValidityAndOffsetBuffers(); + } + if (lastSet >= index) { + lastSet = index - 1; + } + for (int i = lastSet + 1; i <= index; i++) { + final int currentOffset = offsetBuffer.getInt(i * OFFSET_WIDTH); + offsetBuffer.setInt((i + 1) * OFFSET_WIDTH, currentOffset); + } + BitVectorHelper.unsetBit(validityBuffer, index); + } + + /** + * Start a new value in the list vector. + * + * @param index index of the value to start + */ + public long startNewValue(long index) { + while (index >= getValidityAndOffsetValueCapacity()) { + reallocValidityAndOffsetBuffers(); + } + for (int i = lastSet + 1; i <= index; i++) { + final long currentOffset = offsetBuffer.getLong((long) i * OFFSET_WIDTH); + offsetBuffer.setLong(((long) i + 1L) * OFFSET_WIDTH, currentOffset); + } + BitVectorHelper.setBit(validityBuffer, index); + lastSet = checkedCastToInt(index); + return offsetBuffer.getLong(((long) lastSet + 1L) * OFFSET_WIDTH); + } + + /** + * End the current value. + * + * @param index index of the value to end + * @param size number of elements in the list that was written + */ + public void endValue(int index, long size) { + final long currentOffset = offsetBuffer.getLong(((long) index + 1L) * OFFSET_WIDTH); + offsetBuffer.setLong(((long) index + 1L) * OFFSET_WIDTH, currentOffset + size); + } + + /** + * Sets the value count for the vector. + * + *

+ * Important note: The underlying vector does not support 64-bit + * allocations yet. This may throw if attempting to hold larger + * than what a 32-bit vector can store. + *

+ * + * @param valueCount value count + */ + @Override + public void setValueCount(int valueCount) { + this.valueCount = valueCount; + if (valueCount > 0) { + while (valueCount > getValidityAndOffsetValueCapacity()) { + /* check if validity and offset buffers need to be re-allocated */ + reallocValidityAndOffsetBuffers(); + } + for (int i = lastSet + 1; i < valueCount; i++) { + /* fill the holes with offsets */ + final long currentOffset = offsetBuffer.getLong((long) i * OFFSET_WIDTH); + offsetBuffer.setLong(((long) i + 1L) * OFFSET_WIDTH, currentOffset); + } + } + /* valueCount for the data vector is the current end offset */ + final long childValueCount = (valueCount == 0) ? 0 : + offsetBuffer.getLong(((long) lastSet + 1L) * OFFSET_WIDTH); + /* set the value count of data vector and this will take care of + * checking whether data buffer needs to be reallocated. + * TODO: revisit when 64-bit vectors are supported + */ + Preconditions.checkArgument(childValueCount <= Integer.MAX_VALUE || childValueCount >= Integer.MIN_VALUE, + "LargeListVector doesn't yet support 64-bit allocations: %s", childValueCount); + vector.setValueCount((int) childValueCount); + } + + public void setLastSet(int value) { + lastSet = value; + } + + public int getLastSet() { + return lastSet; + } + + public long getElementStartIndex(int index) { + return offsetBuffer.getLong((long) index * OFFSET_WIDTH); + } + + public long getElementEndIndex(int index) { + return offsetBuffer.getLong(((long) index + 1L) * OFFSET_WIDTH); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java new file mode 100644 index 000000000..b5b32951a --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -0,0 +1,879 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex; + +import static java.util.Collections.singletonList; +import static org.apache.arrow.memory.util.LargeMemoryUtil.capAtMaxInt; +import static org.apache.arrow.memory.util.LargeMemoryUtil.checkedCastToInt; +import static org.apache.arrow.util.Preconditions.checkNotNull; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.memory.util.ArrowBufPointer; +import org.apache.arrow.memory.util.ByteFunctionHelpers; +import org.apache.arrow.memory.util.CommonUtil; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.AddOrGetResult; +import org.apache.arrow.vector.BitVectorHelper; +import org.apache.arrow.vector.BufferBacked; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.ZeroVector; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.complex.impl.ComplexCopier; +import org.apache.arrow.vector.complex.impl.UnionListReader; +import org.apache.arrow.vector.complex.impl.UnionListWriter; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.FieldWriter; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.JsonStringArrayList; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.util.TransferPair; + +/** + * A list vector contains lists of a specific type of elements. Its structure contains 3 elements. + *
    + *
  1. A validity buffer.
  2. + *
  3. An offset buffer, that denotes lists boundaries.
  4. + *
  5. A child data vector that contains the elements of lists.
  6. + *
+ * The latter two are managed by its superclass. + */ +public class ListVector extends BaseRepeatedValueVector implements PromotableVector { + + public static ListVector empty(String name, BufferAllocator allocator) { + return new ListVector(name, allocator, FieldType.nullable(ArrowType.List.INSTANCE), null); + } + + protected ArrowBuf validityBuffer; + protected UnionListReader reader; + private CallBack callBack; + private final FieldType fieldType; + private int validityAllocationSizeInBytes; + + /** + * The maximum index that is actually set. + */ + private int lastSet; + + /** + * Constructs a new instance. + * + * @param name The name of the instance. + * @param allocator The allocator to use for allocating/reallocating buffers. + * @param fieldType The type of this list. + * @param callBack A schema change callback. + */ + public ListVector(String name, BufferAllocator allocator, FieldType fieldType, CallBack callBack) { + super(name, allocator, callBack); + this.validityBuffer = allocator.getEmpty(); + this.fieldType = checkNotNull(fieldType); + this.callBack = callBack; + this.validityAllocationSizeInBytes = getValidityBufferSizeFromCount(INITIAL_VALUE_ALLOCATION); + this.lastSet = -1; + } + + @Override + public void initializeChildrenFromFields(List children) { + if (children.size() != 1) { + throw new IllegalArgumentException("Lists have only one child. Found: " + children); + } + Field field = children.get(0); + AddOrGetResult addOrGetVector = addOrGetVector(field.getFieldType()); + if (!addOrGetVector.isCreated()) { + throw new IllegalArgumentException("Child vector already existed: " + addOrGetVector.getVector()); + } + + addOrGetVector.getVector().initializeChildrenFromFields(field.getChildren()); + } + + @Override + public void setInitialCapacity(int numRecords) { + validityAllocationSizeInBytes = getValidityBufferSizeFromCount(numRecords); + super.setInitialCapacity(numRecords); + } + + /** + * Specialized version of setInitialCapacity() for ListVector. This is + * used by some callers when they want to explicitly control and be + * conservative about memory allocated for inner data vector. This is + * very useful when we are working with memory constraints for a query + * and have a fixed amount of memory reserved for the record batch. In + * such cases, we are likely to face OOM or related problems when + * we reserve memory for a record batch with value count x and + * do setInitialCapacity(x) such that each vector allocates only + * what is necessary and not the default amount but the multiplier + * forces the memory requirement to go beyond what was needed. + * + * @param numRecords value count + * @param density density of ListVector. Density is the average size of + * list per position in the List vector. For example, a + * density value of 10 implies each position in the list + * vector has a list of 10 values. + * A density value of 0.1 implies out of 10 positions in + * the list vector, 1 position has a list of size 1 and + * remaining positions are null (no lists) or empty lists. + * This helps in tightly controlling the memory we provision + * for inner data vector. + */ + @Override + public void setInitialCapacity(int numRecords, double density) { + validityAllocationSizeInBytes = getValidityBufferSizeFromCount(numRecords); + super.setInitialCapacity(numRecords, density); + } + + /** + * Get the density of this ListVector. + * @return density + */ + public double getDensity() { + if (valueCount == 0) { + return 0.0D; + } + final int startOffset = offsetBuffer.getInt(0); + final int endOffset = offsetBuffer.getInt(valueCount * OFFSET_WIDTH); + final double totalListSize = endOffset - startOffset; + return totalListSize / valueCount; + } + + @Override + public List getChildrenFromFields() { + return singletonList(getDataVector()); + } + + /** + * Load the buffers of this vector with provided source buffers. + * The caller manages the source buffers and populates them before invoking + * this method. + * @param fieldNode the fieldNode indicating the value count + * @param ownBuffers the buffers for this Field (own buffers only, children not included) + */ + @Override + public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { + if (ownBuffers.size() != 2) { + throw new IllegalArgumentException("Illegal buffer count, expected " + 2 + ", got: " + ownBuffers.size()); + } + + ArrowBuf bitBuffer = ownBuffers.get(0); + ArrowBuf offBuffer = ownBuffers.get(1); + + validityBuffer.getReferenceManager().release(); + validityBuffer = BitVectorHelper.loadValidityBuffer(fieldNode, bitBuffer, allocator); + offsetBuffer.getReferenceManager().release(); + offsetBuffer = offBuffer.getReferenceManager().retain(offBuffer, allocator); + + validityAllocationSizeInBytes = checkedCastToInt(validityBuffer.capacity()); + offsetAllocationSizeInBytes = offsetBuffer.capacity(); + + lastSet = fieldNode.getLength() - 1; + valueCount = fieldNode.getLength(); + } + + /** + * Get the buffers belonging to this vector. + * @return the inner buffers. + */ + @Override + public List getFieldBuffers() { + List result = new ArrayList<>(2); + setReaderAndWriterIndex(); + result.add(validityBuffer); + result.add(offsetBuffer); + + return result; + } + + /** + * Set the reader and writer indexes for the inner buffers. + */ + private void setReaderAndWriterIndex() { + validityBuffer.readerIndex(0); + offsetBuffer.readerIndex(0); + if (valueCount == 0) { + validityBuffer.writerIndex(0); + offsetBuffer.writerIndex(0); + } else { + validityBuffer.writerIndex(getValidityBufferSizeFromCount(valueCount)); + offsetBuffer.writerIndex((valueCount + 1) * OFFSET_WIDTH); + } + } + + /** + * Get the inner vectors. + * + * @deprecated This API will be removed as the current implementations no longer support inner vectors. + * + * @return the inner vectors for this field as defined by the TypeLayout + */ + @Deprecated + @Override + public List getFieldInnerVectors() { + throw new UnsupportedOperationException("There are no inner vectors. Use getFieldBuffers"); + } + + public UnionListWriter getWriter() { + return new UnionListWriter(this); + } + + /** + * Same as {@link #allocateNewSafe()}. + */ + @Override + public void allocateNew() throws OutOfMemoryException { + if (!allocateNewSafe()) { + throw new OutOfMemoryException("Failure while allocating memory"); + } + } + + /** + * Allocate memory for the vector. We internally use a default value count + * of 4096 to allocate memory for at least these many elements in the + * vector. + * + * @return false if memory allocation fails, true otherwise. + */ + public boolean allocateNewSafe() { + boolean success = false; + try { + /* we are doing a new allocation -- release the current buffers */ + clear(); + /* allocate validity buffer */ + allocateValidityBuffer(validityAllocationSizeInBytes); + /* allocate offset and data buffer */ + success = super.allocateNewSafe(); + } finally { + if (!success) { + clear(); + return false; + } + } + return true; + } + + private void allocateValidityBuffer(final long size) { + final int curSize = (int) size; + validityBuffer = allocator.buffer(curSize); + validityBuffer.readerIndex(0); + validityAllocationSizeInBytes = curSize; + validityBuffer.setZero(0, validityBuffer.capacity()); + } + + /** + * Resize the vector to increase the capacity. The internal behavior is to + * double the current value capacity. + */ + @Override + public void reAlloc() { + /* reallocate the validity buffer */ + reallocValidityBuffer(); + /* reallocate the offset and data */ + super.reAlloc(); + } + + private void reallocValidityAndOffsetBuffers() { + reallocOffsetBuffer(); + reallocValidityBuffer(); + } + + private void reallocValidityBuffer() { + final int currentBufferCapacity = checkedCastToInt(validityBuffer.capacity()); + long newAllocationSize = currentBufferCapacity * 2; + if (newAllocationSize == 0) { + if (validityAllocationSizeInBytes > 0) { + newAllocationSize = validityAllocationSizeInBytes; + } else { + newAllocationSize = getValidityBufferSizeFromCount(INITIAL_VALUE_ALLOCATION) * 2; + } + } + newAllocationSize = CommonUtil.nextPowerOfTwo(newAllocationSize); + assert newAllocationSize >= 1; + + if (newAllocationSize > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Unable to expand the buffer"); + } + + final ArrowBuf newBuf = allocator.buffer((int) newAllocationSize); + newBuf.setBytes(0, validityBuffer, 0, currentBufferCapacity); + newBuf.setZero(currentBufferCapacity, newBuf.capacity() - currentBufferCapacity); + validityBuffer.getReferenceManager().release(1); + validityBuffer = newBuf; + validityAllocationSizeInBytes = (int) newAllocationSize; + } + + /** + * Same as {@link #copyFrom(int, int, ValueVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * @param inIndex position to copy from in source vector + * @param outIndex position to copy to in this vector + * @param from source vector + */ + @Override + public void copyFromSafe(int inIndex, int outIndex, ValueVector from) { + copyFrom(inIndex, outIndex, from); + } + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector. + * @param inIndex position to copy from in source vector + * @param outIndex position to copy to in this vector + * @param from source vector + */ + @Override + public void copyFrom(int inIndex, int outIndex, ValueVector from) { + Preconditions.checkArgument(this.getMinorType() == from.getMinorType()); + FieldReader in = from.getReader(); + in.setPosition(inIndex); + FieldWriter out = getWriter(); + out.setPosition(outIndex); + ComplexCopier.copy(in, out); + } + + /** + * Get the inner data vector for this list vector. + * @return data vector + */ + @Override + public FieldVector getDataVector() { + return vector; + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return getTransferPair(ref, allocator, null); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { + return new TransferImpl(ref, allocator, callBack); + } + + @Override + public TransferPair makeTransferPair(ValueVector target) { + return new TransferImpl((ListVector) target); + } + + @Override + public long getValidityBufferAddress() { + return (validityBuffer.memoryAddress()); + } + + @Override + public long getDataBufferAddress() { + throw new UnsupportedOperationException(); + } + + @Override + public long getOffsetBufferAddress() { + return (offsetBuffer.memoryAddress()); + } + + @Override + public ArrowBuf getValidityBuffer() { + return validityBuffer; + } + + @Override + public ArrowBuf getDataBuffer() { + throw new UnsupportedOperationException(); + } + + @Override + public ArrowBuf getOffsetBuffer() { + return offsetBuffer; + } + + @Override + public int hashCode(int index) { + return hashCode(index, null); + } + + @Override + public int hashCode(int index, ArrowBufHasher hasher) { + if (isSet(index) == 0) { + return ArrowBufPointer.NULL_HASH_CODE; + } + int hash = 0; + final int start = offsetBuffer.getInt(index * OFFSET_WIDTH); + final int end = offsetBuffer.getInt((index + 1) * OFFSET_WIDTH); + for (int i = start; i < end; i++) { + hash = ByteFunctionHelpers.combineHash(hash, vector.hashCode(i, hasher)); + } + return hash; + } + + @Override + public OUT accept(VectorVisitor visitor, IN value) { + return visitor.visit(this, value); + } + + private class TransferImpl implements TransferPair { + + ListVector to; + TransferPair dataTransferPair; + + public TransferImpl(String name, BufferAllocator allocator, CallBack callBack) { + this(new ListVector(name, allocator, fieldType, callBack)); + } + + public TransferImpl(ListVector to) { + this.to = to; + to.addOrGetVector(vector.getField().getFieldType()); + if (to.getDataVector() instanceof ZeroVector) { + to.addOrGetVector(vector.getField().getFieldType()); + } + dataTransferPair = getDataVector().makeTransferPair(to.getDataVector()); + } + + /** + * Transfer this vector'data to another vector. The memory associated + * with this vector is transferred to the allocator of target vector + * for accounting and management purposes. + */ + @Override + public void transfer() { + to.clear(); + dataTransferPair.transfer(); + to.validityBuffer = transferBuffer(validityBuffer, to.allocator); + to.offsetBuffer = transferBuffer(offsetBuffer, to.allocator); + to.lastSet = lastSet; + if (valueCount > 0) { + to.setValueCount(valueCount); + } + clear(); + } + + /** + * Slice this vector at desired index and length and transfer the + * corresponding data to the target vector. + * @param startIndex start position of the split in source vector. + * @param length length of the split. + */ + @Override + public void splitAndTransfer(int startIndex, int length) { + Preconditions.checkArgument(startIndex >= 0 && length >= 0 && startIndex + length <= valueCount, + "Invalid parameters startIndex: %s, length: %s for valueCount: %s", startIndex, length, valueCount); + final int startPoint = offsetBuffer.getInt(startIndex * OFFSET_WIDTH); + final int sliceLength = offsetBuffer.getInt((startIndex + length) * OFFSET_WIDTH) - startPoint; + to.clear(); + to.allocateOffsetBuffer((length + 1) * OFFSET_WIDTH); + /* splitAndTransfer offset buffer */ + for (int i = 0; i < length + 1; i++) { + final int relativeOffset = offsetBuffer.getInt((startIndex + i) * OFFSET_WIDTH) - startPoint; + to.offsetBuffer.setInt(i * OFFSET_WIDTH, relativeOffset); + } + /* splitAndTransfer validity buffer */ + splitAndTransferValidityBuffer(startIndex, length, to); + /* splitAndTransfer data buffer */ + dataTransferPair.splitAndTransfer(startPoint, sliceLength); + to.lastSet = length - 1; + to.setValueCount(length); + } + + /* + * transfer the validity. + */ + private void splitAndTransferValidityBuffer(int startIndex, int length, ListVector target) { + int firstByteSource = BitVectorHelper.byteIndex(startIndex); + int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1); + int byteSizeTarget = getValidityBufferSizeFromCount(length); + int offset = startIndex % 8; + + if (length > 0) { + if (offset == 0) { + // slice + if (target.validityBuffer != null) { + target.validityBuffer.getReferenceManager().release(); + } + target.validityBuffer = validityBuffer.slice(firstByteSource, byteSizeTarget); + target.validityBuffer.getReferenceManager().retain(1); + } else { + /* Copy data + * When the first bit starts from the middle of a byte (offset != 0), + * copy data from src BitVector. + * Each byte in the target is composed by a part in i-th byte, + * another part in (i+1)-th byte. + */ + target.allocateValidityBuffer(byteSizeTarget); + + for (int i = 0; i < byteSizeTarget - 1; i++) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(validityBuffer, firstByteSource + i, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(validityBuffer, firstByteSource + i + 1, offset); + + target.validityBuffer.setByte(i, (b1 + b2)); + } + + /* Copying the last piece is done in the following manner: + * if the source vector has 1 or more bytes remaining, we copy + * the last piece as a byte formed by shifting data + * from the current byte and the next byte. + * + * if the source vector has no more bytes remaining + * (we are at the last byte), we copy the last piece as a byte + * by shifting data from the current byte. + */ + if ((firstByteSource + byteSizeTarget - 1) < lastByteSource) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(validityBuffer, + firstByteSource + byteSizeTarget - 1, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(validityBuffer, + firstByteSource + byteSizeTarget, offset); + + target.validityBuffer.setByte(byteSizeTarget - 1, b1 + b2); + } else { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(validityBuffer, + firstByteSource + byteSizeTarget - 1, offset); + target.validityBuffer.setByte(byteSizeTarget - 1, b1); + } + } + } + } + + @Override + public ValueVector getTo() { + return to; + } + + @Override + public void copyValueSafe(int from, int to) { + this.to.copyFrom(from, to, ListVector.this); + } + } + + @Override + public UnionListReader getReader() { + if (reader == null) { + reader = new UnionListReader(this); + } + return reader; + } + + /** Initialize the child data vector to field type. */ + public AddOrGetResult addOrGetVector(FieldType fieldType) { + AddOrGetResult result = super.addOrGetVector(fieldType); + invalidateReader(); + return result; + } + + /** + * Get the size (number of bytes) of underlying buffers used by this + * vector. + * @return size of underlying buffers. + */ + @Override + public int getBufferSize() { + if (valueCount == 0) { + return 0; + } + final int offsetBufferSize = (valueCount + 1) * OFFSET_WIDTH; + final int validityBufferSize = getValidityBufferSizeFromCount(valueCount); + return offsetBufferSize + validityBufferSize + vector.getBufferSize(); + } + + @Override + public int getBufferSizeFor(int valueCount) { + if (valueCount == 0) { + return 0; + } + final int validityBufferSize = getValidityBufferSizeFromCount(valueCount); + + return super.getBufferSizeFor(valueCount) + validityBufferSize; + } + + @Override + public Field getField() { + return new Field(getName(), fieldType, Collections.singletonList(getDataVector().getField())); + } + + @Override + public MinorType getMinorType() { + return MinorType.LIST; + } + + @Override + public void clear() { + super.clear(); + validityBuffer = releaseBuffer(validityBuffer); + lastSet = -1; + } + + @Override + public void reset() { + super.reset(); + validityBuffer.setZero(0, validityBuffer.capacity()); + lastSet = -1; + } + + /** + * Return the underlying buffers associated with this vector. Note that this doesn't + * impact the reference counts for this buffer so it only should be used for in-context + * access. Also note that this buffer changes regularly thus + * external classes shouldn't hold a reference to it (unless they change it). + * + * @param clear Whether to clear vector before returning; the buffers will still be refcounted + * but the returned array will be the only reference to them + * @return The underlying {@link ArrowBuf buffers} that is used by this + * vector instance. + */ + @Override + public ArrowBuf[] getBuffers(boolean clear) { + setReaderAndWriterIndex(); + final ArrowBuf[] buffers; + if (getBufferSize() == 0) { + buffers = new ArrowBuf[0]; + } else { + List list = new ArrayList<>(); + list.add(offsetBuffer); + list.add(validityBuffer); + list.addAll(Arrays.asList(vector.getBuffers(false))); + buffers = list.toArray(new ArrowBuf[list.size()]); + } + if (clear) { + for (ArrowBuf buffer : buffers) { + buffer.getReferenceManager().retain(); + } + clear(); + } + return buffers; + } + + @Override + public UnionVector promoteToUnion() { + UnionVector vector = new UnionVector("$data$", allocator, /* field type*/ null, callBack); + replaceDataVector(vector); + invalidateReader(); + if (callBack != null) { + callBack.doWork(); + } + return vector; + } + + protected void invalidateReader() { + reader = null; + } + + /** + * Get the element in the list vector at a particular index. + * @param index position of the element + * @return Object at given position + */ + @Override + public List getObject(int index) { + if (isSet(index) == 0) { + return null; + } + final List vals = new JsonStringArrayList<>(); + final int start = offsetBuffer.getInt(index * OFFSET_WIDTH); + final int end = offsetBuffer.getInt((index + 1) * OFFSET_WIDTH); + final ValueVector vv = getDataVector(); + for (int i = start; i < end; i++) { + vals.add(vv.getObject(i)); + } + + return vals; + } + + /** + * Check if element at given index is null. + * + * @param index position of element + * @return true if element at given index is null, false otherwise + */ + @Override + public boolean isNull(int index) { + return (isSet(index) == 0); + } + + /** + * Check if element at given index is empty list. + * @param index position of element + * @return true if element at given index is empty list or NULL, false otherwise + */ + @Override + public boolean isEmpty(int index) { + if (isNull(index)) { + return true; + } else { + final int start = offsetBuffer.getInt(index * OFFSET_WIDTH); + final int end = offsetBuffer.getInt((index + 1) * OFFSET_WIDTH); + return start == end; + } + } + + /** + * Same as {@link #isNull(int)}. + * + * @param index position of element + * @return 1 if element at given index is not null, 0 otherwise + */ + public int isSet(int index) { + final int byteIndex = index >> 3; + final byte b = validityBuffer.getByte(byteIndex); + final int bitIndex = index & 7; + return (b >> bitIndex) & 0x01; + } + + /** + * Get the number of elements that are null in the vector. + * + * @return the number of null elements. + */ + @Override + public int getNullCount() { + return BitVectorHelper.getNullCount(validityBuffer, valueCount); + } + + /** + * Get the current value capacity for the vector. + * @return number of elements that vector can hold. + */ + @Override + public int getValueCapacity() { + return getValidityAndOffsetValueCapacity(); + } + + private int getValidityAndOffsetValueCapacity() { + final int offsetValueCapacity = Math.max(getOffsetBufferValueCapacity() - 1, 0); + return Math.min(offsetValueCapacity, getValidityBufferValueCapacity()); + } + + private int getValidityBufferValueCapacity() { + return capAtMaxInt(validityBuffer.capacity() * 8); + } + + /** + * Sets the list at index to be not-null. Reallocates validity buffer if index + * is larger than current capacity. + */ + public void setNotNull(int index) { + while (index >= getValidityAndOffsetValueCapacity()) { + reallocValidityAndOffsetBuffers(); + } + BitVectorHelper.setBit(validityBuffer, index); + lastSet = index; + } + + /** + * Sets list at index to be null. + * @param index position in vector + */ + public void setNull(int index) { + while (index >= getValidityAndOffsetValueCapacity()) { + reallocValidityAndOffsetBuffers(); + } + if (lastSet >= index) { + lastSet = index - 1; + } + for (int i = lastSet + 1; i <= index; i++) { + final int currentOffset = offsetBuffer.getInt(i * OFFSET_WIDTH); + offsetBuffer.setInt((i + 1) * OFFSET_WIDTH, currentOffset); + } + BitVectorHelper.unsetBit(validityBuffer, index); + } + + /** + * Start a new value in the list vector. + * + * @param index index of the value to start + */ + @Override + public int startNewValue(int index) { + while (index >= getValidityAndOffsetValueCapacity()) { + reallocValidityAndOffsetBuffers(); + } + if (lastSet >= index) { + lastSet = index - 1; + } + for (int i = lastSet + 1; i <= index; i++) { + final int currentOffset = offsetBuffer.getInt(i * OFFSET_WIDTH); + offsetBuffer.setInt((i + 1) * OFFSET_WIDTH, currentOffset); + } + BitVectorHelper.setBit(validityBuffer, index); + lastSet = index; + return offsetBuffer.getInt((lastSet + 1) * OFFSET_WIDTH); + } + + /** + * End the current value. + * + * @param index index of the value to end + * @param size number of elements in the list that was written + */ + public void endValue(int index, int size) { + final int currentOffset = offsetBuffer.getInt((index + 1) * OFFSET_WIDTH); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, currentOffset + size); + } + + /** + * Sets the value count for the vector. + * + * @param valueCount value count + */ + @Override + public void setValueCount(int valueCount) { + this.valueCount = valueCount; + if (valueCount > 0) { + while (valueCount > getValidityAndOffsetValueCapacity()) { + /* check if validity and offset buffers need to be re-allocated */ + reallocValidityAndOffsetBuffers(); + } + for (int i = lastSet + 1; i < valueCount; i++) { + /* fill the holes with offsets */ + final int currentOffset = offsetBuffer.getInt(i * OFFSET_WIDTH); + offsetBuffer.setInt((i + 1) * OFFSET_WIDTH, currentOffset); + } + } + /* valueCount for the data vector is the current end offset */ + final int childValueCount = (valueCount == 0) ? 0 : + offsetBuffer.getInt((lastSet + 1) * OFFSET_WIDTH); + /* set the value count of data vector and this will take care of + * checking whether data buffer needs to be reallocated. + */ + vector.setValueCount(childValueCount); + } + + public void setLastSet(int value) { + lastSet = value; + } + + public int getLastSet() { + return lastSet; + } + + @Override + public int getElementStartIndex(int index) { + return offsetBuffer.getInt(index * OFFSET_WIDTH); + } + + @Override + public int getElementEndIndex(int index) { + return offsetBuffer.getInt((index + 1) * OFFSET_WIDTH); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java new file mode 100644 index 000000000..d4275e6fe --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex; + +import static org.apache.arrow.util.Preconditions.checkArgument; + +import java.util.List; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.AddOrGetResult; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.complex.impl.UnionMapReader; +import org.apache.arrow.vector.complex.impl.UnionMapWriter; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType.Map; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.CallBack; + +/** + * A MapVector is used to store entries of key/value pairs. It is a container vector that is + * composed of a list of struct values with "key" and "value" fields. The MapVector is nullable, + * but if a map is set at a given index, there must be an entry. In other words, the StructVector + * data is non-nullable. Also for a given entry, the "key" is non-nullable, however the "value" can + * be null. + */ +public class MapVector extends ListVector { + + public static final String KEY_NAME = "key"; + public static final String VALUE_NAME = "value"; + public static final String DATA_VECTOR_NAME = "entries"; + + /** + * Construct an empty MapVector with no data. Child vectors must be added subsequently. + * + * @param name The name of the vector. + * @param allocator The allocator used for allocating/reallocating buffers. + * @param keysSorted True if the map keys have been pre-sorted. + * @return a new instance of MapVector. + */ + public static MapVector empty(String name, BufferAllocator allocator, boolean keysSorted) { + return new MapVector(name, allocator, FieldType.nullable(new Map(keysSorted)), null); + } + + /** + * Construct a MapVector instance. + * + * @param name The name of the vector. + * @param allocator The allocator used for allocating/reallocating buffers. + * @param fieldType The type definition of the MapVector. + * @param callBack A schema change callback. + */ + public MapVector(String name, BufferAllocator allocator, FieldType fieldType, CallBack callBack) { + super(name, allocator, fieldType, callBack); + defaultDataVectorName = DATA_VECTOR_NAME; + } + + /** + * Initialize child vectors of the map from the given list of fields. + * + * @param children List of fields that will be children of this MapVector. + */ + @Override + public void initializeChildrenFromFields(List children) { + checkArgument(children.size() == 1, "Maps have one List child. Found: %s", children); + + Field structField = children.get(0); + MinorType minorType = Types.getMinorTypeForArrowType(structField.getType()); + checkArgument(minorType == MinorType.STRUCT && !structField.isNullable(), + "Map data should be a non-nullable struct type"); + checkArgument(structField.getChildren().size() == 2, + "Map data should be a struct with 2 children. Found: %s", children); + + Field keyField = structField.getChildren().get(0); + checkArgument(!keyField.isNullable(), "Map data key type should be a non-nullable"); + + AddOrGetResult addOrGetVector = addOrGetVector(structField.getFieldType()); + checkArgument(addOrGetVector.isCreated(), "Child vector already existed: %s", addOrGetVector.getVector()); + + addOrGetVector.getVector().initializeChildrenFromFields(structField.getChildren()); + } + + /** + * Get the writer for this MapVector instance. + */ + @Override + public UnionMapWriter getWriter() { + return new UnionMapWriter(this); + } + + /** + * Get the reader for this MapVector instance. + */ + @Override + public UnionMapReader getReader() { + if (reader == null) { + reader = new UnionMapReader(this); + } + return (UnionMapReader) reader; + } + + @Override + public MinorType getMinorType() { + return MinorType.MAP; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java new file mode 100644 index 000000000..4da266812 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java @@ -0,0 +1,440 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex; + +import static org.apache.arrow.util.Preconditions.checkNotNull; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.util.ByteFunctionHelpers; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.DensityAwareVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.complex.impl.SingleStructReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.ComplexHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.JsonStringHashMap; +import org.apache.arrow.vector.util.TransferPair; + +/** + * A struct vector that has no null values (and no validity buffer). + * Child Vectors are handled in {@link AbstractStructVector}. + */ +public class NonNullableStructVector extends AbstractStructVector { + + public static NonNullableStructVector empty(String name, BufferAllocator allocator) { + FieldType fieldType = new FieldType(false, ArrowType.Struct.INSTANCE, null, null); + return new NonNullableStructVector(name, allocator, fieldType, null, ConflictPolicy.CONFLICT_REPLACE, false); + } + + public static NonNullableStructVector emptyWithDuplicates(String name, BufferAllocator allocator) { + FieldType fieldType = new FieldType(false, ArrowType.Struct.INSTANCE, null, null); + return new NonNullableStructVector(name, allocator, fieldType, null, ConflictPolicy.CONFLICT_APPEND, true); + } + + private final SingleStructReaderImpl reader = new SingleStructReaderImpl(this); + protected final FieldType fieldType; + public int valueCount; + + /** + * Constructs a new instance. + * + * @param name The name of the instance. + * @param allocator The allocator to use to allocating/reallocating buffers. + * @param fieldType The type of this list. + */ + public NonNullableStructVector(String name, + BufferAllocator allocator, + FieldType fieldType, + CallBack callBack) { + super(name, + allocator, + callBack, + null, + true); + this.fieldType = checkNotNull(fieldType); + this.valueCount = 0; + } + + /** + * Constructs a new instance. + * + * @param name The name of the instance. + * @param allocator The allocator to use to allocating/reallocating buffers. + * @param fieldType The type of this list. + * @param callBack A schema change callback. + * @param conflictPolicy How to handle duplicate field names in the struct. + */ + public NonNullableStructVector(String name, + BufferAllocator allocator, + FieldType fieldType, + CallBack callBack, + ConflictPolicy conflictPolicy, + boolean allowConflictPolicyChanges) { + super(name, allocator, callBack, conflictPolicy, allowConflictPolicyChanges); + this.fieldType = checkNotNull(fieldType); + this.valueCount = 0; + } + + @Override + public FieldReader getReader() { + return reader; + } + + private transient StructTransferPair ephPair; + + /** + * Copies the element at fromIndex in the provided vector to thisIndex. Reallocates buffers + * if thisIndex is larger then current capacity. + */ + @Override + public void copyFrom(int fromIndex, int thisIndex, ValueVector from) { + Preconditions.checkArgument(this.getMinorType() == from.getMinorType()); + if (ephPair == null || ephPair.from != from) { + ephPair = (StructTransferPair) from.makeTransferPair(this); + } + ephPair.copyValueSafe(fromIndex, thisIndex); + } + + @Override + public void copyFromSafe(int fromIndex, int thisIndex, ValueVector from) { + copyFrom(fromIndex, thisIndex, from); + } + + @Override + protected boolean supportsDirectRead() { + return true; + } + + public Iterator fieldNameIterator() { + return getChildFieldNames().iterator(); + } + + @Override + public void setInitialCapacity(int numRecords) { + for (final ValueVector v : this) { + v.setInitialCapacity(numRecords); + } + } + + @Override + public void setInitialCapacity(int valueCount, double density) { + for (final ValueVector vector : this) { + if (vector instanceof DensityAwareVector) { + ((DensityAwareVector) vector).setInitialCapacity(valueCount, density); + } else { + vector.setInitialCapacity(valueCount); + } + } + } + + @Override + public int getBufferSize() { + if (valueCount == 0 || size() == 0) { + return 0; + } + long buffer = 0; + for (final ValueVector v : this) { + buffer += v.getBufferSize(); + } + + return (int) buffer; + } + + @Override + public int getBufferSizeFor(final int valueCount) { + if (valueCount == 0) { + return 0; + } + + long bufferSize = 0; + for (final ValueVector v : this) { + bufferSize += v.getBufferSizeFor(valueCount); + } + + return (int) bufferSize; + } + + @Override + public ArrowBuf getValidityBuffer() { + throw new UnsupportedOperationException(); + } + + @Override + public ArrowBuf getDataBuffer() { + throw new UnsupportedOperationException(); + } + + @Override + public ArrowBuf getOffsetBuffer() { + throw new UnsupportedOperationException(); + } + + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + return getTransferPair(name, allocator, null); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { + return new StructTransferPair(this, new NonNullableStructVector(name, + allocator, + fieldType, + callBack, + getConflictPolicy(), + allowConflictPolicyChanges), false); + } + + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new StructTransferPair(this, (NonNullableStructVector) to); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new StructTransferPair(this, new NonNullableStructVector(ref, + allocator, + fieldType, + callBack, + getConflictPolicy(), + allowConflictPolicyChanges), false); + } + + /** + * {@link TransferPair} for this this class. + */ + protected static class StructTransferPair implements TransferPair { + private final TransferPair[] pairs; + private final NonNullableStructVector from; + private final NonNullableStructVector to; + + public StructTransferPair(NonNullableStructVector from, NonNullableStructVector to) { + this(from, to, true); + } + + protected StructTransferPair(NonNullableStructVector from, NonNullableStructVector to, boolean allocate) { + this.from = from; + this.to = to; + this.pairs = new TransferPair[from.size()]; + this.to.ephPair = null; + + int i = 0; + FieldVector vector; + for (String child : from.getChildFieldNames()) { + int preSize = to.size(); + vector = from.getChild(child); + if (vector == null) { + continue; + } + //DRILL-1872: we add the child fields for the vector, looking up the field by name. For a map vector, + // the child fields may be nested fields of the top level child. For example if the structure + // of a child field is oa.oab.oabc then we add oa, then add oab to oa then oabc to oab. + // But the children member of a Materialized field is a HashSet. If the fields are added in the + // children HashSet, and the hashCode of the Materialized field includes the hash code of the + // children, the hashCode value of oa changes *after* the field has been added to the HashSet. + // (This is similar to what happens in ScanBatch where the children cannot be added till they are + // read). To take care of this, we ensure that the hashCode of the MaterializedField does not + // include the hashCode of the children but is based only on MaterializedField$key. + final FieldVector newVector = to.addOrGet(child, vector.getField().getFieldType(), vector.getClass()); + if (allocate && to.size() != preSize) { + newVector.allocateNew(); + } + pairs[i++] = vector.makeTransferPair(newVector); + } + } + + @Override + public void transfer() { + for (final TransferPair p : pairs) { + p.transfer(); + } + to.valueCount = from.valueCount; + from.clear(); + } + + @Override + public ValueVector getTo() { + return to; + } + + @Override + public void copyValueSafe(int from, int to) { + for (TransferPair p : pairs) { + p.copyValueSafe(from, to); + } + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + for (TransferPair p : pairs) { + p.splitAndTransfer(startIndex, length); + } + to.setValueCount(length); + } + } + + @Override + public int getValueCapacity() { + if (size() == 0) { + return 0; + } + + return getChildren().stream() + .mapToInt(child -> child.getValueCapacity()) + .min() + .getAsInt(); + } + + @Override + public Map getObject(int index) { + Map vv = new JsonStringHashMap<>(); + for (String child : getChildFieldNames()) { + ValueVector v = getChild(child); + if (v != null && index < v.getValueCount()) { + Object value = v.getObject(index); + if (value != null) { + vv.put(child, value); + } + } + } + return vv; + } + + @Override + public int hashCode(int index) { + return hashCode(index, null); + } + + @Override + public int hashCode(int index, ArrowBufHasher hasher) { + int hash = 0; + for (FieldVector v : getChildren()) { + if (index < v.getValueCount()) { + hash = ByteFunctionHelpers.combineHash(hash, v.hashCode(index, hasher)); + } + } + return hash; + } + + @Override + public OUT accept(VectorVisitor visitor, IN value) { + return visitor.visit(this, value); + } + + @Override + public boolean isNull(int index) { + return false; + } + + @Override + public int getNullCount() { + return 0; + } + + public void get(int index, ComplexHolder holder) { + reader.setPosition(index); + holder.reader = reader; + } + + @Override + public int getValueCount() { + return valueCount; + } + + public ValueVector getVectorById(int id) { + return getChildByOrdinal(id); + } + + @Override + public void setValueCount(int valueCount) { + for (final ValueVector v : getChildren()) { + v.setValueCount(valueCount); + } + NonNullableStructVector.this.valueCount = valueCount; + } + + @Override + public void clear() { + for (final ValueVector v : getChildren()) { + v.clear(); + } + valueCount = 0; + } + + @Override + public void reset() { + for (final ValueVector v : getChildren()) { + v.reset(); + } + valueCount = 0; + } + + @Override + public Field getField() { + List children = new ArrayList<>(); + for (ValueVector child : getChildren()) { + children.add(child.getField()); + } + return new Field(name, fieldType, children); + } + + @Override + public MinorType getMinorType() { + return MinorType.STRUCT; + } + + @Override + public void close() { + final Collection vectors = getChildren(); + for (final FieldVector v : vectors) { + v.close(); + } + vectors.clear(); + + valueCount = 0; + + super.close(); + } + + /** Initializes the struct's members from the given Fields. */ + public void initializeChildrenFromFields(List children) { + for (Field field : children) { + FieldVector vector = (FieldVector) this.add(field.getName(), field.getFieldType()); + vector.initializeChildrenFromFields(field.getChildren()); + } + } + + public List getChildrenFromFields() { + return getChildren(); + } + +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/Positionable.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/Positionable.java new file mode 100644 index 000000000..dda495408 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/Positionable.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex; + +/** + * Get and set position in a particular data structure. + * + */ +@SuppressWarnings("unused") // Used in when instantiating freemarker templates. +public interface Positionable { + int getPosition(); + + void setPosition(int index); +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/PromotableVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/PromotableVector.java new file mode 100644 index 000000000..d4dd94acb --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/PromotableVector.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex; + +import org.apache.arrow.vector.AddOrGetResult; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.types.pojo.FieldType; + +/** + * Vector that can store multiple {@linkplain FieldType} vectors as children. + */ +public interface PromotableVector { + + AddOrGetResult addOrGetVector(FieldType type); + + UnionVector promoteToUnion(); +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedFixedWidthVectorLike.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedFixedWidthVectorLike.java new file mode 100644 index 000000000..e754f6913 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedFixedWidthVectorLike.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex; + +/** + * A {@link org.apache.arrow.vector.ValueVector} mix-in that can be used in conjunction with + * {@link RepeatedValueVector} subtypes. + */ +public interface RepeatedFixedWidthVectorLike { + /** + * Allocate a new memory space for this vector. Must be called prior to using the ValueVector. + * + * @param valueCount Number of separate repeating groupings. + * @param innerValueCount Number of supported values in the vector. + */ + void allocateNew(int valueCount, int innerValueCount); +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java new file mode 100644 index 000000000..1cae881dd --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex; + +import org.apache.arrow.vector.DensityAwareVector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.ValueVector; + +/** + * An abstraction representing repeated value vectors. + * + *

A repeated vector contains values that may either be flat or nested. A value consists of zero or more + * cells(inner values). Current design maintains data and offsets vectors. Each cell is stored in the data vector. + * Repeated vector uses the offset vector to determine the sequence of cells pertaining to an individual value. + */ +public interface RepeatedValueVector extends ValueVector, DensityAwareVector { + + int DEFAULT_REPEAT_PER_RECORD = 5; + + /** + * Get the offset vector. + * @deprecated This API will be removed, as the current implementations no longer hold inner offset vectors. + * + * @return the underlying offset vector or null if none exists. + */ + @Deprecated + UInt4Vector getOffsetVector(); + + /** + * Get the data vector. + * @return the underlying data vector or null if none exists. + */ + ValueVector getDataVector(); +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedVariableWidthVectorLike.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedVariableWidthVectorLike.java new file mode 100644 index 000000000..5f5324138 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedVariableWidthVectorLike.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex; + +/** + * A {@link org.apache.arrow.vector.ValueVector} mix-in that can be used in conjunction with + * variable {@link RepeatedValueVector} subtypes (e.g. Strings, Lists, etc). + */ +public interface RepeatedVariableWidthVectorLike { + /** + * Allocate a new memory space for this vector. Must be called prior to using the ValueVector. + * + * @param totalBytes Desired size of the underlying data buffer. + * @param parentValueCount Number of separate repeating groupings. + * @param childValueCount Number of supported values in the vector. + */ + void allocateNew(int totalBytes, int parentValueCount, int childValueCount); + + /** + * Provide the maximum amount of variable width bytes that can be stored int his vector. + * + * @return the byte capacity + */ + int getByteCapacity(); +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java new file mode 100644 index 000000000..0098f6836 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex; + +import java.util.Arrays; + +/** + * Utility methods for state machines based on enums. + */ +public class StateTool { + private StateTool() {} + + static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(StateTool.class); + + /** + * Verifies currentState is in one of expectedStates, + * throws an IllegalArgumentException if it isn't. + */ + public static > void check(T currentState, T... expectedStates) { + for (T s : expectedStates) { + if (s == currentState) { + return; + } + } + throw new IllegalArgumentException(String.format("Expected to be in one of these states %s but was actually in " + + "state %s", Arrays.toString(expectedStates), currentState)); + } + +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java new file mode 100644 index 000000000..2dabc6e01 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java @@ -0,0 +1,608 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex; + +import static org.apache.arrow.memory.util.LargeMemoryUtil.checkedCastToInt; +import static org.apache.arrow.util.Preconditions.checkNotNull; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.util.ArrowBufPointer; +import org.apache.arrow.memory.util.CommonUtil; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.BaseValueVector; +import org.apache.arrow.vector.BitVectorHelper; +import org.apache.arrow.vector.BufferBacked; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.complex.impl.NullableStructReaderImpl; +import org.apache.arrow.vector.complex.impl.NullableStructWriter; +import org.apache.arrow.vector.holders.ComplexHolder; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.ArrowType.Struct; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.util.TransferPair; + +/** + * A Struct vector consists of nullability/validity buffer and children vectors + * that make up the struct's fields. The children vectors are handled by the + * parent class. + */ +public class StructVector extends NonNullableStructVector implements FieldVector { + + public static StructVector empty(String name, BufferAllocator allocator) { + FieldType fieldType = FieldType.nullable(Struct.INSTANCE); + return new StructVector(name, allocator, fieldType, null, ConflictPolicy.CONFLICT_REPLACE, false); + } + + public static StructVector emptyWithDuplicates(String name, BufferAllocator allocator) { + FieldType fieldType = new FieldType(false, ArrowType.Struct.INSTANCE, null, null); + return new StructVector(name, allocator, fieldType, null, ConflictPolicy.CONFLICT_APPEND, true); + } + + private final NullableStructReaderImpl reader = new NullableStructReaderImpl(this); + private final NullableStructWriter writer = new NullableStructWriter(this); + + protected ArrowBuf validityBuffer; + private int validityAllocationSizeInBytes; + + /** + * Constructs a new instance. + * + * @param name The name of the instance. + * @param allocator The allocator to use to allocating/reallocating buffers. + * @param fieldType The type of this list. + * @param callBack A schema change callback. + */ + public StructVector(String name, + BufferAllocator allocator, + FieldType fieldType, + CallBack callBack) { + super(name, + checkNotNull(allocator), + fieldType, + callBack); + this.validityBuffer = allocator.getEmpty(); + this.validityAllocationSizeInBytes = + BitVectorHelper.getValidityBufferSize(BaseValueVector.INITIAL_VALUE_ALLOCATION); + } + + /** + * Constructs a new instance. + * + * @param name The name of the instance. + * @param allocator The allocator to use to allocating/reallocating buffers. + * @param fieldType The type of this list. + * @param callBack A schema change callback. + * @param conflictPolicy policy to determine how duplicate names are handled. + * @param allowConflictPolicyChanges wether duplicate names are allowed at all. + */ + public StructVector(String name, + BufferAllocator allocator, + FieldType fieldType, + CallBack callBack, + ConflictPolicy conflictPolicy, + boolean allowConflictPolicyChanges) { + super(name, checkNotNull(allocator), fieldType, callBack, conflictPolicy, allowConflictPolicyChanges); + this.validityBuffer = allocator.getEmpty(); + this.validityAllocationSizeInBytes = + BitVectorHelper.getValidityBufferSize(BaseValueVector.INITIAL_VALUE_ALLOCATION); + } + + @Override + public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { + if (ownBuffers.size() != 1) { + throw new IllegalArgumentException("Illegal buffer count, expected " + 1 + ", got: " + ownBuffers.size()); + } + + ArrowBuf bitBuffer = ownBuffers.get(0); + + validityBuffer.getReferenceManager().release(); + validityBuffer = BitVectorHelper.loadValidityBuffer(fieldNode, bitBuffer, allocator); + valueCount = fieldNode.getLength(); + validityAllocationSizeInBytes = checkedCastToInt(validityBuffer.capacity()); + } + + @Override + public List getFieldBuffers() { + List result = new ArrayList<>(1); + setReaderAndWriterIndex(); + result.add(validityBuffer); + + return result; + } + + private void setReaderAndWriterIndex() { + validityBuffer.readerIndex(0); + validityBuffer.writerIndex(BitVectorHelper.getValidityBufferSize(valueCount)); + } + + /** + * Get the inner vectors. + * + * @deprecated This API will be removed as the current implementations no longer support inner vectors. + * + * @return the inner vectors for this field as defined by the TypeLayout + */ + @Deprecated + @Override + public List getFieldInnerVectors() { + throw new UnsupportedOperationException("There are no inner vectors. Use getFieldBuffers"); + } + + @Override + public NullableStructReaderImpl getReader() { + return reader; + } + + public NullableStructWriter getWriter() { + return writer; + } + + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + return new NullableStructTransferPair(this, new StructVector(name, + allocator, + fieldType, + null, + getConflictPolicy(), + allowConflictPolicyChanges), false); + } + + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new NullableStructTransferPair(this, (StructVector) to, false); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new NullableStructTransferPair(this, new StructVector(ref, + allocator, + fieldType, + null, + getConflictPolicy(), + allowConflictPolicyChanges), false); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { + return new NullableStructTransferPair(this, new StructVector(ref, + allocator, + fieldType, + callBack, + getConflictPolicy(), + allowConflictPolicyChanges), false); + } + + /** + * {@link TransferPair} for this (nullable) {@link StructVector}. + */ + protected class NullableStructTransferPair extends StructTransferPair { + + private StructVector target; + + protected NullableStructTransferPair(StructVector from, StructVector to, boolean allocate) { + super(from, to, allocate); + this.target = to; + } + + @Override + public void transfer() { + target.clear(); + target.validityBuffer = BaseValueVector.transferBuffer(validityBuffer, target.allocator); + super.transfer(); + clear(); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + while (toIndex >= target.getValidityBufferValueCapacity()) { + target.reallocValidityBuffer(); + } + BitVectorHelper.setValidityBit(target.validityBuffer, toIndex, isSet(fromIndex)); + super.copyValueSafe(fromIndex, toIndex); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + Preconditions.checkArgument(startIndex >= 0 && length >= 0 && startIndex + length <= valueCount, + "Invalid parameters startIndex: %s, length: %s for valueCount: %s", startIndex, length, valueCount); + target.clear(); + splitAndTransferValidityBuffer(startIndex, length, target); + super.splitAndTransfer(startIndex, length); + } + } + + /* + * transfer the validity. + */ + private void splitAndTransferValidityBuffer(int startIndex, int length, StructVector target) { + int firstByteSource = BitVectorHelper.byteIndex(startIndex); + int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1); + int byteSizeTarget = BitVectorHelper.getValidityBufferSize(length); + int offset = startIndex % 8; + + if (length > 0) { + if (offset == 0) { + // slice + if (target.validityBuffer != null) { + target.validityBuffer.getReferenceManager().release(); + } + target.validityBuffer = validityBuffer.slice(firstByteSource, byteSizeTarget); + target.validityBuffer.getReferenceManager().retain(1); + } else { + /* Copy data + * When the first bit starts from the middle of a byte (offset != 0), + * copy data from src BitVector. + * Each byte in the target is composed by a part in i-th byte, + * another part in (i+1)-th byte. + */ + target.allocateValidityBuffer(byteSizeTarget); + + for (int i = 0; i < byteSizeTarget - 1; i++) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(validityBuffer, firstByteSource + i, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(validityBuffer, firstByteSource + i + 1, offset); + + target.validityBuffer.setByte(i, (b1 + b2)); + } + + /* Copying the last piece is done in the following manner: + * if the source vector has 1 or more bytes remaining, we copy + * the last piece as a byte formed by shifting data + * from the current byte and the next byte. + * + * if the source vector has no more bytes remaining + * (we are at the last byte), we copy the last piece as a byte + * by shifting data from the current byte. + */ + if ((firstByteSource + byteSizeTarget - 1) < lastByteSource) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(validityBuffer, + firstByteSource + byteSizeTarget - 1, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(validityBuffer, + firstByteSource + byteSizeTarget, offset); + + target.validityBuffer.setByte(byteSizeTarget - 1, b1 + b2); + } else { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(validityBuffer, + firstByteSource + byteSizeTarget - 1, offset); + target.validityBuffer.setByte(byteSizeTarget - 1, b1); + } + } + } + } + + /** + * Get the value capacity of the internal validity buffer. + * @return number of elements that validity buffer can hold + */ + private int getValidityBufferValueCapacity() { + return checkedCastToInt(validityBuffer.capacity() * 8); + } + + /** + * Get the current value capacity for the vector. + * @return number of elements that vector can hold. + */ + @Override + public int getValueCapacity() { + return Math.min(getValidityBufferValueCapacity(), + super.getValueCapacity()); + } + + /** + * Return the underlying buffers associated with this vector. Note that this doesn't + * impact the reference counts for this buffer so it only should be used for in-context + * access. Also note that this buffer changes regularly thus + * external classes shouldn't hold a reference to it (unless they change it). + * + * @param clear Whether to clear vector before returning; the buffers will still be refcounted + * but the returned array will be the only reference to them + * @return The underlying {@link ArrowBuf buffers} that is used by this + * vector instance. + */ + @Override + public ArrowBuf[] getBuffers(boolean clear) { + setReaderAndWriterIndex(); + final ArrowBuf[] buffers; + if (getBufferSize() == 0) { + buffers = new ArrowBuf[0]; + } else { + List list = new ArrayList<>(); + list.add(validityBuffer); + list.addAll(Arrays.asList(super.getBuffers(false))); + buffers = list.toArray(new ArrowBuf[list.size()]); + } + if (clear) { + for (ArrowBuf buffer : buffers) { + buffer.getReferenceManager().retain(); + } + clear(); + } + + return buffers; + } + + /** + * Close the vector and release the associated buffers. + */ + @Override + public void close() { + clearValidityBuffer(); + super.close(); + } + + /** + * Same as {@link #close()}. + */ + @Override + public void clear() { + clearValidityBuffer(); + super.clear(); + } + + /** + * Reset this vector to empty, does not release buffers. + */ + @Override + public void reset() { + super.reset(); + validityBuffer.setZero(0, validityBuffer.capacity()); + } + + /** + * Release the validity buffer. + */ + private void clearValidityBuffer() { + validityBuffer.getReferenceManager().release(); + validityBuffer = allocator.getEmpty(); + } + + /** + * Get the size (number of bytes) of underlying buffers used by this vector. + * + * @return size of underlying buffers. + */ + @Override + public int getBufferSize() { + if (valueCount == 0) { + return 0; + } + return super.getBufferSize() + + BitVectorHelper.getValidityBufferSize(valueCount); + } + + /** + * Get the potential buffer size for a particular number of records. + * + * @param valueCount desired number of elements in the vector + * @return estimated size of underlying buffers if the vector holds + * a given number of elements + */ + @Override + public int getBufferSizeFor(final int valueCount) { + if (valueCount == 0) { + return 0; + } + return super.getBufferSizeFor(valueCount) + + BitVectorHelper.getValidityBufferSize(valueCount); + } + + @Override + public void setInitialCapacity(int numRecords) { + validityAllocationSizeInBytes = BitVectorHelper.getValidityBufferSize(numRecords); + super.setInitialCapacity(numRecords); + } + + @Override + public void setInitialCapacity(int numRecords, double density) { + validityAllocationSizeInBytes = BitVectorHelper.getValidityBufferSize(numRecords); + super.setInitialCapacity(numRecords, density); + } + + @Override + public boolean allocateNewSafe() { + /* Boolean to keep track if all the memory allocations were successful + * Used in the case of composite vectors when we need to allocate multiple + * buffers for multiple vectors. If one of the allocations failed we need to + * clear all the memory that we allocated + */ + boolean success = false; + try { + clear(); + allocateValidityBuffer(validityAllocationSizeInBytes); + success = super.allocateNewSafe(); + } finally { + if (!success) { + clear(); + return false; + } + } + return true; + } + + private void allocateValidityBuffer(final long size) { + final int curSize = (int) size; + validityBuffer = allocator.buffer(curSize); + validityBuffer.readerIndex(0); + validityAllocationSizeInBytes = curSize; + validityBuffer.setZero(0, validityBuffer.capacity()); + } + + @Override + public void reAlloc() { + /* reallocate the validity buffer */ + reallocValidityBuffer(); + super.reAlloc(); + } + + private void reallocValidityBuffer() { + final int currentBufferCapacity = checkedCastToInt(validityBuffer.capacity()); + long newAllocationSize = currentBufferCapacity * 2; + if (newAllocationSize == 0) { + if (validityAllocationSizeInBytes > 0) { + newAllocationSize = validityAllocationSizeInBytes; + } else { + newAllocationSize = BitVectorHelper.getValidityBufferSize(BaseValueVector.INITIAL_VALUE_ALLOCATION) * 2; + } + } + newAllocationSize = CommonUtil.nextPowerOfTwo(newAllocationSize); + assert newAllocationSize >= 1; + + if (newAllocationSize > BaseValueVector.MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Unable to expand the buffer"); + } + + final ArrowBuf newBuf = allocator.buffer((int) newAllocationSize); + newBuf.setBytes(0, validityBuffer, 0, currentBufferCapacity); + newBuf.setZero(currentBufferCapacity, newBuf.capacity() - currentBufferCapacity); + validityBuffer.getReferenceManager().release(1); + validityBuffer = newBuf; + validityAllocationSizeInBytes = (int) newAllocationSize; + } + + @Override + public long getValidityBufferAddress() { + return validityBuffer.memoryAddress(); + } + + @Override + public long getDataBufferAddress() { + throw new UnsupportedOperationException(); + } + + @Override + public long getOffsetBufferAddress() { + throw new UnsupportedOperationException(); + } + + @Override + public ArrowBuf getValidityBuffer() { + return validityBuffer; + } + + @Override + public ArrowBuf getDataBuffer() { + throw new UnsupportedOperationException(); + } + + @Override + public ArrowBuf getOffsetBuffer() { + throw new UnsupportedOperationException(); + } + + @Override + public Map getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return super.getObject(index); + } + } + + @Override + public int hashCode(int index) { + return hashCode(index, null); + } + + @Override + public int hashCode(int index, ArrowBufHasher hasher) { + if (isSet(index) == 0) { + return ArrowBufPointer.NULL_HASH_CODE; + } else { + return super.hashCode(index, hasher); + } + } + + @Override + public void get(int index, ComplexHolder holder) { + holder.isSet = isSet(index); + if (holder.isSet == 0) { + holder.reader = null; + return; + } + super.get(index, holder); + } + + /** + * Return the number of null values in the vector. + */ + public int getNullCount() { + return BitVectorHelper.getNullCount(validityBuffer, valueCount); + } + + /** + * Returns true if the value at the provided index is null. + */ + public boolean isNull(int index) { + return isSet(index) == 0; + } + + /** + * Returns true the value at the given index is set (i.e. not null). + */ + public int isSet(int index) { + final int byteIndex = index >> 3; + final byte b = validityBuffer.getByte(byteIndex); + final int bitIndex = index & 7; + return (b >> bitIndex) & 0x01; + } + + /** + * Marks the value at index as being set. Reallocates the validity buffer + * if index is larger than current capacity. + */ + public void setIndexDefined(int index) { + while (index >= getValidityBufferValueCapacity()) { + /* realloc the inner buffers if needed */ + reallocValidityBuffer(); + } + BitVectorHelper.setBit(validityBuffer, index); + } + + /** + * Marks the value at index as null/not set. + */ + public void setNull(int index) { + while (index >= getValidityBufferValueCapacity()) { + /* realloc the inner buffers if needed */ + reallocValidityBuffer(); + } + BitVectorHelper.unsetBit(validityBuffer, index); + } + + @Override + public void setValueCount(int valueCount) { + Preconditions.checkArgument(valueCount >= 0); + while (valueCount > getValidityBufferValueCapacity()) { + /* realloc the inner buffers if needed */ + reallocValidityBuffer(); + } + super.setValueCount(valueCount); + this.valueCount = valueCount; + } + +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/VectorWithOrdinal.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/VectorWithOrdinal.java new file mode 100644 index 000000000..fa00f4b63 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/VectorWithOrdinal.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex; + +import org.apache.arrow.vector.ValueVector; + +/** + * Tuple of a {@link ValueVector} and an index into a data structure containing the {@link ValueVector}. + * Useful for composite types to determine the index of a child. + */ +public class VectorWithOrdinal { + public final ValueVector vector; + public final int ordinal; + + public VectorWithOrdinal(ValueVector v, int ordinal) { + this.vector = v; + this.ordinal = ordinal; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseReader.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseReader.java new file mode 100644 index 000000000..c80fcb89d --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseReader.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex.impl; + +import java.util.Iterator; + +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; +import org.apache.arrow.vector.complex.writer.FieldWriter; +import org.apache.arrow.vector.holders.DenseUnionHolder; +import org.apache.arrow.vector.holders.UnionHolder; + +/** + * Base class providing common functionality for {@link FieldReader} implementations. + * + *

This includes tracking the current index and throwing implementations of optional methods. + */ +abstract class AbstractBaseReader implements FieldReader { + + static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(AbstractBaseReader.class); + + private int index; + + public AbstractBaseReader() { + super(); + } + + @Override + public int getPosition() { + return index; + } + + public void setPosition(int index) { + this.index = index; + } + + protected int idx() { + return index; + } + + @Override + public void reset() { + index = 0; + } + + @Override + public Iterator iterator() { + throw new IllegalStateException("The current reader doesn't support reading as a map."); + } + + @Override + public boolean next() { + throw new IllegalStateException("The current reader doesn't support getting next information."); + } + + @Override + public int size() { + throw new IllegalStateException("The current reader doesn't support getting size information."); + } + + @Override + public void read(UnionHolder holder) { + holder.reader = this; + holder.isSet = this.isSet() ? 1 : 0; + } + + @Override + public void read(int index, UnionHolder holder) { + throw new IllegalStateException("The current reader doesn't support reading union type"); + } + + @Override + public void copyAsValue(UnionWriter writer) { + throw new IllegalStateException("The current reader doesn't support reading union type"); + } + + @Override + public void read(DenseUnionHolder holder) { + holder.reader = this; + holder.isSet = this.isSet() ? 1 : 0; + } + + @Override + public void read(int index, DenseUnionHolder holder) { + throw new IllegalStateException("The current reader doesn't support reading dense union type"); + } + + @Override + public void copyAsValue(DenseUnionWriter writer) { + throw new IllegalStateException("The current reader doesn't support reading dense union type"); + } + + @Override + public void copyAsValue(ListWriter writer) { + ComplexCopier.copy(this, (FieldWriter) writer); + } + + @Override + public void copyAsValue(MapWriter writer) { + ComplexCopier.copy(this, (FieldWriter) writer); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseWriter.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseWriter.java new file mode 100644 index 000000000..cc3c5deed --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseWriter.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex.impl; + +import org.apache.arrow.vector.complex.writer.FieldWriter; + + +/** + * Base class providing common functionality for {@link FieldWriter} implementations. + * + *

Currently this only includes index tracking. + */ +abstract class AbstractBaseWriter implements FieldWriter { + //private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(AbstractBaseWriter.class); + + private int index; + + @Override + public String toString() { + return super.toString() + "[index = " + index + "]"; + } + + int idx() { + return index; + } + + @Override + public int getPosition() { + return index; + } + + @Override + public void setPosition(int index) { + this.index = index; + } + + @Override + public void end() { + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java new file mode 100644 index 000000000..13b26bb67 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java @@ -0,0 +1,227 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex.impl; + +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.StateTool; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.writer.BaseWriter.ComplexWriter; +import org.apache.arrow.vector.types.pojo.Field; + +/** + * Concrete implementation of {@link ComplexWriter}. + */ +public class ComplexWriterImpl extends AbstractFieldWriter implements ComplexWriter { + + private NullableStructWriter structRoot; + private UnionListWriter listRoot; + private final NonNullableStructVector container; + + Mode mode = Mode.INIT; + private final String name; + private final boolean unionEnabled; + private final NullableStructWriterFactory nullableStructWriterFactory; + + private enum Mode { INIT, STRUCT, LIST } + + /** + * Constructs a new instance. + * + * @param name The name of the writer (for tracking). + * @param container A container for the data field to be written. + * @param unionEnabled Unused. + * @param caseSensitive Whether field names are case sensitive (if false field names will be lowercase. + */ + public ComplexWriterImpl( + String name, + NonNullableStructVector container, + boolean unionEnabled, + boolean caseSensitive) { + this.name = name; + this.container = container; + this.unionEnabled = unionEnabled; + nullableStructWriterFactory = caseSensitive ? + NullableStructWriterFactory.getNullableCaseSensitiveStructWriterFactoryInstance() : + NullableStructWriterFactory.getNullableStructWriterFactoryInstance(); + } + + public ComplexWriterImpl(String name, NonNullableStructVector container, boolean unionEnabled) { + this(name, container, unionEnabled, false); + } + + public ComplexWriterImpl(String name, NonNullableStructVector container) { + this(name, container, false); + } + + @Override + public Field getField() { + return container.getField(); + } + + @Override + public int getValueCapacity() { + return container.getValueCapacity(); + } + + private void check(Mode... modes) { + StateTool.check(mode, modes); + } + + @Override + public void reset() { + setPosition(0); + } + + @Override + public void close() throws Exception { + clear(); + structRoot.close(); + if (listRoot != null) { + listRoot.close(); + } + } + + @Override + public void clear() { + switch (mode) { + case STRUCT: + structRoot.clear(); + break; + case LIST: + listRoot.clear(); + break; + default: + break; + } + } + + @Override + public void setValueCount(int count) { + switch (mode) { + case STRUCT: + structRoot.setValueCount(count); + break; + case LIST: + listRoot.setValueCount(count); + break; + default: + break; + } + } + + @Override + public void setPosition(int index) { + super.setPosition(index); + switch (mode) { + case STRUCT: + structRoot.setPosition(index); + break; + case LIST: + listRoot.setPosition(index); + break; + default: + break; + } + } + + /** + * Returns a StructWriter, initializing it necessary from the constructor this instance + * was constructed with. + */ + public StructWriter directStruct() { + Preconditions.checkArgument(name == null); + + switch (mode) { + + case INIT: + structRoot = nullableStructWriterFactory.build((StructVector) container); + structRoot.setPosition(idx()); + mode = Mode.STRUCT; + break; + + case STRUCT: + break; + + default: + check(Mode.INIT, Mode.STRUCT); + } + + return structRoot; + } + + @Override + public StructWriter rootAsStruct() { + switch (mode) { + + case INIT: + // TODO allow dictionaries in complex types + StructVector struct = container.addOrGetStruct(name); + structRoot = nullableStructWriterFactory.build(struct); + structRoot.setPosition(idx()); + mode = Mode.STRUCT; + break; + + case STRUCT: + break; + + default: + check(Mode.INIT, Mode.STRUCT); + } + + return structRoot; + } + + @Override + public void allocate() { + if (structRoot != null) { + structRoot.allocate(); + } else if (listRoot != null) { + listRoot.allocate(); + } + } + + @Override + public ListWriter rootAsList() { + switch (mode) { + + case INIT: + int vectorCount = container.size(); + // TODO allow dictionaries in complex types + ListVector listVector = container.addOrGetList(name); + if (container.size() > vectorCount) { + listVector.allocateNew(); + } + listRoot = new UnionListWriter(listVector, nullableStructWriterFactory); + listRoot.setPosition(idx()); + mode = Mode.LIST; + break; + + case LIST: + break; + + default: + check(Mode.INIT, Mode.STRUCT); + } + + return listRoot; + } + + +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableStructReaderImpl.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableStructReaderImpl.java new file mode 100644 index 000000000..5c098f627 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableStructReaderImpl.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex.impl; + +import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.writer.BaseWriter.StructWriter; +import org.apache.arrow.vector.types.pojo.Field; + +/** + * An {@link org.apache.arrow.vector.complex.reader.FieldReader} for + * reading nullable struct vectors. + */ +public class NullableStructReaderImpl extends SingleStructReaderImpl { + + private StructVector nullableStructVector; + + public NullableStructReaderImpl(NonNullableStructVector vector) { + super(vector); + this.nullableStructVector = (StructVector) vector; + } + + @Override + public Field getField() { + return nullableStructVector.getField(); + } + + @Override + public void copyAsValue(StructWriter writer) { + NullableStructWriter impl = (NullableStructWriter) writer; + impl.container.copyFromSafe(idx(), impl.idx(), nullableStructVector); + } + + @Override + public void copyAsField(String name, StructWriter writer) { + NullableStructWriter impl = (NullableStructWriter) writer.struct(name); + impl.container.copyFromSafe(idx(), impl.idx(), nullableStructVector); + } + + @Override + public boolean isSet() { + return !nullableStructVector.isNull(idx()); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableStructWriterFactory.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableStructWriterFactory.java new file mode 100644 index 000000000..458aa7b61 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableStructWriterFactory.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex.impl; + +import org.apache.arrow.vector.complex.StructVector; + +/** + * A factory for {@link NullableStructWriter} instances. The factory allows for configuring if field + * names should be considered case sensitive. + */ +public class NullableStructWriterFactory { + private final boolean caseSensitive; + private static final NullableStructWriterFactory nullableStructWriterFactory = + new NullableStructWriterFactory(false); + private static final NullableStructWriterFactory nullableCaseSensitiveWriterFactory = + new NullableStructWriterFactory(true); + + public NullableStructWriterFactory(boolean caseSensitive) { + this.caseSensitive = caseSensitive; + } + + public NullableStructWriter build(StructVector container) { + return this.caseSensitive ? new NullableCaseSensitiveStructWriter(container) : new NullableStructWriter(container); + } + + public static NullableStructWriterFactory getNullableStructWriterFactoryInstance() { + return nullableStructWriterFactory; + } + + public static NullableStructWriterFactory getNullableCaseSensitiveStructWriterFactoryInstance() { + return nullableCaseSensitiveWriterFactory; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java new file mode 100644 index 000000000..06b064fda --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java @@ -0,0 +1,398 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex.impl; + +import java.math.BigDecimal; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.complex.AbstractStructVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.LargeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.complex.writer.FieldWriter; +import org.apache.arrow.vector.holders.Decimal256Holder; +import org.apache.arrow.vector.holders.DecimalHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * This FieldWriter implementation delegates all FieldWriter API calls to an inner FieldWriter. This inner field writer + * can start as a specific type, and this class will promote the writer to a UnionWriter if a call is made that the + * specifically typed writer cannot handle. A new UnionVector is created, wrapping the original vector, and replaces the + * original vector in the parent vector, which can be either an AbstractStructVector or a ListVector. + * + *

The writer used can either be for single elements (struct) or lists.

+ */ +public class PromotableWriter extends AbstractPromotableFieldWriter { + + private final AbstractStructVector parentContainer; + private final ListVector listVector; + private final FixedSizeListVector fixedListVector; + private final LargeListVector largeListVector; + private final NullableStructWriterFactory nullableStructWriterFactory; + private int position; + private static final int MAX_DECIMAL_PRECISION = 38; + private static final int MAX_DECIMAL256_PRECISION = 76; + + private enum State { + UNTYPED, SINGLE, UNION + } + + private MinorType type; + private ValueVector vector; + private UnionVector unionVector; + private State state; + private FieldWriter writer; + + /** + * Constructs a new instance. + * + * @param v The vector to write. + * @param parentContainer The parent container for the vector. + */ + public PromotableWriter(ValueVector v, AbstractStructVector parentContainer) { + this(v, parentContainer, NullableStructWriterFactory.getNullableStructWriterFactoryInstance()); + } + + /** + * Constructs a new instance. + * + * @param v The vector to initialize the writer with. + * @param parentContainer The parent container for the vector. + * @param nullableStructWriterFactory The factory to create the delegate writer. + */ + public PromotableWriter( + ValueVector v, + AbstractStructVector parentContainer, + NullableStructWriterFactory nullableStructWriterFactory) { + this.parentContainer = parentContainer; + this.listVector = null; + this.fixedListVector = null; + this.largeListVector = null; + this.nullableStructWriterFactory = nullableStructWriterFactory; + init(v); + } + + /** + * Constructs a new instance. + * + * @param v The vector to initialize the writer with. + * @param listVector The vector that serves as a parent of v. + */ + public PromotableWriter(ValueVector v, ListVector listVector) { + this(v, listVector, NullableStructWriterFactory.getNullableStructWriterFactoryInstance()); + } + + /** + * Constructs a new instance. + * + * @param v The vector to initialize the writer with. + * @param fixedListVector The vector that serves as a parent of v. + */ + public PromotableWriter(ValueVector v, FixedSizeListVector fixedListVector) { + this(v, fixedListVector, NullableStructWriterFactory.getNullableStructWriterFactoryInstance()); + } + + /** + * Constructs a new instance. + * + * @param v The vector to initialize the writer with. + * @param largeListVector The vector that serves as a parent of v. + */ + public PromotableWriter(ValueVector v, LargeListVector largeListVector) { + this(v, largeListVector, NullableStructWriterFactory.getNullableStructWriterFactoryInstance()); + } + + /** + * Constructs a new instance. + * + * @param v The vector to initialize the writer with. + * @param listVector The vector that serves as a parent of v. + * @param nullableStructWriterFactory The factory to create the delegate writer. + */ + public PromotableWriter( + ValueVector v, + ListVector listVector, + NullableStructWriterFactory nullableStructWriterFactory) { + this.listVector = listVector; + this.parentContainer = null; + this.fixedListVector = null; + this.largeListVector = null; + this.nullableStructWriterFactory = nullableStructWriterFactory; + init(v); + } + + /** + * Constructs a new instance. + * + * @param v The vector to initialize the writer with. + * @param fixedListVector The vector that serves as a parent of v. + * @param nullableStructWriterFactory The factory to create the delegate writer. + */ + public PromotableWriter( + ValueVector v, + FixedSizeListVector fixedListVector, + NullableStructWriterFactory nullableStructWriterFactory) { + this.fixedListVector = fixedListVector; + this.parentContainer = null; + this.listVector = null; + this.largeListVector = null; + this.nullableStructWriterFactory = nullableStructWriterFactory; + init(v); + } + + /** + * Constructs a new instance. + * + * @param v The vector to initialize the writer with. + * @param largeListVector The vector that serves as a parent of v. + * @param nullableStructWriterFactory The factory to create the delegate writer. + */ + public PromotableWriter( + ValueVector v, + LargeListVector largeListVector, + NullableStructWriterFactory nullableStructWriterFactory) { + this.largeListVector = largeListVector; + this.fixedListVector = null; + this.parentContainer = null; + this.listVector = null; + this.nullableStructWriterFactory = nullableStructWriterFactory; + init(v); + } + + private void init(ValueVector v) { + if (v instanceof UnionVector) { + state = State.UNION; + unionVector = (UnionVector) v; + writer = new UnionWriter(unionVector, nullableStructWriterFactory); + } else if (v instanceof NullVector) { + state = State.UNTYPED; + } else { + setWriter(v); + } + } + + @Override + public void setAddVectorAsNullable(boolean nullable) { + super.setAddVectorAsNullable(nullable); + if (writer instanceof AbstractFieldWriter) { + ((AbstractFieldWriter) writer).setAddVectorAsNullable(nullable); + } + } + + private void setWriter(ValueVector v) { + state = State.SINGLE; + vector = v; + type = v.getMinorType(); + switch (type) { + case STRUCT: + writer = nullableStructWriterFactory.build((StructVector) vector); + break; + case LIST: + writer = new UnionListWriter((ListVector) vector, nullableStructWriterFactory); + break; + case MAP: + writer = new UnionMapWriter((MapVector) vector); + break; + case UNION: + writer = new UnionWriter((UnionVector) vector, nullableStructWriterFactory); + break; + default: + writer = type.getNewFieldWriter(vector); + break; + } + } + + @Override + public void writeNull() { + FieldWriter w = getWriter(); + if (w != null) { + w.writeNull(); + } + setPosition(idx() + 1); + } + + @Override + public void setPosition(int index) { + super.setPosition(index); + FieldWriter w = getWriter(); + if (w == null) { + position = index; + } else { + w.setPosition(index); + } + } + + @Override + protected FieldWriter getWriter(MinorType type, ArrowType arrowType) { + if (state == State.UNION) { + if (type == MinorType.DECIMAL || type == MinorType.MAP) { + ((UnionWriter) writer).getWriter(type, arrowType); + } else { + ((UnionWriter) writer).getWriter(type); + } + } else if (state == State.UNTYPED) { + if (type == null) { + // ??? + return null; + } + if (arrowType == null) { + arrowType = type.getType(); + } + FieldType fieldType = new FieldType(addVectorAsNullable, arrowType, null, null); + ValueVector v; + if (listVector != null) { + v = listVector.addOrGetVector(fieldType).getVector(); + } else if (fixedListVector != null) { + v = fixedListVector.addOrGetVector(fieldType).getVector(); + } else { + v = largeListVector.addOrGetVector(fieldType).getVector(); + } + v.allocateNew(); + setWriter(v); + writer.setPosition(position); + } else if (type != this.type) { + promoteToUnion(); + if (type == MinorType.DECIMAL || type == MinorType.MAP) { + ((UnionWriter) writer).getWriter(type, arrowType); + } else { + ((UnionWriter) writer).getWriter(type); + } + } + return writer; + } + + @Override + public boolean isEmptyStruct() { + return writer.isEmptyStruct(); + } + + protected FieldWriter getWriter() { + return writer; + } + + private FieldWriter promoteToUnion() { + String name = vector.getField().getName(); + TransferPair tp = vector.getTransferPair(vector.getMinorType().name().toLowerCase(), vector.getAllocator()); + tp.transfer(); + if (parentContainer != null) { + // TODO allow dictionaries in complex types + unionVector = parentContainer.addOrGetUnion(name); + unionVector.allocateNew(); + } else if (listVector != null) { + unionVector = listVector.promoteToUnion(); + } else if (fixedListVector != null) { + unionVector = fixedListVector.promoteToUnion(); + } else if (largeListVector != null) { + unionVector = largeListVector.promoteToUnion(); + } + unionVector.addVector((FieldVector) tp.getTo()); + writer = new UnionWriter(unionVector, nullableStructWriterFactory); + writer.setPosition(idx()); + for (int i = 0; i <= idx(); i++) { + unionVector.setType(i, vector.getMinorType()); + } + vector = null; + state = State.UNION; + return writer; + } + + @Override + public void write(DecimalHolder holder) { + getWriter(MinorType.DECIMAL, + new ArrowType.Decimal(MAX_DECIMAL_PRECISION, holder.scale, /*bitWidth=*/128)).write(holder); + } + + @Override + public void writeDecimal(long start, ArrowBuf buffer, ArrowType arrowType) { + getWriter(MinorType.DECIMAL, new ArrowType.Decimal(MAX_DECIMAL_PRECISION, + ((ArrowType.Decimal) arrowType).getScale(), /*bitWidth=*/128)).writeDecimal(start, buffer, arrowType); + } + + @Override + public void writeDecimal(BigDecimal value) { + getWriter(MinorType.DECIMAL, + new ArrowType.Decimal(MAX_DECIMAL_PRECISION, value.scale(), /*bitWidth=*/128)).writeDecimal(value); + } + + @Override + public void writeBigEndianBytesToDecimal(byte[] value, ArrowType arrowType) { + getWriter(MinorType.DECIMAL, new ArrowType.Decimal(MAX_DECIMAL_PRECISION, + ((ArrowType.Decimal) arrowType).getScale(), /*bitWidth=*/128)).writeBigEndianBytesToDecimal(value, arrowType); + } + + @Override + public void write(Decimal256Holder holder) { + getWriter(MinorType.DECIMAL256, + new ArrowType.Decimal(MAX_DECIMAL256_PRECISION, holder.scale, /*bitWidth=*/256)).write(holder); + } + + @Override + public void writeDecimal256(long start, ArrowBuf buffer, ArrowType arrowType) { + getWriter(MinorType.DECIMAL256, new ArrowType.Decimal(MAX_DECIMAL256_PRECISION, + ((ArrowType.Decimal) arrowType).getScale(), /*bitWidth=*/256)).writeDecimal256(start, buffer, arrowType); + } + + @Override + public void writeDecimal256(BigDecimal value) { + getWriter(MinorType.DECIMAL256, + new ArrowType.Decimal(MAX_DECIMAL256_PRECISION, value.scale(), /*bitWidth=*/256)).writeDecimal256(value); + } + + @Override + public void writeBigEndianBytesToDecimal256(byte[] value, ArrowType arrowType) { + getWriter(MinorType.DECIMAL256, new ArrowType.Decimal(MAX_DECIMAL256_PRECISION, + ((ArrowType.Decimal) arrowType).getScale(), + /*bitWidth=*/256)).writeBigEndianBytesToDecimal256(value, arrowType); + } + + + @Override + public void allocate() { + getWriter().allocate(); + } + + @Override + public void clear() { + getWriter().clear(); + } + + @Override + public Field getField() { + return getWriter().getField(); + } + + @Override + public int getValueCapacity() { + return getWriter().getValueCapacity(); + } + + @Override + public void close() throws Exception { + getWriter().close(); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleListReaderImpl.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleListReaderImpl.java new file mode 100644 index 000000000..9bbe60421 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleListReaderImpl.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex.impl; + + +import org.apache.arrow.vector.complex.AbstractContainerVector; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.StructWriter; +import org.apache.arrow.vector.types.Types.MinorType; + +/** + * An implementation of {@link AbstractFieldReader} for lists vectors. + */ +@SuppressWarnings("unused") +public class SingleListReaderImpl extends AbstractFieldReader { + + private final String name; + private final AbstractContainerVector container; + private FieldReader reader; + + /** + * Constructs a new instance. + * + * @param name The name of field to read in container. + * @param container The container holding a list. + */ + public SingleListReaderImpl(String name, AbstractContainerVector container) { + super(); + this.name = name; + this.container = container; + } + + @Override + public void setPosition(int index) { + super.setPosition(index); + if (reader != null) { + reader.setPosition(index); + } + } + + @Override + public Object readObject() { + return reader.readObject(); + } + + @Override + public FieldReader reader() { + if (reader == null) { + reader = container.getChild(name).getReader(); + setPosition(idx()); + } + return reader; + } + + @Override + public MinorType getMinorType() { + return MinorType.LIST; + } + + @Override + public boolean isSet() { + return false; + } + + @Override + public void copyAsValue(ListWriter writer) { + throw new UnsupportedOperationException("Generic list copying not yet supported. Please resolve to typed list."); + } + + @Override + public void copyAsField(String name, StructWriter writer) { + throw new UnsupportedOperationException("Generic list copying not yet supported. Please resolve to typed list."); + } + +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleStructReaderImpl.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleStructReaderImpl.java new file mode 100644 index 000000000..3590e40ce --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleStructReaderImpl.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex.impl; + + +import java.util.HashMap; +import java.util.Map; + +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.BaseWriter.StructWriter; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; + +/** + * {@link FieldReader} for a single {@link org.apache.arrow.vector.complex.NonNullableStructVector}. + */ +@SuppressWarnings("unused") +public class SingleStructReaderImpl extends AbstractFieldReader { + + private final NonNullableStructVector vector; + private final Map fields = new HashMap<>(); + + public SingleStructReaderImpl(NonNullableStructVector vector) { + this.vector = vector; + } + + private void setChildrenPosition(int index) { + for (FieldReader r : fields.values()) { + r.setPosition(index); + } + } + + @Override + public Field getField() { + return vector.getField(); + } + + @Override + public FieldReader reader(String name) { + FieldReader reader = fields.get(name); + if (reader == null) { + ValueVector child = vector.getChild(name); + if (child == null) { + reader = NullReader.INSTANCE; + } else { + reader = child.getReader(); + } + fields.put(name, reader); + reader.setPosition(idx()); + } + return reader; + } + + @Override + public void setPosition(int index) { + super.setPosition(index); + for (FieldReader r : fields.values()) { + r.setPosition(index); + } + } + + @Override + public Object readObject() { + return vector.getObject(idx()); + } + + @Override + public MinorType getMinorType() { + return MinorType.STRUCT; + } + + @Override + public boolean isSet() { + return true; + } + + @Override + public java.util.Iterator iterator() { + return vector.fieldNameIterator(); + } + + @Override + public void copyAsValue(StructWriter writer) { + SingleStructWriter impl = (SingleStructWriter) writer; + impl.container.copyFromSafe(idx(), impl.idx(), vector); + } + + @Override + public void copyAsField(String name, StructWriter writer) { + SingleStructWriter impl = (SingleStructWriter) writer.struct(name); + impl.container.copyFromSafe(idx(), impl.idx(), vector); + } + + +} + diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/StructOrListWriterImpl.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/StructOrListWriterImpl.java new file mode 100644 index 000000000..e9c0825dd --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/StructOrListWriterImpl.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex.impl; + +import org.apache.arrow.vector.complex.writer.BaseWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.StructOrListWriter; +import org.apache.arrow.vector.complex.writer.BigIntWriter; +import org.apache.arrow.vector.complex.writer.BitWriter; +import org.apache.arrow.vector.complex.writer.Float4Writer; +import org.apache.arrow.vector.complex.writer.Float8Writer; +import org.apache.arrow.vector.complex.writer.IntWriter; +import org.apache.arrow.vector.complex.writer.VarBinaryWriter; +import org.apache.arrow.vector.complex.writer.VarCharWriter; + +/** + * Concrete implementation of {@link StructOrListWriter}. + */ +public class StructOrListWriterImpl implements StructOrListWriter { + + public final BaseWriter.StructWriter struct; + public final BaseWriter.ListWriter list; + + /** + * Constructs a new instance using a {@link BaseWriter.StructWriter} + * (instead of an {@link BaseWriter.ListWriter}). + */ + public StructOrListWriterImpl(final BaseWriter.StructWriter writer) { + this.struct = writer; + this.list = null; + } + + /** + * Constructs a new instance using a {@link BaseWriter.ListWriter} + * (instead of a {@link BaseWriter.StructWriter}). + */ + public StructOrListWriterImpl(final BaseWriter.ListWriter writer) { + this.struct = null; + this.list = writer; + } + + /** + * Start writing to either the list or the struct. + */ + public void start() { + if (struct != null) { + struct.start(); + } else { + list.startList(); + } + } + + /** + * Finish writing to the list or struct. + */ + public void end() { + if (struct != null) { + struct.end(); + } else { + list.endList(); + } + } + + /** + * Creates a new writer for a struct with the given name. + */ + public StructOrListWriter struct(final String name) { + assert struct != null; + return new StructOrListWriterImpl(struct.struct(name)); + } + + /** + * Creates a new writer for a list of structs. + * + * @param name Unused. + */ + public StructOrListWriter listoftstruct(final String name) { + assert list != null; + return new StructOrListWriterImpl(list.struct()); + } + + public StructOrListWriter list(final String name) { + assert struct != null; + return new StructOrListWriterImpl(struct.list(name)); + } + + public boolean isStructWriter() { + return struct != null; + } + + public boolean isListWriter() { + return list != null; + } + + public VarCharWriter varChar(final String name) { + return (struct != null) ? struct.varChar(name) : list.varChar(); + } + + public IntWriter integer(final String name) { + return (struct != null) ? struct.integer(name) : list.integer(); + } + + public BigIntWriter bigInt(final String name) { + return (struct != null) ? struct.bigInt(name) : list.bigInt(); + } + + public Float4Writer float4(final String name) { + return (struct != null) ? struct.float4(name) : list.float4(); + } + + public Float8Writer float8(final String name) { + return (struct != null) ? struct.float8(name) : list.float8(); + } + + public BitWriter bit(final String name) { + return (struct != null) ? struct.bit(name) : list.bit(); + } + + public VarBinaryWriter binary(final String name) { + return (struct != null) ? struct.varBinary(name) : list.varBinary(); + } + +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionFixedSizeListReader.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionFixedSizeListReader.java new file mode 100644 index 000000000..ece729ae5 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionFixedSizeListReader.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex.impl; + +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; +import org.apache.arrow.vector.complex.writer.FieldWriter; +import org.apache.arrow.vector.holders.UnionHolder; +import org.apache.arrow.vector.types.Types.MinorType; + +/** + * Reader for fixed size list vectors. + */ +public class UnionFixedSizeListReader extends AbstractFieldReader { + + private final FixedSizeListVector vector; + private final ValueVector data; + private final int listSize; + + private int currentOffset; + + /** + * Constructs a new instance that reads data in vector. + */ + public UnionFixedSizeListReader(FixedSizeListVector vector) { + this.vector = vector; + this.data = vector.getDataVector(); + this.listSize = vector.getListSize(); + } + + @Override + public boolean isSet() { + return !vector.isNull(idx()); + } + + @Override + public FieldReader reader() { + return data.getReader(); + } + + @Override + public Object readObject() { + return vector.getObject(idx()); + } + + @Override + public MinorType getMinorType() { + return vector.getMinorType(); + } + + @Override + public void setPosition(int index) { + super.setPosition(index); + data.getReader().setPosition(index * listSize); + currentOffset = 0; + } + + @Override + public void read(int index, UnionHolder holder) { + setPosition(idx()); + for (int i = -1; i < index; i++) { + if (!next()) { + throw new IndexOutOfBoundsException("Requested " + index + ", size " + listSize); + } + } + holder.reader = data.getReader(); + holder.isSet = vector.isNull(idx()) ? 0 : 1; + } + + @Override + public int size() { + return listSize; + } + + @Override + public boolean next() { + if (currentOffset < listSize) { + data.getReader().setPosition(idx() * listSize + currentOffset++); + return true; + } else { + return false; + } + } + + public void copyAsValue(ListWriter writer) { + ComplexCopier.copy(this, (FieldWriter) writer); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionLargeListReader.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionLargeListReader.java new file mode 100644 index 000000000..faf088b55 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionLargeListReader.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex.impl; + +import static org.apache.arrow.memory.util.LargeMemoryUtil.checkedCastToInt; + +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.complex.LargeListVector; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.FieldWriter; +import org.apache.arrow.vector.holders.UnionHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; + +/** + * {@link FieldReader} for list of union types. + */ +public class UnionLargeListReader extends AbstractFieldReader { + + private LargeListVector vector; + private ValueVector data; + private long index; + private static final long OFFSET_WIDTH = 8L; + + public UnionLargeListReader(LargeListVector vector) { + this.vector = vector; + this.data = vector.getDataVector(); + } + + @Override + public Field getField() { + return vector.getField(); + } + + @Override + public boolean isSet() { + return !vector.isNull(idx()); + } + + private long currentOffset; + private long maxOffset; + + @Override + public void setPosition(int index) { + super.setPosition(index); + currentOffset = vector.getOffsetBuffer().getLong((long) index * OFFSET_WIDTH) - 1; + maxOffset = vector.getOffsetBuffer().getLong(((long) index + 1L) * OFFSET_WIDTH); + } + + @Override + public FieldReader reader() { + return data.getReader(); + } + + @Override + public Object readObject() { + return vector.getObject(idx()); + } + + @Override + public MinorType getMinorType() { + return MinorType.LARGELIST; + } + + @Override + public void read(int index, UnionHolder holder) { + setPosition(index); + for (int i = -1; i < index; i++) { + next(); + } + holder.reader = data.getReader(); + holder.isSet = data.getReader().isSet() ? 1 : 0; + } + + @Override + public int size() { + int size = checkedCastToInt(maxOffset - currentOffset - 1); //todo revisit when int64 vectors are done + return size < 0 ? 0 : size; + } + + @Override + public boolean next() { + if (currentOffset + 1 < maxOffset) { + data.getReader().setPosition(checkedCastToInt(++currentOffset)); // todo revisit when int64 vectors are done + return true; + } else { + return false; + } + } + + public void copyAsValue(UnionLargeListWriter writer) { + ComplexCopier.copy(this, (FieldWriter) writer); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java new file mode 100644 index 000000000..a8c185aef --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex.impl; + +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; +import org.apache.arrow.vector.complex.writer.FieldWriter; +import org.apache.arrow.vector.holders.UnionHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; + +/** + * {@link FieldReader} for list of union types. + */ +public class UnionListReader extends AbstractFieldReader { + + private ListVector vector; + private ValueVector data; + private static final int OFFSET_WIDTH = 4; + + public UnionListReader(ListVector vector) { + this.vector = vector; + this.data = vector.getDataVector(); + } + + @Override + public Field getField() { + return vector.getField(); + } + + @Override + public boolean isSet() { + return !vector.isNull(idx()); + } + + private int currentOffset; + private int maxOffset; + + @Override + public void setPosition(int index) { + super.setPosition(index); + currentOffset = vector.getOffsetBuffer().getInt(index * OFFSET_WIDTH) - 1; + maxOffset = vector.getOffsetBuffer().getInt((index + 1) * OFFSET_WIDTH); + } + + @Override + public FieldReader reader() { + return data.getReader(); + } + + @Override + public Object readObject() { + return vector.getObject(idx()); + } + + @Override + public MinorType getMinorType() { + return MinorType.LIST; + } + + @Override + public void read(int index, UnionHolder holder) { + setPosition(idx()); + for (int i = -1; i < index; i++) { + next(); + } + holder.reader = data.getReader(); + holder.isSet = data.getReader().isSet() ? 1 : 0; + } + + @Override + public int size() { + int size = maxOffset - currentOffset - 1; + return size < 0 ? 0 : size; + } + + @Override + public boolean next() { + if (currentOffset + 1 < maxOffset) { + data.getReader().setPosition(++currentOffset); + return true; + } else { + return false; + } + } + + public void copyAsValue(ListWriter writer) { + ComplexCopier.copy(this, (FieldWriter) writer); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionMapReader.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionMapReader.java new file mode 100644 index 000000000..7a1bdce9b --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionMapReader.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex.impl; + +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.types.Types.MinorType; + +/** + * Reader for a MapVector. + */ +public class UnionMapReader extends UnionListReader { + + private String keyName = MapVector.KEY_NAME; + private String valueName = MapVector.VALUE_NAME; + + /** + * Construct a new reader for the given vector. + * + * @param vector Vector to read from. + */ + public UnionMapReader(MapVector vector) { + super(vector); + } + + /** + * Set the key, value field names to read. + * + * @param key Field name for key. + * @param value Field name for value. + */ + public void setKeyValueNames(String key, String value) { + keyName = key; + valueName = value; + } + + /** + * Start reading a key from the map entry. + * + * @return reader that can be used to read the key. + */ + public FieldReader key() { + return reader().reader(keyName); + } + + /** + * Start reading a value element from the map entry. + * + * @return reader that can be used to read the value. + */ + public FieldReader value() { + return reader().reader(valueName); + } + + /** + * Return the MinorType of the reader as MAP. + */ + @Override + public MinorType getMinorType() { + return MinorType.MAP; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/reader/FieldReader.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/reader/FieldReader.java new file mode 100644 index 000000000..a888abbaa --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/reader/FieldReader.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex.reader; + +import org.apache.arrow.vector.complex.reader.BaseReader.ListReader; +import org.apache.arrow.vector.complex.reader.BaseReader.MapReader; +import org.apache.arrow.vector.complex.reader.BaseReader.RepeatedListReader; +import org.apache.arrow.vector.complex.reader.BaseReader.RepeatedMapReader; +import org.apache.arrow.vector.complex.reader.BaseReader.RepeatedStructReader; +import org.apache.arrow.vector.complex.reader.BaseReader.ScalarReader; +import org.apache.arrow.vector.complex.reader.BaseReader.StructReader; + + +/** + * Composite of all Reader types (e.g. {@link StructReader}, {@link ScalarReader}, etc). Each reader type + * is in essence a way of iterating over a {@link org.apache.arrow.vector.ValueVector}. + */ +public interface FieldReader extends StructReader, ListReader, MapReader, ScalarReader, + RepeatedStructReader, RepeatedListReader, RepeatedMapReader { +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/writer/FieldWriter.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/writer/FieldWriter.java new file mode 100644 index 000000000..a3cb7108a --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/complex/writer/FieldWriter.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex.writer; + +import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.ScalarWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.StructWriter; + +/** + * Composite of all writer types. Writers are convenience classes for incrementally + * adding values to {@linkplain org.apache.arrow.vector.ValueVector}s. + */ +public interface FieldWriter extends StructWriter, ListWriter, MapWriter, ScalarWriter { + void allocate(); + + void clear(); +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compression/AbstractCompressionCodec.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compression/AbstractCompressionCodec.java new file mode 100644 index 000000000..39b32968d --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compression/AbstractCompressionCodec.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.compression; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.util.MemoryUtil; +import org.apache.arrow.util.Preconditions; + +/** + * The base class for concrete compression codecs, providing + * common logic for all compression codecs. + */ +public abstract class AbstractCompressionCodec implements CompressionCodec { + + @Override + public ArrowBuf compress(BufferAllocator allocator, ArrowBuf uncompressedBuffer) { + if (uncompressedBuffer.writerIndex() == 0L) { + // shortcut for empty buffer + ArrowBuf compressedBuffer = allocator.buffer(CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH); + compressedBuffer.setLong(0, 0); + compressedBuffer.writerIndex(CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH); + uncompressedBuffer.close(); + return compressedBuffer; + } + + ArrowBuf compressedBuffer = doCompress(allocator, uncompressedBuffer); + long compressedLength = compressedBuffer.writerIndex() - CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH; + long uncompressedLength = uncompressedBuffer.writerIndex(); + + if (compressedLength > uncompressedLength) { + // compressed buffer is larger, send the raw buffer + compressedBuffer.close(); + compressedBuffer = CompressionUtil.packageRawBuffer(allocator, uncompressedBuffer); + } else { + writeUncompressedLength(compressedBuffer, uncompressedLength); + } + + uncompressedBuffer.close(); + return compressedBuffer; + } + + @Override + public ArrowBuf decompress(BufferAllocator allocator, ArrowBuf compressedBuffer) { + Preconditions.checkArgument(compressedBuffer.writerIndex() >= CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH, + "Not enough data to decompress."); + + long decompressedLength = readUncompressedLength(compressedBuffer); + + if (decompressedLength == 0L) { + // shortcut for empty buffer + compressedBuffer.close(); + return allocator.getEmpty(); + } + + if (decompressedLength == CompressionUtil.NO_COMPRESSION_LENGTH) { + // no compression + return CompressionUtil.extractUncompressedBuffer(compressedBuffer); + } + + ArrowBuf decompressedBuffer = doDecompress(allocator, compressedBuffer); + compressedBuffer.close(); + return decompressedBuffer; + } + + protected void writeUncompressedLength(ArrowBuf compressedBuffer, long uncompressedLength) { + if (!MemoryUtil.LITTLE_ENDIAN) { + uncompressedLength = Long.reverseBytes(uncompressedLength); + } + // first 8 bytes reserved for uncompressed length, according to the specification + compressedBuffer.setLong(0, uncompressedLength); + } + + protected long readUncompressedLength(ArrowBuf compressedBuffer) { + long decompressedLength = compressedBuffer.getLong(0); + if (!MemoryUtil.LITTLE_ENDIAN) { + decompressedLength = Long.reverseBytes(decompressedLength); + } + return decompressedLength; + } + + /** + * The method that actually performs the data compression. + * The layout of the returned compressed buffer is the compressed data, + * plus 8 bytes reserved at the beginning of the buffer for the uncompressed data size. + *

+ * Please note that this method is not responsible for releasing the uncompressed buffer. + *

+ */ + protected abstract ArrowBuf doCompress(BufferAllocator allocator, ArrowBuf uncompressedBuffer); + + /** + * The method that actually performs the data decompression. + * The layout of the compressed buffer is the compressed data, + * plus 8 bytes at the beginning of the buffer storing the uncompressed data size. + *

+ * Please note that this method is not responsible for releasing the compressed buffer. + *

+ */ + protected abstract ArrowBuf doDecompress(BufferAllocator allocator, ArrowBuf compressedBuffer); +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compression/CompressionCodec.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compression/CompressionCodec.java new file mode 100644 index 000000000..a6dd8b51f --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compression/CompressionCodec.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.compression; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; + +/** + * The codec for compression/decompression. + */ +public interface CompressionCodec { + + /** + * Compress a buffer. + * @param allocator the allocator for allocating memory for compressed buffer. + * @param uncompressedBuffer the buffer to compress. + * Implementation of this method should take care of releasing this buffer. + * @return the compressed buffer + */ + ArrowBuf compress(BufferAllocator allocator, ArrowBuf uncompressedBuffer); + + /** + * Decompress a buffer. + * @param allocator the allocator for allocating memory for decompressed buffer. + * @param compressedBuffer the buffer to be decompressed. + * Implementation of this method should take care of releasing this buffer. + * @return the decompressed buffer. + */ + ArrowBuf decompress(BufferAllocator allocator, ArrowBuf compressedBuffer); + + /** + * Gets the type of the codec. + * @return the type of the codec. + */ + CompressionUtil.CodecType getCodecType(); + + /** + * Factory to create compression codec. + */ + interface Factory { + + /** + * Creates the codec based on the codec type. + */ + CompressionCodec createCodec(CompressionUtil.CodecType codecType); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compression/CompressionUtil.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compression/CompressionUtil.java new file mode 100644 index 000000000..1deb38c84 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compression/CompressionUtil.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.compression; + +import org.apache.arrow.flatbuf.BodyCompressionMethod; +import org.apache.arrow.flatbuf.CompressionType; +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.ipc.message.ArrowBodyCompression; + +/** + * Utilities for data compression/decompression. + */ +public class CompressionUtil { + + /** + * Compression codec types corresponding to flat buffer implementation in {@link CompressionType}. + */ + public enum CodecType { + + NO_COMPRESSION(NoCompressionCodec.COMPRESSION_TYPE), + + LZ4_FRAME(org.apache.arrow.flatbuf.CompressionType.LZ4_FRAME), + + ZSTD(org.apache.arrow.flatbuf.CompressionType.ZSTD); + + private final byte type; + + CodecType(byte type) { + this.type = type; + } + + public byte getType() { + return type; + } + + /** + * Gets the codec type from the compression type defined in {@link CompressionType}. + */ + public static CodecType fromCompressionType(byte type) { + for (CodecType codecType : values()) { + if (codecType.type == type) { + return codecType; + } + } + return NO_COMPRESSION; + } + } + + public static final long SIZE_OF_UNCOMPRESSED_LENGTH = 8L; + + /** + * Special flag to indicate no compression. + * (e.g. when the compressed buffer has a larger size.) + */ + public static final long NO_COMPRESSION_LENGTH = -1L; + + private CompressionUtil() { + } + + /** + * Creates the {@link ArrowBodyCompression} object, given the {@link CompressionCodec}. + * The implementation of this method should depend on the values of + * {@link org.apache.arrow.flatbuf.CompressionType#names}. + */ + public static ArrowBodyCompression createBodyCompression(CompressionCodec codec) { + return new ArrowBodyCompression(codec.getCodecType().getType(), BodyCompressionMethod.BUFFER); + } + + /** + * Process compression by compressing the buffer as is. + */ + public static ArrowBuf packageRawBuffer(BufferAllocator allocator, ArrowBuf inputBuffer) { + ArrowBuf compressedBuffer = allocator.buffer(SIZE_OF_UNCOMPRESSED_LENGTH + inputBuffer.writerIndex()); + compressedBuffer.setLong(0, NO_COMPRESSION_LENGTH); + compressedBuffer.setBytes(SIZE_OF_UNCOMPRESSED_LENGTH, inputBuffer, 0, inputBuffer.writerIndex()); + compressedBuffer.writerIndex(SIZE_OF_UNCOMPRESSED_LENGTH + inputBuffer.writerIndex()); + return compressedBuffer; + } + + /** + * Process decompression by slicing the buffer that contains the uncompressed bytes. + */ + public static ArrowBuf extractUncompressedBuffer(ArrowBuf inputBuffer) { + return inputBuffer.slice(SIZE_OF_UNCOMPRESSED_LENGTH, + inputBuffer.writerIndex() - SIZE_OF_UNCOMPRESSED_LENGTH); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compression/NoCompressionCodec.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compression/NoCompressionCodec.java new file mode 100644 index 000000000..e5e8e9d46 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/compression/NoCompressionCodec.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.compression; + +import org.apache.arrow.flatbuf.BodyCompressionMethod; +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.ipc.message.ArrowBodyCompression; + +/** + * The default compression codec that does no compression. + */ +public class NoCompressionCodec implements CompressionCodec { + + public static final NoCompressionCodec INSTANCE = new NoCompressionCodec(); + + public static final byte COMPRESSION_TYPE = -1; + + public static final ArrowBodyCompression DEFAULT_BODY_COMPRESSION = + new ArrowBodyCompression(COMPRESSION_TYPE, BodyCompressionMethod.BUFFER); + + private NoCompressionCodec() { + } + + @Override + public ArrowBuf compress(BufferAllocator allocator, ArrowBuf uncompressedBuffer) { + return uncompressedBuffer; + } + + @Override + public ArrowBuf decompress(BufferAllocator allocator, ArrowBuf compressedBuffer) { + return compressedBuffer; + } + + @Override + public CompressionUtil.CodecType getCodecType() { + return CompressionUtil.CodecType.NO_COMPRESSION; + } + + /** + * The default factory that creates a {@link NoCompressionCodec}. + */ + public static class Factory implements CompressionCodec.Factory { + + public static final NoCompressionCodec.Factory INSTANCE = new NoCompressionCodec.Factory(); + + @Override + public CompressionCodec createCodec(CompressionUtil.CodecType codecType) { + return NoCompressionCodec.INSTANCE; + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/dictionary/Dictionary.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/dictionary/Dictionary.java new file mode 100644 index 000000000..6f40e5814 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/dictionary/Dictionary.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.dictionary; + +import java.util.Objects; + +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.compare.VectorEqualsVisitor; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.DictionaryEncoding; + +/** + * A dictionary (integer to Value mapping) that is used to facilitate + * dictionary encoding compression. + */ +public class Dictionary { + + private final DictionaryEncoding encoding; + private final FieldVector dictionary; + + public Dictionary(FieldVector dictionary, DictionaryEncoding encoding) { + this.dictionary = dictionary; + this.encoding = encoding; + } + + public FieldVector getVector() { + return dictionary; + } + + public DictionaryEncoding getEncoding() { + return encoding; + } + + public ArrowType getVectorType() { + return dictionary.getField().getType(); + } + + @Override + public String toString() { + return "Dictionary " + encoding + " " + dictionary; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + Dictionary that = (Dictionary) o; + return Objects.equals(encoding, that.encoding) && + new VectorEqualsVisitor().vectorEquals(that.dictionary, dictionary); + } + + @Override + public int hashCode() { + return Objects.hash(encoding, dictionary); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java new file mode 100644 index 000000000..babb0dbd3 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.dictionary; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.memory.util.hash.SimpleHasher; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.BaseIntVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * Encoder/decoder for Dictionary encoded {@link ValueVector}. Dictionary encoding produces an + * integer {@link ValueVector}. Each entry in the Vector is index into the dictionary which can hold + * values of any type. + */ +public class DictionaryEncoder { + + private final DictionaryHashTable hashTable; + private final Dictionary dictionary; + private final BufferAllocator allocator; + + /** + * Construct an instance. + */ + public DictionaryEncoder(Dictionary dictionary, BufferAllocator allocator) { + this (dictionary, allocator, SimpleHasher.INSTANCE); + } + + /** + * Construct an instance. + */ + public DictionaryEncoder(Dictionary dictionary, BufferAllocator allocator, ArrowBufHasher hasher) { + this.dictionary = dictionary; + this.allocator = allocator; + hashTable = new DictionaryHashTable(dictionary.getVector(), hasher); + } + + /** + * Dictionary encodes a vector with a provided dictionary. The dictionary must contain all values in the vector. + * + * @param vector vector to encode + * @param dictionary dictionary used for encoding + * @return dictionary encoded vector + */ + public static ValueVector encode(ValueVector vector, Dictionary dictionary) { + DictionaryEncoder encoder = new DictionaryEncoder(dictionary, vector.getAllocator()); + return encoder.encode(vector); + } + + /** + * Decodes a dictionary encoded array using the provided dictionary. + * + * @param indices dictionary encoded values, must be int type + * @param dictionary dictionary used to decode the values + * @return vector with values restored from dictionary + */ + public static ValueVector decode(ValueVector indices, Dictionary dictionary) { + DictionaryEncoder encoder = new DictionaryEncoder(dictionary, indices.getAllocator()); + return encoder.decode(indices); + } + + /** + * Get the indexType according to the dictionary vector valueCount. + * @param valueCount dictionary vector valueCount. + * @return index type. + */ + public static ArrowType.Int getIndexType(int valueCount) { + Preconditions.checkArgument(valueCount >= 0); + if (valueCount <= Byte.MAX_VALUE) { + return new ArrowType.Int(8, true); + } else if (valueCount <= Character.MAX_VALUE) { + return new ArrowType.Int(16, true); + } else if (valueCount <= Integer.MAX_VALUE) { + return new ArrowType.Int(32, true); + } else { + return new ArrowType.Int(64, true); + } + } + + /** + * Populates indices between start and end with the encoded values of vector. + * @param vector the vector to encode + * @param indices the index vector + * @param encoding the hash table for encoding + * @param start the start index + * @param end the end index + */ + static void buildIndexVector( + ValueVector vector, + BaseIntVector indices, + DictionaryHashTable encoding, + int start, + int end) { + + for (int i = start; i < end; i++) { + if (!vector.isNull(i)) { + // if it's null leave it null + // note: this may fail if value was not included in the dictionary + int encoded = encoding.getIndex(i, vector); + if (encoded == -1) { + throw new IllegalArgumentException("Dictionary encoding not defined for value:" + vector.getObject(i)); + } + indices.setWithPossibleTruncate(i, encoded); + } + } + } + + /** + * Retrieve values to target vector from index vector. + * @param indices the index vector + * @param transfer the {@link TransferPair} to copy dictionary data into target vector. + * @param dictionaryCount the value count of dictionary vector. + * @param start the start index + * @param end the end index + */ + static void retrieveIndexVector( + BaseIntVector indices, + TransferPair transfer, + int dictionaryCount, + int start, + int end) { + for (int i = start; i < end; i++) { + if (!indices.isNull(i)) { + int indexAsInt = (int) indices.getValueAsLong(i); + if (indexAsInt > dictionaryCount) { + throw new IllegalArgumentException("Provided dictionary does not contain value for index " + indexAsInt); + } + transfer.copyValueSafe(indexAsInt, i); + } + } + } + + /** + * Encodes a vector with the built hash table in this encoder. + */ + public ValueVector encode(ValueVector vector) { + + Field valueField = vector.getField(); + FieldType indexFieldType = new FieldType(valueField.isNullable(), dictionary.getEncoding().getIndexType(), + dictionary.getEncoding(), valueField.getMetadata()); + Field indexField = new Field(valueField.getName(), indexFieldType, null); + + // vector to hold our indices (dictionary encoded values) + FieldVector createdVector = indexField.createVector(allocator); + if (! (createdVector instanceof BaseIntVector)) { + throw new IllegalArgumentException("Dictionary encoding does not have a valid int type:" + + createdVector.getClass()); + } + + BaseIntVector indices = (BaseIntVector) createdVector; + indices.allocateNew(); + + buildIndexVector(vector, indices, hashTable, 0, vector.getValueCount()); + indices.setValueCount(vector.getValueCount()); + return indices; + } + + /** + * Decodes a vector with the built hash table in this encoder. + */ + public ValueVector decode(ValueVector indices) { + int count = indices.getValueCount(); + ValueVector dictionaryVector = dictionary.getVector(); + int dictionaryCount = dictionaryVector.getValueCount(); + // copy the dictionary values into the decoded vector + TransferPair transfer = dictionaryVector.getTransferPair(allocator); + transfer.getTo().allocateNewSafe(); + + BaseIntVector baseIntVector = (BaseIntVector) indices; + retrieveIndexVector(baseIntVector, transfer, dictionaryCount, 0, count); + ValueVector decoded = transfer.getTo(); + decoded.setValueCount(count); + return decoded; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java new file mode 100644 index 000000000..9926a8e2a --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java @@ -0,0 +1,295 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.dictionary; + +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.memory.util.hash.SimpleHasher; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.compare.Range; +import org.apache.arrow.vector.compare.RangeEqualsVisitor; + +/** + * HashTable used for Dictionary encoding. It holds two vectors (the vector to encode and dictionary vector) + * It stores the index in dictionary vector and for a given index in encode vector, + * it could return dictionary index. + */ +public class DictionaryHashTable { + + /** + * Represents a null value in map. + */ + static final int NULL_VALUE = -1; + + /** + * The default initial capacity - MUST be a power of two. + */ + static final int DEFAULT_INITIAL_CAPACITY = 1 << 4; + + /** + * The maximum capacity, used if a higher value is implicitly specified + * by either of the constructors with arguments. + */ + static final int MAXIMUM_CAPACITY = 1 << 30; + + /** + * The load factor used when none specified in constructor. + */ + static final float DEFAULT_LOAD_FACTOR = 0.75f; + + static final DictionaryHashTable.Entry[] EMPTY_TABLE = {}; + + /** + * The table, initialized on first use, and resized as + * necessary. When allocated, length is always a power of two. + */ + transient DictionaryHashTable.Entry[] table = EMPTY_TABLE; + + /** + * The number of key-value mappings contained in this map. + */ + transient int size; + + /** + * The next size value at which to resize (capacity * load factor). + */ + int threshold; + + /** + * The load factor for the hash table. + */ + final float loadFactor; + + private final ValueVector dictionary; + + private final ArrowBufHasher hasher; + + /** + * Constructs an empty map with the specified initial capacity and load factor. + */ + public DictionaryHashTable(int initialCapacity, ValueVector dictionary, ArrowBufHasher hasher) { + if (initialCapacity < 0) { + throw new IllegalArgumentException("Illegal initial capacity: " + + initialCapacity); + } + if (initialCapacity > MAXIMUM_CAPACITY) { + initialCapacity = MAXIMUM_CAPACITY; + } + this.loadFactor = DEFAULT_LOAD_FACTOR; + this.threshold = initialCapacity; + + this.dictionary = dictionary; + + this.hasher = hasher; + + // build hash table + for (int i = 0; i < this.dictionary.getValueCount(); i++) { + put(i); + } + } + + public DictionaryHashTable(ValueVector dictionary, ArrowBufHasher hasher) { + this(DEFAULT_INITIAL_CAPACITY, dictionary, hasher); + } + + public DictionaryHashTable(ValueVector dictionary) { + this(dictionary, SimpleHasher.INSTANCE); + } + + /** + * Compute the capacity with given threshold and create init table. + */ + private void inflateTable(int threshold) { + int capacity = roundUpToPowerOf2(threshold); + this.threshold = (int) Math.min(capacity * loadFactor, MAXIMUM_CAPACITY + 1); + table = new DictionaryHashTable.Entry[capacity]; + } + + /** + * Computes the storage location in an array for the given hashCode. + */ + static int indexFor(int h, int length) { + return h & (length - 1); + } + + /** + * Returns a power of two size for the given size. + */ + static final int roundUpToPowerOf2(int size) { + int n = size - 1; + n |= n >>> 1; + n |= n >>> 2; + n |= n >>> 4; + n |= n >>> 8; + n |= n >>> 16; + return (n < 0) ? 1 : (n >= MAXIMUM_CAPACITY) ? MAXIMUM_CAPACITY : n + 1; + } + + /** + * get the corresponding dictionary index with the given index in vector which to encode. + * @param indexInArray index in vector. + * @return dictionary vector index or -1 if no value equals. + */ + public int getIndex(int indexInArray, ValueVector toEncode) { + int hash = toEncode.hashCode(indexInArray, this.hasher); + int index = indexFor(hash, table.length); + + RangeEqualsVisitor equalVisitor = new RangeEqualsVisitor(dictionary, toEncode, null); + Range range = new Range(0, 0, 1); + + for (DictionaryHashTable.Entry e = table[index]; e != null ; e = e.next) { + if (e.hash == hash) { + int dictIndex = e.index; + + range = range.setRightStart(indexInArray) + .setLeftStart(dictIndex); + if (equalVisitor.rangeEquals(range)) { + return dictIndex; + } + } + } + return NULL_VALUE; + } + + /** + * put the index of dictionary vector to build hash table. + */ + private void put(int indexInDictionary) { + if (table == EMPTY_TABLE) { + inflateTable(threshold); + } + + int hash = dictionary.hashCode(indexInDictionary, this.hasher); + int i = indexFor(hash, table.length); + for (DictionaryHashTable.Entry e = table[i]; e != null; e = e.next) { + if (e.hash == hash && e.index == indexInDictionary) { + //already has this index, return + return; + } + } + + addEntry(hash, indexInDictionary, i); + } + + /** + * Create a new Entry at the specific position of table. + */ + void createEntry(int hash, int index, int bucketIndex) { + DictionaryHashTable.Entry e = table[bucketIndex]; + table[bucketIndex] = new DictionaryHashTable.Entry(hash, index, e); + size++; + } + + /** + * Add Entry at the specified location of the table. + */ + void addEntry(int hash, int index, int bucketIndex) { + if ((size >= threshold) && (null != table[bucketIndex])) { + resize(2 * table.length); + bucketIndex = indexFor(hash, table.length); + } + + createEntry(hash, index, bucketIndex); + } + + /** + * Resize table with given new capacity. + */ + void resize(int newCapacity) { + DictionaryHashTable.Entry[] oldTable = table; + int oldCapacity = oldTable.length; + if (oldCapacity == MAXIMUM_CAPACITY) { + threshold = Integer.MAX_VALUE; + return; + } + + DictionaryHashTable.Entry[] newTable = new DictionaryHashTable.Entry[newCapacity]; + transfer(newTable); + table = newTable; + threshold = (int) Math.min(newCapacity * loadFactor, MAXIMUM_CAPACITY + 1); + } + + /** + * Transfer entries into new table from old table. + * @param newTable new table + */ + void transfer(DictionaryHashTable.Entry[] newTable) { + int newCapacity = newTable.length; + for (DictionaryHashTable.Entry e : table) { + while (null != e) { + DictionaryHashTable.Entry next = e.next; + int i = indexFor(e.hash, newCapacity); + e.next = newTable[i]; + newTable[i] = e; + e = next; + } + } + } + + /** + * Returns the number of mappings in this Map. + */ + public int size() { + return size; + } + + /** + * Removes all elements from this map, leaving it empty. + */ + public void clear() { + size = 0; + for (int i = 0; i < table.length; i++) { + table[i] = null; + } + } + + /** + * Class to keep dictionary index data within hash table. + */ + static class Entry { + //dictionary index + int index; + DictionaryHashTable.Entry next; + int hash; + + Entry(int hash, int index, DictionaryHashTable.Entry next) { + this.index = index; + this.hash = hash; + this.next = next; + } + + public final int getIndex() { + return this.index; + } + + @Override + public int hashCode() { + return hash; + } + + public final boolean equals(Object o) { + if (!(o instanceof DictionaryHashTable.Entry)) { + return false; + } + DictionaryHashTable.Entry e = (DictionaryHashTable.Entry) o; + if (index == e.getIndex()) { + return true; + } + return false; + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryProvider.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryProvider.java new file mode 100644 index 000000000..21165c07d --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryProvider.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.dictionary; + +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +/** + * A manager for association of dictionary IDs to their corresponding {@link Dictionary}. + */ +public interface DictionaryProvider { + + /** Return the dictionary for the given ID. */ + Dictionary lookup(long id); + + /** + * Implementation of {@link DictionaryProvider} that is backed by a hash-map. + */ + class MapDictionaryProvider implements DictionaryProvider { + + private final Map map; + + /** + * Constructs a new instance from the given dictionaries. + */ + public MapDictionaryProvider(Dictionary... dictionaries) { + this.map = new HashMap<>(); + for (Dictionary dictionary : dictionaries) { + put(dictionary); + } + } + + public void put(Dictionary dictionary) { + map.put(dictionary.getEncoding().getId(), dictionary); + } + + public final Set getDictionaryIds() { + return map.keySet(); + } + + @Override + public Dictionary lookup(long id) { + return map.get(id); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/dictionary/ListSubfieldEncoder.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/dictionary/ListSubfieldEncoder.java new file mode 100644 index 000000000..dd2bb26e3 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/dictionary/ListSubfieldEncoder.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.dictionary; + +import java.util.Collections; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.memory.util.hash.SimpleHasher; +import org.apache.arrow.vector.BaseIntVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.complex.BaseListVector; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * Sub fields encoder/decoder for Dictionary encoded {@link BaseListVector}. + */ +public class ListSubfieldEncoder { + + private final DictionaryHashTable hashTable; + private final Dictionary dictionary; + private final BufferAllocator allocator; + + public ListSubfieldEncoder(Dictionary dictionary, BufferAllocator allocator) { + this (dictionary, allocator, SimpleHasher.INSTANCE); + } + + /** + * Construct an instance. + */ + public ListSubfieldEncoder(Dictionary dictionary, BufferAllocator allocator, ArrowBufHasher hasher) { + this.dictionary = dictionary; + this.allocator = allocator; + BaseListVector dictVector = (BaseListVector) dictionary.getVector(); + hashTable = new DictionaryHashTable(getDataVector(dictVector), hasher); + } + + private FieldVector getDataVector(BaseListVector vector) { + return vector.getChildrenFromFields().get(0); + } + + private BaseListVector cloneVector(BaseListVector vector) { + + final FieldType fieldType = vector.getField().getFieldType(); + BaseListVector cloned = (BaseListVector) fieldType.createNewSingleVector(vector.getField().getName(), + allocator, /*schemaCallBack=*/null); + + final ArrowFieldNode fieldNode = new ArrowFieldNode(vector.getValueCount(), vector.getNullCount()); + cloned.loadFieldBuffers(fieldNode, vector.getFieldBuffers()); + + return cloned; + } + + /** + * Dictionary encodes subfields for complex vector with a provided dictionary. + * The dictionary must contain all values in the sub fields vector. + * @param vector vector to encode + * @return dictionary encoded vector + */ + public BaseListVector encodeListSubField(BaseListVector vector) { + final int valueCount = vector.getValueCount(); + + FieldType indexFieldType = new FieldType(vector.getField().isNullable(), + dictionary.getEncoding().getIndexType(), dictionary.getEncoding(), vector.getField().getMetadata()); + Field valueField = new Field(vector.getField().getName(), indexFieldType, null); + + // clone list vector and initialize data vector + BaseListVector encoded = cloneVector(vector); + encoded.initializeChildrenFromFields(Collections.singletonList(valueField)); + BaseIntVector indices = (BaseIntVector) getDataVector(encoded); + + ValueVector dataVector = getDataVector(vector); + for (int i = 0; i < valueCount; i++) { + if (!vector.isNull(i)) { + int start = vector.getElementStartIndex(i); + int end = vector.getElementEndIndex(i); + + DictionaryEncoder.buildIndexVector(dataVector, indices, hashTable, start, end); + } + } + + return encoded; + } + + /** + * Decodes a dictionary subfields encoded vector using the provided dictionary. + * @param vector dictionary encoded vector, its data vector must be int type + * @return vector with values restored from dictionary + */ + public BaseListVector decodeListSubField(BaseListVector vector) { + + int valueCount = vector.getValueCount(); + BaseListVector dictionaryVector = (BaseListVector) dictionary.getVector(); + int dictionaryValueCount = getDataVector(dictionaryVector).getValueCount(); + + // clone list vector and initialize data vector + BaseListVector decoded = cloneVector(vector); + Field dataVectorField = getDataVector(dictionaryVector).getField(); + decoded.initializeChildrenFromFields(Collections.singletonList(dataVectorField)); + + // get data vector + ValueVector dataVector = getDataVector(decoded); + + TransferPair transfer = getDataVector(dictionaryVector).makeTransferPair(dataVector); + BaseIntVector indices = (BaseIntVector) getDataVector(vector); + + for (int i = 0; i < valueCount; i++) { + + if (!vector.isNull(i)) { + int start = vector.getElementStartIndex(i); + int end = vector.getElementEndIndex(i); + + DictionaryEncoder.retrieveIndexVector(indices, transfer, dictionaryValueCount, start, end); + } + } + return decoded; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/dictionary/StructSubfieldEncoder.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/dictionary/StructSubfieldEncoder.java new file mode 100644 index 000000000..6542b298d --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/dictionary/StructSubfieldEncoder.java @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.dictionary; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.memory.util.hash.SimpleHasher; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.BaseIntVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.types.pojo.DictionaryEncoding; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * Sub fields encoder/decoder for Dictionary encoded {@link StructVector}. + * Notes that child vectors within struct vector can either be dictionary encodable or not. + */ +public class StructSubfieldEncoder { + + private final BufferAllocator allocator; + + private final DictionaryProvider.MapDictionaryProvider provider; + private final Map dictionaryIdToHashTable; + + /** + * Construct an instance. + */ + public StructSubfieldEncoder(BufferAllocator allocator, DictionaryProvider.MapDictionaryProvider provider) { + this (allocator, provider, SimpleHasher.INSTANCE); + } + + /** + * Construct an instance. + */ + public StructSubfieldEncoder( + BufferAllocator allocator, + DictionaryProvider.MapDictionaryProvider provider, + ArrowBufHasher hasher) { + + this.allocator = allocator; + this.provider = provider; + + this.dictionaryIdToHashTable = new HashMap<>(); + + provider.getDictionaryIds().forEach(id -> + dictionaryIdToHashTable.put(id, new DictionaryHashTable(provider.lookup(id).getVector(), hasher))); + } + + private FieldVector getChildVector(StructVector vector, int index) { + return vector.getChildrenFromFields().get(index); + } + + private StructVector cloneVector(StructVector vector) { + + final FieldType fieldType = vector.getField().getFieldType(); + StructVector cloned = (StructVector) fieldType.createNewSingleVector( + vector.getField().getName(), allocator, /*schemaCallback=*/null); + + final ArrowFieldNode fieldNode = new ArrowFieldNode(vector.getValueCount(), vector.getNullCount()); + cloned.loadFieldBuffers(fieldNode, vector.getFieldBuffers()); + + return cloned; + } + + /** + * Dictionary encodes subfields for complex vector with a provided dictionary. + * The dictionary must contain all values in the sub fields vector. + * @param vector vector to encode + * @param columnToDictionaryId the mappings between child vector index and dictionary id. A null dictionary + * id indicates the child vector is not encodable. + * @return dictionary encoded vector + */ + public StructVector encode(StructVector vector, Map columnToDictionaryId) { + final int valueCount = vector.getValueCount(); + final int childCount = vector.getChildrenFromFields().size(); + + List childrenFields = new ArrayList<>(); + + // initialize child fields. + for (int i = 0; i < childCount; i++) { + FieldVector childVector = getChildVector(vector, i); + Long dictionaryId = columnToDictionaryId.get(i); + // A null dictionaryId indicates the child vector shouldn't be encoded. + if (dictionaryId == null) { + childrenFields.add(childVector.getField()); + } else { + Dictionary dictionary = provider.lookup(dictionaryId); + Preconditions.checkNotNull(dictionary, "Dictionary not found with id:" + dictionaryId); + FieldType indexFieldType = new FieldType(childVector.getField().isNullable(), + dictionary.getEncoding().getIndexType(), dictionary.getEncoding()); + childrenFields.add(new Field(childVector.getField().getName(), indexFieldType, /*children=*/null)); + } + } + + // clone list vector and initialize data vector + StructVector encoded = cloneVector(vector); + encoded.initializeChildrenFromFields(childrenFields); + encoded.setValueCount(valueCount); + + for (int index = 0; index < childCount; index++) { + FieldVector childVector = getChildVector(vector, index); + FieldVector encodedChildVector = getChildVector(encoded, index); + Long dictionaryId = columnToDictionaryId.get(index); + if (dictionaryId != null) { + BaseIntVector indices = (BaseIntVector) encodedChildVector; + DictionaryEncoder.buildIndexVector(childVector, indices, dictionaryIdToHashTable.get(dictionaryId), + 0, valueCount); + } else { + childVector.makeTransferPair(encodedChildVector).splitAndTransfer(0, valueCount); + } + } + + return encoded; + } + + /** + * Decodes a dictionary subfields encoded vector using the provided dictionary. + * @param vector dictionary encoded vector, its child vector must be int type + * @return vector with values restored from dictionary + */ + public StructVector decode(StructVector vector) { + + final int valueCount = vector.getValueCount(); + final int childCount = vector.getChildrenFromFields().size(); + + // clone list vector and initialize child vectors + StructVector decoded = cloneVector(vector); + List childFields = new ArrayList<>(); + for (int i = 0; i < childCount; i++) { + FieldVector childVector = getChildVector(vector, i); + Dictionary dictionary = getChildVectorDictionary(childVector); + // childVector is not encoded. + if (dictionary == null) { + childFields.add(childVector.getField()); + } else { + childFields.add(dictionary.getVector().getField()); + } + } + decoded.initializeChildrenFromFields(childFields); + decoded.setValueCount(valueCount); + + for (int index = 0; index < childCount; index++) { + // get child vector + FieldVector childVector = getChildVector(vector, index); + FieldVector decodedChildVector = getChildVector(decoded, index); + Dictionary dictionary = getChildVectorDictionary(childVector); + if (dictionary == null) { + childVector.makeTransferPair(decodedChildVector).splitAndTransfer(0, valueCount); + } else { + TransferPair transfer = dictionary.getVector().makeTransferPair(decodedChildVector); + BaseIntVector indices = (BaseIntVector) childVector; + + DictionaryEncoder.retrieveIndexVector(indices, transfer, valueCount, 0, valueCount); + } + } + + return decoded; + } + + /** + * Get the child vector dictionary, return null if not dictionary encoded. + */ + private Dictionary getChildVectorDictionary(FieldVector childVector) { + DictionaryEncoding dictionaryEncoding = childVector.getField().getDictionary(); + if (dictionaryEncoding != null) { + Dictionary dictionary = provider.lookup(dictionaryEncoding.getId()); + Preconditions.checkNotNull(dictionary, "Dictionary not found with id:" + dictionary); + return dictionary; + } + return null; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/holders/ComplexHolder.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/holders/ComplexHolder.java new file mode 100644 index 000000000..b4cb4882f --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/holders/ComplexHolder.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.holders; + +import org.apache.arrow.vector.complex.reader.FieldReader; + +/** + * Represents a single value of a complex type (e.g. Union, Struct). + */ +public class ComplexHolder implements ValueHolder { + public FieldReader reader; + public int isSet; +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/holders/DenseUnionHolder.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/holders/DenseUnionHolder.java new file mode 100644 index 000000000..c3052711e --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/holders/DenseUnionHolder.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.holders; + +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.types.Types; + +/** + * {@link ValueHolder} for Dense Union Vectors. + */ +public class DenseUnionHolder implements ValueHolder { + public FieldReader reader; + public int isSet; + public byte typeId; + + public Types.MinorType getMinorType() { + return reader.getMinorType(); + } + + public boolean isSet() { + return isSet == 1; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedListHolder.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedListHolder.java new file mode 100644 index 000000000..fc17704f0 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedListHolder.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.holders; + +/** + * {@link ValueHolder} for a nested {@link org.apache.arrow.vector.complex.ListVector}. + */ +public final class RepeatedListHolder implements ValueHolder { + public int start; + public int end; +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedStructHolder.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedStructHolder.java new file mode 100644 index 000000000..32c590a8a --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedStructHolder.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.holders; + +/** + * {@link ValueHolder} for a list of structs. + */ +public final class RepeatedStructHolder implements ValueHolder { + public int start; + public int end; +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/holders/UnionHolder.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/holders/UnionHolder.java new file mode 100644 index 000000000..e67a0e941 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/holders/UnionHolder.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.holders; + +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.types.Types.MinorType; + +/** + * {@link ValueHolder} for Union Vectors. + */ +public class UnionHolder implements ValueHolder { + public FieldReader reader; + public int isSet; + + public MinorType getMinorType() { + return reader.getMinorType(); + } + + public boolean isSet() { + return isSet == 1; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/holders/ValueHolder.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/holders/ValueHolder.java new file mode 100644 index 000000000..a809e6bb8 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/holders/ValueHolder.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.holders; + +/** + * Wrapper object for an individual value in Arrow. + * + *

ValueHolders are designed to be mutable wrapper objects for defining clean + * APIs that access data in Arrow. For performance, object creation is avoided + * at all costs throughout execution. For this reason, ValueHolders are + * disallowed from implementing any methods, this allows for them to be + * replaced by their java primitive inner members during optimization of + * run-time generated code. + */ +public interface ValueHolder { +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowFileReader.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowFileReader.java new file mode 100644 index 000000000..f4e9e0db1 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowFileReader.java @@ -0,0 +1,230 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.SeekableByteChannel; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.arrow.flatbuf.Footer; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.util.VisibleForTesting; +import org.apache.arrow.vector.compression.CompressionCodec; +import org.apache.arrow.vector.compression.NoCompressionCodec; +import org.apache.arrow.vector.ipc.message.ArrowBlock; +import org.apache.arrow.vector.ipc.message.ArrowDictionaryBatch; +import org.apache.arrow.vector.ipc.message.ArrowFooter; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; +import org.apache.arrow.vector.ipc.message.MessageSerializer; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.validate.MetadataV4UnionChecker; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * An implementation of {@link ArrowReader} that reads the standard arrow binary + * file format. + */ +public class ArrowFileReader extends ArrowReader { + + private static final Logger LOGGER = LoggerFactory.getLogger(ArrowFileReader.class); + + private SeekableReadChannel in; + private ArrowFooter footer; + private int currentDictionaryBatch = 0; + private int currentRecordBatch = 0; + + public ArrowFileReader( + SeekableReadChannel in, BufferAllocator allocator, CompressionCodec.Factory compressionFactory) { + super(allocator, compressionFactory); + this.in = in; + } + + public ArrowFileReader( + SeekableByteChannel in, BufferAllocator allocator, CompressionCodec.Factory compressionFactory) { + this(new SeekableReadChannel(in), allocator, compressionFactory); + } + + public ArrowFileReader(SeekableReadChannel in, BufferAllocator allocator) { + this(in, allocator, NoCompressionCodec.Factory.INSTANCE); + } + + public ArrowFileReader(SeekableByteChannel in, BufferAllocator allocator) { + this(new SeekableReadChannel(in), allocator); + } + + @Override + public long bytesRead() { + return in.bytesRead(); + } + + @Override + protected void closeReadSource() throws IOException { + in.close(); + } + + @Override + protected Schema readSchema() throws IOException { + if (footer == null) { + if (in.size() <= (ArrowMagic.MAGIC_LENGTH * 2 + 4)) { + throw new InvalidArrowFileException("file too small: " + in.size()); + } + ByteBuffer buffer = ByteBuffer.allocate(4 + ArrowMagic.MAGIC_LENGTH); + long footerLengthOffset = in.size() - buffer.remaining(); + in.setPosition(footerLengthOffset); + in.readFully(buffer); + buffer.flip(); + byte[] array = buffer.array(); + if (!ArrowMagic.validateMagic(Arrays.copyOfRange(array, 4, array.length))) { + throw new InvalidArrowFileException("missing Magic number " + Arrays.toString(buffer.array())); + } + int footerLength = MessageSerializer.bytesToInt(array); + if (footerLength <= 0 || footerLength + ArrowMagic.MAGIC_LENGTH * 2 + 4 > in.size()) { + throw new InvalidArrowFileException("invalid footer length: " + footerLength); + } + long footerOffset = footerLengthOffset - footerLength; + LOGGER.debug("Footer starts at {}, length: {}", footerOffset, footerLength); + ByteBuffer footerBuffer = ByteBuffer.allocate(footerLength); + in.setPosition(footerOffset); + in.readFully(footerBuffer); + footerBuffer.flip(); + Footer footerFB = Footer.getRootAsFooter(footerBuffer); + this.footer = new ArrowFooter(footerFB); + } + MetadataV4UnionChecker.checkRead(footer.getSchema(), footer.getMetadataVersion()); + return footer.getSchema(); + } + + @Override + public void initialize() throws IOException { + super.initialize(); + + // empty stream, has no dictionaries in IPC. + if (footer.getRecordBatches().size() == 0) { + return; + } + // Read and load all dictionaries from schema + for (int i = 0; i < dictionaries.size(); i++) { + ArrowDictionaryBatch dictionaryBatch = readDictionary(); + loadDictionary(dictionaryBatch); + } + } + + /** + * Get custom metadata. + */ + public Map getMetaData() { + if (footer != null) { + return footer.getMetaData(); + } + return new HashMap<>(); + } + + /** + * Read a dictionary batch from the source, will be invoked after the schema has been read and + * called N times, where N is the number of dictionaries indicated by the schema Fields. + * + * @return the read ArrowDictionaryBatch + * @throws IOException on error + */ + public ArrowDictionaryBatch readDictionary() throws IOException { + if (currentDictionaryBatch >= footer.getDictionaries().size()) { + throw new IOException("Requested more dictionaries than defined in footer: " + currentDictionaryBatch); + } + ArrowBlock block = footer.getDictionaries().get(currentDictionaryBatch++); + return readDictionaryBatch(in, block, allocator); + } + + /** Returns true if a batch was read, false if no more batches. */ + @Override + public boolean loadNextBatch() throws IOException { + prepareLoadNextBatch(); + + if (currentRecordBatch < footer.getRecordBatches().size()) { + ArrowBlock block = footer.getRecordBatches().get(currentRecordBatch++); + ArrowRecordBatch batch = readRecordBatch(in, block, allocator); + loadRecordBatch(batch); + return true; + } else { + return false; + } + } + + + public List getDictionaryBlocks() throws IOException { + ensureInitialized(); + return footer.getDictionaries(); + } + + /** + * Returns the {@link ArrowBlock} metadata from the file. + */ + public List getRecordBlocks() throws IOException { + ensureInitialized(); + return footer.getRecordBatches(); + } + + /** + * Loads record batch for the given block. + */ + public boolean loadRecordBatch(ArrowBlock block) throws IOException { + ensureInitialized(); + int blockIndex = footer.getRecordBatches().indexOf(block); + if (blockIndex == -1) { + throw new IllegalArgumentException("Arrow block does not exist in record batches: " + block); + } + currentRecordBatch = blockIndex; + return loadNextBatch(); + } + + @VisibleForTesting + ArrowFooter getFooter() { + return footer; + } + + private ArrowDictionaryBatch readDictionaryBatch(SeekableReadChannel in, + ArrowBlock block, + BufferAllocator allocator) throws IOException { + LOGGER.debug("DictionaryRecordBatch at {}, metadata: {}, body: {}", + block.getOffset(), block.getMetadataLength(), block.getBodyLength()); + in.setPosition(block.getOffset()); + ArrowDictionaryBatch batch = MessageSerializer.deserializeDictionaryBatch(in, block, allocator); + if (batch == null) { + throw new IOException("Invalid file. No batch at offset: " + block.getOffset()); + } + return batch; + } + + private ArrowRecordBatch readRecordBatch(SeekableReadChannel in, + ArrowBlock block, + BufferAllocator allocator) throws IOException { + LOGGER.debug("RecordBatch at {}, metadata: {}, body: {}", + block.getOffset(), block.getMetadataLength(), + block.getBodyLength()); + in.setPosition(block.getOffset()); + ArrowRecordBatch batch = MessageSerializer.deserializeRecordBatch(in, block, allocator); + if (batch == null) { + throw new IOException("Invalid file. No batch at offset: " + block.getOffset()); + } + return batch; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowFileWriter.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowFileWriter.java new file mode 100644 index 000000000..55cd26285 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowFileWriter.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +import java.io.IOException; +import java.nio.channels.WritableByteChannel; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import org.apache.arrow.util.VisibleForTesting; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.dictionary.DictionaryProvider; +import org.apache.arrow.vector.ipc.message.ArrowBlock; +import org.apache.arrow.vector.ipc.message.ArrowDictionaryBatch; +import org.apache.arrow.vector.ipc.message.ArrowFooter; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; +import org.apache.arrow.vector.ipc.message.IpcOption; +import org.apache.arrow.vector.ipc.message.MessageSerializer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * {@link ArrowWriter} that writes out a Arrow files (https://arrow.apache.org/docs/format/IPC.html#file-format). + */ +public class ArrowFileWriter extends ArrowWriter { + + private static final Logger LOGGER = LoggerFactory.getLogger(ArrowFileWriter.class); + + // All ArrowBlocks written are saved in these lists to be passed to ArrowFooter in endInternal. + private final List dictionaryBlocks = new ArrayList<>(); + private final List recordBlocks = new ArrayList<>(); + + private Map metaData; + + public ArrowFileWriter(VectorSchemaRoot root, DictionaryProvider provider, WritableByteChannel out) { + super(root, provider, out); + } + + public ArrowFileWriter(VectorSchemaRoot root, DictionaryProvider provider, WritableByteChannel out, + Map metaData) { + super(root, provider, out); + this.metaData = metaData; + } + + public ArrowFileWriter(VectorSchemaRoot root, DictionaryProvider provider, WritableByteChannel out, + IpcOption option) { + super(root, provider, out, option); + } + + public ArrowFileWriter(VectorSchemaRoot root, DictionaryProvider provider, WritableByteChannel out, + Map metaData, IpcOption option) { + super(root, provider, out, option); + this.metaData = metaData; + } + + @Override + protected void startInternal(WriteChannel out) throws IOException { + ArrowMagic.writeMagic(out, true); + } + + @Override + protected ArrowBlock writeDictionaryBatch(ArrowDictionaryBatch batch) throws IOException { + ArrowBlock block = super.writeDictionaryBatch(batch); + dictionaryBlocks.add(block); + return block; + } + + @Override + protected ArrowBlock writeRecordBatch(ArrowRecordBatch batch) throws IOException { + ArrowBlock block = super.writeRecordBatch(batch); + recordBlocks.add(block); + return block; + } + + @Override + protected void endInternal(WriteChannel out) throws IOException { + if (!option.write_legacy_ipc_format) { + out.writeIntLittleEndian(MessageSerializer.IPC_CONTINUATION_TOKEN); + } + out.writeIntLittleEndian(0); + + long footerStart = out.getCurrentPosition(); + out.write(new ArrowFooter(schema, dictionaryBlocks, recordBlocks, metaData, option.metadataVersion), false); + int footerLength = (int) (out.getCurrentPosition() - footerStart); + if (footerLength <= 0) { + throw new InvalidArrowFileException("invalid footer"); + } + out.writeIntLittleEndian(footerLength); + LOGGER.debug("Footer starts at {}, length: {}", footerStart, footerLength); + ArrowMagic.writeMagic(out, false); + LOGGER.debug("magic written, now at {}", out.getCurrentPosition()); + } + + @VisibleForTesting + public List getRecordBlocks() { + return recordBlocks; + } + + @VisibleForTesting + public List getDictionaryBlocks() { + return dictionaryBlocks; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowMagic.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowMagic.java new file mode 100644 index 000000000..9c399669a --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowMagic.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; + +/** + * Magic header/footer helpers for {@link ArrowFileWriter} and {@link ArrowFileReader} formatted files. + */ +class ArrowMagic { + private ArrowMagic(){} + + private static final byte[] MAGIC = "ARROW1".getBytes(StandardCharsets.UTF_8); + + public static final int MAGIC_LENGTH = MAGIC.length; + + public static void writeMagic(WriteChannel out, boolean align) throws IOException { + out.write(MAGIC); + if (align) { + out.align(); + } + } + + public static boolean validateMagic(byte[] array) { + return Arrays.equals(MAGIC, array); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowReader.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowReader.java new file mode 100644 index 000000000..9d940deec --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowReader.java @@ -0,0 +1,255 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.VectorLoader; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.compression.CompressionCodec; +import org.apache.arrow.vector.compression.NoCompressionCodec; +import org.apache.arrow.vector.dictionary.Dictionary; +import org.apache.arrow.vector.dictionary.DictionaryProvider; +import org.apache.arrow.vector.ipc.message.ArrowDictionaryBatch; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.DictionaryUtility; +import org.apache.arrow.vector.util.VectorBatchAppender; + +/** + * Abstract class to read Schema and ArrowRecordBatches. + * + */ +public abstract class ArrowReader implements DictionaryProvider, AutoCloseable { + + protected final BufferAllocator allocator; + private VectorLoader loader; + private VectorSchemaRoot root; + protected Map dictionaries; + private boolean initialized = false; + + private final CompressionCodec.Factory compressionFactory; + + protected ArrowReader(BufferAllocator allocator) { + this(allocator, NoCompressionCodec.Factory.INSTANCE); + } + + protected ArrowReader(BufferAllocator allocator, CompressionCodec.Factory compressionFactory) { + this.allocator = allocator; + this.compressionFactory = compressionFactory; + } + + /** + * Returns the vector schema root. This will be loaded with new values on every call to loadNextBatch. + * + * @return the vector schema root + * @throws IOException if reading of schema fails + */ + public VectorSchemaRoot getVectorSchemaRoot() throws IOException { + ensureInitialized(); + return root; + } + + /** + * Returns any dictionaries that were loaded along with ArrowRecordBatches. + * + * @return Map of dictionaries to dictionary id, empty if no dictionaries loaded + * @throws IOException if reading of schema fails + */ + public Map getDictionaryVectors() throws IOException { + ensureInitialized(); + return dictionaries; + } + + /** + * Lookup a dictionary that has been loaded using the dictionary id. + * + * @param id Unique identifier for a dictionary + * @return the requested dictionary or null if not found + */ + @Override + public Dictionary lookup(long id) { + if (!initialized) { + throw new IllegalStateException("Unable to lookup until reader has been initialized"); + } + + return dictionaries.get(id); + } + + /** + * Load the next ArrowRecordBatch to the vector schema root if available. + * + * @return true if a batch was read, false on EOS + * @throws IOException on error + */ + public abstract boolean loadNextBatch() throws IOException; + + /** + * Return the number of bytes read from the ReadChannel. + * + * @return number of bytes read + */ + public abstract long bytesRead(); + + /** + * Close resources, including vector schema root and dictionary vectors, and the + * underlying read source. + * + * @throws IOException on error + */ + @Override + public void close() throws IOException { + close(true); + } + + /** + * Close resources, including vector schema root and dictionary vectors. If the flag + * closeReadChannel is true then close the underlying read source, otherwise leave it open. + * + * @param closeReadSource Flag to control if closing the underlying read source + * @throws IOException on error + */ + public void close(boolean closeReadSource) throws IOException { + if (initialized) { + root.close(); + for (Dictionary dictionary : dictionaries.values()) { + dictionary.getVector().close(); + } + } + + if (closeReadSource) { + closeReadSource(); + } + } + + /** + * Close the underlying read source. + * + * @throws IOException on error + */ + protected abstract void closeReadSource() throws IOException; + + /** + * Read the Schema from the source, will be invoked at the beginning the initialization. + * + * @return the read Schema + * @throws IOException on error + */ + protected abstract Schema readSchema() throws IOException; + + /** + * Initialize if not done previously. + * + * @throws IOException on error + */ + protected void ensureInitialized() throws IOException { + if (!initialized) { + initialize(); + initialized = true; + } + } + + /** + * Reads the schema and initializes the vectors. + */ + protected void initialize() throws IOException { + Schema originalSchema = readSchema(); + List fields = new ArrayList<>(originalSchema.getFields().size()); + List vectors = new ArrayList<>(originalSchema.getFields().size()); + Map dictionaries = new HashMap<>(); + + // Convert fields with dictionaries to have the index type + for (Field field : originalSchema.getFields()) { + Field updated = DictionaryUtility.toMemoryFormat(field, allocator, dictionaries); + fields.add(updated); + vectors.add(updated.createVector(allocator)); + } + Schema schema = new Schema(fields, originalSchema.getCustomMetadata()); + + this.root = new VectorSchemaRoot(schema, vectors, 0); + this.loader = new VectorLoader(root, compressionFactory); + this.dictionaries = Collections.unmodifiableMap(dictionaries); + } + + /** + * Ensure the reader has been initialized and reset the VectorSchemaRoot row count to 0. + * + * @throws IOException on error + */ + protected void prepareLoadNextBatch() throws IOException { + ensureInitialized(); + root.setRowCount(0); + } + + /** + * Load an ArrowRecordBatch to the readers VectorSchemaRoot. + * + * @param batch the record batch to load + */ + protected void loadRecordBatch(ArrowRecordBatch batch) { + try { + loader.load(batch); + } finally { + batch.close(); + } + } + + /** + * Load an ArrowDictionaryBatch to the readers dictionary vectors. + * + * @param dictionaryBatch dictionary batch to load + */ + protected void loadDictionary(ArrowDictionaryBatch dictionaryBatch) { + long id = dictionaryBatch.getDictionaryId(); + Dictionary dictionary = dictionaries.get(id); + if (dictionary == null) { + throw new IllegalArgumentException("Dictionary ID " + id + " not defined in schema"); + } + FieldVector vector = dictionary.getVector(); + // if is deltaVector, concat it with non-delta vector with the same ID. + if (dictionaryBatch.isDelta()) { + try (FieldVector deltaVector = vector.getField().createVector(allocator)) { + load(dictionaryBatch, deltaVector); + VectorBatchAppender.batchAppend(vector, deltaVector); + } + return; + } + + load(dictionaryBatch, vector); + } + + private void load(ArrowDictionaryBatch dictionaryBatch, FieldVector vector) { + VectorSchemaRoot root = new VectorSchemaRoot( + Collections.singletonList(vector.getField()), + Collections.singletonList(vector), 0); + VectorLoader loader = new VectorLoader(root); + try { + loader.load(dictionaryBatch.getDictionary()); + } finally { + dictionaryBatch.close(); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowStreamReader.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowStreamReader.java new file mode 100644 index 000000000..a0096aaf3 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowStreamReader.java @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.channels.Channels; +import java.nio.channels.ReadableByteChannel; + +import org.apache.arrow.flatbuf.MessageHeader; +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.compression.CompressionCodec; +import org.apache.arrow.vector.compression.NoCompressionCodec; +import org.apache.arrow.vector.ipc.message.ArrowDictionaryBatch; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; +import org.apache.arrow.vector.ipc.message.MessageChannelReader; +import org.apache.arrow.vector.ipc.message.MessageResult; +import org.apache.arrow.vector.ipc.message.MessageSerializer; +import org.apache.arrow.vector.types.MetadataVersion; +import org.apache.arrow.vector.types.pojo.DictionaryEncoding; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.validate.MetadataV4UnionChecker; + +/** + * This class reads from an input stream and produces ArrowRecordBatches. + */ +public class ArrowStreamReader extends ArrowReader { + + private MessageChannelReader messageReader; + + private int loadedDictionaryCount; + + /** + * Constructs a streaming reader using a MessageChannelReader. Non-blocking. + * + * @param messageReader reader used to get messages from a ReadChannel + * @param allocator to allocate new buffers + * @param compressionFactory the factory to create compression codec. + */ + public ArrowStreamReader( + MessageChannelReader messageReader, BufferAllocator allocator, CompressionCodec.Factory compressionFactory) { + super(allocator, compressionFactory); + this.messageReader = messageReader; + } + + /** + * Constructs a streaming reader using a MessageChannelReader. Non-blocking. + * + * @param messageReader reader used to get messages from a ReadChannel + * @param allocator to allocate new buffers + */ + public ArrowStreamReader(MessageChannelReader messageReader, BufferAllocator allocator) { + this(messageReader, allocator, NoCompressionCodec.Factory.INSTANCE); + } + + /** + * Constructs a streaming reader from a ReadableByteChannel input. Non-blocking. + * + * @param in ReadableByteChannel to read messages from + * @param allocator to allocate new buffers + * @param compressionFactory the factory to create compression codec. + */ + public ArrowStreamReader( + ReadableByteChannel in, BufferAllocator allocator, CompressionCodec.Factory compressionFactory) { + this(new MessageChannelReader(new ReadChannel(in), allocator), allocator, compressionFactory); + } + + /** + * Constructs a streaming reader from a ReadableByteChannel input. Non-blocking. + * + * @param in ReadableByteChannel to read messages from + * @param allocator to allocate new buffers + */ + public ArrowStreamReader(ReadableByteChannel in, BufferAllocator allocator) { + this(new MessageChannelReader(new ReadChannel(in), allocator), allocator); + } + + /** + * Constructs a streaming reader from a ReadableByteChannel input. Non-blocking. + * + * @param in InputStream to read messages from + * @param allocator to allocate new buffers + * @param compressionFactory the factory to create compression codec. + */ + public ArrowStreamReader( + InputStream in, BufferAllocator allocator, CompressionCodec.Factory compressionFactory) { + this(Channels.newChannel(in), allocator, compressionFactory); + } + + /** + * Constructs a streaming reader from an InputStream. Non-blocking. + * + * @param in InputStream to read messages from + * @param allocator to allocate new buffers + */ + public ArrowStreamReader(InputStream in, BufferAllocator allocator) { + this(Channels.newChannel(in), allocator); + } + + /** + * Get the number of bytes read from the stream since constructing the reader. + * + * @return number of bytes + */ + @Override + public long bytesRead() { + return messageReader.bytesRead(); + } + + /** + * Closes the underlying read source. + * + * @throws IOException on error + */ + @Override + protected void closeReadSource() throws IOException { + messageReader.close(); + } + + /** + * Load the next ArrowRecordBatch to the vector schema root if available. + * + * @return true if a batch was read, false on EOS + * @throws IOException on error + */ + public boolean loadNextBatch() throws IOException { + prepareLoadNextBatch(); + MessageResult result = messageReader.readNext(); + + // Reached EOS + if (result == null) { + return false; + } + + if (result.getMessage().headerType() == MessageHeader.RecordBatch) { + ArrowBuf bodyBuffer = result.getBodyBuffer(); + + // For zero-length batches, need an empty buffer to deserialize the batch + if (bodyBuffer == null) { + bodyBuffer = allocator.getEmpty(); + } + + ArrowRecordBatch batch = MessageSerializer.deserializeRecordBatch(result.getMessage(), bodyBuffer); + loadRecordBatch(batch); + checkDictionaries(); + return true; + } else if (result.getMessage().headerType() == MessageHeader.DictionaryBatch) { + // if it's dictionary message, read dictionary message out and continue to read unless get a batch or eos. + ArrowDictionaryBatch dictionaryBatch = readDictionary(result); + loadDictionary(dictionaryBatch); + loadedDictionaryCount++; + return loadNextBatch(); + } else { + throw new IOException("Expected RecordBatch or DictionaryBatch but header was " + + result.getMessage().headerType()); + } + } + + /** + * When read a record batch, check whether its dictionaries are available. + */ + private void checkDictionaries() throws IOException { + // if all dictionaries are loaded, return. + if (loadedDictionaryCount == dictionaries.size()) { + return; + } + for (FieldVector vector : getVectorSchemaRoot().getFieldVectors()) { + DictionaryEncoding encoding = vector.getField().getDictionary(); + if (encoding != null) { + // if the dictionaries it needs is not available and the vector is not all null, something was wrong. + if (!dictionaries.containsKey(encoding.getId()) && vector.getNullCount() < vector.getValueCount()) { + throw new IOException("The dictionary was not available, id was:" + encoding.getId()); + } + } + } + } + + /** + * Reads the schema message from the beginning of the stream. + * + * @return the deserialized arrow schema + */ + @Override + protected Schema readSchema() throws IOException { + MessageResult result = messageReader.readNext(); + + if (result == null) { + throw new IOException("Unexpected end of input. Missing schema."); + } + + if (result.getMessage().headerType() != MessageHeader.Schema) { + throw new IOException("Expected schema but header was " + result.getMessage().headerType()); + } + + final Schema schema = MessageSerializer.deserializeSchema(result.getMessage()); + MetadataV4UnionChecker.checkRead(schema, MetadataVersion.fromFlatbufID(result.getMessage().version())); + return schema; + } + + + private ArrowDictionaryBatch readDictionary(MessageResult result) throws IOException { + + ArrowBuf bodyBuffer = result.getBodyBuffer(); + + // For zero-length batches, need an empty buffer to deserialize the batch + if (bodyBuffer == null) { + bodyBuffer = allocator.getEmpty(); + } + + return MessageSerializer.deserializeDictionaryBatch(result.getMessage(), bodyBuffer); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowStreamWriter.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowStreamWriter.java new file mode 100644 index 000000000..deb98580f --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowStreamWriter.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +import java.io.IOException; +import java.io.OutputStream; +import java.nio.channels.Channels; +import java.nio.channels.WritableByteChannel; + +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.dictionary.DictionaryProvider; +import org.apache.arrow.vector.ipc.message.IpcOption; +import org.apache.arrow.vector.ipc.message.MessageSerializer; + +/** + * Writer for the Arrow stream format to send ArrowRecordBatches over a WriteChannel. + */ +public class ArrowStreamWriter extends ArrowWriter { + + /** + * Construct an ArrowStreamWriter with an optional DictionaryProvider for the OutputStream. + * + * @param root Existing VectorSchemaRoot with vectors to be written. + * @param provider DictionaryProvider for any vectors that are dictionary encoded. + * (Optional, can be null) + * @param out OutputStream for writing. + */ + public ArrowStreamWriter(VectorSchemaRoot root, DictionaryProvider provider, OutputStream out) { + this(root, provider, Channels.newChannel(out)); + } + + /** + * Construct an ArrowStreamWriter with an optional DictionaryProvider for the WritableByteChannel. + */ + public ArrowStreamWriter(VectorSchemaRoot root, DictionaryProvider provider, WritableByteChannel out) { + this(root, provider, out, IpcOption.DEFAULT); + } + + /** + * Construct an ArrowStreamWriter with an optional DictionaryProvider for the WritableByteChannel. + * + * @param root Existing VectorSchemaRoot with vectors to be written. + * @param provider DictionaryProvider for any vectors that are dictionary encoded. + * (Optional, can be null) + * @param option IPC write options + * @param out WritableByteChannel for writing. + */ + public ArrowStreamWriter(VectorSchemaRoot root, DictionaryProvider provider, WritableByteChannel out, + IpcOption option) { + super(root, provider, out, option); + } + + /** + * Write an EOS identifier to the WriteChannel. + * + * @param out Open WriteChannel with an active Arrow stream. + * @param option IPC write option + * @throws IOException on error + */ + public static void writeEndOfStream(WriteChannel out, IpcOption option) throws IOException { + if (!option.write_legacy_ipc_format) { + out.writeIntLittleEndian(MessageSerializer.IPC_CONTINUATION_TOKEN); + } + out.writeIntLittleEndian(0); + } + + @Override + protected void endInternal(WriteChannel out) throws IOException { + writeEndOfStream(out, option); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowWriter.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowWriter.java new file mode 100644 index 000000000..7bc9a306f --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowWriter.java @@ -0,0 +1,210 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +import java.io.IOException; +import java.nio.channels.WritableByteChannel; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.arrow.util.AutoCloseables; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.VectorUnloader; +import org.apache.arrow.vector.dictionary.Dictionary; +import org.apache.arrow.vector.dictionary.DictionaryProvider; +import org.apache.arrow.vector.ipc.message.ArrowBlock; +import org.apache.arrow.vector.ipc.message.ArrowDictionaryBatch; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; +import org.apache.arrow.vector.ipc.message.IpcOption; +import org.apache.arrow.vector.ipc.message.MessageSerializer; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.DictionaryUtility; +import org.apache.arrow.vector.validate.MetadataV4UnionChecker; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Abstract base class for implementing Arrow writers for IPC over a WriteChannel. + */ +public abstract class ArrowWriter implements AutoCloseable { + + protected static final Logger LOGGER = LoggerFactory.getLogger(ArrowWriter.class); + + // schema with fields in message format, not memory format + protected final Schema schema; + protected final WriteChannel out; + + private final VectorUnloader unloader; + private final List dictionaries; + + private boolean started = false; + private boolean ended = false; + + private boolean dictWritten = false; + + protected IpcOption option; + + protected ArrowWriter(VectorSchemaRoot root, DictionaryProvider provider, WritableByteChannel out) { + this (root, provider, out, IpcOption.DEFAULT); + } + + /** + * Note: fields are not closed when the writer is closed. + * + * @param root the vectors to write to the output + * @param provider where to find the dictionaries + * @param out the output where to write + * @param option IPC write options + */ + protected ArrowWriter(VectorSchemaRoot root, DictionaryProvider provider, WritableByteChannel out, IpcOption option) { + this.unloader = new VectorUnloader(root); + this.out = new WriteChannel(out); + this.option = option; + + List fields = new ArrayList<>(root.getSchema().getFields().size()); + Set dictionaryIdsUsed = new HashSet<>(); + + MetadataV4UnionChecker.checkForUnion(root.getSchema().getFields().iterator(), option.metadataVersion); + // Convert fields with dictionaries to have dictionary type + for (Field field : root.getSchema().getFields()) { + fields.add(DictionaryUtility.toMessageFormat(field, provider, dictionaryIdsUsed)); + } + + // Create a record batch for each dictionary + this.dictionaries = new ArrayList<>(dictionaryIdsUsed.size()); + for (long id : dictionaryIdsUsed) { + Dictionary dictionary = provider.lookup(id); + FieldVector vector = dictionary.getVector(); + int count = vector.getValueCount(); + VectorSchemaRoot dictRoot = new VectorSchemaRoot( + Collections.singletonList(vector.getField()), + Collections.singletonList(vector), + count); + VectorUnloader unloader = new VectorUnloader(dictRoot); + ArrowRecordBatch batch = unloader.getRecordBatch(); + this.dictionaries.add(new ArrowDictionaryBatch(id, batch)); + } + + this.schema = new Schema(fields, root.getSchema().getCustomMetadata()); + } + + public void start() throws IOException { + ensureStarted(); + } + + /** + * Writes the record batch currently loaded in this instance's VectorSchemaRoot. + */ + public void writeBatch() throws IOException { + ensureStarted(); + ensureDictionariesWritten(); + try (ArrowRecordBatch batch = unloader.getRecordBatch()) { + writeRecordBatch(batch); + } + } + + protected ArrowBlock writeDictionaryBatch(ArrowDictionaryBatch batch) throws IOException { + ArrowBlock block = MessageSerializer.serialize(out, batch, option); + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("DictionaryRecordBatch at {}, metadata: {}, body: {}", + block.getOffset(), block.getMetadataLength(), block.getBodyLength()); + } + return block; + } + + protected ArrowBlock writeRecordBatch(ArrowRecordBatch batch) throws IOException { + ArrowBlock block = MessageSerializer.serialize(out, batch, option); + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("RecordBatch at {}, metadata: {}, body: {}", + block.getOffset(), block.getMetadataLength(), block.getBodyLength()); + } + return block; + } + + public void end() throws IOException { + ensureStarted(); + ensureEnded(); + } + + public long bytesWritten() { + return out.getCurrentPosition(); + } + + private void ensureStarted() throws IOException { + if (!started) { + started = true; + startInternal(out); + // write the schema - for file formats this is duplicated in the footer, but matches + // the streaming format + MessageSerializer.serialize(out, schema, option); + } + } + + /** + * Write dictionaries after schema and before recordBatches, dictionaries won't be + * written if empty stream (only has schema data in IPC). + */ + private void ensureDictionariesWritten() throws IOException { + if (!dictWritten) { + dictWritten = true; + // write out any dictionaries + try { + for (ArrowDictionaryBatch batch : dictionaries) { + writeDictionaryBatch(batch); + } + } finally { + try { + AutoCloseables.close(dictionaries); + } catch (Exception e) { + throw new RuntimeException("Error occurred while closing dictionaries.", e); + } + } + } + } + + private void ensureEnded() throws IOException { + if (!ended) { + ended = true; + endInternal(out); + } + } + + protected void startInternal(WriteChannel out) throws IOException { + } + + protected void endInternal(WriteChannel out) throws IOException { + } + + @Override + public void close() { + try { + end(); + out.close(); + if (!dictWritten) { + AutoCloseables.close(dictionaries); + } + } catch (Exception e) { + throw new RuntimeException(e); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/InvalidArrowFileException.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/InvalidArrowFileException.java new file mode 100644 index 000000000..e234058e6 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/InvalidArrowFileException.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +/** + * Exception indicating a problem with an Arrow File (https://arrow.apache.org/docs/format/IPC.html#file-format). + */ +public class InvalidArrowFileException extends RuntimeException { + private static final long serialVersionUID = 1L; + + public InvalidArrowFileException(String message) { + super(message); + } + +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java new file mode 100644 index 000000000..d093e840a --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java @@ -0,0 +1,806 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +import static com.fasterxml.jackson.core.JsonToken.END_ARRAY; +import static com.fasterxml.jackson.core.JsonToken.END_OBJECT; +import static com.fasterxml.jackson.core.JsonToken.START_ARRAY; +import static com.fasterxml.jackson.core.JsonToken.START_OBJECT; +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.arrow.vector.BufferLayout.BufferType.DATA; +import static org.apache.arrow.vector.BufferLayout.BufferType.OFFSET; +import static org.apache.arrow.vector.BufferLayout.BufferType.TYPE; +import static org.apache.arrow.vector.BufferLayout.BufferType.VALIDITY; + +import java.io.File; +import java.io.IOException; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.BitVectorHelper; +import org.apache.arrow.vector.BufferLayout.BufferType; +import org.apache.arrow.vector.Decimal256Vector; +import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.IntervalDayVector; +import org.apache.arrow.vector.IntervalMonthDayNanoVector; +import org.apache.arrow.vector.SmallIntVector; +import org.apache.arrow.vector.TinyIntVector; +import org.apache.arrow.vector.TypeLayout; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.dictionary.Dictionary; +import org.apache.arrow.vector.dictionary.DictionaryProvider; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.DecimalUtility; +import org.apache.arrow.vector.util.DictionaryUtility; +import org.apache.commons.codec.DecoderException; +import org.apache.commons.codec.binary.Hex; + +import com.fasterxml.jackson.core.JsonParseException; +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonToken; +import com.fasterxml.jackson.databind.MapperFeature; +import com.fasterxml.jackson.databind.MappingJsonFactory; +import com.fasterxml.jackson.databind.ObjectMapper; + +/** + * A reader for JSON files that translates them into vectors. This reader is used for integration tests. + * + *

This class uses a streaming parser API, method naming tends to reflect this implementation + * detail. + */ +public class JsonFileReader implements AutoCloseable, DictionaryProvider { + private final JsonParser parser; + private final BufferAllocator allocator; + private Schema schema; + private Map dictionaries; + private Boolean started = false; + + /** + * Constructs a new instance. + * @param inputFile The file to read. + * @param allocator The allocator to use for allocating buffers. + */ + public JsonFileReader(File inputFile, BufferAllocator allocator) throws JsonParseException, IOException { + super(); + this.allocator = allocator; + MappingJsonFactory jsonFactory = new MappingJsonFactory(new ObjectMapper() + //ignore case for enums + .configure(MapperFeature.ACCEPT_CASE_INSENSITIVE_ENUMS, true) + ); + this.parser = jsonFactory.createParser(inputFile); + // Allow reading NaN for floating point values + this.parser.configure(JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS, true); + } + + @Override + public Dictionary lookup(long id) { + if (!started) { + throw new IllegalStateException("Unable to lookup until after read() has started"); + } + + return dictionaries.get(id); + } + + /** Reads the beginning (schema section) of the json file and returns it. */ + public Schema start() throws JsonParseException, IOException { + readToken(START_OBJECT); + { + Schema originalSchema = readNextField("schema", Schema.class); + List fields = new ArrayList<>(); + dictionaries = new HashMap<>(); + + // Convert fields with dictionaries to have the index type + for (Field field : originalSchema.getFields()) { + fields.add(DictionaryUtility.toMemoryFormat(field, allocator, dictionaries)); + } + this.schema = new Schema(fields, originalSchema.getCustomMetadata()); + + if (!dictionaries.isEmpty()) { + nextFieldIs("dictionaries"); + readDictionaryBatches(); + } + + nextFieldIs("batches"); + readToken(START_ARRAY); + started = true; + return this.schema; + } + } + + private void readDictionaryBatches() throws JsonParseException, IOException { + readToken(START_ARRAY); + JsonToken token = parser.nextToken(); + boolean haveDictionaryBatch = token == START_OBJECT; + while (haveDictionaryBatch) { + + // Lookup what dictionary for the batch about to be read + long id = readNextField("id", Long.class); + Dictionary dict = dictionaries.get(id); + if (dict == null) { + throw new IllegalArgumentException("Dictionary with id: " + id + " missing encoding from schema Field"); + } + + // Read the dictionary record batch + nextFieldIs("data"); + FieldVector vector = dict.getVector(); + List fields = Collections.singletonList(vector.getField()); + List vectors = Collections.singletonList(vector); + VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors, vector.getValueCount()); + read(root); + + readToken(END_OBJECT); + token = parser.nextToken(); + haveDictionaryBatch = token == START_OBJECT; + } + + if (token != END_ARRAY) { + throw new IllegalArgumentException("Invalid token: " + token + " expected end of array at " + + parser.getTokenLocation()); + } + } + + /** + * Reads the next record batch from the file into root. + */ + public boolean read(VectorSchemaRoot root) throws IOException { + JsonToken t = parser.nextToken(); + if (t == START_OBJECT) { + { + int count = readNextField("count", Integer.class); + nextFieldIs("columns"); + readToken(START_ARRAY); + { + for (Field field : root.getSchema().getFields()) { + FieldVector vector = root.getVector(field); + readFromJsonIntoVector(field, vector); + } + } + readToken(END_ARRAY); + root.setRowCount(count); + } + readToken(END_OBJECT); + return true; + } else if (t == END_ARRAY) { + root.setRowCount(0); + return false; + } else { + throw new IllegalArgumentException("Invalid token: " + t); + } + } + + /** + * Returns the next record batch from the file. + */ + public VectorSchemaRoot read() throws IOException { + JsonToken t = parser.nextToken(); + if (t == START_OBJECT) { + VectorSchemaRoot recordBatch = VectorSchemaRoot.create(schema, allocator); + { + int count = readNextField("count", Integer.class); + recordBatch.setRowCount(count); + nextFieldIs("columns"); + readToken(START_ARRAY); + { + for (Field field : schema.getFields()) { + FieldVector vector = recordBatch.getVector(field); + readFromJsonIntoVector(field, vector); + } + } + readToken(END_ARRAY); + } + readToken(END_OBJECT); + return recordBatch; + } else if (t == END_ARRAY) { + return null; + } else { + throw new IllegalArgumentException("Invalid token: " + t); + } + } + + private abstract class BufferReader { + protected abstract ArrowBuf read(BufferAllocator allocator, int count) throws IOException; + + ArrowBuf readBuffer(BufferAllocator allocator, int count) throws IOException { + readToken(START_ARRAY); + ArrowBuf buf = read(allocator, count); + readToken(END_ARRAY); + return buf; + } + } + + private class BufferHelper { + BufferReader BIT = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + final int bufferSize = BitVectorHelper.getValidityBufferSize(count); + ArrowBuf buf = allocator.buffer(bufferSize); + + // C++ integration test fails without this. + buf.setZero(0, bufferSize); + + for (int i = 0; i < count; i++) { + parser.nextToken(); + BitVectorHelper.setValidityBit(buf, i, parser.readValueAs(Boolean.class) ? 1 : 0); + } + + buf.writerIndex(bufferSize); + return buf; + } + }; + + BufferReader DAY_MILLIS = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + final long size = (long) count * IntervalDayVector.TYPE_WIDTH; + ArrowBuf buf = allocator.buffer(size); + + for (int i = 0; i < count; i++) { + readToken(START_OBJECT); + buf.writeInt(readNextField("days", Integer.class)); + buf.writeInt(readNextField("milliseconds", Integer.class)); + readToken(END_OBJECT); + } + + return buf; + } + }; + + BufferReader MONTH_DAY_NANOS = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + final long size = (long) count * IntervalMonthDayNanoVector.TYPE_WIDTH; + ArrowBuf buf = allocator.buffer(size); + + for (int i = 0; i < count; i++) { + readToken(START_OBJECT); + buf.writeInt(readNextField("months", Integer.class)); + buf.writeInt(readNextField("days", Integer.class)); + buf.writeLong(readNextField("nanoseconds", Long.class)); + readToken(END_OBJECT); + } + + return buf; + } + }; + + + BufferReader INT1 = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + final long size = (long) count * TinyIntVector.TYPE_WIDTH; + ArrowBuf buf = allocator.buffer(size); + + for (int i = 0; i < count; i++) { + parser.nextToken(); + buf.writeByte(parser.getByteValue()); + } + + return buf; + } + }; + + BufferReader INT2 = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + final long size = (long) count * SmallIntVector.TYPE_WIDTH; + ArrowBuf buf = allocator.buffer(size); + + for (int i = 0; i < count; i++) { + parser.nextToken(); + buf.writeShort(parser.getShortValue()); + } + + return buf; + } + }; + + BufferReader INT4 = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + final long size = (long) count * IntVector.TYPE_WIDTH; + ArrowBuf buf = allocator.buffer(size); + + for (int i = 0; i < count; i++) { + parser.nextToken(); + buf.writeInt(parser.getIntValue()); + } + + return buf; + } + }; + + BufferReader INT8 = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + final long size = (long) count * BigIntVector.TYPE_WIDTH; + ArrowBuf buf = allocator.buffer(size); + + for (int i = 0; i < count; i++) { + parser.nextToken(); + String value = parser.getValueAsString(); + buf.writeLong(Long.valueOf(value)); + } + + return buf; + } + }; + + BufferReader UINT1 = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + final long size = (long) count * TinyIntVector.TYPE_WIDTH; + ArrowBuf buf = allocator.buffer(size); + + for (int i = 0; i < count; i++) { + parser.nextToken(); + buf.writeByte(parser.getShortValue() & 0xFF); + } + + return buf; + } + }; + + BufferReader UINT2 = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + final long size = (long) count * SmallIntVector.TYPE_WIDTH; + ArrowBuf buf = allocator.buffer(size); + + for (int i = 0; i < count; i++) { + parser.nextToken(); + buf.writeShort(parser.getIntValue() & 0xFFFF); + } + + return buf; + } + }; + + BufferReader UINT4 = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + final long size = (long) count * IntVector.TYPE_WIDTH; + ArrowBuf buf = allocator.buffer(size); + + for (int i = 0; i < count; i++) { + parser.nextToken(); + buf.writeInt((int) parser.getLongValue()); + } + + return buf; + } + }; + + BufferReader UINT8 = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + final long size = (long) count * BigIntVector.TYPE_WIDTH; + ArrowBuf buf = allocator.buffer(size); + + for (int i = 0; i < count; i++) { + parser.nextToken(); + BigInteger value = new BigInteger(parser.getValueAsString()); + buf.writeLong(value.longValue()); + } + + return buf; + } + }; + + BufferReader FLOAT4 = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + final long size = (long) count * Float4Vector.TYPE_WIDTH; + ArrowBuf buf = allocator.buffer(size); + + for (int i = 0; i < count; i++) { + parser.nextToken(); + buf.writeFloat(parser.getFloatValue()); + } + + return buf; + } + }; + + BufferReader FLOAT8 = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + final long size = (long) count * Float8Vector.TYPE_WIDTH; + ArrowBuf buf = allocator.buffer(size); + + for (int i = 0; i < count; i++) { + parser.nextToken(); + buf.writeDouble(parser.getDoubleValue()); + } + + return buf; + } + }; + + BufferReader DECIMAL = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + final long size = (long) count * DecimalVector.TYPE_WIDTH; + ArrowBuf buf = allocator.buffer(size); + + for (int i = 0; i < count; i++) { + parser.nextToken(); + BigDecimal decimalValue = new BigDecimal(parser.readValueAs(String.class)); + DecimalUtility.writeBigDecimalToArrowBuf(decimalValue, buf, i, DecimalVector.TYPE_WIDTH); + } + + buf.writerIndex(size); + return buf; + } + }; + + BufferReader DECIMAL256 = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + final long size = (long) count * Decimal256Vector.TYPE_WIDTH; + ArrowBuf buf = allocator.buffer(size); + + for (int i = 0; i < count; i++) { + parser.nextToken(); + BigDecimal decimalValue = new BigDecimal(parser.readValueAs(String.class)); + DecimalUtility.writeBigDecimalToArrowBuf(decimalValue, buf, i, Decimal256Vector.TYPE_WIDTH); + } + + buf.writerIndex(size); + return buf; + } + }; + + ArrowBuf readBinaryValues( + BufferAllocator allocator, int count) throws IOException { + ArrayList values = new ArrayList<>(count); + long bufferSize = 0L; + for (int i = 0; i < count; i++) { + parser.nextToken(); + final byte[] value = decodeHexSafe(parser.readValueAs(String.class)); + values.add(value); + bufferSize += value.length; + } + + ArrowBuf buf = allocator.buffer(bufferSize); + + for (byte[] value : values) { + buf.writeBytes(value); + } + + return buf; + } + + ArrowBuf readStringValues( + BufferAllocator allocator, int count) throws IOException { + ArrayList values = new ArrayList<>(count); + long bufferSize = 0L; + for (int i = 0; i < count; i++) { + parser.nextToken(); + final byte[] value = parser.getValueAsString().getBytes(UTF_8); + values.add(value); + bufferSize += value.length; + } + + ArrowBuf buf = allocator.buffer(bufferSize); + + for (byte[] value : values) { + buf.writeBytes(value); + } + + return buf; + } + + BufferReader FIXEDSIZEBINARY = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + return readBinaryValues(allocator, count); + } + }; + + BufferReader VARCHAR = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + return readStringValues(allocator, count); + } + }; + + BufferReader LARGEVARCHAR = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + return readStringValues(allocator, count); + } + }; + + BufferReader VARBINARY = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + return readBinaryValues(allocator, count); + } + }; + + BufferReader LARGEVARBINARY = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + return readBinaryValues(allocator, count); + } + }; + } + + private ArrowBuf readIntoBuffer(BufferAllocator allocator, BufferType bufferType, + Types.MinorType type, int count) throws IOException { + ArrowBuf buf; + + BufferHelper helper = new BufferHelper(); + + BufferReader reader = null; + + if (bufferType.equals(VALIDITY)) { + reader = helper.BIT; + } else if (bufferType.equals(OFFSET)) { + if (type == Types.MinorType.LARGELIST || + type == Types.MinorType.LARGEVARCHAR || + type == Types.MinorType.LARGEVARBINARY) { + reader = helper.INT8; + } else { + reader = helper.INT4; + } + } else if (bufferType.equals(TYPE)) { + reader = helper.INT1; + } else if (bufferType.equals(DATA)) { + switch (type) { + case BIT: + reader = helper.BIT; + break; + case TINYINT: + reader = helper.INT1; + break; + case SMALLINT: + reader = helper.INT2; + break; + case INT: + reader = helper.INT4; + break; + case BIGINT: + reader = helper.INT8; + break; + case UINT1: + reader = helper.UINT1; + break; + case UINT2: + reader = helper.UINT2; + break; + case UINT4: + reader = helper.UINT4; + break; + case UINT8: + reader = helper.UINT8; + break; + case FLOAT4: + reader = helper.FLOAT4; + break; + case FLOAT8: + reader = helper.FLOAT8; + break; + case DECIMAL: + reader = helper.DECIMAL; + break; + case DECIMAL256: + reader = helper.DECIMAL256; + break; + case FIXEDSIZEBINARY: + reader = helper.FIXEDSIZEBINARY; + break; + case VARCHAR: + reader = helper.VARCHAR; + break; + case LARGEVARCHAR: + reader = helper.LARGEVARCHAR; + break; + case VARBINARY: + reader = helper.VARBINARY; + break; + case LARGEVARBINARY: + reader = helper.LARGEVARBINARY; + break; + case DATEDAY: + reader = helper.INT4; + break; + case DATEMILLI: + reader = helper.INT8; + break; + case TIMESEC: + case TIMEMILLI: + reader = helper.INT4; + break; + case TIMEMICRO: + case TIMENANO: + reader = helper.INT8; + break; + case TIMESTAMPNANO: + case TIMESTAMPMICRO: + case TIMESTAMPMILLI: + case TIMESTAMPSEC: + case TIMESTAMPNANOTZ: + case TIMESTAMPMICROTZ: + case TIMESTAMPMILLITZ: + case TIMESTAMPSECTZ: + reader = helper.INT8; + break; + case INTERVALYEAR: + reader = helper.INT4; + break; + case INTERVALDAY: + reader = helper.DAY_MILLIS; + break; + case INTERVALMONTHDAYNANO: + reader = helper.MONTH_DAY_NANOS; + break; + case DURATION: + reader = helper.INT8; + break; + default: + throw new UnsupportedOperationException("Cannot read array of type " + type); + } + } else { + throw new InvalidArrowFileException("Unrecognized buffer type " + bufferType); + } + + buf = reader.readBuffer(allocator, count); + + Preconditions.checkNotNull(buf); + return buf; + } + + private void readFromJsonIntoVector(Field field, FieldVector vector) throws JsonParseException, IOException { + TypeLayout typeLayout = TypeLayout.getTypeLayout(field.getType()); + List vectorTypes = typeLayout.getBufferTypes(); + ArrowBuf[] vectorBuffers = new ArrowBuf[vectorTypes.size()]; + /* + * The order of inner buffers is : + * Fixed width vector: + * -- validity buffer + * -- data buffer + * Variable width vector: + * -- validity buffer + * -- offset buffer + * -- data buffer + * + * This is similar to what getFieldInnerVectors() used to give but now that we don't have + * inner vectors anymore, we will work directly at the buffer level -- populate buffers + * locally as we read from Json parser and do loadFieldBuffers on the vector followed by + * releasing the local buffers. + */ + readToken(START_OBJECT); + { + // If currently reading dictionaries, field name is not important so don't check + String name = readNextField("name", String.class); + if (started && !Objects.equals(field.getName(), name)) { + throw new IllegalArgumentException("Expected field " + field.getName() + " but got " + name); + } + + /* Initialize the vector with required capacity but don't allocateNew since we would + * be doing loadFieldBuffers. + */ + int valueCount = readNextField("count", Integer.class); + vector.setInitialCapacity(valueCount); + + for (int v = 0; v < vectorTypes.size(); v++) { + BufferType bufferType = vectorTypes.get(v); + nextFieldIs(bufferType.getName()); + int innerBufferValueCount = valueCount; + if (bufferType.equals(OFFSET) && !field.getType().getTypeID().equals(ArrowType.ArrowTypeID.Union)) { + /* offset buffer has 1 additional value capacity */ + innerBufferValueCount = valueCount + 1; + } + + vectorBuffers[v] = readIntoBuffer(allocator, bufferType, vector.getMinorType(), innerBufferValueCount); + } + + if (vectorBuffers.length == 0) { + readToken(END_OBJECT); + return; + } + + int nullCount = 0; + if (!(vector.getField().getFieldType().getType() instanceof ArrowType.Union)) { + nullCount = BitVectorHelper.getNullCount(vectorBuffers[0], valueCount); + } + final ArrowFieldNode fieldNode = new ArrowFieldNode(valueCount, nullCount); + vector.loadFieldBuffers(fieldNode, Arrays.asList(vectorBuffers)); + + /* read child vectors (if any) */ + List fields = field.getChildren(); + if (!fields.isEmpty()) { + List vectorChildren = vector.getChildrenFromFields(); + if (fields.size() != vectorChildren.size()) { + throw new IllegalArgumentException( + "fields and children are not the same size: " + fields.size() + " != " + vectorChildren.size()); + } + nextFieldIs("children"); + readToken(START_ARRAY); + for (int i = 0; i < fields.size(); i++) { + Field childField = fields.get(i); + FieldVector childVector = vectorChildren.get(i); + readFromJsonIntoVector(childField, childVector); + } + readToken(END_ARRAY); + } + } + readToken(END_OBJECT); + + for (ArrowBuf buffer: vectorBuffers) { + buffer.getReferenceManager().release(); + } + } + + private byte[] decodeHexSafe(String hexString) throws IOException { + try { + return Hex.decodeHex(hexString.toCharArray()); + } catch (DecoderException e) { + throw new IOException("Unable to decode hex string: " + hexString, e); + } + } + + @Override + public void close() throws IOException { + parser.close(); + for (Dictionary dictionary : dictionaries.values()) { + dictionary.getVector().close(); + } + } + + private T readNextField(String expectedFieldName, Class c) throws IOException, JsonParseException { + nextFieldIs(expectedFieldName); + parser.nextToken(); + return parser.readValueAs(c); + } + + private void nextFieldIs(String expectedFieldName) throws IOException, JsonParseException { + String name = parser.nextFieldName(); + if (name == null || !name.equals(expectedFieldName)) { + throw new IllegalStateException("Expected " + expectedFieldName + " but got " + name); + } + } + + private void readToken(JsonToken expected) throws JsonParseException, IOException { + JsonToken t = parser.nextToken(); + if (t != expected) { + throw new IllegalStateException("Expected " + expected + " but got " + t); + } + } + +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java new file mode 100644 index 000000000..58760c1a9 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java @@ -0,0 +1,417 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +import static org.apache.arrow.vector.BufferLayout.BufferType.*; + +import java.io.File; +import java.io.IOException; +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.BitVectorHelper; +import org.apache.arrow.vector.BufferLayout.BufferType; +import org.apache.arrow.vector.DateDayVector; +import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.Decimal256Vector; +import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.DurationVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.IntervalDayVector; +import org.apache.arrow.vector.IntervalMonthDayNanoVector; +import org.apache.arrow.vector.IntervalYearVector; +import org.apache.arrow.vector.SmallIntVector; +import org.apache.arrow.vector.TimeMicroVector; +import org.apache.arrow.vector.TimeMilliVector; +import org.apache.arrow.vector.TimeNanoVector; +import org.apache.arrow.vector.TimeSecVector; +import org.apache.arrow.vector.TimeStampMicroTZVector; +import org.apache.arrow.vector.TimeStampMicroVector; +import org.apache.arrow.vector.TimeStampMilliTZVector; +import org.apache.arrow.vector.TimeStampMilliVector; +import org.apache.arrow.vector.TimeStampNanoTZVector; +import org.apache.arrow.vector.TimeStampNanoVector; +import org.apache.arrow.vector.TimeStampSecTZVector; +import org.apache.arrow.vector.TimeStampSecVector; +import org.apache.arrow.vector.TinyIntVector; +import org.apache.arrow.vector.TypeLayout; +import org.apache.arrow.vector.UInt1Vector; +import org.apache.arrow.vector.UInt2Vector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.UInt8Vector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.dictionary.Dictionary; +import org.apache.arrow.vector.dictionary.DictionaryProvider; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.DecimalUtility; +import org.apache.arrow.vector.util.DictionaryUtility; +import org.apache.commons.codec.binary.Hex; + +import com.fasterxml.jackson.core.JsonEncoding; +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.core.util.DefaultPrettyPrinter; +import com.fasterxml.jackson.core.util.DefaultPrettyPrinter.NopIndenter; +import com.fasterxml.jackson.databind.MappingJsonFactory; + +/** + * A writer that converts binary Vectors into a JSON format suitable + * for integration testing. + */ +public class JsonFileWriter implements AutoCloseable { + + /** + * Configuration POJO for writing JSON files. + */ + public static final class JSONWriteConfig { + private final boolean pretty; + + private JSONWriteConfig(boolean pretty) { + this.pretty = pretty; + } + + private JSONWriteConfig() { + this.pretty = false; + } + + public JSONWriteConfig pretty(boolean pretty) { + return new JSONWriteConfig(pretty); + } + } + + public static JSONWriteConfig config() { + return new JSONWriteConfig(); + } + + private final JsonGenerator generator; + private Schema schema; + + /** + * Constructs a new writer that will output to outputFile. + */ + public JsonFileWriter(File outputFile) throws IOException { + this(outputFile, config()); + } + + /** + * Constructs a new writer that will output to outputFile with the given options. + */ + public JsonFileWriter(File outputFile, JSONWriteConfig config) throws IOException { + MappingJsonFactory jsonFactory = new MappingJsonFactory(); + this.generator = jsonFactory.createGenerator(outputFile, JsonEncoding.UTF8); + if (config.pretty) { + DefaultPrettyPrinter prettyPrinter = new DefaultPrettyPrinter(); + prettyPrinter.indentArraysWith(NopIndenter.instance); + this.generator.setPrettyPrinter(prettyPrinter); + } + // Allow writing of floating point NaN values not as strings + this.generator.configure(JsonGenerator.Feature.QUOTE_NON_NUMERIC_NUMBERS, false); + } + + /** + * Writes out the "header" of the file including the schema and any dictionaries required. + */ + public void start(Schema schema, DictionaryProvider provider) throws IOException { + List fields = new ArrayList<>(schema.getFields().size()); + Set dictionaryIdsUsed = new HashSet<>(); + this.schema = schema; // Store original Schema to ensure batches written match + + // Convert fields with dictionaries to have dictionary type + for (Field field : schema.getFields()) { + fields.add(DictionaryUtility.toMessageFormat(field, provider, dictionaryIdsUsed)); + } + Schema updatedSchema = new Schema(fields, schema.getCustomMetadata()); + + generator.writeStartObject(); + generator.writeObjectField("schema", updatedSchema); + + // Write all dictionaries that were used + if (!dictionaryIdsUsed.isEmpty()) { + writeDictionaryBatches(generator, dictionaryIdsUsed, provider); + } + + // Start writing of record batches + generator.writeArrayFieldStart("batches"); + } + + private void writeDictionaryBatches(JsonGenerator generator, Set dictionaryIdsUsed, DictionaryProvider provider) + throws IOException { + generator.writeArrayFieldStart("dictionaries"); + for (Long id : dictionaryIdsUsed) { + generator.writeStartObject(); + generator.writeObjectField("id", id); + + generator.writeFieldName("data"); + Dictionary dictionary = provider.lookup(id); + FieldVector vector = dictionary.getVector(); + List fields = Collections.singletonList(vector.getField()); + List vectors = Collections.singletonList(vector); + VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors, vector.getValueCount()); + writeBatch(root); + + generator.writeEndObject(); + } + generator.writeEndArray(); + } + + /** Writes the record batch to the JSON file. */ + public void write(VectorSchemaRoot recordBatch) throws IOException { + if (!recordBatch.getSchema().equals(schema)) { + throw new IllegalArgumentException("record batches must have the same schema: " + schema); + } + writeBatch(recordBatch); + } + + private void writeBatch(VectorSchemaRoot recordBatch) throws IOException { + generator.writeStartObject(); + { + generator.writeObjectField("count", recordBatch.getRowCount()); + generator.writeArrayFieldStart("columns"); + for (Field field : recordBatch.getSchema().getFields()) { + FieldVector vector = recordBatch.getVector(field); + writeFromVectorIntoJson(field, vector); + } + generator.writeEndArray(); + } + generator.writeEndObject(); + } + + private void writeFromVectorIntoJson(Field field, FieldVector vector) throws IOException { + List vectorTypes = TypeLayout.getTypeLayout(field.getType()).getBufferTypes(); + List vectorBuffers = vector.getFieldBuffers(); + if (vectorTypes.size() != vectorBuffers.size()) { + throw new IllegalArgumentException("vector types and inner vector buffers are not the same size: " + + vectorTypes.size() + " != " + vectorBuffers.size()); + } + generator.writeStartObject(); + { + generator.writeObjectField("name", field.getName()); + int valueCount = vector.getValueCount(); + generator.writeObjectField("count", valueCount); + + for (int v = 0; v < vectorTypes.size(); v++) { + BufferType bufferType = vectorTypes.get(v); + ArrowBuf vectorBuffer = vectorBuffers.get(v); + generator.writeArrayFieldStart(bufferType.getName()); + final int bufferValueCount = (bufferType.equals(OFFSET) && vector.getMinorType() != MinorType.DENSEUNION) ? + valueCount + 1 : valueCount; + for (int i = 0; i < bufferValueCount; i++) { + if (bufferType.equals(DATA) && (vector.getMinorType() == MinorType.VARCHAR || + vector.getMinorType() == MinorType.VARBINARY)) { + writeValueToGenerator(bufferType, vectorBuffer, vectorBuffers.get(v - 1), vector, i); + } else if (bufferType.equals(OFFSET) && vector.getValueCount() == 0 && + (vector.getMinorType() == MinorType.VARBINARY || vector.getMinorType() == MinorType.VARCHAR)) { + ArrowBuf vectorBufferTmp = vector.getAllocator().buffer(4); + vectorBufferTmp.setInt(0, 0); + writeValueToGenerator(bufferType, vectorBufferTmp, null, vector, i); + vectorBufferTmp.close(); + } else { + writeValueToGenerator(bufferType, vectorBuffer, null, vector, i); + } + } + generator.writeEndArray(); + } + List fields = field.getChildren(); + List children = vector.getChildrenFromFields(); + if (fields.size() != children.size()) { + throw new IllegalArgumentException("fields and children are not the same size: " + fields.size() + " != " + + children.size()); + } + if (fields.size() > 0) { + generator.writeArrayFieldStart("children"); + for (int i = 0; i < fields.size(); i++) { + Field childField = fields.get(i); + FieldVector childVector = children.get(i); + writeFromVectorIntoJson(childField, childVector); + } + generator.writeEndArray(); + } + } + generator.writeEndObject(); + } + + private void writeValueToGenerator( + BufferType bufferType, + ArrowBuf buffer, + ArrowBuf offsetBuffer, + FieldVector vector, + final int index) throws IOException { + if (bufferType.equals(TYPE)) { + generator.writeNumber(buffer.getByte(index * TinyIntVector.TYPE_WIDTH)); + } else if (bufferType.equals(OFFSET)) { + generator.writeNumber(buffer.getInt(index * BaseVariableWidthVector.OFFSET_WIDTH)); + } else if (bufferType.equals(VALIDITY)) { + generator.writeNumber(vector.isNull(index) ? 0 : 1); + } else if (bufferType.equals(DATA)) { + switch (vector.getMinorType()) { + case TINYINT: + generator.writeNumber(TinyIntVector.get(buffer, index)); + break; + case SMALLINT: + generator.writeNumber(SmallIntVector.get(buffer, index)); + break; + case INT: + generator.writeNumber(IntVector.get(buffer, index)); + break; + case BIGINT: + generator.writeString(String.valueOf(BigIntVector.get(buffer, index))); + break; + case UINT1: + generator.writeNumber(UInt1Vector.getNoOverflow(buffer, index)); + break; + case UINT2: + generator.writeNumber(UInt2Vector.get(buffer, index)); + break; + case UINT4: + generator.writeNumber(UInt4Vector.getNoOverflow(buffer, index)); + break; + case UINT8: + generator.writeString(UInt8Vector.getNoOverflow(buffer, index).toString()); + break; + case FLOAT4: + generator.writeNumber(Float4Vector.get(buffer, index)); + break; + case FLOAT8: + generator.writeNumber(Float8Vector.get(buffer, index)); + break; + case DATEDAY: + generator.writeNumber(DateDayVector.get(buffer, index)); + break; + case DATEMILLI: + generator.writeNumber(DateMilliVector.get(buffer, index)); + break; + case TIMESEC: + generator.writeNumber(TimeSecVector.get(buffer, index)); + break; + case TIMEMILLI: + generator.writeNumber(TimeMilliVector.get(buffer, index)); + break; + case TIMEMICRO: + generator.writeNumber(TimeMicroVector.get(buffer, index)); + break; + case TIMENANO: + generator.writeNumber(TimeNanoVector.get(buffer, index)); + break; + case TIMESTAMPSEC: + generator.writeNumber(TimeStampSecVector.get(buffer, index)); + break; + case TIMESTAMPMILLI: + generator.writeNumber(TimeStampMilliVector.get(buffer, index)); + break; + case TIMESTAMPMICRO: + generator.writeNumber(TimeStampMicroVector.get(buffer, index)); + break; + case TIMESTAMPNANO: + generator.writeNumber(TimeStampNanoVector.get(buffer, index)); + break; + case TIMESTAMPSECTZ: + generator.writeNumber(TimeStampSecTZVector.get(buffer, index)); + break; + case TIMESTAMPMILLITZ: + generator.writeNumber(TimeStampMilliTZVector.get(buffer, index)); + break; + case TIMESTAMPMICROTZ: + generator.writeNumber(TimeStampMicroTZVector.get(buffer, index)); + break; + case TIMESTAMPNANOTZ: + generator.writeNumber(TimeStampNanoTZVector.get(buffer, index)); + break; + case DURATION: + generator.writeNumber(DurationVector.get(buffer, index)); + break; + case INTERVALYEAR: + generator.writeNumber(IntervalYearVector.getTotalMonths(buffer, index)); + break; + case INTERVALDAY: + generator.writeStartObject(); + generator.writeObjectField("days", IntervalDayVector.getDays(buffer, index)); + generator.writeObjectField("milliseconds", IntervalDayVector.getMilliseconds(buffer, index)); + generator.writeEndObject(); + break; + case INTERVALMONTHDAYNANO: + generator.writeStartObject(); + generator.writeObjectField("months", IntervalMonthDayNanoVector.getMonths(buffer, index)); + generator.writeObjectField("days", IntervalMonthDayNanoVector.getDays(buffer, index)); + generator.writeObjectField("nanoseconds", IntervalMonthDayNanoVector.getNanoseconds(buffer, index)); + generator.writeEndObject(); + break; + case BIT: + generator.writeNumber(BitVectorHelper.get(buffer, index)); + break; + case VARBINARY: { + Preconditions.checkNotNull(offsetBuffer); + String hexString = Hex.encodeHexString(BaseVariableWidthVector.get(buffer, + offsetBuffer, index)); + generator.writeObject(hexString); + break; + } + case FIXEDSIZEBINARY: + int byteWidth = ((FixedSizeBinaryVector) vector).getByteWidth(); + String fixedSizeHexString = Hex.encodeHexString(FixedSizeBinaryVector.get(buffer, index, byteWidth)); + generator.writeObject(fixedSizeHexString); + break; + case VARCHAR: { + Preconditions.checkNotNull(offsetBuffer); + byte[] b = (BaseVariableWidthVector.get(buffer, offsetBuffer, index)); + generator.writeString(new String(b, "UTF-8")); + break; + } + case DECIMAL: { + int scale = ((DecimalVector) vector).getScale(); + BigDecimal decimalValue = DecimalUtility.getBigDecimalFromArrowBuf(buffer, index, scale, + DecimalVector.TYPE_WIDTH); + // We write the unscaled value, because the scale is stored in the type metadata. + generator.writeString(decimalValue.unscaledValue().toString()); + break; + } + case DECIMAL256: { + int scale = ((Decimal256Vector) vector).getScale(); + BigDecimal decimalValue = DecimalUtility.getBigDecimalFromArrowBuf(buffer, index, scale, + Decimal256Vector.TYPE_WIDTH); + // We write the unscaled value, because the scale is stored in the type metadata. + generator.writeString(decimalValue.unscaledValue().toString()); + break; + } + + default: + throw new UnsupportedOperationException("minor type: " + vector.getMinorType()); + } + } + } + + @Override + public void close() throws IOException { + generator.writeEndArray(); + generator.writeEndObject(); + generator.close(); + } + +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ReadChannel.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ReadChannel.java new file mode 100644 index 000000000..db79661a8 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/ReadChannel.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.ReadableByteChannel; + +import org.apache.arrow.memory.ArrowBuf; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Adapter around {@link ReadableByteChannel} that reads into {@linkplain ArrowBuf}s. + */ +public class ReadChannel implements AutoCloseable { + + private static final Logger LOGGER = LoggerFactory.getLogger(ReadChannel.class); + + private ReadableByteChannel in; + private long bytesRead = 0; + + public ReadChannel(ReadableByteChannel in) { + this.in = in; + } + + public long bytesRead() { + return bytesRead; + } + + /** + * Reads bytes into buffer until it is full (buffer.remaining() == 0). Returns the + * number of bytes read which can be less than full if there are no more. + * + * @param buffer The buffer to read to + * @return the number of byte read + * @throws IOException if nit enough bytes left to read + */ + public int readFully(ByteBuffer buffer) throws IOException { + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("Reading buffer with size: {}", buffer.remaining()); + } + int totalRead = 0; + while (buffer.remaining() != 0) { + int read = in.read(buffer); + if (read == -1) { + this.bytesRead += totalRead; + return totalRead; + } + totalRead += read; + if (read == 0) { + break; + } + } + this.bytesRead += totalRead; + return totalRead; + } + + /** + * Reads up to len into buffer. Returns bytes read. + * + * @param buffer the buffer to read to + * @param length the amount of bytes to read + * @return the number of bytes read + * @throws IOException if nit enough bytes left to read + */ + public long readFully(ArrowBuf buffer, long length) throws IOException { + boolean fullRead = true; + long bytesLeft = length; + while (fullRead && bytesLeft > 0) { + int bytesToRead = (int) Math.min(bytesLeft, Integer.MAX_VALUE); + int n = readFully(buffer.nioBuffer(buffer.writerIndex(), bytesToRead)); + buffer.writerIndex(buffer.writerIndex() + n); + fullRead = n == bytesToRead; + bytesLeft -= n; + } + return length - bytesLeft; + } + + @Override + public void close() throws IOException { + if (this.in != null) { + in.close(); + in = null; + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/SeekableReadChannel.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/SeekableReadChannel.java new file mode 100644 index 000000000..4b6e0ed76 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/SeekableReadChannel.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +import java.io.IOException; +import java.nio.channels.SeekableByteChannel; + +/** + * An {@link ReadChannel} that supports seeking to a + * random position. + */ +public class SeekableReadChannel extends ReadChannel { + + private final SeekableByteChannel in; + + public SeekableReadChannel(SeekableByteChannel in) { + super(in); + this.in = in; + } + + public void setPosition(long position) throws IOException { + in.position(position); + } + + public long size() throws IOException { + return in.size(); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/WriteChannel.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/WriteChannel.java new file mode 100644 index 000000000..9ad71f6fe --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/WriteChannel.java @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.WritableByteChannel; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.vector.ipc.message.FBSerializable; +import org.apache.arrow.vector.ipc.message.MessageSerializer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.flatbuffers.FlatBufferBuilder; + +/** + * Wrapper around a WritableByteChannel that maintains the position as well adding + * some common serialization utilities. + * + *

All write methods in this class follow full write semantics, i.e., write calls + * only return after requested data has been fully written. Note this is different + * from java WritableByteChannel interface where partial write is allowed + *

+ *

+ * Please note that objects of this class are not thread-safe. + *

+ */ +public class WriteChannel implements AutoCloseable { + private static final Logger LOGGER = LoggerFactory.getLogger(WriteChannel.class); + + private static final byte[] ZERO_BYTES = new byte[8]; + + private final byte[] intBuf = new byte[4]; + + private long currentPosition = 0; + + private final WritableByteChannel out; + + public WriteChannel(WritableByteChannel out) { + this.out = out; + } + + @Override + public void close() throws IOException { + out.close(); + } + + public long getCurrentPosition() { + return currentPosition; + } + + public long write(byte[] buffer) throws IOException { + return write(ByteBuffer.wrap(buffer)); + } + + long write(byte[] buffer, int offset, int length) throws IOException { + return write(ByteBuffer.wrap(buffer, offset, length)); + } + + /** + * Writes zeroCount zeros the underlying channel. + */ + public long writeZeros(long zeroCount) throws IOException { + long bytesWritten = 0; + long wholeWordsEnd = zeroCount - 8; + while (bytesWritten <= wholeWordsEnd) { + bytesWritten += write(ZERO_BYTES); + } + + if (bytesWritten < zeroCount) { + bytesWritten += write(ZERO_BYTES, 0, (int) (zeroCount - bytesWritten)); + } + return bytesWritten; + } + + /** + * Writes enough bytes to align the channel to an 8-byte boundary. + */ + public long align() throws IOException { + int trailingByteSize = (int) (currentPosition % 8); + if (trailingByteSize != 0) { // align on 8 byte boundaries + return writeZeros(8 - trailingByteSize); + } + return 0; + } + + /** + * Writes all data from buffer to the underlying channel. + */ + public long write(ByteBuffer buffer) throws IOException { + long length = buffer.remaining(); + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("Writing buffer with size: {}", length); + } + while (buffer.hasRemaining()) { + out.write(buffer); + } + currentPosition += length; + return length; + } + + /** + * Writes v in little-endian format to the underlying channel. + */ + public long writeIntLittleEndian(int v) throws IOException { + MessageSerializer.intToBytes(v, intBuf); + return write(intBuf); + } + + /** + * Writes the buffer to the underlying channel. + */ + public void write(ArrowBuf buffer) throws IOException { + long bytesWritten = 0; + while (bytesWritten < buffer.readableBytes()) { + int bytesToWrite = (int) Math.min(Integer.MAX_VALUE, buffer.readableBytes() - bytesWritten); + ByteBuffer nioBuffer = buffer.nioBuffer(buffer.readerIndex() + bytesWritten, + bytesToWrite); + write(nioBuffer); + bytesWritten += bytesToWrite; + } + + } + + /** + * Writes the serialized flatbuffer to the underlying channel. If withSizePrefix + * is true then the length in bytes of the buffer will first be written in little endian format. + */ + public long write(FBSerializable writer, boolean withSizePrefix) throws IOException { + ByteBuffer buffer = serialize(writer); + if (withSizePrefix) { + writeIntLittleEndian(buffer.remaining()); + } + return write(buffer); + } + + /** + * Serializes writer to a ByteBuffer. + */ + public static ByteBuffer serialize(FBSerializable writer) { + FlatBufferBuilder builder = new FlatBufferBuilder(); + int root = writer.writeTo(builder); + builder.finish(root); + return builder.dataBuffer(); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowBlock.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowBlock.java new file mode 100644 index 000000000..a235102ce --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowBlock.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc.message; + +import org.apache.arrow.flatbuf.Block; + +import com.google.flatbuffers.FlatBufferBuilder; + +/** Metadata for an arrow message in a channel. */ +public class ArrowBlock implements FBSerializable { + + private final long offset; + private final int metadataLength; + private final long bodyLength; + + /** + * Constructs a new instance. + * + * @param offset The offset into the channel file where the block was written. + * @param metadataLength The length of the flatbuffer metadata in the block. + * @param bodyLength The length of data in the block. + */ + public ArrowBlock(long offset, int metadataLength, long bodyLength) { + super(); + this.offset = offset; + this.metadataLength = metadataLength; + this.bodyLength = bodyLength; + } + + public long getOffset() { + return offset; + } + + public int getMetadataLength() { + return metadataLength; + } + + public long getBodyLength() { + return bodyLength; + } + + @Override + public int writeTo(FlatBufferBuilder builder) { + return Block.createBlock(builder, offset, metadataLength, bodyLength); + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + (int) (bodyLength ^ (bodyLength >>> 32)); + result = prime * result + metadataLength; + result = prime * result + (int) (offset ^ (offset >>> 32)); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + ArrowBlock other = (ArrowBlock) obj; + if (bodyLength != other.bodyLength) { + return false; + } + if (metadataLength != other.metadataLength) { + return false; + } + if (offset != other.offset) { + return false; + } + return true; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowBodyCompression.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowBodyCompression.java new file mode 100644 index 000000000..5370ddfa0 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowBodyCompression.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc.message; + +import org.apache.arrow.flatbuf.BodyCompression; + +import com.google.flatbuffers.FlatBufferBuilder; + +/** + * Compression information about data written to a channel. + */ +public class ArrowBodyCompression implements FBSerializable { + + private final byte codec; + + private final byte method; + + public ArrowBodyCompression(byte codec, byte method) { + this.codec = codec; + this.method = method; + } + + @Override + public int writeTo(FlatBufferBuilder builder) { + return BodyCompression.createBodyCompression(builder, codec, method); + } + + public byte getCodec() { + return codec; + } + + public byte getMethod() { + return method; + } + + @Override + public String toString() { + return "ArrowBodyCompression [codec=" + codec + ", method=" + method + "]"; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowBuffer.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowBuffer.java new file mode 100644 index 000000000..d3aec6fb7 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowBuffer.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc.message; + +import org.apache.arrow.flatbuf.Buffer; + +import com.google.flatbuffers.FlatBufferBuilder; + +/** Metadata for a buffer written to a channel. */ +public class ArrowBuffer implements FBSerializable { + + private long offset; + private long size; + + /** + * Constructs a new instance. + * + * @param offset The offset to the start of the buffer in the channel. + * @param size The size of the buffer. + */ + public ArrowBuffer(long offset, long size) { + super(); + this.offset = offset; + this.size = size; + } + + public long getOffset() { + return offset; + } + + public long getSize() { + return size; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + (int) (offset ^ (offset >>> 32)); + result = prime * result + (int) (size ^ (size >>> 32)); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + ArrowBuffer other = (ArrowBuffer) obj; + if (offset != other.offset) { + return false; + } + if (size != other.size) { + return false; + } + return true; + } + + @Override + public int writeTo(FlatBufferBuilder builder) { + return Buffer.createBuffer(builder, offset, size); + } + + @Override + public String toString() { + return "ArrowBuffer [offset=" + offset + ", size=" + size + "]"; + } + +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowDictionaryBatch.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowDictionaryBatch.java new file mode 100644 index 000000000..cac2a1cb8 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowDictionaryBatch.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc.message; + +import org.apache.arrow.flatbuf.DictionaryBatch; +import org.apache.arrow.flatbuf.MessageHeader; + +import com.google.flatbuffers.FlatBufferBuilder; + +/** + * POJO wrapper around a Dictionary Batch IPC messages + * (https://arrow.apache.org/docs/format/IPC.html#dictionary-batches) + */ +public class ArrowDictionaryBatch implements ArrowMessage { + + private final long dictionaryId; + private final ArrowRecordBatch dictionary; + private final boolean isDelta; + + @Deprecated + public ArrowDictionaryBatch(long dictionaryId, ArrowRecordBatch dictionary) { + this (dictionaryId, dictionary, false); + } + + /** + * Constructs new instance. + */ + public ArrowDictionaryBatch(long dictionaryId, ArrowRecordBatch dictionary, boolean isDelta) { + this.dictionaryId = dictionaryId; + this.dictionary = dictionary; + this.isDelta = isDelta; + } + + public boolean isDelta() { + return isDelta; + } + + public byte getMessageType() { + return MessageHeader.DictionaryBatch; + } + + public long getDictionaryId() { + return dictionaryId; + } + + public ArrowRecordBatch getDictionary() { + return dictionary; + } + + @Override + public int writeTo(FlatBufferBuilder builder) { + int dataOffset = dictionary.writeTo(builder); + DictionaryBatch.startDictionaryBatch(builder); + DictionaryBatch.addId(builder, dictionaryId); + DictionaryBatch.addData(builder, dataOffset); + DictionaryBatch.addIsDelta(builder, isDelta); + return DictionaryBatch.endDictionaryBatch(builder); + } + + @Override + public long computeBodyLength() { + return dictionary.computeBodyLength(); + } + + @Override + public T accepts(ArrowMessageVisitor visitor) { + return visitor.visit(this); + } + + @Override + public String toString() { + return "ArrowDictionaryBatch [dictionaryId=" + dictionaryId + ", dictionary=" + dictionary + "]"; + } + + @Override + public void close() { + dictionary.close(); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowFieldNode.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowFieldNode.java new file mode 100644 index 000000000..9ce5e2e4d --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowFieldNode.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc.message; + +import static org.apache.arrow.memory.util.LargeMemoryUtil.checkedCastToInt; + +import org.apache.arrow.flatbuf.FieldNode; + +import com.google.flatbuffers.FlatBufferBuilder; + +/** + * Metadata about Vectors/Arrays that is written to a channel. + */ +public class ArrowFieldNode implements FBSerializable { + + private final int length; + private final int nullCount; + + /** + * Constructs a new instance. + * + * @param length The number of values written. + * @param nullCount The number of null values. + */ + public ArrowFieldNode(long length, long nullCount) { + super(); + this.length = checkedCastToInt(length); + this.nullCount = checkedCastToInt(nullCount); + } + + @Override + public int writeTo(FlatBufferBuilder builder) { + return FieldNode.createFieldNode(builder, length, nullCount); + } + + public int getNullCount() { + return nullCount; + } + + public int getLength() { + return length; + } + + @Override + public String toString() { + return "ArrowFieldNode [length=" + length + ", nullCount=" + nullCount + "]"; + } + +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowFooter.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowFooter.java new file mode 100644 index 000000000..567fabc1d --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowFooter.java @@ -0,0 +1,226 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc.message; + +import static org.apache.arrow.vector.ipc.message.FBSerializables.writeAllStructsToVector; +import static org.apache.arrow.vector.ipc.message.FBSerializables.writeKeyValues; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.arrow.flatbuf.Block; +import org.apache.arrow.flatbuf.Footer; +import org.apache.arrow.flatbuf.KeyValue; +import org.apache.arrow.vector.types.MetadataVersion; +import org.apache.arrow.vector.types.pojo.Schema; + +import com.google.flatbuffers.FlatBufferBuilder; + +/** Footer metadata for the arrow file format. */ +public class ArrowFooter implements FBSerializable { + + private final Schema schema; + + private final List dictionaries; + + private final List recordBatches; + + private final Map metaData; + + private final MetadataVersion metadataVersion; + + public ArrowFooter(Schema schema, List dictionaries, List recordBatches) { + this(schema, dictionaries, recordBatches, null); + } + + /** + * Constructs a new instance. + * + * @param schema The schema for record batches in the file. + * @param dictionaries The dictionaries relevant to the file. + * @param recordBatches The recordBatches written to the file. + * @param metaData user-defined k-v meta data. + */ + public ArrowFooter( + Schema schema, + List dictionaries, + List recordBatches, + Map metaData) { + this(schema, dictionaries, recordBatches, metaData, MetadataVersion.DEFAULT); + } + + /** + * Constructs a new instance. + * + * @param schema The schema for record batches in the file. + * @param dictionaries The dictionaries relevant to the file. + * @param recordBatches The recordBatches written to the file. + * @param metaData user-defined k-v meta data. + * @param metadataVersion The Arrow metadata version. + */ + public ArrowFooter( + Schema schema, + List dictionaries, + List recordBatches, + Map metaData, + MetadataVersion metadataVersion) { + this.schema = schema; + this.dictionaries = dictionaries; + this.recordBatches = recordBatches; + this.metaData = metaData; + this.metadataVersion = metadataVersion; + } + + /** + * Constructs from the corresponding Flatbuffer message. + */ + public ArrowFooter(Footer footer) { + this( + Schema.convertSchema(footer.schema()), + dictionaries(footer), + recordBatches(footer), + metaData(footer), + MetadataVersion.fromFlatbufID(footer.version()) + ); + } + + private static List recordBatches(Footer footer) { + List recordBatches = new ArrayList<>(); + Block tempBlock = new Block(); + int recordBatchesLength = footer.recordBatchesLength(); + for (int i = 0; i < recordBatchesLength; i++) { + Block block = footer.recordBatches(tempBlock, i); + recordBatches.add(new ArrowBlock(block.offset(), block.metaDataLength(), block.bodyLength())); + } + return recordBatches; + } + + private static List dictionaries(Footer footer) { + List dictionaries = new ArrayList<>(); + Block tempBlock = new Block(); + + int dictionariesLength = footer.dictionariesLength(); + for (int i = 0; i < dictionariesLength; i++) { + Block block = footer.dictionaries(tempBlock, i); + dictionaries.add(new ArrowBlock(block.offset(), block.metaDataLength(), block.bodyLength())); + } + return dictionaries; + } + + private static Map metaData(Footer footer) { + Map metaData = new HashMap<>(); + + int metaDataLength = footer.customMetadataLength(); + for (int i = 0; i < metaDataLength; i++) { + KeyValue kv = footer.customMetadata(i); + metaData.put(kv.key(), kv.value()); + } + + return metaData; + } + + public Schema getSchema() { + return schema; + } + + public List getDictionaries() { + return dictionaries; + } + + public List getRecordBatches() { + return recordBatches; + } + + public Map getMetaData() { + return metaData; + } + + public MetadataVersion getMetadataVersion() { + return metadataVersion; + } + + @Override + public int writeTo(FlatBufferBuilder builder) { + int schemaIndex = schema.getSchema(builder); + Footer.startDictionariesVector(builder, dictionaries.size()); + int dicsOffset = writeAllStructsToVector(builder, dictionaries); + Footer.startRecordBatchesVector(builder, recordBatches.size()); + int rbsOffset = writeAllStructsToVector(builder, recordBatches); + + int metaDataOffset = 0; + if (metaData != null) { + metaDataOffset = writeKeyValues(builder, metaData); + } + + Footer.startFooter(builder); + Footer.addSchema(builder, schemaIndex); + Footer.addDictionaries(builder, dicsOffset); + Footer.addRecordBatches(builder, rbsOffset); + Footer.addCustomMetadata(builder, metaDataOffset); + Footer.addVersion(builder, metadataVersion.toFlatbufID()); + return Footer.endFooter(builder); + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((dictionaries == null) ? 0 : dictionaries.hashCode()); + result = prime * result + ((recordBatches == null) ? 0 : recordBatches.hashCode()); + result = prime * result + ((schema == null) ? 0 : schema.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + ArrowFooter other = (ArrowFooter) obj; + if (dictionaries == null) { + if (other.dictionaries != null) { + return false; + } + } else if (!dictionaries.equals(other.dictionaries)) { + return false; + } + if (recordBatches == null) { + if (other.recordBatches != null) { + return false; + } + } else if (!recordBatches.equals(other.recordBatches)) { + return false; + } + if (schema == null) { + if (other.schema != null) { + return false; + } + } else if (!schema.equals(other.schema)) { + return false; + } + return true; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowMessage.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowMessage.java new file mode 100644 index 000000000..4cbc87b4e --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowMessage.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc.message; + +/** + * Interface for Arrow IPC messages (https://arrow.apache.org/docs/format/IPC.html). + */ +public interface ArrowMessage extends FBSerializable, AutoCloseable { + + long computeBodyLength(); + + T accepts(ArrowMessageVisitor visitor); + + /** Returns the flatbuffer enum value indicating the type of the message. */ + byte getMessageType(); + + /** + * Visitor interface for implementations of {@link ArrowMessage}. + * + * @param The type of value to return after visiting. + */ + interface ArrowMessageVisitor { + T visit(ArrowDictionaryBatch message); + + T visit(ArrowRecordBatch message); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatch.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatch.java new file mode 100644 index 000000000..dbf2774fb --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatch.java @@ -0,0 +1,259 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc.message; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + +import org.apache.arrow.flatbuf.RecordBatch; +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.compression.NoCompressionCodec; +import org.apache.arrow.vector.util.DataSizeRoundingUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.flatbuffers.FlatBufferBuilder; + +/** + * POJO representation of a RecordBatch IPC message (https://arrow.apache.org/docs/format/IPC.html). + */ +public class ArrowRecordBatch implements ArrowMessage { + + private static final Logger LOGGER = LoggerFactory.getLogger(ArrowRecordBatch.class); + + /** + * Number of records. + */ + private final int length; + + /** + * Nodes correspond to the pre-ordered flattened logical schema. + */ + private final List nodes; + + private final List buffers; + + private final ArrowBodyCompression bodyCompression; + + private final List buffersLayout; + + private boolean closed = false; + + public ArrowRecordBatch( + int length, List nodes, List buffers) { + this(length, nodes, buffers, NoCompressionCodec.DEFAULT_BODY_COMPRESSION, true); + } + + public ArrowRecordBatch( + int length, List nodes, List buffers, + ArrowBodyCompression bodyCompression) { + this(length, nodes, buffers, bodyCompression, true); + } + + /** + * Construct a record batch from nodes. + * + * @param length how many rows in this batch + * @param nodes field level info + * @param buffers will be retained until this recordBatch is closed + * @param bodyCompression compression info. + */ + public ArrowRecordBatch( + int length, List nodes, List buffers, + ArrowBodyCompression bodyCompression, boolean alignBuffers) { + super(); + this.length = length; + this.nodes = nodes; + this.buffers = buffers; + Preconditions.checkArgument(bodyCompression != null, "body compression cannot be null"); + this.bodyCompression = bodyCompression; + List arrowBuffers = new ArrayList<>(buffers.size()); + long offset = 0; + for (ArrowBuf arrowBuf : buffers) { + arrowBuf.getReferenceManager().retain(); + long size = arrowBuf.readableBytes(); + arrowBuffers.add(new ArrowBuffer(offset, size)); + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("Buffer in RecordBatch at {}, length: {}", offset, size); + } + offset += size; + if (alignBuffers) { // align on 8 byte boundaries + offset = DataSizeRoundingUtil.roundUpTo8Multiple(offset); + } + } + this.buffersLayout = Collections.unmodifiableList(arrowBuffers); + } + + // clone constructor + // this constructor is different from the public ones in that the reference manager's + // retain method is not called, so the first dummy parameter is used + // to distinguish this from the public constructor. + private ArrowRecordBatch( + boolean dummy, int length, List nodes, + List buffers, ArrowBodyCompression bodyCompression) { + this.length = length; + this.nodes = nodes; + this.buffers = buffers; + Preconditions.checkArgument(bodyCompression != null, "body compression cannot be null"); + this.bodyCompression = bodyCompression; + this.closed = false; + List arrowBuffers = new ArrayList<>(); + long offset = 0; + for (ArrowBuf arrowBuf : buffers) { + long size = arrowBuf.readableBytes(); + arrowBuffers.add(new ArrowBuffer(offset, size)); + offset += size; + } + this.buffersLayout = Collections.unmodifiableList(arrowBuffers); + } + + public byte getMessageType() { + return org.apache.arrow.flatbuf.MessageHeader.RecordBatch; + } + + public int getLength() { + return length; + } + + public ArrowBodyCompression getBodyCompression() { + return bodyCompression; + } + + /** + * Get the nodes in this record batch. + * + * @return the FieldNodes corresponding to the schema + */ + public List getNodes() { + return nodes; + } + + /** + * Get the record batch buffers. + * + * @return the buffers containing the data + */ + public List getBuffers() { + if (closed) { + throw new IllegalStateException("already closed"); + } + return buffers; + } + + /** + * Create a new ArrowRecordBatch which has the same information as this batch but whose buffers + * are owned by that Allocator. + * + *

This will also close this record batch and make it no longer useful. + * + * @return A cloned ArrowRecordBatch + */ + public ArrowRecordBatch cloneWithTransfer(final BufferAllocator allocator) { + final List newBufs = buffers.stream() + .map(buf -> + (buf.getReferenceManager().transferOwnership(buf, allocator) + .getTransferredBuffer()) + .writerIndex(buf.writerIndex())) + .collect(Collectors.toList()); + close(); + return new ArrowRecordBatch(false, length, nodes, newBufs, bodyCompression); + } + + /** + * Get the serialized layout. + * + * @return the serialized layout if we send the buffers on the wire + */ + public List getBuffersLayout() { + return buffersLayout; + } + + @Override + public int writeTo(FlatBufferBuilder builder) { + RecordBatch.startNodesVector(builder, nodes.size()); + int nodesOffset = FBSerializables.writeAllStructsToVector(builder, nodes); + RecordBatch.startBuffersVector(builder, buffers.size()); + int buffersOffset = FBSerializables.writeAllStructsToVector(builder, buffersLayout); + int compressOffset = 0; + if (bodyCompression.getCodec() != NoCompressionCodec.COMPRESSION_TYPE) { + compressOffset = bodyCompression.writeTo(builder); + } + RecordBatch.startRecordBatch(builder); + RecordBatch.addLength(builder, length); + RecordBatch.addNodes(builder, nodesOffset); + RecordBatch.addBuffers(builder, buffersOffset); + if (bodyCompression.getCodec() != NoCompressionCodec.COMPRESSION_TYPE) { + RecordBatch.addCompression(builder, compressOffset); + } + return RecordBatch.endRecordBatch(builder); + } + + @Override + public T accepts(ArrowMessageVisitor visitor) { + return visitor.visit(this); + } + + /** + * Releases the buffers. + */ + @Override + public void close() { + if (!closed) { + closed = true; + for (ArrowBuf arrowBuf : buffers) { + arrowBuf.getReferenceManager().release(); + } + } + } + + @Override + public String toString() { + return "ArrowRecordBatch [length=" + length + ", nodes=" + nodes + ", #buffers=" + buffers.size() + + ", buffersLayout=" + buffersLayout + ", closed=" + closed + "]"; + } + + /** + * Computes the size of the serialized body for this recordBatch. + */ + @Override + public long computeBodyLength() { + long size = 0; + + List buffers = getBuffers(); + List buffersLayout = getBuffersLayout(); + if (buffers.size() != buffersLayout.size()) { + throw new IllegalStateException("the layout does not match: " + + buffers.size() + " != " + buffersLayout.size()); + } + + for (int i = 0; i < buffers.size(); i++) { + ArrowBuf buffer = buffers.get(i); + ArrowBuffer layout = buffersLayout.get(i); + size = layout.getOffset() + buffer.readableBytes(); + + // round up size to the next multiple of 8 + size = DataSizeRoundingUtil.roundUpTo8Multiple(size); + } + return size; + } + +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/FBSerializable.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/FBSerializable.java new file mode 100644 index 000000000..6b406b594 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/FBSerializable.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc.message; + +import com.google.flatbuffers.FlatBufferBuilder; + +/** + * Interface for serializing to FlatBuffers. + */ +public interface FBSerializable { + /** + * Returns the number of bytes taken to serialize the data in builder after writing to it. + */ + int writeTo(FlatBufferBuilder builder); +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/FBSerializables.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/FBSerializables.java new file mode 100644 index 000000000..26736ed91 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/FBSerializables.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc.message; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import org.apache.arrow.flatbuf.KeyValue; + +import com.google.flatbuffers.FlatBufferBuilder; + +/** + * Utility methods for {@linkplain org.apache.arrow.vector.ipc.message.FBSerializable}s. + */ +public class FBSerializables { + private FBSerializables() {} + + /** + * Writes every element of all to builder and calls {@link FlatBufferBuilder#endVector()} afterwards. + * Returns the number of result of calling endVector. + */ + public static int writeAllStructsToVector(FlatBufferBuilder builder, List all) { + // struct vectors have to be created in reverse order + List reversed = new ArrayList<>(all); + Collections.reverse(reversed); + for (FBSerializable element : reversed) { + element.writeTo(builder); + } + return builder.endVector(); + } + + /** + * Writes map data with string type. + */ + public static int writeKeyValues(FlatBufferBuilder builder, Map metaData) { + int[] metadataOffsets = new int[metaData.size()]; + Iterator> metadataIterator = metaData.entrySet().iterator(); + for (int i = 0; i < metadataOffsets.length; i++) { + Map.Entry kv = metadataIterator.next(); + int keyOffset = builder.createString(kv.getKey()); + int valueOffset = builder.createString(kv.getValue()); + KeyValue.startKeyValue(builder); + KeyValue.addKey(builder, keyOffset); + KeyValue.addValue(builder, valueOffset); + metadataOffsets[i] = KeyValue.endKeyValue(builder); + } + return org.apache.arrow.flatbuf.Field.createCustomMetadataVector(builder, metadataOffsets); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/IpcOption.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/IpcOption.java new file mode 100644 index 000000000..51207584f --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/IpcOption.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc.message; + +import org.apache.arrow.vector.types.MetadataVersion; + +/** + * IPC options, now only use for write. + */ +public class IpcOption { + + // Write the pre-0.15.0 encapsulated IPC message format + // consisting of a 4-byte prefix instead of 8 byte + public final boolean write_legacy_ipc_format; + + // The metadata version. Defaults to V5. + public final MetadataVersion metadataVersion; + + public IpcOption() { + this(false, MetadataVersion.DEFAULT); + } + + public IpcOption(boolean writeLegacyIpcFormat, MetadataVersion metadataVersion) { + this.write_legacy_ipc_format = writeLegacyIpcFormat; + this.metadataVersion = metadataVersion; + } + + public static final IpcOption DEFAULT = new IpcOption(); +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/MessageChannelReader.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/MessageChannelReader.java new file mode 100644 index 000000000..1c7968d7f --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/MessageChannelReader.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc.message; + +import java.io.IOException; + +import org.apache.arrow.flatbuf.Message; +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.ipc.ReadChannel; + +/** + * Reads a sequence of messages using a ReadChannel. + */ +public class MessageChannelReader implements AutoCloseable { + protected ReadChannel in; + protected BufferAllocator allocator; + + /** + * Construct a MessageReader to read streaming messages from an existing ReadChannel. + * + * @param in Channel to read messages from + * @param allocator BufferAllocator used to read Message body into an ArrowBuf. + */ + public MessageChannelReader(ReadChannel in, BufferAllocator allocator) { + this.in = in; + this.allocator = allocator; + } + + /** + * Read a message from the ReadChannel and return a MessageResult containing the Message + * metadata and optional message body data. Once the end-of-stream has been reached, a null + * value will be returned. If the message has no body, then MessageResult.getBodyBuffer() + * returns null. + * + * @return MessageResult or null if reached end-of-stream + * @throws IOException on error + */ + public MessageResult readNext() throws IOException { + + // Read the flatbuf message and check for end-of-stream + MessageMetadataResult result = MessageSerializer.readMessage(in); + if (result == null) { + return null; + } + Message message = result.getMessage(); + ArrowBuf bodyBuffer = null; + + // Read message body data if defined in message + if (result.messageHasBody()) { + long bodyLength = result.getMessageBodyLength(); + bodyBuffer = MessageSerializer.readMessageBody(in, bodyLength, allocator); + } + + return new MessageResult(message, bodyBuffer); + } + + /** + * Get the number of bytes read from the ReadChannel. + * + * @return number of bytes + */ + public long bytesRead() { + return in.bytesRead(); + } + + /** + * Close the ReadChannel. + * + * @throws IOException on error + */ + @Override + public void close() throws IOException { + in.close(); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/MessageMetadataResult.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/MessageMetadataResult.java new file mode 100644 index 000000000..e4728822d --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/MessageMetadataResult.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc.message; + +import java.nio.ByteBuffer; + +import org.apache.arrow.flatbuf.Message; + +/** + * Class to hold resulting Message metadata and buffer containing the serialized Flatbuffer + * message when reading messages from a ReadChannel. This handles Message metadata only and + * does not include the message body data, which should be subsequently read into an ArrowBuf. + */ +public class MessageMetadataResult { + + /** + * Construct a container to hold a deserialized Message metadata, and buffer + * with the serialized Message as read from a ReadChannel. + * + * @param messageLength the length of the serialized Flatbuffer message in bytes + * @param messageBuffer contains the serialized Flatbuffer Message metadata + * @param message the deserialized Flatbuffer Message metadata description + */ + MessageMetadataResult(int messageLength, ByteBuffer messageBuffer, Message message) { + this.messageLength = messageLength; + this.messageBuffer = messageBuffer; + this.message = message; + } + + /** + * Creates a new {@link MessageMetadataResult} by parsing it from the beginning of the buffer. + * + * @param messageLength The length of the serialized flatbuffer message in bytes (might not be equal to the buffer + * size). + */ + public static MessageMetadataResult create(ByteBuffer buffer, int messageLength) { + return new MessageMetadataResult(messageLength, buffer, Message.getRootAsMessage(buffer)); + } + + /** + * Get the length of the message metadata in bytes, not including the body length. + * + * @return number of bytes in the message metadata buffer. + */ + public int getMessageLength() { + return messageLength; + } + + /** + * Get the buffer containing the raw message metadata bytes, not including the message body data. + * + * @return buffer containing the message metadata. + */ + public ByteBuffer getMessageBuffer() { + return messageBuffer; + } + + /** + * Returns the bytes remaining in the buffer after parsing the message from it. + */ + public int bytesAfterMessage() { + return message.getByteBuffer().remaining(); + } + + public byte headerType() { + return message.headerType(); + } + + /** + * Check if the message is followed by a body. This will be true if the message has a body + * length > 0, which indicates that a message body needs to be read from the input source. + * + * @return true if message has a defined body + */ + public boolean messageHasBody() { + return message.bodyLength() > 0; + } + + /** + * Get the length of the message body. + * + * @return number of bytes of the message body + */ + public long getMessageBodyLength() { + return message.bodyLength(); + } + + /** + * Get the realized flatbuf Message metadata description. + * + * @return Message metadata + */ + public Message getMessage() { + return message; + } + + private final int messageLength; + private final ByteBuffer messageBuffer; + private final Message message; +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/MessageResult.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/MessageResult.java new file mode 100644 index 000000000..591fbf106 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/MessageResult.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc.message; + +import org.apache.arrow.flatbuf.Message; +import org.apache.arrow.memory.ArrowBuf; + +/** + * Class to hold the Message metadata and body data when reading messages through a + * MessageChannelReader. + */ +public class MessageResult { + + /** + * Construct with a valid Message metadata and optional ArrowBuf containing message body + * data, if any. + * + * @param message Deserialized Flatbuffer Message metadata description + * @param bodyBuffer Optional ArrowBuf containing message body data, null if message has no body + */ + MessageResult(Message message, ArrowBuf bodyBuffer) { + this.message = message; + this.bodyBuffer = bodyBuffer; + } + + /** + * Get the Message metadata. + * + * @return the Flatbuffer Message metadata + */ + public Message getMessage() { + return message; + } + + /** + * Get the message body data. + * + * @return an ArrowBuf containing the message body data or null if the message has no body + */ + public ArrowBuf getBodyBuffer() { + return bodyBuffer; + } + + private final Message message; + private final ArrowBuf bodyBuffer; +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/MessageSerializer.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/MessageSerializer.java new file mode 100644 index 000000000..6597e0302 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/MessageSerializer.java @@ -0,0 +1,736 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc.message; + +import static org.apache.arrow.memory.util.LargeMemoryUtil.checkedCastToInt; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; + +import org.apache.arrow.flatbuf.Buffer; +import org.apache.arrow.flatbuf.DictionaryBatch; +import org.apache.arrow.flatbuf.FieldNode; +import org.apache.arrow.flatbuf.Message; +import org.apache.arrow.flatbuf.MessageHeader; +import org.apache.arrow.flatbuf.MetadataVersion; +import org.apache.arrow.flatbuf.RecordBatch; +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.compression.NoCompressionCodec; +import org.apache.arrow.vector.ipc.ReadChannel; +import org.apache.arrow.vector.ipc.WriteChannel; +import org.apache.arrow.vector.types.pojo.Schema; + +import com.google.flatbuffers.FlatBufferBuilder; + +/** + * Utility class for serializing Messages. Messages are all serialized a similar way. + * 1. 4 byte little endian message header prefix + * 2. FB serialized Message: This includes it the body length, which is the serialized + * body and the type of the message. + * 3. Serialized message. + * + *

For schema messages, the serialization is simply the FB serialized Schema. + * + *

For RecordBatch messages the serialization is: + * 1. 4 byte little endian batch metadata header + * 2. FB serialized RowBatch + * 3. Padding to align to 8 byte boundary. + * 4. serialized RowBatch buffers. + */ +public class MessageSerializer { + + // This 0xFFFFFFFF value is the first 4 bytes of a valid IPC message + public static final int IPC_CONTINUATION_TOKEN = -1; + + /** + * Convert an array of 4 bytes in little-endian to an native-endian i32 value. + * + * @param bytes byte array with minimum length of 4 in little-endian + * @return converted an native-endian 32-bit integer + */ + public static int bytesToInt(byte[] bytes) { + return ((bytes[3] & 255) << 24) + + ((bytes[2] & 255) << 16) + + ((bytes[1] & 255) << 8) + + ((bytes[0] & 255)); + } + + /** + * Convert an integer to a little endian 4 byte array. + * + * @param value integer value input + * @param bytes existing byte array with minimum length of 4 to contain the conversion output + */ + public static void intToBytes(int value, byte[] bytes) { + bytes[3] = (byte) (value >>> 24); + bytes[2] = (byte) (value >>> 16); + bytes[1] = (byte) (value >>> 8); + bytes[0] = (byte) (value); + } + + /** + * Convert a long to a little-endian 8 byte array. + * + * @param value long value input + * @param bytes existing byte array with minimum length of 8 to contain the conversion output + */ + public static void longToBytes(long value, byte[] bytes) { + bytes[7] = (byte) (value >>> 56); + bytes[6] = (byte) (value >>> 48); + bytes[5] = (byte) (value >>> 40); + bytes[4] = (byte) (value >>> 32); + bytes[3] = (byte) (value >>> 24); + bytes[2] = (byte) (value >>> 16); + bytes[1] = (byte) (value >>> 8); + bytes[0] = (byte) (value); + } + + public static int writeMessageBuffer(WriteChannel out, int messageLength, ByteBuffer messageBuffer) + throws IOException { + return writeMessageBuffer(out, messageLength, messageBuffer, IpcOption.DEFAULT); + } + + /** + * Write the serialized Message metadata, prefixed by the length, to the output Channel. This + * ensures that it aligns to an 8 byte boundary and will adjust the message length to include + * any padding used for alignment. + * + * @param out Output Channel + * @param messageLength Number of bytes in the message buffer, written as little Endian prefix + * @param messageBuffer Message metadata buffer to be written, this does not include any + * message body data which should be subsequently written to the Channel + * @param option IPC write options + * @return Number of bytes written + * @throws IOException on error + */ + public static int writeMessageBuffer(WriteChannel out, int messageLength, ByteBuffer messageBuffer, IpcOption option) + throws IOException { + + // if write the pre-0.15.0 encapsulated IPC message format consisting of a 4-byte prefix instead of 8 byte + int prefixSize = option.write_legacy_ipc_format ? 4 : 8; + + // ensure that message aligns to 8 byte padding - prefix_size bytes, then message body + if ((messageLength + prefixSize ) % 8 != 0) { + messageLength += 8 - (messageLength + prefixSize) % 8; + } + if (!option.write_legacy_ipc_format) { + out.writeIntLittleEndian(IPC_CONTINUATION_TOKEN); + } + out.writeIntLittleEndian(messageLength); + out.write(messageBuffer); + out.align(); + + // any bytes written are already captured by our size modification above + return messageLength + prefixSize; + } + + /** + * Serialize a schema object. + */ + public static long serialize(WriteChannel out, Schema schema) throws IOException { + return serialize(out, schema, IpcOption.DEFAULT); + } + + /** + * Serialize a schema object. + * + * @param out where to write the schema + * @param schema the object to serialize to out + * @return the number of bytes written + * @throws IOException if something went wrong + */ + public static long serialize(WriteChannel out, Schema schema, IpcOption option) throws IOException { + long start = out.getCurrentPosition(); + Preconditions.checkArgument(start % 8 == 0, "out is not aligned"); + + ByteBuffer serializedMessage = serializeMetadata(schema, option); + + int messageLength = serializedMessage.remaining(); + + int bytesWritten = writeMessageBuffer(out, messageLength, serializedMessage, option); + Preconditions.checkArgument(bytesWritten % 8 == 0, "out is not aligned"); + return bytesWritten; + } + + /** + * Returns the serialized flatbuffer bytes of the schema wrapped in a message table. + */ + @Deprecated + public static ByteBuffer serializeMetadata(Schema schema) { + return serializeMetadata(schema, IpcOption.DEFAULT); + } + + /** + * Returns the serialized flatbuffer bytes of the schema wrapped in a message table. + */ + public static ByteBuffer serializeMetadata(Schema schema, IpcOption writeOption) { + FlatBufferBuilder builder = new FlatBufferBuilder(); + int schemaOffset = schema.getSchema(builder); + return MessageSerializer.serializeMessage(builder, org.apache.arrow.flatbuf.MessageHeader.Schema, schemaOffset, 0, + writeOption); + } + + /** + * Deserializes an Arrow Schema object from a schema message. Format is from serialize(). + * + * @param schemaMessage a Message of type MessageHeader.Schema + * @return the deserialized Arrow Schema + */ + public static Schema deserializeSchema(Message schemaMessage) { + Preconditions.checkArgument(schemaMessage.headerType() == MessageHeader.Schema, + "Expected schema but result was: %s", schemaMessage.headerType()); + return Schema.convertSchema((org.apache.arrow.flatbuf.Schema) + schemaMessage.header(new org.apache.arrow.flatbuf.Schema())); + } + + /** + * Deserializes an Arrow Schema read from the input channel. Format is from serialize(). + * + * @param in the channel to deserialize from + * @return the deserialized Arrow Schema + * @throws IOException if something went wrong + */ + public static Schema deserializeSchema(ReadChannel in) throws IOException { + MessageMetadataResult result = readMessage(in); + if (result == null) { + throw new IOException("Unexpected end of input when reading Schema"); + } + if (result.getMessage().headerType() != MessageHeader.Schema) { + throw new IOException("Expected schema but header was " + result.getMessage().headerType()); + } + return deserializeSchema(result); + } + + /** + * Deserializes an Arrow Schema object from a {@link MessageMetadataResult}. Format is from serialize(). + * + * @param message a Message of type MessageHeader.Schema + * @return the deserialized Arrow Schema + */ + public static Schema deserializeSchema(MessageMetadataResult message) { + return deserializeSchema(message.getMessage()); + } + + /** + * Serializes an ArrowRecordBatch. Returns the offset and length of the written batch. + */ + public static ArrowBlock serialize(WriteChannel out, ArrowRecordBatch batch) throws IOException { + return serialize(out, batch, IpcOption.DEFAULT); + } + + /** + * Serializes an ArrowRecordBatch. Returns the offset and length of the written batch. + * + * @param out where to write the batch + * @param batch the object to serialize to out + * @return the serialized block metadata + * @throws IOException if something went wrong + */ + public static ArrowBlock serialize(WriteChannel out, ArrowRecordBatch batch, IpcOption option) throws IOException { + + long start = out.getCurrentPosition(); + long bodyLength = batch.computeBodyLength(); + Preconditions.checkArgument(bodyLength % 8 == 0, "batch is not aligned"); + + ByteBuffer serializedMessage = serializeMetadata(batch, option); + + int metadataLength = serializedMessage.remaining(); + + int prefixSize = 4; + if (!option.write_legacy_ipc_format) { + out.writeIntLittleEndian(IPC_CONTINUATION_TOKEN); + prefixSize = 8; + } + + // calculate alignment bytes so that metadata length points to the correct location after alignment + int padding = (int) ((start + metadataLength + prefixSize) % 8); + if (padding != 0) { + metadataLength += (8 - padding); + } + + out.writeIntLittleEndian(metadataLength); + out.write(serializedMessage); + + // Align the output to 8 byte boundary. + out.align(); + + long bufferLength = writeBatchBuffers(out, batch); + Preconditions.checkArgument(bufferLength % 8 == 0, "out is not aligned"); + + // Metadata size in the Block account for the size prefix + return new ArrowBlock(start, metadataLength + prefixSize, bufferLength); + } + + /** + * Write the Arrow buffers of the record batch to the output channel. + * + * @param out the output channel to write the buffers to + * @param batch an ArrowRecordBatch containing buffers to be written + * @return the number of bytes written + * @throws IOException on error + */ + public static long writeBatchBuffers(WriteChannel out, ArrowRecordBatch batch) throws IOException { + long bufferStart = out.getCurrentPosition(); + List buffers = batch.getBuffers(); + List buffersLayout = batch.getBuffersLayout(); + + for (int i = 0; i < buffers.size(); i++) { + ArrowBuf buffer = buffers.get(i); + ArrowBuffer layout = buffersLayout.get(i); + long startPosition = bufferStart + layout.getOffset(); + if (startPosition != out.getCurrentPosition()) { + out.writeZeros(startPosition - out.getCurrentPosition()); + } + out.write(buffer); + if (out.getCurrentPosition() != startPosition + layout.getSize()) { + throw new IllegalStateException("wrong buffer size: " + out.getCurrentPosition() + + " != " + startPosition + layout.getSize()); + } + } + out.align(); + return out.getCurrentPosition() - bufferStart; + } + + /** + * Returns the serialized form of {@link RecordBatch} wrapped in a {@link org.apache.arrow.flatbuf.Message}. + */ + @Deprecated + public static ByteBuffer serializeMetadata(ArrowMessage message) { + return serializeMetadata(message, IpcOption.DEFAULT); + } + + /** + * Returns the serialized form of {@link RecordBatch} wrapped in a {@link org.apache.arrow.flatbuf.Message}. + */ + public static ByteBuffer serializeMetadata(ArrowMessage message, IpcOption writeOption) { + FlatBufferBuilder builder = new FlatBufferBuilder(); + int batchOffset = message.writeTo(builder); + return serializeMessage(builder, message.getMessageType(), batchOffset, + message.computeBodyLength(), writeOption); + } + + /** + * Deserializes an ArrowRecordBatch from a record batch message and data in an ArrowBuf. + * + * @param recordBatchMessage a Message of type MessageHeader.RecordBatch + * @param bodyBuffer Arrow buffer containing the RecordBatch data + * @return the deserialized ArrowRecordBatch + * @throws IOException if something went wrong + */ + public static ArrowRecordBatch deserializeRecordBatch(Message recordBatchMessage, ArrowBuf bodyBuffer) + throws IOException { + RecordBatch recordBatchFB = (RecordBatch) recordBatchMessage.header(new RecordBatch()); + return deserializeRecordBatch(recordBatchFB, bodyBuffer); + } + + /** + * Deserializes an ArrowRecordBatch read from the input channel. This uses the given allocator + * to create an ArrowBuf for the batch body data. + * + * @param in Channel to read a RecordBatch message and data from + * @param allocator BufferAllocator to allocate an Arrow buffer to read message body data + * @return the deserialized ArrowRecordBatch + * @throws IOException on error + */ + public static ArrowRecordBatch deserializeRecordBatch(ReadChannel in, BufferAllocator allocator) throws IOException { + MessageMetadataResult result = readMessage(in); + if (result == null) { + throw new IOException("Unexpected end of input when reading a RecordBatch"); + } + if (result.getMessage().headerType() != MessageHeader.RecordBatch) { + throw new IOException("Expected RecordBatch but header was " + result.getMessage().headerType()); + } + long bodyLength = result.getMessageBodyLength(); + ArrowBuf bodyBuffer = readMessageBody(in, bodyLength, allocator); + return deserializeRecordBatch(result.getMessage(), bodyBuffer); + } + + /** + * Deserializes an ArrowRecordBatch knowing the size of the entire message up front. This + * minimizes the number of reads to the underlying stream. + * + * @param in the channel to deserialize from + * @param block the object to deserialize to + * @param alloc to allocate buffers + * @return the deserialized ArrowRecordBatch + * @throws IOException if something went wrong + */ + public static ArrowRecordBatch deserializeRecordBatch(ReadChannel in, ArrowBlock block, BufferAllocator alloc) + throws IOException { + // Metadata length contains prefix_size bytes plus byte padding + long totalLen = block.getMetadataLength() + block.getBodyLength(); + + ArrowBuf buffer = alloc.buffer(totalLen); + if (in.readFully(buffer, totalLen) != totalLen) { + throw new IOException("Unexpected end of input trying to read batch."); + } + + int prefixSize = buffer.getInt(0) == IPC_CONTINUATION_TOKEN ? 8 : 4; + + ArrowBuf metadataBuffer = buffer.slice(prefixSize, block.getMetadataLength() - prefixSize); + + Message messageFB = + Message.getRootAsMessage(metadataBuffer.nioBuffer().asReadOnlyBuffer()); + + RecordBatch recordBatchFB = (RecordBatch) messageFB.header(new RecordBatch()); + + // Now read the body + final ArrowBuf body = buffer.slice(block.getMetadataLength(), + totalLen - block.getMetadataLength()); + return deserializeRecordBatch(recordBatchFB, body); + } + + /** + * Deserializes an ArrowRecordBatch given the Flatbuffer metadata and in-memory body. + * + * @param recordBatchFB Deserialized FlatBuffer record batch + * @param body Read body of the record batch + * @return ArrowRecordBatch from metadata and in-memory body + * @throws IOException on error + */ + public static ArrowRecordBatch deserializeRecordBatch(RecordBatch recordBatchFB, ArrowBuf body) throws IOException { + // Now read the body + int nodesLength = recordBatchFB.nodesLength(); + List nodes = new ArrayList<>(); + for (int i = 0; i < nodesLength; ++i) { + FieldNode node = recordBatchFB.nodes(i); + if ((int) node.length() != node.length() || + (int) node.nullCount() != node.nullCount()) { + throw new IOException("Cannot currently deserialize record batches with " + + "node length larger than INT_MAX records."); + } + nodes.add(new ArrowFieldNode(node.length(), node.nullCount())); + } + List buffers = new ArrayList<>(); + for (int i = 0; i < recordBatchFB.buffersLength(); ++i) { + Buffer bufferFB = recordBatchFB.buffers(i); + ArrowBuf vectorBuffer = body.slice(bufferFB.offset(), bufferFB.length()); + buffers.add(vectorBuffer); + } + + ArrowBodyCompression bodyCompression = recordBatchFB.compression() == null ? + NoCompressionCodec.DEFAULT_BODY_COMPRESSION + : new ArrowBodyCompression(recordBatchFB.compression().codec(), recordBatchFB.compression().method()); + + if ((int) recordBatchFB.length() != recordBatchFB.length()) { + throw new IOException("Cannot currently deserialize record batches with more than INT_MAX records."); + } + ArrowRecordBatch arrowRecordBatch = + new ArrowRecordBatch(checkedCastToInt(recordBatchFB.length()), nodes, buffers, bodyCompression); + body.getReferenceManager().release(); + return arrowRecordBatch; + } + + /** + * Reads a record batch based on the metadata in serializedMessage and the underlying data buffer. + */ + public static ArrowRecordBatch deserializeRecordBatch(MessageMetadataResult serializedMessage, + ArrowBuf underlying) throws + IOException { + return deserializeRecordBatch(serializedMessage.getMessage(), underlying); + } + + public static ArrowBlock serialize(WriteChannel out, ArrowDictionaryBatch batch) throws IOException { + return serialize(out, batch, IpcOption.DEFAULT); + } + + /** + * Serializes a dictionary ArrowRecordBatch. Returns the offset and length of the written batch. + * + * @param out where to serialize + * @param batch the batch to serialize + * @param option options for IPC + * @return the metadata of the serialized block + * @throws IOException if something went wrong + */ + public static ArrowBlock serialize(WriteChannel out, ArrowDictionaryBatch batch, IpcOption option) + throws IOException { + long start = out.getCurrentPosition(); + + long bodyLength = batch.computeBodyLength(); + Preconditions.checkArgument(bodyLength % 8 == 0, "batch is not aligned"); + + ByteBuffer serializedMessage = serializeMetadata(batch, option); + + int metadataLength = serializedMessage.remaining(); + + int prefixSize = 4; + if (!option.write_legacy_ipc_format) { + out.writeIntLittleEndian(IPC_CONTINUATION_TOKEN); + prefixSize = 8; + } + + // calculate alignment bytes so that metadata length points to the correct location after alignment + int padding = (int) ((start + metadataLength + prefixSize) % 8); + if (padding != 0) { + metadataLength += (8 - padding); + } + + out.writeIntLittleEndian(metadataLength); + out.write(serializedMessage); + + // Align the output to 8 byte boundary. + out.align(); + + // write the embedded record batch + long bufferLength = writeBatchBuffers(out, batch.getDictionary()); + Preconditions.checkArgument(bufferLength % 8 == 0, "out is not aligned"); + + // Metadata size in the Block account for the size prefix + return new ArrowBlock(start, metadataLength + prefixSize, bufferLength); + } + + /** + * Deserializes an ArrowDictionaryBatch from a dictionary batch Message and data in an ArrowBuf. + * + * @param message a message of type MessageHeader.DictionaryBatch + * @param bodyBuffer Arrow buffer containing the DictionaryBatch data + * of type MessageHeader.DictionaryBatch + * @return the deserialized ArrowDictionaryBatch + * @throws IOException if something went wrong + */ + public static ArrowDictionaryBatch deserializeDictionaryBatch(Message message, ArrowBuf bodyBuffer) + throws IOException { + DictionaryBatch dictionaryBatchFB = (DictionaryBatch) message.header(new DictionaryBatch()); + ArrowRecordBatch recordBatch = deserializeRecordBatch(dictionaryBatchFB.data(), bodyBuffer); + return new ArrowDictionaryBatch(dictionaryBatchFB.id(), recordBatch, dictionaryBatchFB.isDelta()); + } + + /** + * Deserializes an ArrowDictionaryBatch from a dictionary batch Message and data in an ArrowBuf. + * + * @param message a message of type MessageHeader.DictionaryBatch + * @param bodyBuffer Arrow buffer containing the DictionaryBatch data + * of type MessageHeader.DictionaryBatch + * @return the deserialized ArrowDictionaryBatch + * @throws IOException if something went wrong + */ + public static ArrowDictionaryBatch deserializeDictionaryBatch(MessageMetadataResult message, ArrowBuf bodyBuffer) + throws IOException { + return deserializeDictionaryBatch(message.getMessage(), bodyBuffer); + } + + /** + * Deserializes an ArrowDictionaryBatch read from the input channel. This uses the given allocator + * to create an ArrowBuf for the batch body data. + * + * @param in Channel to read a DictionaryBatch message and data from + * @param allocator BufferAllocator to allocate an Arrow buffer to read message body data + * @return the deserialized ArrowDictionaryBatch + * @throws IOException on error + */ + public static ArrowDictionaryBatch deserializeDictionaryBatch(ReadChannel in, BufferAllocator allocator) + throws IOException { + MessageMetadataResult result = readMessage(in); + if (result == null) { + throw new IOException("Unexpected end of input when reading a DictionaryBatch"); + } + if (result.getMessage().headerType() != MessageHeader.DictionaryBatch) { + throw new IOException("Expected DictionaryBatch but header was " + result.getMessage().headerType()); + } + long bodyLength = result.getMessageBodyLength(); + ArrowBuf bodyBuffer = readMessageBody(in, bodyLength, allocator); + return deserializeDictionaryBatch(result.getMessage(), bodyBuffer); + } + + /** + * Deserializes a DictionaryBatch knowing the size of the entire message up front. This + * minimizes the number of reads to the underlying stream. + * + * @param in where to read from + * @param block block metadata for deserializing + * @param alloc to allocate new buffers + * @return the deserialized ArrowDictionaryBatch + * @throws IOException if something went wrong + */ + public static ArrowDictionaryBatch deserializeDictionaryBatch( + ReadChannel in, + ArrowBlock block, + BufferAllocator alloc) throws IOException { + // Metadata length contains integer prefix plus byte padding + long totalLen = block.getMetadataLength() + block.getBodyLength(); + + ArrowBuf buffer = alloc.buffer(totalLen); + if (in.readFully(buffer, totalLen) != totalLen) { + throw new IOException("Unexpected end of input trying to read batch."); + } + + int prefixSize = buffer.getInt(0) == IPC_CONTINUATION_TOKEN ? 8 : 4; + + ArrowBuf metadataBuffer = buffer.slice(prefixSize, block.getMetadataLength() - prefixSize); + + Message messageFB = + Message.getRootAsMessage(metadataBuffer.nioBuffer().asReadOnlyBuffer()); + + DictionaryBatch dictionaryBatchFB = (DictionaryBatch) messageFB.header(new DictionaryBatch()); + + // Now read the body + final ArrowBuf body = buffer.slice(block.getMetadataLength(), + totalLen - block.getMetadataLength()); + ArrowRecordBatch recordBatch = deserializeRecordBatch(dictionaryBatchFB.data(), body); + return new ArrowDictionaryBatch(dictionaryBatchFB.id(), recordBatch, dictionaryBatchFB.isDelta()); + } + + /** + * Deserialize a message that is either an ArrowDictionaryBatch or ArrowRecordBatch. + * + * @param reader MessageChannelReader to read a sequence of messages from a ReadChannel + * @return The deserialized record batch + * @throws IOException if the message is not an ArrowDictionaryBatch or ArrowRecordBatch + */ + public static ArrowMessage deserializeMessageBatch(MessageChannelReader reader) throws IOException { + MessageResult result = reader.readNext(); + if (result == null) { + return null; + } else if (result.getMessage().bodyLength() > Integer.MAX_VALUE) { + throw new IOException("Cannot currently deserialize record batches over 2GB"); + } + + if (result.getMessage().version() != MetadataVersion.V4 && + result.getMessage().version() != MetadataVersion.V5) { + throw new IOException("Received metadata with an incompatible version number: " + result.getMessage().version()); + } + + switch (result.getMessage().headerType()) { + case MessageHeader.RecordBatch: + return deserializeRecordBatch(result.getMessage(), result.getBodyBuffer()); + case MessageHeader.DictionaryBatch: + return deserializeDictionaryBatch(result.getMessage(), result.getBodyBuffer()); + default: + throw new IOException("Unexpected message header type " + result.getMessage().headerType()); + } + } + + /** + * Deserialize a message that is either an ArrowDictionaryBatch or ArrowRecordBatch. + * + * @param in ReadChannel to read messages from + * @param alloc Allocator for message data + * @return The deserialized record batch + * @throws IOException if the message is not an ArrowDictionaryBatch or ArrowRecordBatch + */ + public static ArrowMessage deserializeMessageBatch(ReadChannel in, BufferAllocator alloc) throws IOException { + return deserializeMessageBatch(new MessageChannelReader(in, alloc)); + } + + @Deprecated + public static ByteBuffer serializeMessage( + FlatBufferBuilder builder, + byte headerType, + int headerOffset, + long bodyLength) { + return serializeMessage(builder, headerType, headerOffset, bodyLength, IpcOption.DEFAULT); + } + + /** + * Serializes a message header. + * + * @param builder to write the flatbuf to + * @param headerType headerType field + * @param headerOffset header offset field + * @param bodyLength body length field + * @param writeOption IPC write options + * @return the corresponding ByteBuffer + */ + public static ByteBuffer serializeMessage( + FlatBufferBuilder builder, + byte headerType, + int headerOffset, + long bodyLength, + IpcOption writeOption) { + Message.startMessage(builder); + Message.addHeaderType(builder, headerType); + Message.addHeader(builder, headerOffset); + Message.addVersion(builder, writeOption.metadataVersion.toFlatbufID()); + Message.addBodyLength(builder, bodyLength); + builder.finish(Message.endMessage(builder)); + return builder.dataBuffer(); + } + + /** + * Read a Message from the input channel and return a MessageMetadataResult that contains the + * Message metadata, buffer containing the serialized Message metadata as read, and length of the + * Message in bytes. Returns null if the end-of-stream has been reached. + * + * @param in ReadChannel to read messages from + * @return MessageMetadataResult with deserialized Message metadata and message information if + * a valid Message was read, or null if end-of-stream + * @throws IOException on error + */ + public static MessageMetadataResult readMessage(ReadChannel in) throws IOException { + + // Read the message size. There is an i32 little endian prefix. + ByteBuffer buffer = ByteBuffer.allocate(4); + if (in.readFully(buffer) == 4) { + + int messageLength = MessageSerializer.bytesToInt(buffer.array()); + if (messageLength == IPC_CONTINUATION_TOKEN) { + buffer.clear(); + // ARROW-6313, if the first 4 bytes are continuation message, read the next 4 for the length + if (in.readFully(buffer) == 4) { + messageLength = MessageSerializer.bytesToInt(buffer.array()); + } + } + + // Length of 0 indicates end of stream + if (messageLength != 0) { + + // Read the message into the buffer. + ByteBuffer messageBuffer = ByteBuffer.allocate(messageLength); + if (in.readFully(messageBuffer) != messageLength) { + throw new IOException( + "Unexpected end of stream trying to read message."); + } + messageBuffer.rewind(); + + // Load the message. + Message message = Message.getRootAsMessage(messageBuffer); + + return new MessageMetadataResult(messageLength, messageBuffer, message); + } + } + return null; + } + + /** + * Read a Message body from the in channel into an ArrowBuf. + * + * @param in ReadChannel to read message body from + * @param bodyLength Length in bytes of the message body to read + * @param allocator Allocate the ArrowBuf to contain message body data + * @return an ArrowBuf containing the message body data + * @throws IOException on error + */ + public static ArrowBuf readMessageBody(ReadChannel in, long bodyLength, + BufferAllocator allocator) throws IOException { + ArrowBuf bodyBuffer = allocator.buffer(bodyLength); + try { + if (in.readFully(bodyBuffer, bodyLength) != bodyLength) { + throw new IOException("Unexpected end of input trying to read batch."); + } + } catch (RuntimeException | IOException e) { + bodyBuffer.close(); + throw e; + } + return bodyBuffer; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/DateUnit.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/DateUnit.java new file mode 100644 index 000000000..8fce12e83 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/DateUnit.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.types; + +/** + * Resolutions that dates can be stored at. + */ +public enum DateUnit { + /** Days since epoch. */ + DAY(org.apache.arrow.flatbuf.DateUnit.DAY), + /** Milliseconds since epoch. */ + MILLISECOND(org.apache.arrow.flatbuf.DateUnit.MILLISECOND); + + private static final DateUnit[] valuesByFlatbufId = new DateUnit[DateUnit.values().length]; + + static { + for (DateUnit v : DateUnit.values()) { + valuesByFlatbufId[v.flatbufID] = v; + } + } + + private final short flatbufID; + + DateUnit(short flatbufID) { + this.flatbufID = flatbufID; + } + + public short getFlatbufID() { + return flatbufID; + } + + public static DateUnit fromFlatbufID(short id) { + return valuesByFlatbufId[id]; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/FloatingPointPrecision.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/FloatingPointPrecision.java new file mode 100644 index 000000000..c52fc1243 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/FloatingPointPrecision.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.types; + +import org.apache.arrow.flatbuf.Precision; + +/** + * Precisions of primitive floating point numbers. + */ +public enum FloatingPointPrecision { + /** 16-bit (not a standard java type). */ + HALF(Precision.HALF), + /** 32-bit (i.e. float in java). */ + SINGLE(Precision.SINGLE), + /** 64-bit (i.e. double in java). */ + DOUBLE(Precision.DOUBLE); + + private static final FloatingPointPrecision[] valuesByFlatbufId = + new FloatingPointPrecision[FloatingPointPrecision.values().length]; + + static { + for (FloatingPointPrecision v : FloatingPointPrecision.values()) { + valuesByFlatbufId[v.flatbufID] = v; + } + } + + private short flatbufID; + + private FloatingPointPrecision(short flatbufID) { + this.flatbufID = flatbufID; + } + + public short getFlatbufID() { + return flatbufID; + } + + public static FloatingPointPrecision fromFlatbufID(short id) { + return valuesByFlatbufId[id]; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/IntervalUnit.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/IntervalUnit.java new file mode 100644 index 000000000..1b17240d0 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/IntervalUnit.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.types; + +/** + * Resolutions for Interval Vectors. + */ +public enum IntervalUnit { + /** Values are stored as number of months (which can be converted into years and months via division). */ + YEAR_MONTH(org.apache.arrow.flatbuf.IntervalUnit.YEAR_MONTH), + /** Values are stored as some number of days and some number of milliseconds within that day. */ + DAY_TIME(org.apache.arrow.flatbuf.IntervalUnit.DAY_TIME), + /** Values are stored as number of months, days and nanoseconds. */ + MONTH_DAY_NANO(org.apache.arrow.flatbuf.IntervalUnit.MONTH_DAY_NANO); + + private static final IntervalUnit[] valuesByFlatbufId = new IntervalUnit[IntervalUnit.values().length]; + + static { + for (IntervalUnit v : IntervalUnit.values()) { + valuesByFlatbufId[v.flatbufID] = v; + } + } + + private short flatbufID; + + private IntervalUnit(short flatbufID) { + this.flatbufID = flatbufID; + } + + public short getFlatbufID() { + return flatbufID; + } + + public static IntervalUnit fromFlatbufID(short id) { + return valuesByFlatbufId[id]; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/MetadataVersion.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/MetadataVersion.java new file mode 100644 index 000000000..a0e281960 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/MetadataVersion.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.types; + +/** + * Metadata version for Arrow metadata. + */ +public enum MetadataVersion { + /// 0.1.0 + V1(org.apache.arrow.flatbuf.MetadataVersion.V1), + + /// 0.2.0 + V2(org.apache.arrow.flatbuf.MetadataVersion.V2), + + /// 0.3.0 to 0.7.1 + V3(org.apache.arrow.flatbuf.MetadataVersion.V3), + + /// 0.8.0 to 0.17.1 + V4(org.apache.arrow.flatbuf.MetadataVersion.V4), + + /// >= 1.0.0 + V5(org.apache.arrow.flatbuf.MetadataVersion.V5), + + ; + + public static final MetadataVersion DEFAULT = V5; + + private static final MetadataVersion[] valuesByFlatbufId = + new MetadataVersion[MetadataVersion.values().length]; + + static { + for (MetadataVersion v : MetadataVersion.values()) { + valuesByFlatbufId[v.flatbufID] = v; + } + } + + private final short flatbufID; + + MetadataVersion(short flatbufID) { + this.flatbufID = flatbufID; + } + + public short toFlatbufID() { + return flatbufID; + } + + public static MetadataVersion fromFlatbufID(short id) { + return valuesByFlatbufId[id]; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/TimeUnit.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/TimeUnit.java new file mode 100644 index 000000000..dcaebba48 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/TimeUnit.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.types; + +/** + * Resolutions that times can be stored with. + */ +public enum TimeUnit { + SECOND(org.apache.arrow.flatbuf.TimeUnit.SECOND), + MILLISECOND(org.apache.arrow.flatbuf.TimeUnit.MILLISECOND), + MICROSECOND(org.apache.arrow.flatbuf.TimeUnit.MICROSECOND), + NANOSECOND(org.apache.arrow.flatbuf.TimeUnit.NANOSECOND); + + private static final TimeUnit[] valuesByFlatbufId = new TimeUnit[TimeUnit.values().length]; + + static { + for (TimeUnit v : TimeUnit.values()) { + valuesByFlatbufId[v.flatbufID] = v; + } + } + + private final short flatbufID; + + TimeUnit(short flatbufID) { + this.flatbufID = flatbufID; + } + + public short getFlatbufID() { + return flatbufID; + } + + public static TimeUnit fromFlatbufID(short id) { + return valuesByFlatbufId[id]; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java new file mode 100644 index 000000000..d4c827859 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -0,0 +1,1016 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.types; + +import static org.apache.arrow.vector.types.FloatingPointPrecision.DOUBLE; +import static org.apache.arrow.vector.types.FloatingPointPrecision.SINGLE; +import static org.apache.arrow.vector.types.UnionMode.Dense; +import static org.apache.arrow.vector.types.UnionMode.Sparse; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.DateDayVector; +import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.Decimal256Vector; +import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.DurationVector; +import org.apache.arrow.vector.ExtensionTypeVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.IntervalDayVector; +import org.apache.arrow.vector.IntervalMonthDayNanoVector; +import org.apache.arrow.vector.IntervalYearVector; +import org.apache.arrow.vector.LargeVarBinaryVector; +import org.apache.arrow.vector.LargeVarCharVector; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.SmallIntVector; +import org.apache.arrow.vector.TimeMicroVector; +import org.apache.arrow.vector.TimeMilliVector; +import org.apache.arrow.vector.TimeNanoVector; +import org.apache.arrow.vector.TimeSecVector; +import org.apache.arrow.vector.TimeStampMicroTZVector; +import org.apache.arrow.vector.TimeStampMicroVector; +import org.apache.arrow.vector.TimeStampMilliTZVector; +import org.apache.arrow.vector.TimeStampMilliVector; +import org.apache.arrow.vector.TimeStampNanoTZVector; +import org.apache.arrow.vector.TimeStampNanoVector; +import org.apache.arrow.vector.TimeStampSecTZVector; +import org.apache.arrow.vector.TimeStampSecVector; +import org.apache.arrow.vector.TinyIntVector; +import org.apache.arrow.vector.UInt1Vector; +import org.apache.arrow.vector.UInt2Vector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.UInt8Vector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VarBinaryVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.complex.DenseUnionVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.LargeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.complex.impl.BigIntWriterImpl; +import org.apache.arrow.vector.complex.impl.BitWriterImpl; +import org.apache.arrow.vector.complex.impl.DateDayWriterImpl; +import org.apache.arrow.vector.complex.impl.DateMilliWriterImpl; +import org.apache.arrow.vector.complex.impl.Decimal256WriterImpl; +import org.apache.arrow.vector.complex.impl.DecimalWriterImpl; +import org.apache.arrow.vector.complex.impl.DenseUnionWriter; +import org.apache.arrow.vector.complex.impl.DurationWriterImpl; +import org.apache.arrow.vector.complex.impl.FixedSizeBinaryWriterImpl; +import org.apache.arrow.vector.complex.impl.Float4WriterImpl; +import org.apache.arrow.vector.complex.impl.Float8WriterImpl; +import org.apache.arrow.vector.complex.impl.IntWriterImpl; +import org.apache.arrow.vector.complex.impl.IntervalDayWriterImpl; +import org.apache.arrow.vector.complex.impl.IntervalMonthDayNanoWriterImpl; +import org.apache.arrow.vector.complex.impl.IntervalYearWriterImpl; +import org.apache.arrow.vector.complex.impl.LargeVarBinaryWriterImpl; +import org.apache.arrow.vector.complex.impl.LargeVarCharWriterImpl; +import org.apache.arrow.vector.complex.impl.NullableStructWriter; +import org.apache.arrow.vector.complex.impl.SmallIntWriterImpl; +import org.apache.arrow.vector.complex.impl.TimeMicroWriterImpl; +import org.apache.arrow.vector.complex.impl.TimeMilliWriterImpl; +import org.apache.arrow.vector.complex.impl.TimeNanoWriterImpl; +import org.apache.arrow.vector.complex.impl.TimeSecWriterImpl; +import org.apache.arrow.vector.complex.impl.TimeStampMicroTZWriterImpl; +import org.apache.arrow.vector.complex.impl.TimeStampMicroWriterImpl; +import org.apache.arrow.vector.complex.impl.TimeStampMilliTZWriterImpl; +import org.apache.arrow.vector.complex.impl.TimeStampMilliWriterImpl; +import org.apache.arrow.vector.complex.impl.TimeStampNanoTZWriterImpl; +import org.apache.arrow.vector.complex.impl.TimeStampNanoWriterImpl; +import org.apache.arrow.vector.complex.impl.TimeStampSecTZWriterImpl; +import org.apache.arrow.vector.complex.impl.TimeStampSecWriterImpl; +import org.apache.arrow.vector.complex.impl.TinyIntWriterImpl; +import org.apache.arrow.vector.complex.impl.UInt1WriterImpl; +import org.apache.arrow.vector.complex.impl.UInt2WriterImpl; +import org.apache.arrow.vector.complex.impl.UInt4WriterImpl; +import org.apache.arrow.vector.complex.impl.UInt8WriterImpl; +import org.apache.arrow.vector.complex.impl.UnionLargeListWriter; +import org.apache.arrow.vector.complex.impl.UnionListWriter; +import org.apache.arrow.vector.complex.impl.UnionWriter; +import org.apache.arrow.vector.complex.impl.VarBinaryWriterImpl; +import org.apache.arrow.vector.complex.impl.VarCharWriterImpl; +import org.apache.arrow.vector.complex.writer.FieldWriter; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeVisitor; +import org.apache.arrow.vector.types.pojo.ArrowType.Binary; +import org.apache.arrow.vector.types.pojo.ArrowType.Bool; +import org.apache.arrow.vector.types.pojo.ArrowType.Date; +import org.apache.arrow.vector.types.pojo.ArrowType.Decimal; +import org.apache.arrow.vector.types.pojo.ArrowType.Duration; +import org.apache.arrow.vector.types.pojo.ArrowType.ExtensionType; +import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeBinary; +import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeList; +import org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint; +import org.apache.arrow.vector.types.pojo.ArrowType.Int; +import org.apache.arrow.vector.types.pojo.ArrowType.Interval; +import org.apache.arrow.vector.types.pojo.ArrowType.LargeBinary; +import org.apache.arrow.vector.types.pojo.ArrowType.LargeUtf8; +import org.apache.arrow.vector.types.pojo.ArrowType.List; +import org.apache.arrow.vector.types.pojo.ArrowType.Map; +import org.apache.arrow.vector.types.pojo.ArrowType.Null; +import org.apache.arrow.vector.types.pojo.ArrowType.Struct; +import org.apache.arrow.vector.types.pojo.ArrowType.Time; +import org.apache.arrow.vector.types.pojo.ArrowType.Timestamp; +import org.apache.arrow.vector.types.pojo.ArrowType.Union; +import org.apache.arrow.vector.types.pojo.ArrowType.Utf8; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.CallBack; + +/** An enumeration of all logical types supported by this library. */ +public class Types { + + /** + * The actual enumeration of types. + */ + public enum MinorType { + NULL(Null.INSTANCE) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new NullVector(field.getName()); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return null; + } + }, + STRUCT(Struct.INSTANCE) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new StructVector(field.getName(), allocator, field.getFieldType(), schemaChangeCallback); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new NullableStructWriter((StructVector) vector); + } + }, + TINYINT(new Int(8, true)) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new TinyIntVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new TinyIntWriterImpl((TinyIntVector) vector); + } + }, + SMALLINT(new Int(16, true)) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new SmallIntVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new SmallIntWriterImpl((SmallIntVector) vector); + } + }, + INT(new Int(32, true)) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new IntVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new IntWriterImpl((IntVector) vector); + } + }, + BIGINT(new Int(64, true)) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new BigIntVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new BigIntWriterImpl((BigIntVector) vector); + } + }, + DATEDAY(new Date(DateUnit.DAY)) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new DateDayVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new DateDayWriterImpl((DateDayVector) vector); + } + }, + DATEMILLI(new Date(DateUnit.MILLISECOND)) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new DateMilliVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new DateMilliWriterImpl((DateMilliVector) vector); + } + }, + TIMESEC(new Time(TimeUnit.SECOND, 32)) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new TimeSecVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new TimeSecWriterImpl((TimeSecVector) vector); + } + }, + TIMEMILLI(new Time(TimeUnit.MILLISECOND, 32)) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new TimeMilliVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new TimeMilliWriterImpl((TimeMilliVector) vector); + } + }, + TIMEMICRO(new Time(TimeUnit.MICROSECOND, 64)) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new TimeMicroVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new TimeMicroWriterImpl((TimeMicroVector) vector); + } + }, + TIMENANO(new Time(TimeUnit.NANOSECOND, 64)) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new TimeNanoVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new TimeNanoWriterImpl((TimeNanoVector) vector); + } + }, + // time in second from the Unix epoch, 00:00:00.000000 on 1 January 1970, UTC. + TIMESTAMPSEC(new Timestamp(org.apache.arrow.vector.types.TimeUnit.SECOND, null)) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new TimeStampSecVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new TimeStampSecWriterImpl((TimeStampSecVector) vector); + } + }, + // time in millis from the Unix epoch, 00:00:00.000 on 1 January 1970, UTC. + TIMESTAMPMILLI(new Timestamp(org.apache.arrow.vector.types.TimeUnit.MILLISECOND, null)) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new TimeStampMilliVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new TimeStampMilliWriterImpl((TimeStampMilliVector) vector); + } + }, + // time in microsecond from the Unix epoch, 00:00:00.000000 on 1 January 1970, UTC. + TIMESTAMPMICRO(new Timestamp(org.apache.arrow.vector.types.TimeUnit.MICROSECOND, null)) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new TimeStampMicroVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new TimeStampMicroWriterImpl((TimeStampMicroVector) vector); + } + }, + // time in nanosecond from the Unix epoch, 00:00:00.000000000 on 1 January 1970, UTC. + TIMESTAMPNANO(new Timestamp(org.apache.arrow.vector.types.TimeUnit.NANOSECOND, null)) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new TimeStampNanoVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new TimeStampNanoWriterImpl((TimeStampNanoVector) vector); + } + }, + INTERVALDAY(new Interval(IntervalUnit.DAY_TIME)) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new IntervalDayVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new IntervalDayWriterImpl((IntervalDayVector) vector); + } + }, + INTERVALMONTHDAYNANO(new Interval(IntervalUnit.MONTH_DAY_NANO)) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new IntervalMonthDayNanoVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new IntervalMonthDayNanoWriterImpl((IntervalMonthDayNanoVector) vector); + } + }, + DURATION(null) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new DurationVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new DurationWriterImpl((DurationVector) vector); + } + }, + + + INTERVALYEAR(new Interval(IntervalUnit.YEAR_MONTH)) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new IntervalYearVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new IntervalYearWriterImpl((IntervalYearVector) vector); + } + }, + // 4 byte ieee 754 + FLOAT4(new FloatingPoint(SINGLE)) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new Float4Vector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new Float4WriterImpl((Float4Vector) vector); + } + }, + // 8 byte ieee 754 + FLOAT8(new FloatingPoint(DOUBLE)) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new Float8Vector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new Float8WriterImpl((Float8Vector) vector); + } + }, + BIT(Bool.INSTANCE) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new BitVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new BitWriterImpl((BitVector) vector); + } + }, + VARCHAR(Utf8.INSTANCE) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new VarCharVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new VarCharWriterImpl((VarCharVector) vector); + } + }, + LARGEVARCHAR(LargeUtf8.INSTANCE) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new LargeVarCharVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new LargeVarCharWriterImpl((LargeVarCharVector) vector); + } + }, + LARGEVARBINARY(LargeBinary.INSTANCE) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new LargeVarBinaryVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new LargeVarBinaryWriterImpl((LargeVarBinaryVector) vector); + } + }, + VARBINARY(Binary.INSTANCE) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new VarBinaryVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new VarBinaryWriterImpl((VarBinaryVector) vector); + } + }, + DECIMAL(null) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new DecimalVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new DecimalWriterImpl((DecimalVector) vector); + } + }, + DECIMAL256(null) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new Decimal256Vector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new Decimal256WriterImpl((Decimal256Vector) vector); + } + }, + FIXEDSIZEBINARY(null) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new FixedSizeBinaryVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new FixedSizeBinaryWriterImpl((FixedSizeBinaryVector) vector); + } + }, + UINT1(new Int(8, false)) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new UInt1Vector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new UInt1WriterImpl((UInt1Vector) vector); + } + }, + UINT2(new Int(16, false)) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new UInt2Vector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new UInt2WriterImpl((UInt2Vector) vector); + } + }, + UINT4(new Int(32, false)) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new UInt4Vector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new UInt4WriterImpl((UInt4Vector) vector); + } + }, + UINT8(new Int(64, false)) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new UInt8Vector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new UInt8WriterImpl((UInt8Vector) vector); + } + }, + LIST(List.INSTANCE) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new ListVector(field.getName(), allocator, field.getFieldType(), schemaChangeCallback); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new UnionListWriter((ListVector) vector); + } + }, + LARGELIST(ArrowType.LargeList.INSTANCE) { + @Override + public FieldVector getNewVector(Field field, BufferAllocator allocator, CallBack schemaChangeCallback) { + return new LargeListVector(field.getName(), allocator, field.getFieldType(), schemaChangeCallback); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new UnionLargeListWriter((LargeListVector) vector); + } + }, + FIXED_SIZE_LIST(null) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new FixedSizeListVector(field.getName(), allocator, field.getFieldType(), schemaChangeCallback); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + throw new UnsupportedOperationException("FieldWriter not implemented for FixedSizeList " + + "type"); + } + }, + UNION(new Union(Sparse, null)) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + if (field.getFieldType().getDictionary() != null) { + throw new UnsupportedOperationException("Dictionary encoding not supported for complex " + + "types"); + } + return new UnionVector(field.getName(), allocator, field.getFieldType(), schemaChangeCallback); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new UnionWriter((UnionVector) vector); + } + }, + DENSEUNION(new Union(Dense, null)) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + if (field.getFieldType().getDictionary() != null) { + throw new UnsupportedOperationException("Dictionary encoding not supported for complex " + + "types"); + } + return new DenseUnionVector(field.getName(), allocator, field.getFieldType(), schemaChangeCallback); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new DenseUnionWriter((DenseUnionVector) vector); + } + }, + MAP(null) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new MapVector(field.getName(), allocator, field.getFieldType(), schemaChangeCallback); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new UnionListWriter((MapVector) vector); + } + }, + TIMESTAMPSECTZ(null) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new TimeStampSecTZVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new TimeStampSecTZWriterImpl((TimeStampSecTZVector) vector); + } + }, + TIMESTAMPMILLITZ(null) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new TimeStampMilliTZVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new TimeStampMilliTZWriterImpl((TimeStampMilliTZVector) vector); + } + }, + TIMESTAMPMICROTZ(null) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new TimeStampMicroTZVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new TimeStampMicroTZWriterImpl((TimeStampMicroTZVector) vector); + } + }, + TIMESTAMPNANOTZ(null) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new TimeStampNanoTZVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new TimeStampNanoTZWriterImpl((TimeStampNanoTZVector) vector); + } + }, + EXTENSIONTYPE(null) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return ((ExtensionType) field.getFieldType().getType()).getNewVector(field.getName(), + field.getFieldType(), allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return ((ExtensionTypeVector) vector).getUnderlyingVector().getMinorType().getNewFieldWriter(vector); + } + }, + ; + + private final ArrowType type; + + MinorType(ArrowType type) { + this.type = type; + } + + /** + * Returns the {@link ArrowType} equivalent of this type. + */ + public final ArrowType getType() { + if (type == null) { + throw new UnsupportedOperationException("Cannot get simple type for type " + name()); + } + return type; + } + + /** Constructs a new vector for the given type. */ + public final FieldVector getNewVector( + String name, + FieldType fieldType, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return getNewVector(new Field(name, fieldType, null), allocator, schemaChangeCallback); + } + + /** Constructs a new vector for the given type. */ + public abstract FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback); + + public abstract FieldWriter getNewFieldWriter(ValueVector vector); + } + + /** + * Maps the ArrowType to the java implementations MinorType. + */ + public static MinorType getMinorTypeForArrowType(ArrowType arrowType) { + return arrowType.accept(new ArrowTypeVisitor() { + @Override + public MinorType visit(Null type) { + return MinorType.NULL; + } + + @Override + public MinorType visit(Struct type) { + return MinorType.STRUCT; + } + + @Override + public MinorType visit(List type) { + return MinorType.LIST; + } + + @Override + public MinorType visit(FixedSizeList type) { + return MinorType.FIXED_SIZE_LIST; + } + + @Override + public MinorType visit(Union type) { + switch (type.getMode()) { + case Sparse: + return MinorType.UNION; + case Dense: + return MinorType.DENSEUNION; + default: + throw new IllegalArgumentException("only Dense or Sparse unions supported: " + type); + } + } + + @Override + public MinorType visit(Map type) { + return MinorType.MAP; + } + + @Override + public MinorType visit(ArrowType.LargeList type) { + return MinorType.LARGELIST; + } + + @Override + public MinorType visit(Int type) { + switch (type.getBitWidth()) { + case 8: + return type.getIsSigned() ? MinorType.TINYINT : MinorType.UINT1; + case 16: + return type.getIsSigned() ? MinorType.SMALLINT : MinorType.UINT2; + case 32: + return type.getIsSigned() ? MinorType.INT : MinorType.UINT4; + case 64: + return type.getIsSigned() ? MinorType.BIGINT : MinorType.UINT8; + default: + throw new IllegalArgumentException("only 8, 16, 32, 64 supported: " + type); + } + } + + @Override + public MinorType visit(FloatingPoint type) { + switch (type.getPrecision()) { + case HALF: + throw new UnsupportedOperationException("NYI: " + type); + case SINGLE: + return MinorType.FLOAT4; + case DOUBLE: + return MinorType.FLOAT8; + default: + throw new IllegalArgumentException("unknown precision: " + type); + } + } + + @Override + public MinorType visit(Utf8 type) { + return MinorType.VARCHAR; + } + + @Override + public Types.MinorType visit(LargeUtf8 type) { + return MinorType.LARGEVARCHAR; + } + + @Override + public MinorType visit(Binary type) { + return MinorType.VARBINARY; + } + + @Override + public MinorType visit(LargeBinary type) { + return MinorType.LARGEVARBINARY; + } + + @Override + public MinorType visit(Bool type) { + return MinorType.BIT; + } + + @Override + public MinorType visit(Decimal type) { + if (type.getBitWidth() == 256) { + return MinorType.DECIMAL256; + } + return MinorType.DECIMAL; + } + + @Override + public MinorType visit(FixedSizeBinary type) { + return MinorType.FIXEDSIZEBINARY; + } + + @Override + public MinorType visit(Date type) { + switch (type.getUnit()) { + case DAY: + return MinorType.DATEDAY; + case MILLISECOND: + return MinorType.DATEMILLI; + default: + throw new IllegalArgumentException("unknown unit: " + type); + } + } + + @Override + public MinorType visit(Time type) { + switch (type.getUnit()) { + case SECOND: + return MinorType.TIMESEC; + case MILLISECOND: + return MinorType.TIMEMILLI; + case MICROSECOND: + return MinorType.TIMEMICRO; + case NANOSECOND: + return MinorType.TIMENANO; + default: + throw new IllegalArgumentException("unknown unit: " + type); + } + } + + @Override + public MinorType visit(Timestamp type) { + String tz = type.getTimezone(); + switch (type.getUnit()) { + case SECOND: + return tz == null ? MinorType.TIMESTAMPSEC : MinorType.TIMESTAMPSECTZ; + case MILLISECOND: + return tz == null ? MinorType.TIMESTAMPMILLI : MinorType.TIMESTAMPMILLITZ; + case MICROSECOND: + return tz == null ? MinorType.TIMESTAMPMICRO : MinorType.TIMESTAMPMICROTZ; + case NANOSECOND: + return tz == null ? MinorType.TIMESTAMPNANO : MinorType.TIMESTAMPNANOTZ; + default: + throw new IllegalArgumentException("unknown unit: " + type); + } + } + + @Override + public MinorType visit(Interval type) { + switch (type.getUnit()) { + case DAY_TIME: + return MinorType.INTERVALDAY; + case YEAR_MONTH: + return MinorType.INTERVALYEAR; + case MONTH_DAY_NANO: + return MinorType.INTERVALMONTHDAYNANO; + default: + throw new IllegalArgumentException("unknown unit: " + type); + } + } + + @Override + public MinorType visit(Duration type) { + return MinorType.DURATION; + } + + @Override + public MinorType visit(ExtensionType type) { + return MinorType.EXTENSIONTYPE; + } + }); + } + +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/UnionMode.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/UnionMode.java new file mode 100644 index 000000000..19956ac6a --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/UnionMode.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.types; + +/** + * Different memory layouts for Union Vectors. + */ +public enum UnionMode { + /** + * Each child vector is the same length as the overall vector, and there is one 8-bit integer buffer to indicate + * the index of a child vector to use at any given position. + */ + Sparse(org.apache.arrow.flatbuf.UnionMode.Sparse), + /** + * Each child vector is of variable width. The parent vector contains both a child index vector (like in + * {@link #Sparse}) and in addition a slot index buffer to determine the offset into the child vector indicated + * by the index vector. + */ + Dense(org.apache.arrow.flatbuf.UnionMode.Dense); + + private static final UnionMode[] valuesByFlatbufId = new UnionMode[UnionMode.values().length]; + + static { + for (UnionMode v : UnionMode.values()) { + valuesByFlatbufId[v.flatbufID] = v; + } + } + + private final short flatbufID; + + private UnionMode(short flatbufID) { + this.flatbufID = flatbufID; + } + + public short getFlatbufID() { + return flatbufID; + } + + public static UnionMode fromFlatbufID(short id) { + return valuesByFlatbufId[id]; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/DictionaryEncoding.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/DictionaryEncoding.java new file mode 100644 index 000000000..8d41b92d8 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/DictionaryEncoding.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.types.pojo; + +import java.util.Objects; + +import org.apache.arrow.vector.types.pojo.ArrowType.Int; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonGetter; +import com.fasterxml.jackson.annotation.JsonProperty; + +/** + * A POJO representation of Arrow Dictionary metadata. + */ +public class DictionaryEncoding { + + private final long id; + private final boolean ordered; + private final Int indexType; + + /** + * Constructs a new instance. + * + * @param id The ID of the dictionary to use for encoding. + * @param ordered Whether the keys in values in the dictionary are ordered. + * @param indexType (nullable). The integer type to use for indexing in the dictionary. Defaults to a signed + * 32 bit integer. + */ + @JsonCreator + public DictionaryEncoding( + @JsonProperty("id") long id, + @JsonProperty("isOrdered") boolean ordered, + @JsonProperty("indexType") Int indexType) { + this.id = id; + this.ordered = ordered; + this.indexType = indexType == null ? new Int(32, true) : indexType; + } + + public long getId() { + return id; + } + + @JsonGetter("isOrdered") + public boolean isOrdered() { + return ordered; + } + + public Int getIndexType() { + return indexType; + } + + @Override + public String toString() { + return "DictionaryEncoding[id=" + id + ",ordered=" + ordered + ",indexType=" + indexType + "]"; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } else if (o == null || getClass() != o.getClass()) { + return false; + } + DictionaryEncoding that = (DictionaryEncoding) o; + return id == that.id && ordered == that.ordered && Objects.equals(indexType, that.indexType); + } + + @Override + public int hashCode() { + return Objects.hash(id, ordered, indexType); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/ExtensionTypeRegistry.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/ExtensionTypeRegistry.java new file mode 100644 index 000000000..f347008b4 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/ExtensionTypeRegistry.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.types.pojo; + +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; + +import org.apache.arrow.vector.types.pojo.ArrowType.ExtensionType; + +/** + * A registry of recognized extension types. + */ +public final class ExtensionTypeRegistry { + private static final ConcurrentMap registry = new ConcurrentHashMap<>(); + + public static void register(ExtensionType type) { + registry.put(type.extensionName(), type); + } + + public static void unregister(ExtensionType type) { + registry.remove(type.extensionName()); + } + + public static ExtensionType lookup(String name) { + return registry.get(name); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java new file mode 100644 index 000000000..3a5ef1153 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java @@ -0,0 +1,306 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.types.pojo; + +import static org.apache.arrow.util.Preconditions.checkNotNull; +import static org.apache.arrow.vector.complex.BaseRepeatedValueVector.DATA_VECTOR_NAME; +import static org.apache.arrow.vector.types.pojo.ArrowType.getTypeForField; +import static org.apache.arrow.vector.types.pojo.Schema.convertMetadata; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Objects; +import java.util.stream.Collectors; + +import org.apache.arrow.flatbuf.KeyValue; +import org.apache.arrow.flatbuf.Type; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.util.Collections2; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.TypeLayout; +import org.apache.arrow.vector.types.pojo.ArrowType.ExtensionType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonInclude.Include; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.flatbuffers.FlatBufferBuilder; + +/** + * A POJO abstraction for the Flatbuffer description of Vector Type. + */ +public class Field { + + private static final Logger logger = LoggerFactory.getLogger(Field.class); + + public static Field nullablePrimitive(String name, ArrowType.PrimitiveType type) { + return nullable(name, type); + } + + public static Field nullable(String name, ArrowType type) { + return new Field(name, FieldType.nullable(type), null); + } + + private final String name; + private final FieldType fieldType; + private final List children; + + private Field( + String name, + boolean nullable, + ArrowType type, + DictionaryEncoding dictionary, + List children, + Map metadata) { + this(name, new FieldType(nullable, type, dictionary, metadata), children); + } + + @JsonCreator + private Field( + @JsonProperty("name") String name, + @JsonProperty("nullable") boolean nullable, + @JsonProperty("type") ArrowType type, + @JsonProperty("dictionary") DictionaryEncoding dictionary, + @JsonProperty("children") List children, + @JsonProperty("metadata") List> metadata) { + this(name, new FieldType(nullable, type, dictionary, convertMetadata(metadata)), children); + } + + private Field(String name, FieldType fieldType, List children, TypeLayout typeLayout) { + this.name = name; + this.fieldType = checkNotNull(fieldType); + this.children = children == null ? Collections.emptyList() : Collections2.toImmutableList(children); + } + + public Field(String name, FieldType fieldType, List children) { + this(name, fieldType, children, fieldType == null ? null : TypeLayout.getTypeLayout(fieldType.getType())); + } + + /** + * Construct a new vector of this type using the given allocator. + */ + public FieldVector createVector(BufferAllocator allocator) { + FieldVector vector = fieldType.createNewSingleVector(this, allocator, null); + vector.initializeChildrenFromFields(children); + return vector; + } + + /** + * Constructs a new instance from a flatbuffer representation of the field. + */ + public static Field convertField(org.apache.arrow.flatbuf.Field field) { + Map metadata = new HashMap<>(); + for (int i = 0; i < field.customMetadataLength(); i++) { + KeyValue kv = field.customMetadata(i); + String key = kv.key(); + String value = kv.value(); + metadata.put(key == null ? "" : key, value == null ? "" : value); + } + metadata = Collections.unmodifiableMap(metadata); + + String name = field.name(); + boolean nullable = field.nullable(); + ArrowType type = getTypeForField(field); + + if (metadata.containsKey(ExtensionType.EXTENSION_METADATA_KEY_NAME)) { + final String extensionName = metadata.get(ExtensionType.EXTENSION_METADATA_KEY_NAME); + final String extensionMetadata = metadata.getOrDefault(ExtensionType.EXTENSION_METADATA_KEY_METADATA, ""); + ExtensionType extensionType = ExtensionTypeRegistry.lookup(extensionName); + if (extensionType != null) { + type = extensionType.deserialize(type, extensionMetadata); + } else { + // Otherwise, we haven't registered the type + logger.info("Unrecognized extension type: {}", extensionName); + } + } + + DictionaryEncoding dictionary = null; + org.apache.arrow.flatbuf.DictionaryEncoding dictionaryFB = field.dictionary(); + if (dictionaryFB != null) { + ArrowType.Int indexType = null; + org.apache.arrow.flatbuf.Int indexTypeFB = dictionaryFB.indexType(); + if (indexTypeFB != null) { + indexType = new ArrowType.Int(indexTypeFB.bitWidth(), indexTypeFB.isSigned()); + } + dictionary = new DictionaryEncoding(dictionaryFB.id(), dictionaryFB.isOrdered(), indexType); + } + List children = new ArrayList<>(); + for (int i = 0; i < field.childrenLength(); i++) { + Field childField = convertField(field.children(i)); + childField = mutateOriginalNameIfNeeded(field, childField); + children.add(childField); + } + children = Collections.unmodifiableList(children); + return new Field(name, nullable, type, dictionary, children, metadata); + } + + /** + * Helper method to ensure backward compatibility with schemas generated prior to ARROW-1347, ARROW-1663. + * + * @param field the field to check + * @param originalChildField original field which name might be mutated + * @return original or mutated field + */ + private static Field mutateOriginalNameIfNeeded(org.apache.arrow.flatbuf.Field field, Field originalChildField) { + if ((field.typeType() == Type.List || field.typeType() == Type.FixedSizeList) && + originalChildField.getName().equals("[DEFAULT]")) { + return + new Field(DATA_VECTOR_NAME, + originalChildField.isNullable(), + originalChildField.getType(), + originalChildField.getDictionary(), + originalChildField.getChildren(), + originalChildField.getMetadata()); + } + return originalChildField; + } + + /** + * Puts this object into builder and returns the length of the serialized flatbuffer. + */ + public int getField(FlatBufferBuilder builder) { + int nameOffset = name == null ? -1 : builder.createString(name); + int typeOffset = getType().getType(builder); + int dictionaryOffset = -1; + DictionaryEncoding dictionary = getDictionary(); + if (dictionary != null) { + int dictionaryType = dictionary.getIndexType().getType(builder); + org.apache.arrow.flatbuf.DictionaryEncoding.startDictionaryEncoding(builder); + org.apache.arrow.flatbuf.DictionaryEncoding.addId(builder, dictionary.getId()); + org.apache.arrow.flatbuf.DictionaryEncoding.addIsOrdered(builder, dictionary.isOrdered()); + org.apache.arrow.flatbuf.DictionaryEncoding.addIndexType(builder, dictionaryType); + dictionaryOffset = org.apache.arrow.flatbuf.DictionaryEncoding.endDictionaryEncoding(builder); + } + int[] childrenData = new int[children.size()]; + for (int i = 0; i < children.size(); i++) { + childrenData[i] = children.get(i).getField(builder); + } + int childrenOffset = org.apache.arrow.flatbuf.Field.createChildrenVector(builder, childrenData); + int[] metadataOffsets = new int[getMetadata().size()]; + Iterator> metadataIterator = getMetadata().entrySet().iterator(); + for (int i = 0; i < metadataOffsets.length; i++) { + Entry kv = metadataIterator.next(); + int keyOffset = builder.createString(kv.getKey()); + int valueOffset = builder.createString(kv.getValue()); + KeyValue.startKeyValue(builder); + KeyValue.addKey(builder, keyOffset); + KeyValue.addValue(builder, valueOffset); + metadataOffsets[i] = KeyValue.endKeyValue(builder); + } + int metadataOffset = org.apache.arrow.flatbuf.Field.createCustomMetadataVector(builder, metadataOffsets); + org.apache.arrow.flatbuf.Field.startField(builder); + if (name != null) { + org.apache.arrow.flatbuf.Field.addName(builder, nameOffset); + } + org.apache.arrow.flatbuf.Field.addNullable(builder, isNullable()); + org.apache.arrow.flatbuf.Field.addTypeType(builder, getType().getTypeID().getFlatbufID()); + org.apache.arrow.flatbuf.Field.addType(builder, typeOffset); + org.apache.arrow.flatbuf.Field.addChildren(builder, childrenOffset); + org.apache.arrow.flatbuf.Field.addCustomMetadata(builder, metadataOffset); + if (dictionary != null) { + org.apache.arrow.flatbuf.Field.addDictionary(builder, dictionaryOffset); + } + return org.apache.arrow.flatbuf.Field.endField(builder); + } + + public String getName() { + return name; + } + + public boolean isNullable() { + return fieldType.isNullable(); + } + + public ArrowType getType() { + return fieldType.getType(); + } + + @JsonIgnore + public FieldType getFieldType() { + return fieldType; + } + + @JsonInclude(Include.NON_NULL) + public DictionaryEncoding getDictionary() { + return fieldType.getDictionary(); + } + + public List getChildren() { + return children; + } + + @JsonIgnore + public Map getMetadata() { + return fieldType.getMetadata(); + } + + @JsonProperty("metadata") + @JsonInclude(Include.NON_EMPTY) + List> getMetadataForJson() { + return convertMetadata(getMetadata()); + } + + @Override + public int hashCode() { + return Objects.hash(name, isNullable(), getType(), getDictionary(), getMetadata(), children); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof Field)) { + return false; + } + Field that = (Field) obj; + return Objects.equals(this.name, that.name) && + Objects.equals(this.isNullable(), that.isNullable()) && + Objects.equals(this.getType(), that.getType()) && + Objects.equals(this.getDictionary(), that.getDictionary()) && + Objects.equals(this.getMetadata(), that.getMetadata()) && + Objects.equals(this.children, that.children); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + if (name != null) { + sb.append(name).append(": "); + } + sb.append(getType()); + if (getDictionary() != null) { + sb.append("[dictionary: ").append(getDictionary().getId()).append("]"); + } + if (!children.isEmpty()) { + sb.append("<").append(children.stream() + .map(t -> t.toString()) + .collect(Collectors.joining(", "))) + .append(">"); + } + if (!isNullable()) { + sb.append(" not null"); + } + return sb.toString(); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/FieldType.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/FieldType.java new file mode 100644 index 000000000..bb3250ef1 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/FieldType.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.types.pojo; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.util.Collections2; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType.ExtensionType; +import org.apache.arrow.vector.util.CallBack; + +/** + * POJO representation of an Arrow field type. It consists of a logical type, nullability and whether the field + * (column) is dictionary encoded. + */ +public class FieldType { + + public static FieldType nullable(ArrowType type) { + return new FieldType(true, type, null, null); + } + + private final boolean nullable; + private final ArrowType type; + private final DictionaryEncoding dictionary; + private final Map metadata; + + public FieldType(boolean nullable, ArrowType type, DictionaryEncoding dictionary) { + this(nullable, type, dictionary, null); + } + + /** + * Constructs a new instance. + * + * @param nullable Whether the Vector is nullable + * @param type The logical arrow type of the field. + * @param dictionary The dictionary encoding of the field. + * @param metadata Custom metadata for the field. + */ + public FieldType(boolean nullable, ArrowType type, DictionaryEncoding dictionary, Map metadata) { + super(); + this.nullable = nullable; + this.type = Preconditions.checkNotNull(type); + this.dictionary = dictionary; + if (type instanceof ExtensionType) { + // Save the extension type name/metadata + final Map extensionMetadata = new HashMap<>(); + extensionMetadata.put(ExtensionType.EXTENSION_METADATA_KEY_NAME, ((ExtensionType) type).extensionName()); + extensionMetadata.put(ExtensionType.EXTENSION_METADATA_KEY_METADATA, ((ExtensionType) type).serialize()); + if (metadata != null) { + extensionMetadata.putAll(metadata); + } + this.metadata = Collections.unmodifiableMap(extensionMetadata); + } else { + this.metadata = metadata == null ? java.util.Collections.emptyMap() : Collections2.immutableMapCopy(metadata); + } + } + + public boolean isNullable() { + return nullable; + } + + public ArrowType getType() { + return type; + } + + public DictionaryEncoding getDictionary() { + return dictionary; + } + + public Map getMetadata() { + return metadata; + } + + public FieldVector createNewSingleVector(String name, BufferAllocator allocator, CallBack schemaCallBack) { + MinorType minorType = Types.getMinorTypeForArrowType(type); + return minorType.getNewVector(name, this, allocator, schemaCallBack); + } + + public FieldVector createNewSingleVector(Field field, BufferAllocator allocator, CallBack schemaCallBack) { + MinorType minorType = Types.getMinorTypeForArrowType(type); + return minorType.getNewVector(field, allocator, schemaCallBack); + } + + @Override + public int hashCode() { + return Objects.hash(nullable, type, dictionary, metadata); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof FieldType)) { + return false; + } + FieldType that = (FieldType) obj; + return Objects.equals(this.isNullable(), that.isNullable()) && + Objects.equals(this.getType(), that.getType()) && + Objects.equals(this.getDictionary(), that.getDictionary()) && + Objects.equals(this.getMetadata(), that.getMetadata()); + } + +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Schema.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Schema.java new file mode 100644 index 000000000..d377b395c --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Schema.java @@ -0,0 +1,247 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.types.pojo; + + +import static org.apache.arrow.vector.types.pojo.Field.convertField; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; + +import org.apache.arrow.flatbuf.KeyValue; +import org.apache.arrow.util.Collections2; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.ipc.message.FBSerializables; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonInclude.Include; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectReader; +import com.fasterxml.jackson.databind.ObjectWriter; +import com.google.flatbuffers.FlatBufferBuilder; + +/** + * An Arrow Schema. + */ +public class Schema { + + /** + * Search for a field by name in given the list of fields. + * + * @param fields the list of the fields + * @param name the name of the field to return + * @return the corresponding field + * @throws IllegalArgumentException if the field was not found + */ + public static Field findField(List fields, String name) { + for (Field field : fields) { + if (field.getName().equals(name)) { + return field; + } + } + throw new IllegalArgumentException(String.format("field %s not found in %s", name, fields)); + } + + static final String METADATA_KEY = "key"; + static final String METADATA_VALUE = "value"; + + private static final ObjectMapper mapper = new ObjectMapper(); + private static final ObjectWriter writer = mapper.writerWithDefaultPrettyPrinter(); + private static final ObjectReader reader = mapper.readerFor(Schema.class); + + public static Schema fromJSON(String json) throws IOException { + return reader.readValue(Preconditions.checkNotNull(json)); + } + + public static Schema deserialize(ByteBuffer buffer) { + return convertSchema(org.apache.arrow.flatbuf.Schema.getRootAsSchema(buffer)); + } + + /** Converts a flatbuffer schema to its POJO representation. */ + public static Schema convertSchema(org.apache.arrow.flatbuf.Schema schema) { + List fields = new ArrayList<>(); + for (int i = 0; i < schema.fieldsLength(); i++) { + fields.add(convertField(schema.fields(i))); + } + Map metadata = new HashMap<>(); + for (int i = 0; i < schema.customMetadataLength(); i++) { + KeyValue kv = schema.customMetadata(i); + String key = kv.key(); + String value = kv.value(); + metadata.put(key == null ? "" : key, value == null ? "" : value); + } + return new Schema(true, Collections.unmodifiableList(fields), Collections.unmodifiableMap(metadata)); + } + + private final List fields; + private final Map metadata; + + public Schema(Iterable fields) { + this(fields, (Map) null); + } + + /** + * Constructor with metadata. + */ + public Schema(Iterable fields, + Map metadata) { + this(true, + Collections2.toImmutableList(fields), + metadata == null ? Collections.emptyMap() : Collections2.immutableMapCopy(metadata)); + } + + + /** + * Constructor used for JSON deserialization. + */ + @JsonCreator + private Schema(@JsonProperty("fields") Iterable fields, + @JsonProperty("metadata") List> metadata) { + this(fields, convertMetadata(metadata)); + } + + + /** + * Private constructor to bypass automatic collection copy. + * @param unsafe a ignored argument. Its only purpose is to prevent using the constructor + * by accident because of type collisions (List vs Iterable). + */ + private Schema(boolean unsafe, List fields, Map metadata) { + this.fields = fields; + this.metadata = metadata; + } + + static Map convertMetadata(List> metadata) { + return (metadata == null) ? null : metadata.stream() + .map(e -> new AbstractMap.SimpleImmutableEntry<>(e.get(METADATA_KEY), e.get(METADATA_VALUE))) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + } + + static List> convertMetadata(Map metadata) { + return (metadata == null) ? null : metadata.entrySet() + .stream() + .map(Schema::convertEntryToKeyValueMap) + .collect(Collectors.toList()); + } + + private static Map convertEntryToKeyValueMap(Map.Entry entry) { + Map map = new HashMap<>(2); + map.put(METADATA_KEY, entry.getKey()); + map.put(METADATA_VALUE, entry.getValue()); + return Collections.unmodifiableMap(map); + } + + public List getFields() { + return fields; + } + + @JsonIgnore + public Map getCustomMetadata() { + return metadata; + } + + @JsonProperty("metadata") + @JsonInclude(Include.NON_EMPTY) + List> getCustomMetadataForJson() { + return convertMetadata(getCustomMetadata()); + } + + /** + * Search for a field by name in this Schema. + * + * @param name the name of the field to return + * @return the corresponding field + * @throws IllegalArgumentException if the field was not found + */ + public Field findField(String name) { + return findField(getFields(), name); + } + + /** + * Returns the JSON string representation of this schema. + */ + public String toJson() { + try { + return writer.writeValueAsString(this); + } catch (JsonProcessingException e) { + // this should not happen + throw new RuntimeException(e); + } + } + + /** + * Adds this schema to the builder returning the size of the builder after adding. + */ + public int getSchema(FlatBufferBuilder builder) { + int[] fieldOffsets = new int[fields.size()]; + for (int i = 0; i < fields.size(); i++) { + fieldOffsets[i] = fields.get(i).getField(builder); + } + int fieldsOffset = org.apache.arrow.flatbuf.Schema.createFieldsVector(builder, fieldOffsets); + int metadataOffset = FBSerializables.writeKeyValues(builder, metadata); + org.apache.arrow.flatbuf.Schema.startSchema(builder); + org.apache.arrow.flatbuf.Schema.addFields(builder, fieldsOffset); + org.apache.arrow.flatbuf.Schema.addCustomMetadata(builder, metadataOffset); + return org.apache.arrow.flatbuf.Schema.endSchema(builder); + } + + /** + * Returns the serialized flatbuffer representation of this schema. + */ + public byte[] toByteArray() { + FlatBufferBuilder builder = new FlatBufferBuilder(); + int schemaOffset = this.getSchema(builder); + builder.finish(schemaOffset); + ByteBuffer bb = builder.dataBuffer(); + byte[] bytes = new byte[bb.remaining()]; + bb.get(bytes); + return bytes; + } + + @Override + public int hashCode() { + return Objects.hash(fields, metadata); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof Schema)) { + return false; + } + return Objects.equals(this.fields, ((Schema) obj).fields) && + Objects.equals(this.metadata, ((Schema) obj).metadata); + } + + @Override + public String toString() { + String meta = metadata.isEmpty() ? "" : "(metadata: " + metadata.toString() + ")"; + return "Schema<" + fields.stream().map(t -> t.toString()).collect(Collectors.joining(", ")) + ">" + meta; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/ByteArrayReadableSeekableByteChannel.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/ByteArrayReadableSeekableByteChannel.java new file mode 100644 index 000000000..b8ce9bde4 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/ByteArrayReadableSeekableByteChannel.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.SeekableByteChannel; + +/** + * A {@link SeekableByteChannel} backed by a byte array. + */ +public class ByteArrayReadableSeekableByteChannel implements SeekableByteChannel { + private byte[] byteArray; + private int position = 0; + + /** + * Construct a new object using the given byteArray as a backing store. + */ + public ByteArrayReadableSeekableByteChannel(byte[] byteArray) { + if (byteArray == null) { + throw new NullPointerException(); + } + this.byteArray = byteArray; + } + + @Override + public boolean isOpen() { + return byteArray != null; + } + + @Override + public void close() throws IOException { + byteArray = null; + } + + @Override + public int read(final ByteBuffer dst) throws IOException { + int remainingInBuf = byteArray.length - this.position; + int length = Math.min(dst.remaining(), remainingInBuf); + dst.put(this.byteArray, this.position, length); + this.position += length; + return length; + } + + @Override + public long position() throws IOException { + return this.position; + } + + @Override + public SeekableByteChannel position(final long newPosition) throws IOException { + this.position = (int) newPosition; + return this; + } + + @Override + public long size() throws IOException { + return this.byteArray.length; + } + + @Override + public int write(final ByteBuffer src) throws IOException { + throw new UnsupportedOperationException("Read only"); + } + + @Override + public SeekableByteChannel truncate(final long size) throws IOException { + throw new UnsupportedOperationException("Read only"); + } + +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/CallBack.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/CallBack.java new file mode 100644 index 000000000..ddeca59b0 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/CallBack.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +/** + * Generic callback interface to be notified of events on value vectors. + */ +public interface CallBack { + void doWork(); +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/DataSizeRoundingUtil.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/DataSizeRoundingUtil.java new file mode 100644 index 000000000..3af2c9837 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/DataSizeRoundingUtil.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +/** + * Utilities for rounding data size. + */ +public final class DataSizeRoundingUtil { + + /** + * The mask for rounding an integer to a multiple of 8. + * (i.e. clear the lowest 3 bits) + */ + public static int ROUND_8_MASK_INT = 0xFFFFFFF8; + + /** + * The mask for rounding a long integer to a multiple of 8. + * (i.e. clear the lowest 3 bits) + */ + public static long ROUND_8_MASK_LONG = 0xFFFFFFFFFFFFFFF8L; + + /** + * The number of bits to shift for dividing by 8. + */ + public static int DIVIDE_BY_8_SHIFT_BITS = 3; + + /** + * Round up the number to the nearest multiple of 8. + * @param input the number to round. + * @return the rounded number. + */ + public static int roundUpTo8Multiple(int input) { + return (input + 7) & ROUND_8_MASK_INT; + } + + /** + * Round up the number to the nearest multiple of 8. + * @param input the number to round. + * @return the rounded number + */ + public static long roundUpTo8Multiple(long input) { + return (input + 7L) & ROUND_8_MASK_LONG; + } + + /** + * Round down the number to the nearest multiple of 8. + * @param input the number to round. + * @return the rounded number. + */ + public static int roundDownTo8Multiple(int input) { + return input & ROUND_8_MASK_INT; + } + + /** + * Round down the number to the nearest multiple of 8. + * @param input the number to round. + * @return the rounded number + */ + public static long roundDownTo8Multiple(long input) { + return input & ROUND_8_MASK_LONG; + } + + /** + * A fast way to compute Math.ceil(input / 8.0). + * @param input the input number. + * @return the computed number. + */ + public static int divideBy8Ceil(int input) { + return (input + 7) >>> DIVIDE_BY_8_SHIFT_BITS; + } + + /** + * A fast way to compute Math.ceil(input / 8.0). + * @param input the input number. + * @return the computed number. + */ + public static long divideBy8Ceil(long input) { + return (input + 7) >>> (long) DIVIDE_BY_8_SHIFT_BITS; + } + + private DataSizeRoundingUtil() { + + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java new file mode 100644 index 000000000..9e8b6d26f --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import java.time.Instant; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; +import java.time.temporal.ChronoUnit; +import java.util.TimeZone; + +/** Utility class for Date, DateTime, TimeStamp, Interval data types. */ +public class DateUtility { + private DateUtility() {} + + private static final String UTC = "UTC"; + + public static final DateTimeFormatter formatDate = DateTimeFormatter.ofPattern("yyyy-MM-dd"); + public static final DateTimeFormatter formatTimeStampMilli = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss.SSS"); + public static final DateTimeFormatter formatTimeStampTZ = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss.SSS ZZZ"); + public static final DateTimeFormatter formatTime = DateTimeFormatter.ofPattern("HH:mm:ss.SSS"); + + public static DateTimeFormatter dateTimeTZFormat = null; + public static DateTimeFormatter timeFormat = null; + + public static final int yearsToMonths = 12; + public static final int hoursToMillis = 60 * 60 * 1000; + public static final int minutesToMillis = 60 * 1000; + public static final int secondsToMillis = 1000; + public static final int monthToStandardDays = 30; + public static final long monthsToMillis = 2592000000L; // 30 * 24 * 60 * 60 * 1000 + public static final int daysToStandardMillis = 24 * 60 * 60 * 1000; + + /** Returns the date time formatter used to parse date strings. */ + public static DateTimeFormatter getDateTimeFormatter() { + + if (dateTimeTZFormat == null) { + DateTimeFormatter dateFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd"); + DateTimeFormatter optionalTime = DateTimeFormatter.ofPattern(" HH:mm:ss"); + DateTimeFormatter optionalSec = DateTimeFormatter.ofPattern(".SSS"); + DateTimeFormatter optionalZone = DateTimeFormatter.ofPattern(" ZZZ"); + + dateTimeTZFormat = new DateTimeFormatterBuilder().append(dateFormatter).appendOptional(optionalTime) + .appendOptional(optionalSec).appendOptional(optionalZone).toFormatter(); + } + + return dateTimeTZFormat; + } + + /** Returns time formatter used to parse time strings. */ + public static DateTimeFormatter getTimeFormatter() { + if (timeFormat == null) { + DateTimeFormatter timeFormatter = DateTimeFormatter.ofPattern("HH:mm:ss"); + DateTimeFormatter optionalSec = DateTimeFormatter.ofPattern(".SSS"); + timeFormat = new DateTimeFormatterBuilder().append(timeFormatter).appendOptional(optionalSec).toFormatter(); + } + return timeFormat; + } + + /** + * Convert milliseconds from epoch to a LocalDateTime with timeZone offset. + * + * @param epochMillis milliseconds from epoch + * @param timeZone current timeZone + * @return LocalDateTime object with timeZone offset + */ + public static LocalDateTime getLocalDateTimeFromEpochMilli(long epochMillis, String timeZone) { + final LocalDateTime localDateTime = LocalDateTime.ofInstant( + Instant.ofEpochMilli(epochMillis), TimeZone.getTimeZone(timeZone).toZoneId()); + return localDateTime; + } + + /** + * Convert milliseconds from epoch to a LocalDateTime with UTC offset. + */ + public static LocalDateTime getLocalDateTimeFromEpochMilli(long epochMillis) { + return getLocalDateTimeFromEpochMilli(epochMillis, UTC); + } + + /** + * Convert microseconds from epoch to a LocalDateTime with timeZone offset. + * + * @param epochMicros microseconds from epoch + * @param timeZone current timeZone + * @return LocalDateTime object with timeZone offset + */ + public static LocalDateTime getLocalDateTimeFromEpochMicro(long epochMicros, String timeZone) { + final long millis = java.util.concurrent.TimeUnit.MICROSECONDS.toMillis(epochMicros); + final long addl_micros = epochMicros - (millis * 1000); + return DateUtility.getLocalDateTimeFromEpochMilli(millis, timeZone).plus(addl_micros, ChronoUnit.MICROS); + } + + /** + * Convert microseconds from epoch to a LocalDateTime with UTC offset. + */ + public static LocalDateTime getLocalDateTimeFromEpochMicro(long epochMicros) { + return getLocalDateTimeFromEpochMicro(epochMicros, UTC); + } + + /** + * Convert nanoseconds from epoch to a LocalDateTime with timeZone offset. + * + * @param epochNanos nanoseconds from epoch + * @param timeZone current timeZone + * @return LocalDateTime object with timeZone offset + */ + public static LocalDateTime getLocalDateTimeFromEpochNano(long epochNanos, String timeZone) { + final long millis = java.util.concurrent.TimeUnit.NANOSECONDS.toMillis(epochNanos); + final long addl_nanos = epochNanos - (millis * 1000 * 1000); + return DateUtility.getLocalDateTimeFromEpochMilli(millis, timeZone).plusNanos(addl_nanos); + } + + /** + * Convert nanoseconds from epoch to a LocalDateTime with UTC offset. + */ + public static LocalDateTime getLocalDateTimeFromEpochNano(long epochNanos) { + return getLocalDateTimeFromEpochNano(epochNanos, UTC); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java new file mode 100644 index 000000000..f778bcb20 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +import org.apache.arrow.memory.ArrowBuf; + +import io.netty.util.internal.PlatformDependent; + +/** + * Utility methods for configurable precision Decimal values (e.g. {@link BigDecimal}). + */ +public class DecimalUtility { + private DecimalUtility() {} + + public static final byte [] zeroes = new byte[] {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + public static final byte [] minus_one = new byte[] {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; + private static final boolean LITTLE_ENDIAN = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN; + + /** + * Read an ArrowType.Decimal at the given value index in the ArrowBuf and convert to a BigDecimal + * with the given scale. + */ + public static BigDecimal getBigDecimalFromArrowBuf(ArrowBuf bytebuf, int index, int scale, int byteWidth) { + byte[] value = new byte[byteWidth]; + byte temp; + final long startIndex = (long) index * byteWidth; + + bytebuf.getBytes(startIndex, value, 0, byteWidth); + if (LITTLE_ENDIAN) { + // Decimal stored as native endian, need to swap bytes to make BigDecimal if native endian is LE + int stop = byteWidth / 2; + for (int i = 0, j; i < stop; i++) { + temp = value[i]; + j = (byteWidth - 1) - i; + value[i] = value[j]; + value[j] = temp; + } + } + BigInteger unscaledValue = new BigInteger(value); + return new BigDecimal(unscaledValue, scale); + } + + /** + * Read an ArrowType.Decimal from the ByteBuffer and convert to a BigDecimal with the given + * scale. + */ + public static BigDecimal getBigDecimalFromByteBuffer(ByteBuffer bytebuf, int scale, int byteWidth) { + byte[] value = new byte[byteWidth]; + bytebuf.get(value); + BigInteger unscaledValue = new BigInteger(value); + return new BigDecimal(unscaledValue, scale); + } + + /** + * Read an ArrowType.Decimal from the ArrowBuf at the given value index and return it as a byte + * array. + */ + public static byte[] getByteArrayFromArrowBuf(ArrowBuf bytebuf, int index, int byteWidth) { + final byte[] value = new byte[byteWidth]; + final long startIndex = (long) index * byteWidth; + bytebuf.getBytes(startIndex, value, 0, byteWidth); + return value; + } + + /** + * Check that the BigDecimal scale equals the vectorScale and that the BigDecimal precision is + * less than or equal to the vectorPrecision. If not, then an UnsupportedOperationException is + * thrown, otherwise returns true. + */ + public static boolean checkPrecisionAndScale(BigDecimal value, int vectorPrecision, int vectorScale) { + if (value.scale() != vectorScale) { + throw new UnsupportedOperationException("BigDecimal scale must equal that in the Arrow vector: " + + value.scale() + " != " + vectorScale); + } + if (value.precision() > vectorPrecision) { + throw new UnsupportedOperationException("BigDecimal precision can not be greater than that in the Arrow " + + "vector: " + value.precision() + " > " + vectorPrecision); + } + return true; + } + + /** + * Check that the decimal scale equals the vectorScale and that the decimal precision is + * less than or equal to the vectorPrecision. If not, then an UnsupportedOperationException is + * thrown, otherwise returns true. + */ + public static boolean checkPrecisionAndScale(int decimalPrecision, int decimalScale, int vectorPrecision, + int vectorScale) { + if (decimalScale != vectorScale) { + throw new UnsupportedOperationException("BigDecimal scale must equal that in the Arrow vector: " + + decimalScale + " != " + vectorScale); + } + if (decimalPrecision > vectorPrecision) { + throw new UnsupportedOperationException("BigDecimal precision can not be greater than that in the Arrow " + + "vector: " + decimalPrecision + " > " + vectorPrecision); + } + return true; + } + + /** + * Write the given BigDecimal to the ArrowBuf at the given value index. Will throw an + * UnsupportedOperationException if the decimal size is greater than the Decimal vector byte + * width. + */ + public static void writeBigDecimalToArrowBuf(BigDecimal value, ArrowBuf bytebuf, int index, int byteWidth) { + final byte[] bytes = value.unscaledValue().toByteArray(); + writeByteArrayToArrowBufHelper(bytes, bytebuf, index, byteWidth); + } + + /** + * Write the given long to the ArrowBuf at the given value index. + * This routine extends the original sign bit to a new upper area in 128-bit or 256-bit. + */ + public static void writeLongToArrowBuf(long value, ArrowBuf bytebuf, int index, int byteWidth) { + if (byteWidth != 16 && byteWidth != 32) { + throw new UnsupportedOperationException("DecimalUtility.writeLongToArrowBuf() currently supports " + + "128-bit or 256-bit width data"); + } + final long addressOfValue = bytebuf.memoryAddress() + (long) index * byteWidth; + final long padValue = Long.signum(value) == -1 ? -1L : 0L; + if (LITTLE_ENDIAN) { + PlatformDependent.putLong(addressOfValue, value); + for (int i = 1; i <= (byteWidth - 8) / 8; i++) { + PlatformDependent.putLong(addressOfValue + Long.BYTES * i, padValue); + } + } else { + for (int i = 0; i < (byteWidth - 8) / 8; i++) { + PlatformDependent.putLong(addressOfValue + Long.BYTES * i, padValue); + } + PlatformDependent.putLong(addressOfValue + Long.BYTES * (byteWidth - 8) / 8, value); + } + } + + /** + * Write the given byte array to the ArrowBuf at the given value index. Will throw an + * UnsupportedOperationException if the decimal size is greater than the Decimal vector byte + * width. + */ + public static void writeByteArrayToArrowBuf(byte[] bytes, ArrowBuf bytebuf, int index, int byteWidth) { + writeByteArrayToArrowBufHelper(bytes, bytebuf, index, byteWidth); + } + + private static void writeByteArrayToArrowBufHelper(byte[] bytes, ArrowBuf bytebuf, int index, int byteWidth) { + final long startIndex = (long) index * byteWidth; + if (bytes.length > byteWidth) { + throw new UnsupportedOperationException("Decimal size greater than " + byteWidth + " bytes: " + bytes.length); + } + + byte [] padBytes = bytes[0] < 0 ? minus_one : zeroes; + if (LITTLE_ENDIAN) { + // Decimal stored as native-endian, need to swap data bytes before writing to ArrowBuf if LE + byte[] bytesLE = new byte[bytes.length]; + for (int i = 0; i < bytes.length; i++) { + bytesLE[i] = bytes[bytes.length - 1 - i]; + } + + // Write LE data + bytebuf.setBytes(startIndex, bytesLE, 0, bytes.length); + bytebuf.setBytes(startIndex + bytes.length, padBytes, 0, byteWidth - bytes.length); + } else { + // Write BE data + bytebuf.setBytes(startIndex + byteWidth - bytes.length, bytes, 0, bytes.length); + bytebuf.setBytes(startIndex, padBytes, 0, byteWidth - bytes.length); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java new file mode 100644 index 000000000..9592f3975 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.dictionary.Dictionary; +import org.apache.arrow.vector.dictionary.DictionaryProvider; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.DictionaryEncoding; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; + +/** + * Utility methods for working with Dictionaries used in Dictionary encodings. + */ +public class DictionaryUtility { + private DictionaryUtility() {} + + /** + * Convert field and child fields that have a dictionary encoding to message format, so fields + * have the dictionary type. + * + *

NOTE: in the message format, fields have the dictionary type + * in the memory format, they have the index type + */ + public static Field toMessageFormat(Field field, DictionaryProvider provider, Set dictionaryIdsUsed) { + if (!needConvertToMessageFormat(field)) { + return field; + } + DictionaryEncoding encoding = field.getDictionary(); + List children; + + + ArrowType type; + if (encoding == null) { + type = field.getType(); + children = field.getChildren(); + } else { + long id = encoding.getId(); + Dictionary dictionary = provider.lookup(id); + if (dictionary == null) { + throw new IllegalArgumentException("Could not find dictionary with ID " + id); + } + type = dictionary.getVectorType(); + children = dictionary.getVector().getField().getChildren(); + + dictionaryIdsUsed.add(id); + } + + final List updatedChildren = new ArrayList<>(children.size()); + for (Field child : children) { + updatedChildren.add(toMessageFormat(child, provider, dictionaryIdsUsed)); + } + + return new Field(field.getName(), new FieldType(field.isNullable(), type, encoding, field.getMetadata()), + updatedChildren); + } + + /** + * Checks if it is required to convert the field to message format. + * @param field the field to check. + * @return true if a conversion is required, and false otherwise. + */ + public static boolean needConvertToMessageFormat(Field field) { + DictionaryEncoding encoding = field.getDictionary(); + + if (encoding != null) { + // when encoding is not null, the type must be determined from the + // dictionary, so conversion must be performed. + return true; + } + + List children = field.getChildren(); + for (Field child : children) { + if (needConvertToMessageFormat(child)) { + return true; + } + } + return false; + } + + /** + * Convert field and child fields that have a dictionary encoding to memory format, so fields + * have the index type. + */ + public static Field toMemoryFormat(Field field, BufferAllocator allocator, Map dictionaries) { + DictionaryEncoding encoding = field.getDictionary(); + List children = field.getChildren(); + + if (encoding == null && children.isEmpty()) { + return field; + } + + List updatedChildren = new ArrayList<>(children.size()); + for (Field child : children) { + updatedChildren.add(toMemoryFormat(child, allocator, dictionaries)); + } + + ArrowType type; + List fieldChildren = null; + if (encoding == null) { + type = field.getType(); + fieldChildren = updatedChildren; + } else { + // re-type the field for in-memory format + type = encoding.getIndexType(); + if (type == null) { + type = new ArrowType.Int(32, true); + } + // get existing or create dictionary vector + if (!dictionaries.containsKey(encoding.getId())) { + // create a new dictionary vector for the values + String dictName = "DICT" + encoding.getId(); + Field dictionaryField = new Field(dictName, + new FieldType(field.isNullable(), field.getType(), null, null), updatedChildren); + FieldVector dictionaryVector = dictionaryField.createVector(allocator); + dictionaries.put(encoding.getId(), new Dictionary(dictionaryVector, encoding)); + } + } + + return new Field(field.getName(), new FieldType(field.isNullable(), type, encoding, field.getMetadata()), + fieldChildren); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/ElementAddressableVectorIterator.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/ElementAddressableVectorIterator.java new file mode 100644 index 000000000..89c100779 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/ElementAddressableVectorIterator.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import java.util.Iterator; + +import org.apache.arrow.memory.util.ArrowBufPointer; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.memory.util.hash.SimpleHasher; +import org.apache.arrow.vector.ElementAddressableVector; + +/** + * Iterator for traversing elements of a {@link ElementAddressableVector}. + * @param vector type. + */ +public class ElementAddressableVectorIterator + implements Iterator { + + private final T vector; + + /** + * Index of the next element to access. + */ + private int index = 0; + + private final ArrowBufPointer reusablePointer; + + /** + * Constructs an iterator for the {@link ElementAddressableVector}. + * @param vector the vector to iterate. + */ + public ElementAddressableVectorIterator(T vector) { + this(vector, SimpleHasher.INSTANCE); + } + + /** + * Constructs an iterator for the {@link ElementAddressableVector}. + * @param vector the vector to iterate. + * @param hasher the hasher to calculate the hash code. + */ + public ElementAddressableVectorIterator(T vector, ArrowBufHasher hasher) { + this.vector = vector; + reusablePointer = new ArrowBufPointer(hasher); + } + + @Override + public boolean hasNext() { + return index < vector.getValueCount(); + } + + /** + * Retrieves the next pointer from the vector. + * @return the pointer pointing to the next element in the vector. + * Note that the returned pointer is only valid before the next call to this method. + */ + @Override + public ArrowBufPointer next() { + vector.getDataPointer(index, reusablePointer); + index += 1; + return reusablePointer; + } + + /** + * Retrieves the next pointer from the vector. + * @param outPointer the pointer to populate. + */ + public void next(ArrowBufPointer outPointer) { + vector.getDataPointer(index, outPointer); + index += 1; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringArrayList.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringArrayList.java new file mode 100644 index 000000000..2ca71ec63 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringArrayList.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import java.util.ArrayList; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; + +/** + * Extension of {@link ArrayList} that {@link #toString()} method returns the serialized JSON + * version of its members (or throws an exception if they can't be converted to JSON). + * + * @param Type of value held in the list. + */ +public class JsonStringArrayList extends ArrayList { + + private static ObjectMapper mapper; + + static { + mapper = new ObjectMapper(); + } + + public JsonStringArrayList() { + super(); + } + + public JsonStringArrayList(int size) { + super(size); + } + + @Override + public final String toString() { + try { + return mapper.writeValueAsString(this); + } catch (JsonProcessingException e) { + throw new IllegalStateException("Cannot serialize array list to JSON string", e); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringHashMap.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringHashMap.java new file mode 100644 index 000000000..f41ae4ee2 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringHashMap.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import java.util.LinkedHashMap; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; + +/** + * Simple class that extends the regular java.util.HashMap but overrides the + * toString() method of the HashMap class to produce a JSON string instead + * + * @param The type of the key for the map. + * @param The type of the value for the map. + */ +public class JsonStringHashMap extends LinkedHashMap { + + private static ObjectMapper mapper; + + static { + mapper = new ObjectMapper(); + } + + @Override + public final String toString() { + try { + return mapper.writeValueAsString(this); + } catch (JsonProcessingException e) { + throw new IllegalStateException("Cannot serialize hash map to JSON string", e); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinal.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinal.java new file mode 100644 index 000000000..cf157031b --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinal.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import java.util.Collection; +import java.util.Map; +import java.util.Set; + +/** + * An implementation of a map that supports constant time look-up by a generic key or an ordinal. + * + *

This class extends the functionality a regular {@link Map} with ordinal lookup support. + * Upon insertion an unused ordinal is assigned to the inserted (key, value) tuple. + * Upon update the same ordinal id is re-used while value is replaced. + * Upon deletion of an existing item, its corresponding ordinal is recycled and could be used by another item. + * + *

For any instance with N items, this implementation guarantees that ordinals are in the range of [0, N). However, + * the ordinal assignment is dynamic and may change after an insertion or deletion. Consumers of this class are + * responsible for explicitly checking the ordinal corresponding to a key via + * {@link MultiMapWithOrdinal#getOrdinal(Object)} before attempting to execute a lookup + * with an ordinal. + * + * @param key type + * @param value type + */ +public interface MapWithOrdinal { + V getByOrdinal(int id); + + int getOrdinal(K key); + + int size(); + + boolean isEmpty(); + + V get(K key); + + Collection getAll(K key); + + boolean put(K key, V value, boolean overwrite); + + Collection values(); + + boolean containsKey(K key); + + boolean remove(K key, V value); + + boolean removeAll(K key); + + void clear(); + + Set keys(); +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinalImpl.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinalImpl.java new file mode 100644 index 000000000..41ce1fc0d --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinalImpl.java @@ -0,0 +1,248 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; + +import org.apache.arrow.util.Preconditions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import io.netty.util.collection.IntObjectHashMap; +import io.netty.util.collection.IntObjectMap; + +/** + * An implementation of map that supports constant time look-up by a generic key or an ordinal. + * + *

This class extends the functionality a regular {@link Map} with ordinal lookup support. + * Upon insertion an unused ordinal is assigned to the inserted (key, value) tuple. + * Upon update the same ordinal id is re-used while value is replaced. + * Upon deletion of an existing item, its corresponding ordinal is recycled and could be used by another item. + * + *

For any instance with N items, this implementation guarantees that ordinals are in the range of [0, N). However, + * the ordinal assignment is dynamic and may change after an insertion or deletion. Consumers of this class are + * responsible for explicitly checking the ordinal corresponding to a key via + * {@link MapWithOrdinalImpl#getOrdinal(Object)} before attempting to execute a lookup + * with an ordinal. + * + * @param key type + * @param value type + */ +public class MapWithOrdinalImpl implements MapWithOrdinal { + private static final Logger logger = LoggerFactory.getLogger(MapWithOrdinalImpl.class); + + private final Map> primary = new HashMap<>(); + private final IntObjectHashMap secondary = new IntObjectHashMap<>(); + + private final Map delegate = new Map() { + @Override + public boolean isEmpty() { + return size() == 0; + } + + @Override + public int size() { + return primary.size(); + } + + @Override + public boolean containsKey(Object key) { + return primary.containsKey(key); + } + + @Override + public boolean containsValue(Object value) { + return primary.containsValue(value); + } + + @Override + public V get(Object key) { + Entry pair = primary.get(key); + if (pair != null) { + return pair.getValue(); + } + return null; + } + + @Override + public V put(K key, V value) { + final Entry oldPair = primary.get(key); + // if key exists try replacing otherwise, assign a new ordinal identifier + final int ordinal = oldPair == null ? primary.size() : oldPair.getKey(); + primary.put(key, new AbstractMap.SimpleImmutableEntry<>(ordinal, value)); + secondary.put(ordinal, value); + return oldPair == null ? null : oldPair.getValue(); + } + + public boolean put(K key, V value, boolean override) { + return put(key, value) != null; + } + + @Override + public V remove(Object key) { + final Entry oldPair = primary.remove(key); + if (oldPair != null) { + final int lastOrdinal = secondary.size(); + final V last = secondary.get(lastOrdinal); + // normalize mappings so that all numbers until primary.size() is assigned + // swap the last element with the deleted one + secondary.put(oldPair.getKey(), last); + primary.put((K) key, new AbstractMap.SimpleImmutableEntry<>(oldPair.getKey(), last)); + } + return oldPair == null ? null : oldPair.getValue(); + } + + @Override + public void putAll(Map m) { + throw new UnsupportedOperationException(); + } + + @Override + public void clear() { + primary.clear(); + secondary.clear(); + } + + @Override + public Set keySet() { + return primary.keySet(); + } + + @Override + public Collection values() { + return StreamSupport.stream(secondary.entries().spliterator(), false) + .map((IntObjectMap.PrimitiveEntry t) -> Preconditions.checkNotNull(t).value()) + .collect(Collectors.toList()); + } + + @Override + public Set> entrySet() { + return primary.entrySet().stream() + .map(entry -> new AbstractMap.SimpleImmutableEntry<>(entry.getKey(), entry.getValue().getValue())) + .collect(Collectors.toSet()); + } + }; + + /** + * Returns the value corresponding to the given ordinal. + * + * @param id ordinal value for lookup + * @return an instance of V + */ + public V getByOrdinal(int id) { + return secondary.get(id); + } + + /** + * Returns the ordinal corresponding to the given key. + * + * @param key key for ordinal lookup + * @return ordinal value corresponding to key if it exists or -1 + */ + public int getOrdinal(K key) { + Map.Entry pair = primary.get(key); + if (pair != null) { + return pair.getKey(); + } + return -1; + } + + @Override + public int size() { + return delegate.size(); + } + + @Override + public boolean isEmpty() { + return delegate.isEmpty(); + } + + @Override + public Collection getAll(K key) { + if (delegate.containsKey(key)) { + List list = new ArrayList<>(1); + list.add(get(key)); + return list; + } + return null; + } + + @Override + public V get(K key) { + return delegate.get(key); + } + + /** + * Inserts the tuple (key, value) into the map extending the semantics of {@link Map#put} with automatic ordinal + * assignment. A new ordinal is assigned if key does not exists. Otherwise the same ordinal is re-used but the value + * is replaced. + * + * @see java.util.Map#put + */ + @Override + public boolean put(K key, V value, boolean overwrite) { + return delegate.put(key, value) != null; + } + + @Override + public Collection values() { + return delegate.values(); + } + + @Override + public boolean remove(K key, V value) { + return false; + } + + @Override + public boolean containsKey(Object key) { + return delegate.containsKey(key); + } + + /** + * Removes the element corresponding to the key if exists extending the semantics of {@link java.util.Map#remove} + * with ordinal re-cycling. The ordinal corresponding to the given key may be re-assigned to another tuple. It is + * important that consumer checks the ordinal value via + * {@link MapWithOrdinalImpl#getOrdinal(Object)} before attempting to look-up by ordinal. + * + * @see java.util.Map#remove + */ + @Override + public boolean removeAll(K key) { + return delegate.remove(key) != null; + } + + @Override + public void clear() { + delegate.clear(); + } + + @Override + public Set keys() { + return delegate.keySet(); + } + +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/MultiMapWithOrdinal.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/MultiMapWithOrdinal.java new file mode 100644 index 000000000..5fbb45a7a --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/MultiMapWithOrdinal.java @@ -0,0 +1,230 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import java.util.Collection; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; + +import io.netty.util.collection.IntObjectHashMap; + +/** + * An implementation of a multimap that supports constant time look-up by a generic key or an ordinal. + * + *

This class extends the functionality a regular {@link Map} with ordinal lookup support. + * Upon insertion an unused ordinal is assigned to the inserted (key, value) tuple. + * Upon update the same ordinal id is re-used while value is replaced. + * Upon deletion of an existing item, its corresponding ordinal is recycled and could be used by another item. + * + *

For any instance with N items, this implementation guarantees that ordinals are in the range of [0, N). However, + * the ordinal assignment is dynamic and may change after an insertion or deletion. Consumers of this class are + * responsible for explicitly checking the ordinal corresponding to a key via + * {@link MultiMapWithOrdinal#getOrdinal(Object)} before attempting to execute a lookup + * with an ordinal. + * + * @param key type + * @param value type + */ +public class MultiMapWithOrdinal implements MapWithOrdinal { + + private final Map> keyToOrdinal = new LinkedHashMap<>(); + private final IntObjectHashMap ordinalToValue = new IntObjectHashMap<>(); + + /** + * Returns the value corresponding to the given ordinal. + * + * @param id ordinal value for lookup + * @return an instance of V + */ + @Override + public V getByOrdinal(int id) { + return ordinalToValue.get(id); + } + + /** + * Returns the ordinal corresponding to the given key. + * + * @param key key for ordinal lookup + * @return ordinal value corresponding to key if it exists or -1 + */ + @Override + public int getOrdinal(K key) { + Set pair = getOrdinals(key); + if (!pair.isEmpty()) { + return pair.iterator().next(); + } + return -1; + } + + private Set getOrdinals(K key) { + return keyToOrdinal.getOrDefault(key, new HashSet<>()); + } + + @Override + public int size() { + return ordinalToValue.size(); + } + + @Override + public boolean isEmpty() { + return ordinalToValue.isEmpty(); + } + + /** + * get set of values for key. + */ + @Override + public V get(K key) { + Set ordinals = keyToOrdinal.get(key); + if (ordinals == null) { + return null; + } + return ordinals.stream().map(ordinalToValue::get).collect(Collectors.toList()).get(0); + } + + /** + * get set of values for key. + */ + @Override + public Collection getAll(K key) { + Set ordinals = keyToOrdinal.get(key); + if (ordinals == null) { + return null; + } + return ordinals.stream().map(ordinalToValue::get).collect(Collectors.toList()); + } + + /** + * Inserts the tuple (key, value) into the multimap with automatic ordinal assignment. + * + * A new ordinal is assigned if key/value pair does not exists. + * + * If overwrite is true the existing key will be overwritten with value else value will be appended to the multimap. + */ + @Override + public boolean put(K key, V value, boolean overwrite) { + if (overwrite) { + removeAll(key); + } + Set ordinalSet = getOrdinals(key); + int nextOrdinal = ordinalToValue.size(); + ordinalToValue.put(nextOrdinal, value); + boolean changed = ordinalSet.add(nextOrdinal); + keyToOrdinal.put(key, ordinalSet); + return changed; + } + + @Override + public Collection values() { + return ordinalToValue.values(); + } + + @Override + public boolean containsKey(K key) { + return keyToOrdinal.containsKey(key); + } + + /** + * Removes the element corresponding to the key/value if exists with ordinal re-cycling. + * + * The ordinal corresponding to the given key may be re-assigned to another tuple. It is + * important that consumer checks the ordinal value via + * {@link MultiMapWithOrdinal#getOrdinal(Object)} before attempting to look-up by ordinal. + * + * If the multimap is changed return true. + */ + @Override + public synchronized boolean remove(K key, V value) { + Set removalSet = getOrdinals(key); + if (removalSet.isEmpty()) { + return false; + } + Optional removeValue = removalSet.stream().map(ordinalToValue::get).filter(value::equals).findFirst(); + if (!removeValue.isPresent()) { + return false; + } + int removalOrdinal = removeKv(removalSet, key, value); + int lastOrdinal = ordinalToValue.size(); + if (lastOrdinal != removalOrdinal) { //we didn't remove the last ordinal + swapOrdinal(lastOrdinal, removalOrdinal); + } + return true; + } + + private void swapOrdinal(int lastOrdinal, int removalOrdinal) { + V swapOrdinalValue = ordinalToValue.remove(lastOrdinal); + ordinalToValue.put(removalOrdinal, swapOrdinalValue); + K swapOrdinalKey = keyToOrdinal.entrySet() + .stream() + .filter(kv -> kv.getValue().stream().anyMatch(o -> o == lastOrdinal)) + .map(Map.Entry::getKey) + .findFirst() + .orElseThrow(() -> new IllegalStateException("MultimapWithOrdinal in bad state")); + ordinalToValue.put(removalOrdinal, swapOrdinalValue); + Set swapSet = getOrdinals(swapOrdinalKey); + swapSet.remove(lastOrdinal); + swapSet.add(removalOrdinal); + keyToOrdinal.put(swapOrdinalKey, swapSet); + } + + private int removeKv(Set removalSet, K key, V value) { + Integer removalOrdinal = removalSet.stream() + .filter(i -> ordinalToValue.get(i).equals(value)) + .findFirst() + .orElseThrow(() -> new IllegalStateException("MultimapWithOrdinal in bad state")); + ordinalToValue.remove(removalOrdinal); + removalSet.remove(removalOrdinal); + if (removalSet.isEmpty()) { + keyToOrdinal.remove(key); + } else { + keyToOrdinal.put(key, removalSet); + } + return removalOrdinal; + } + + /** + * remove all entries of key. + */ + @Override + public synchronized boolean removeAll(K key) { + Collection values = this.getAll(key); + if (values == null) { + return false; + } + for (V v: values) { + this.remove(key, v); + } + return true; + } + + @Override + public void clear() { + ordinalToValue.clear(); + keyToOrdinal.clear(); + } + + @Override + public Set keys() { + return keyToOrdinal.keySet(); + } + +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/OversizedAllocationException.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/OversizedAllocationException.java new file mode 100644 index 000000000..a47d3ade0 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/OversizedAllocationException.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + + +/** + * An exception that is used to signal that allocation request in bytes is greater than the maximum allowed by + * {@link org.apache.arrow.memory.BufferAllocator#buffer(int) allocator}. + * + *

Operators should handle this exception to split the batch and later resume the execution on the next + * iteration.

+ */ +public class OversizedAllocationException extends RuntimeException { + public OversizedAllocationException() { + super(); + } + + public OversizedAllocationException( + String message, + Throwable cause, + boolean enableSuppression, + boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } + + public OversizedAllocationException(String message, Throwable cause) { + super(message, cause); + } + + public OversizedAllocationException(String message) { + super(message); + } + + public OversizedAllocationException(Throwable cause) { + super(cause); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/PromotableMultiMapWithOrdinal.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/PromotableMultiMapWithOrdinal.java new file mode 100644 index 000000000..f2f838af9 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/PromotableMultiMapWithOrdinal.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import java.util.Collection; +import java.util.Set; + +import org.apache.arrow.vector.complex.AbstractStructVector; + +/** + * Implementation of MapWithOrdinal that allows for promotion to multimap when duplicate fields exist. + * @param key type + * @param value type + */ +public class PromotableMultiMapWithOrdinal implements MapWithOrdinal { + private final MapWithOrdinalImpl mapWithOrdinal = new MapWithOrdinalImpl<>(); + private final MultiMapWithOrdinal multiMapWithOrdinal = new MultiMapWithOrdinal<>(); + private final boolean promotable; + private AbstractStructVector.ConflictPolicy conflictPolicy; + private MapWithOrdinal delegate; + + /** + * Create promotable map. + * @param promotable if promotion is allowed, otherwise delegate to MapWithOrdinal. + * @param conflictPolicy how to handle name conflicts. + */ + public PromotableMultiMapWithOrdinal(boolean promotable, AbstractStructVector.ConflictPolicy conflictPolicy) { + this.promotable = promotable; + this.conflictPolicy = conflictPolicy; + delegate = mapWithOrdinal; + } + + private void promote() { + if (delegate == multiMapWithOrdinal || + !promotable || + conflictPolicy.equals(AbstractStructVector.ConflictPolicy.CONFLICT_REPLACE)) { + return; + } + for (K key : mapWithOrdinal.keys()) { + V value = mapWithOrdinal.get(key); + multiMapWithOrdinal.put(key, value, false); + } + mapWithOrdinal.clear(); + delegate = multiMapWithOrdinal; + } + + @Override + public V getByOrdinal(int id) { + return delegate.getByOrdinal(id); + } + + @Override + public int getOrdinal(K key) { + return delegate.getOrdinal(key); + } + + @Override + public int size() { + return delegate.size(); + } + + @Override + public boolean isEmpty() { + return delegate.isEmpty(); + } + + @Override + public V get(K key) { + return delegate.get(key); + } + + @Override + public Collection getAll(K key) { + return delegate.getAll(key); + } + + @Override + public boolean put(K key, V value, boolean overwrite) { + if (delegate.containsKey(key)) { + promote(); + } + return delegate.put(key, value, overwrite); + } + + @Override + public Collection values() { + return delegate.values(); + } + + @Override + public boolean containsKey(K key) { + return delegate.containsKey(key); + } + + @Override + public boolean remove(K key, V value) { + return delegate.remove(key, value); + } + + @Override + public boolean removeAll(K key) { + return delegate.removeAll(key); + } + + @Override + public void clear() { + delegate.clear(); + } + + @Override + public Set keys() { + return delegate.keys(); + } + + public void setConflictPolicy(AbstractStructVector.ConflictPolicy conflictPolicy) { + this.conflictPolicy = conflictPolicy; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaChangeRuntimeException.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaChangeRuntimeException.java new file mode 100644 index 000000000..c29eb6ad3 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaChangeRuntimeException.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + + +/** + * Thrown when child vectors (e.g. in lists) don't match the expected type. + */ +public class SchemaChangeRuntimeException extends RuntimeException { + public SchemaChangeRuntimeException() { + super(); + } + + public SchemaChangeRuntimeException( + String message, + Throwable cause, + boolean enableSuppression, + boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } + + public SchemaChangeRuntimeException(String message, Throwable cause) { + super(message, cause); + } + + public SchemaChangeRuntimeException(String message) { + super(message); + } + + public SchemaChangeRuntimeException(Throwable cause) { + super(cause); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaUtility.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaUtility.java new file mode 100644 index 000000000..f8167604c --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaUtility.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.channels.Channels; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.ipc.ReadChannel; +import org.apache.arrow.vector.ipc.WriteChannel; +import org.apache.arrow.vector.ipc.message.MessageChannelReader; +import org.apache.arrow.vector.ipc.message.MessageResult; +import org.apache.arrow.vector.ipc.message.MessageSerializer; +import org.apache.arrow.vector.types.pojo.Schema; + +/** + * Schema utility class including serialization and deserialization. + */ +public class SchemaUtility { + private SchemaUtility() {} + + /** + * Deserialize Arrow schema from byte array. + */ + public static Schema deserialize(byte[] bytes, BufferAllocator allocator) throws IOException { + try (MessageChannelReader schemaReader = + new MessageChannelReader( + new ReadChannel( + new ByteArrayReadableSeekableByteChannel(bytes)), allocator)) { + + MessageResult result = schemaReader.readNext(); + if (result == null) { + throw new IOException("Unexpected end of input. Missing schema."); + } + return MessageSerializer.deserializeSchema(result.getMessage()); + } + } + + /** + * Serialize Arrow schema into byte array. + */ + public static byte[] serialize(Schema schema) throws IOException { + final ByteArrayOutputStream out = new ByteArrayOutputStream(); + MessageSerializer.serialize(new WriteChannel(Channels.newChannel(out)), schema); + return out.toByteArray(); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java new file mode 100644 index 000000000..b479305c6 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java @@ -0,0 +1,688 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import java.io.DataInput; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CodingErrorAction; +import java.nio.charset.MalformedInputException; +import java.text.CharacterIterator; +import java.text.StringCharacterIterator; +import java.util.Arrays; + +import com.fasterxml.jackson.core.JsonGenerationException; +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.databind.SerializerProvider; +import com.fasterxml.jackson.databind.annotation.JsonSerialize; +import com.fasterxml.jackson.databind.ser.std.StdSerializer; + +/** + * A simplified byte wrapper similar to Hadoop's Text class without all the dependencies. + * Lifted from Hadoop 2.7.1 + */ +@JsonSerialize(using = Text.TextSerializer.class) +public class Text { + + private static ThreadLocal ENCODER_FACTORY = + new ThreadLocal() { + @Override + protected CharsetEncoder initialValue() { + return Charset.forName("UTF-8").newEncoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + } + }; + + private static ThreadLocal DECODER_FACTORY = + new ThreadLocal() { + @Override + protected CharsetDecoder initialValue() { + return Charset.forName("UTF-8").newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + } + }; + + private static final byte[] EMPTY_BYTES = new byte[0]; + + private byte[] bytes; + private int length; + + public Text() { + bytes = EMPTY_BYTES; + } + + /** + * Construct from a string. + * + * @param string initialize from that string + */ + public Text(String string) { + set(string); + } + + /** + * Construct from another text. + * + * @param utf8 initialize from that Text + */ + public Text(Text utf8) { + set(utf8); + } + + /** + * Construct from a byte array. + * + * @param utf8 initialize from that byte array + */ + public Text(byte[] utf8) { + set(utf8); + } + + /** + * Get a copy of the bytes that is exactly the length of the data. See {@link #getBytes()} for + * faster access to the underlying array. + * + * @return a copy of the underlying array + */ + public byte[] copyBytes() { + byte[] result = new byte[length]; + System.arraycopy(bytes, 0, result, 0, length); + return result; + } + + /** + * Returns the raw bytes; however, only data up to {@link #getLength()} is valid. Please use + * {@link #copyBytes()} if you need the returned array to be precisely the length of the data. + * + * @return the underlying array + */ + public byte[] getBytes() { + return bytes; + } + + /** + * Get the number of bytes in the byte array. + * + * @return the number of bytes in the byte array + */ + public int getLength() { + return length; + } + + /** + * Returns the Unicode Scalar Value (32-bit integer value) for the character at + * position. Note that this method avoids using the converter or doing String + * instantiation. + * + * @param position the index of the char we want to retrieve + * @return the Unicode scalar value at position or -1 if the position is invalid or points to a + * trailing byte + */ + public int charAt(int position) { + if (position > this.length) { + return -1; // too long + } + if (position < 0) { + return -1; // duh. + } + + ByteBuffer bb = (ByteBuffer) ByteBuffer.wrap(bytes).position(position); + return bytesToCodePoint(bb.slice()); + } + + public int find(String what) { + return find(what, 0); + } + + /** + * Finds any occurrence of what in the backing buffer, starting as position + * start. The starting position is measured in bytes and the return value is in terms + * of byte position in the buffer. The backing buffer is not converted to a string for this + * operation. + * + * @param what the string to search for + * @param start where to start from + * @return byte position of the first occurrence of the search string in the UTF-8 buffer or -1 + * if not found + */ + public int find(String what, int start) { + try { + ByteBuffer src = ByteBuffer.wrap(this.bytes, 0, this.length); + ByteBuffer tgt = encode(what); + byte b = tgt.get(); + src.position(start); + + while (src.hasRemaining()) { + if (b == src.get()) { // matching first byte + src.mark(); // save position in loop + tgt.mark(); // save position in target + boolean found = true; + int pos = src.position() - 1; + while (tgt.hasRemaining()) { + if (!src.hasRemaining()) { // src expired first + tgt.reset(); + src.reset(); + found = false; + break; + } + if (!(tgt.get() == src.get())) { + tgt.reset(); + src.reset(); + found = false; + break; // no match + } + } + if (found) { + return pos; + } + } + } + return -1; // not found + } catch (CharacterCodingException e) { + // can't get here + e.printStackTrace(); + return -1; + } + } + + /** + * Set to contain the contents of a string. + * + * @param string the string to initialize from + */ + public void set(String string) { + try { + ByteBuffer bb = encode(string, true); + bytes = bb.array(); + length = bb.limit(); + } catch (CharacterCodingException e) { + throw new RuntimeException("Should not have happened ", e); + } + } + + /** + * Set to a utf8 byte array. + * + * @param utf8 the byte array to initialize from + */ + public void set(byte[] utf8) { + set(utf8, 0, utf8.length); + } + + /** + * copy a text. + * + * @param other the text to initialize from + */ + public void set(Text other) { + set(other.getBytes(), 0, other.getLength()); + } + + /** + * Set the Text to range of bytes. + * + * @param utf8 the data to copy from + * @param start the first position of the new string + * @param len the number of bytes of the new string + */ + public void set(byte[] utf8, int start, int len) { + setCapacity(len, false); + System.arraycopy(utf8, start, bytes, 0, len); + this.length = len; + } + + /** + * Append a range of bytes to the end of the given text. + * + * @param utf8 the data to copy from + * @param start the first position to append from utf8 + * @param len the number of bytes to append + */ + public void append(byte[] utf8, int start, int len) { + setCapacity(length + len, true); + System.arraycopy(utf8, start, bytes, length, len); + length += len; + } + + /** + * Clear the string to empty. + * + * Note: For performance reasons, this call does not clear the underlying byte array that + * is retrievable via {@link #getBytes()}. In order to free the byte-array memory, call + * {@link #set(byte[])} with an empty byte array (For example, new byte[0]). + */ + public void clear() { + length = 0; + } + + /** + * Sets the capacity of this Text object to at least len bytes. If the + * current buffer is longer, then the capacity and existing content of the buffer are unchanged. + * If len is larger than the current capacity, the Text object's capacity is + * increased to match. + * + * @param len the number of bytes we need + * @param keepData should the old data be kept + */ + private void setCapacity(int len, boolean keepData) { + if (bytes == null || bytes.length < len) { + if (bytes != null && keepData) { + bytes = Arrays.copyOf(bytes, Math.max(len, length << 1)); + } else { + bytes = new byte[len]; + } + } + } + + @Override + public String toString() { + try { + return decode(bytes, 0, length); + } catch (CharacterCodingException e) { + throw new RuntimeException("Should not have happened ", e); + } + } + + /** + * Read a Text object whose length is already known. This allows creating Text from a stream which + * uses a different serialization format. + * + * @param in the input to initialize from + * @param len how many bytes to read from in + * @throws IOException if something bad happens + */ + public void readWithKnownLength(DataInput in, int len) throws IOException { + setCapacity(len, false); + in.readFully(bytes, 0, len); + length = len; + } + + @Override + public boolean equals(Object o) { + if (o == this) { + return true; + } else if (o == null) { + return false; + } + if (!(o instanceof Text)) { + return false; + } + + final Text that = (Text) o; + if (this.getLength() != that.getLength()) { + return false; + } + + // copied from Arrays.equals so we don'thave to copy the byte arrays + for (int i = 0; i < length; i++) { + if (bytes[i] != that.bytes[i]) { + return false; + } + } + + return true; + } + + /** + * Copied from Arrays.hashCode so we don't have to copy the byte array. + * + * @return hashCode + */ + @Override + public int hashCode() { + if (bytes == null) { + return 0; + } + + int result = 1; + for (int i = 0; i < length; i++) { + result = 31 * result + bytes[i]; + } + + return result; + } + + // / STATIC UTILITIES FROM HERE DOWN + + /** + * Converts the provided byte array to a String using the UTF-8 encoding. If the input is + * malformed, replace by a default value. + * + * @param utf8 bytes to decode + * @return the decoded string + * @throws CharacterCodingException if this is not valid UTF-8 + */ + public static String decode(byte[] utf8) throws CharacterCodingException { + return decode(ByteBuffer.wrap(utf8), true); + } + + public static String decode(byte[] utf8, int start, int length) + throws CharacterCodingException { + return decode(ByteBuffer.wrap(utf8, start, length), true); + } + + /** + * Converts the provided byte array to a String using the UTF-8 encoding. If replace + * is true, then malformed input is replaced with the substitution character, which is U+FFFD. + * Otherwise the method throws a MalformedInputException. + * + * @param utf8 the bytes to decode + * @param start where to start from + * @param length length of the bytes to decode + * @param replace whether to replace malformed characters with U+FFFD + * @return the decoded string + * @throws CharacterCodingException if the input could not be decoded + */ + public static String decode(byte[] utf8, int start, int length, boolean replace) + throws CharacterCodingException { + return decode(ByteBuffer.wrap(utf8, start, length), replace); + } + + private static String decode(ByteBuffer utf8, boolean replace) + throws CharacterCodingException { + CharsetDecoder decoder = DECODER_FACTORY.get(); + if (replace) { + decoder.onMalformedInput( + java.nio.charset.CodingErrorAction.REPLACE); + decoder.onUnmappableCharacter(CodingErrorAction.REPLACE); + } + String str = decoder.decode(utf8).toString(); + // set decoder back to its default value: REPORT + if (replace) { + decoder.onMalformedInput(CodingErrorAction.REPORT); + decoder.onUnmappableCharacter(CodingErrorAction.REPORT); + } + return str; + } + + /** + * Converts the provided String to bytes using the UTF-8 encoding. If the input is malformed, + * invalid chars are replaced by a default value. + * + * @param string the string to encode + * @return ByteBuffer: bytes stores at ByteBuffer.array() and length is ByteBuffer.limit() + * @throws CharacterCodingException if the string could not be encoded + */ + public static ByteBuffer encode(String string) + throws CharacterCodingException { + return encode(string, true); + } + + /** + * Converts the provided String to bytes using the UTF-8 encoding. If replace is + * true, then malformed input is replaced with the substitution character, which is U+FFFD. + * Otherwise the method throws a MalformedInputException. + * + * @param string the string to encode + * @param replace whether to replace malformed characters with U+FFFD + * @return ByteBuffer: bytes stores at ByteBuffer.array() and length is ByteBuffer.limit() + * @throws CharacterCodingException if the string could not be encoded + */ + public static ByteBuffer encode(String string, boolean replace) + throws CharacterCodingException { + CharsetEncoder encoder = ENCODER_FACTORY.get(); + if (replace) { + encoder.onMalformedInput(CodingErrorAction.REPLACE); + encoder.onUnmappableCharacter(CodingErrorAction.REPLACE); + } + ByteBuffer bytes = + encoder.encode(CharBuffer.wrap(string.toCharArray())); + if (replace) { + encoder.onMalformedInput(CodingErrorAction.REPORT); + encoder.onUnmappableCharacter(CodingErrorAction.REPORT); + } + return bytes; + } + + public static final int DEFAULT_MAX_LEN = 1024 * 1024; + + // //// states for validateUTF8 + + private static final int LEAD_BYTE = 0; + + private static final int TRAIL_BYTE_1 = 1; + + private static final int TRAIL_BYTE = 2; + + /** + * Check if a byte array contains valid utf-8. + * + * @param utf8 byte array + * @throws MalformedInputException if the byte array contains invalid utf-8 + */ + public static void validateUTF8(byte[] utf8) throws MalformedInputException { + validateUTF8(utf8, 0, utf8.length); + } + + /** + * Check to see if a byte array is valid utf-8. + * + * @param utf8 the array of bytes + * @param start the offset of the first byte in the array + * @param len the length of the byte sequence + * @throws MalformedInputException if the byte array contains invalid bytes + */ + public static void validateUTF8(byte[] utf8, int start, int len) + throws MalformedInputException { + int count = start; + int leadByte = 0; + int length = 0; + int state = LEAD_BYTE; + while (count < start + len) { + int aByte = utf8[count] & 0xFF; + + switch (state) { + case LEAD_BYTE: + leadByte = aByte; + length = bytesFromUTF8[aByte]; + + switch (length) { + case 0: // check for ASCII + if (leadByte > 0x7F) { + throw new MalformedInputException(count); + } + break; + case 1: + if (leadByte < 0xC2 || leadByte > 0xDF) { + throw new MalformedInputException(count); + } + state = TRAIL_BYTE_1; + break; + case 2: + if (leadByte < 0xE0 || leadByte > 0xEF) { + throw new MalformedInputException(count); + } + state = TRAIL_BYTE_1; + break; + case 3: + if (leadByte < 0xF0 || leadByte > 0xF4) { + throw new MalformedInputException(count); + } + state = TRAIL_BYTE_1; + break; + default: + // too long! Longest valid UTF-8 is 4 bytes (lead + three) + // or if < 0 we got a trail byte in the lead byte position + throw new MalformedInputException(count); + } // switch (length) + break; + + case TRAIL_BYTE_1: + if (leadByte == 0xF0 && aByte < 0x90) { + throw new MalformedInputException(count); + } + if (leadByte == 0xF4 && aByte > 0x8F) { + throw new MalformedInputException(count); + } + if (leadByte == 0xE0 && aByte < 0xA0) { + throw new MalformedInputException(count); + } + if (leadByte == 0xED && aByte > 0x9F) { + throw new MalformedInputException(count); + } + // falls through to regular trail-byte test!! + case TRAIL_BYTE: + if (aByte < 0x80 || aByte > 0xBF) { + throw new MalformedInputException(count); + } + if (--length == 0) { + state = LEAD_BYTE; + } else { + state = TRAIL_BYTE; + } + break; + default: + break; + } // switch (state) + count++; + } + } + + /** + * Magic numbers for UTF-8. These are the number of bytes that follow a given lead byte. + * Trailing bytes have the value -1. The values 4 and 5 are presented in this table, even though + * valid UTF-8 cannot include the five and six byte sequences. + */ + static final int[] bytesFromUTF8 = + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, + // trail bytes + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, + 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5}; + + /** + * Returns the next code point at the current position in the buffer. The buffer's position will + * be incremented. Any mark set on this buffer will be changed by this method! + * + * @param bytes the incoming bytes + * @return the corresponding unicode codepoint + */ + public static int bytesToCodePoint(ByteBuffer bytes) { + bytes.mark(); + byte b = bytes.get(); + bytes.reset(); + int extraBytesToRead = bytesFromUTF8[(b & 0xFF)]; + if (extraBytesToRead < 0) { + return -1; // trailing byte! + } + int ch = 0; + + switch (extraBytesToRead) { + case 5: + ch += (bytes.get() & 0xFF); + ch <<= 6; /* remember, illegal UTF-8 */ + // fall through + case 4: + ch += (bytes.get() & 0xFF); + ch <<= 6; /* remember, illegal UTF-8 */ + // fall through + case 3: + ch += (bytes.get() & 0xFF); + ch <<= 6; + // fall through + case 2: + ch += (bytes.get() & 0xFF); + ch <<= 6; + // fall through + case 1: + ch += (bytes.get() & 0xFF); + ch <<= 6; + // fall through + case 0: + ch += (bytes.get() & 0xFF); + break; + default: // do nothing + } + ch -= offsetsFromUTF8[extraBytesToRead]; + + return ch; + } + + static final int[] offsetsFromUTF8 = + {0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080}; + + /** + * For the given string, returns the number of UTF-8 bytes required to encode the string. + * + * @param string text to encode + * @return number of UTF-8 bytes required to encode + */ + public static int utf8Length(String string) { + CharacterIterator iter = new StringCharacterIterator(string); + char ch = iter.first(); + int size = 0; + while (ch != CharacterIterator.DONE) { + if ((ch >= 0xD800) && (ch < 0xDC00)) { + // surrogate pair? + char trail = iter.next(); + if ((trail > 0xDBFF) && (trail < 0xE000)) { + // valid pair + size += 4; + } else { + // invalid pair + size += 3; + iter.previous(); // rewind one + } + } else if (ch < 0x80) { + size++; + } else if (ch < 0x800) { + size += 2; + } else { + // ch < 0x10000, that is, the largest char value + size += 3; + } + ch = iter.next(); + } + return size; + } + + /** + * JSON serializer for {@link Text}. + */ + public static class TextSerializer extends StdSerializer { + + public TextSerializer() { + super(Text.class); + } + + @Override + public void serialize( + Text text, + JsonGenerator jsonGenerator, + SerializerProvider serializerProvider) throws IOException, JsonGenerationException { + jsonGenerator.writeString(text.toString()); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/TransferPair.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/TransferPair.java new file mode 100644 index 000000000..ca3876c7b --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/TransferPair.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import org.apache.arrow.vector.ValueVector; + +/** + * Interface for copying values between a pair of two vectors of the same type. + */ +public interface TransferPair { + void transfer(); + + void splitAndTransfer(int startIndex, int length); + + ValueVector getTo(); + + void copyValueSafe(int from, int to); +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java new file mode 100644 index 000000000..741972b4a --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java @@ -0,0 +1,190 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import java.util.Arrays; +import java.util.List; +import java.util.Objects; + +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.dictionary.Dictionary; +import org.apache.arrow.vector.dictionary.DictionaryProvider; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.DictionaryEncoding; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; + +/** + * Utility class for validating arrow data structures. + */ +public class Validator { + + /** + * Validate two arrow schemas are equal. + * + * @param schema1 the 1st schema to compare + * @param schema2 the 2nd schema to compare + * @throws IllegalArgumentException if they are different. + */ + public static void compareSchemas(Schema schema1, Schema schema2) { + if (!schema2.equals(schema1)) { + throw new IllegalArgumentException("Different schemas:\n" + schema2 + "\n" + schema1); + } + } + + /** + * Validate two Dictionary encodings and dictionaries with id's from the encodings. + */ + public static void compareDictionaries( + List encodings1, + List encodings2, + DictionaryProvider provider1, + DictionaryProvider provider2) { + + if (encodings1.size() != encodings2.size()) { + throw new IllegalArgumentException("Different dictionary encoding count:\n" + + encodings1.size() + "\n" + encodings2.size()); + } + + for (int i = 0; i < encodings1.size(); i++) { + if (!encodings1.get(i).equals(encodings2.get(i))) { + throw new IllegalArgumentException("Different dictionary encodings:\n" + encodings1.get(i) + + "\n" + encodings2.get(i)); + } + + long id = encodings1.get(i).getId(); + Dictionary dict1 = provider1.lookup(id); + Dictionary dict2 = provider2.lookup(id); + + if (dict1 == null || dict2 == null) { + throw new IllegalArgumentException("The DictionaryProvider did not contain the required " + + "dictionary with id: " + id + "\n" + dict1 + "\n" + dict2); + } + + try { + compareFieldVectors(dict1.getVector(), dict2.getVector()); + } catch (IllegalArgumentException e) { + throw new IllegalArgumentException("Different dictionaries:\n" + dict1 + "\n" + dict2, e); + } + } + } + + /** + * Validate two arrow vectorSchemaRoot are equal. + * + * @param root1 the 1st schema to compare + * @param root2 the 2nd schema to compare + * @throws IllegalArgumentException if they are different. + */ + public static void compareVectorSchemaRoot(VectorSchemaRoot root1, VectorSchemaRoot root2) { + compareSchemas(root2.getSchema(), root1.getSchema()); + if (root1.getRowCount() != root2.getRowCount()) { + throw new IllegalArgumentException("Different row count:\n" + root1.getRowCount() + " != " + root2.getRowCount()); + } + List vectors1 = root1.getFieldVectors(); + List vectors2 = root2.getFieldVectors(); + if (vectors1.size() != vectors2.size()) { + throw new IllegalArgumentException("Different column count:\n" + vectors1.toString() + + "\n!=\n" + vectors2.toString()); + } + for (int i = 0; i < vectors1.size(); i++) { + compareFieldVectors(vectors1.get(i), vectors2.get(i)); + } + } + + /** + * Validate two arrow FieldVectors are equal. + * + * @param vector1 the 1st VectorField to compare + * @param vector2 the 2nd VectorField to compare + * @throws IllegalArgumentException if they are different + */ + public static void compareFieldVectors(FieldVector vector1, FieldVector vector2) { + Field field1 = vector1.getField(); + if (!field1.equals(vector2.getField())) { + throw new IllegalArgumentException("Different Fields:\n" + field1 + "\n!=\n" + + vector2.getField()); + } + int valueCount = vector1.getValueCount(); + if (valueCount != vector2.getValueCount()) { + throw new IllegalArgumentException("Different value count for field " + field1 + " : " + + valueCount + " != " + vector2.getValueCount()); + } + for (int j = 0; j < valueCount; j++) { + Object obj1 = vector1.getObject(j); + Object obj2 = vector2.getObject(j); + if (!equals(field1.getType(), obj1, obj2)) { + throw new IllegalArgumentException( + "Different values in column:\n" + field1 + " at index " + j + ": " + obj1 + " != " + obj2); + } + } + } + + static boolean equals(ArrowType type, final Object o1, final Object o2) { + if (type instanceof ArrowType.FloatingPoint) { + ArrowType.FloatingPoint fpType = (ArrowType.FloatingPoint) type; + switch (fpType.getPrecision()) { + case DOUBLE: + return equalEnough((Double) o1, (Double) o2); + case SINGLE: + return equalEnough((Float) o1, (Float) o2); + case HALF: + default: + throw new UnsupportedOperationException("unsupported precision: " + fpType); + } + } else if (type instanceof ArrowType.Binary || type instanceof ArrowType.LargeBinary || + type instanceof ArrowType.FixedSizeBinary) { + return Arrays.equals((byte[]) o1, (byte[]) o2); + } else if (o1 instanceof byte[] && o2 instanceof byte[]) { + return Arrays.equals((byte[]) o1, (byte[]) o2); + } + + return Objects.equals(o1, o2); + } + + static boolean equalEnough(Float f1, Float f2) { + if (f1 == null || f2 == null) { + return f1 == null && f2 == null; + } + if (f1.isNaN()) { + return f2.isNaN(); + } + if (f1.isInfinite()) { + return f2.isInfinite() && Math.signum(f1) == Math.signum(f2); + } + float average = Math.abs((f1 + f2) / 2); + float differenceScaled = Math.abs(f1 - f2) / (average == 0.0f ? 1f : average); + return differenceScaled < 1.0E-6f; + } + + static boolean equalEnough(Double f1, Double f2) { + if (f1 == null || f2 == null) { + return f1 == null && f2 == null; + } + if (f1.isNaN()) { + return f2.isNaN(); + } + if (f1.isInfinite()) { + return f2.isInfinite() && Math.signum(f1) == Math.signum(f2); + } + double average = Math.abs((f1 + f2) / 2); + double differenceScaled = Math.abs(f1 - f2) / (average == 0.0d ? 1d : average); + return differenceScaled < 1.0E-12d; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/ValueVectorUtility.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/ValueVectorUtility.java new file mode 100644 index 000000000..ceb7081e1 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/ValueVectorUtility.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import static org.apache.arrow.vector.validate.ValidateUtil.validateOrThrow; + +import java.util.function.BiFunction; + +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.validate.ValidateVectorBufferVisitor; +import org.apache.arrow.vector.validate.ValidateVectorDataVisitor; +import org.apache.arrow.vector.validate.ValidateVectorTypeVisitor; + +/** + * Utility methods for {@link ValueVector}. + */ +public class ValueVectorUtility { + + private ValueVectorUtility() { + } + + /** + * Get the toString() representation of vector suitable for debugging. + * Note since vectors may have millions of values, this method only shows max 20 values. + * Examples as below (v represents value): + *
  • + * vector with 0 value: + * [] + *
  • + *
  • + * vector with 5 values (no more than 20 values): + * [v0, v1, v2, v3, v4] + *
  • + *
  • + * vector with 100 values (more than 20 values): + * [v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, ..., v90, v91, v92, v93, v94, v95, v96, v97, v98, v99] + *
  • + */ + public static String getToString(V vector, int start, int end) { + return getToString(vector, start, end, (v, i) -> v.getObject(i)); + } + + /** + * Get the toString() representation of vector suitable for debugging. + * Note since vectors may have millions of values, this method only shows at most 20 values. + * @param vector the vector for which to get toString representation. + * @param start the starting index, inclusive. + * @param end the end index, exclusive. + * @param valueToString the function to transform individual elements to strings. + */ + public static String getToString( + V vector, int start, int end, BiFunction valueToString) { + Preconditions.checkNotNull(vector); + final int length = end - start; + Preconditions.checkArgument(length >= 0); + Preconditions.checkArgument(start >= 0); + Preconditions.checkArgument(end <= vector.getValueCount()); + + if (length == 0) { + return "[]"; + } + + final int window = 10; + boolean skipComma = false; + + StringBuilder sb = new StringBuilder(); + sb.append('['); + for (int i = start; i < end; i++) { + if (skipComma) { + skipComma = false; + } + if (i - start >= window && i < end - window) { + sb.append("..."); + i = end - window - 1; + skipComma = true; + } else { + sb.append(valueToString.apply(vector, i)); + } + + if (i == end - 1) { + sb.append(']'); + } else { + if (!skipComma) { + sb.append(','); + } + sb.append(' '); + } + } + + return sb.toString(); + } + + /** + * Utility to validate vector in O(1) time. + */ + public static void validate(ValueVector vector) { + Preconditions.checkNotNull(vector); + + ValidateVectorTypeVisitor typeVisitor = new ValidateVectorTypeVisitor(); + vector.accept(typeVisitor, null); + + ValidateVectorBufferVisitor bufferVisitor = new ValidateVectorBufferVisitor(); + vector.accept(bufferVisitor, null); + } + + /** + * Utility to validate vector in O(n) time, where n is the value count. + */ + public static void validateFull(ValueVector vector) { + validate(vector); + + ValidateVectorDataVisitor dataVisitor = new ValidateVectorDataVisitor(); + vector.accept(dataVisitor, null); + } + + /** + * Utility to validate vector schema root in O(1) time. + */ + public static void validate(VectorSchemaRoot root) { + Preconditions.checkNotNull(root); + int valueCount = root.getRowCount(); + validateOrThrow(valueCount >= 0, "The row count of vector schema root %s is negative.", valueCount); + for (ValueVector childVec : root.getFieldVectors()) { + validateOrThrow(valueCount == childVec.getValueCount(), + "Child vector and vector schema root have different value counts. " + + "Child vector value count %s, vector schema root value count %s", childVec.getValueCount(), valueCount); + validate(childVec); + } + } + + /** + * Utility to validate vector in O(n) time, where n is the value count. + */ + public static void validateFull(VectorSchemaRoot root) { + Preconditions.checkNotNull(root); + int valueCount = root.getRowCount(); + validateOrThrow(valueCount >= 0, "The row count of vector schema root %s is negative.", valueCount); + for (ValueVector childVec : root.getFieldVectors()) { + validateOrThrow(valueCount == childVec.getValueCount(), + "Child vector and vector schema root have different value counts. " + + "Child vector value count %s, vector schema root value count %s", childVec.getValueCount(), valueCount); + validateFull(childVec); + } + } + + /** + * Pre allocate memory for BaseFixedWidthVector. + */ + public static void preAllocate(VectorSchemaRoot root, int targetSize) { + for (ValueVector vector : root.getFieldVectors()) { + if (vector instanceof BaseFixedWidthVector) { + ((BaseFixedWidthVector) vector).allocateNew(targetSize); + } + } + } + + /** + * Ensure capacity for BaseFixedWidthVector. + */ + public static void ensureCapacity(VectorSchemaRoot root, int targetCapacity) { + for (ValueVector vector : root.getFieldVectors()) { + if (vector instanceof BaseFixedWidthVector) { + while (vector.getValueCapacity() < targetCapacity) { + vector.reAlloc(); + } + } + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java new file mode 100644 index 000000000..e5809e93e --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java @@ -0,0 +1,542 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import static org.apache.arrow.memory.util.LargeMemoryUtil.checkedCastToInt; + +import java.util.HashSet; + +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.BaseLargeVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BitVectorHelper; +import org.apache.arrow.vector.ExtensionTypeVector; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.compare.TypeEqualsVisitor; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.complex.DenseUnionVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.LargeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.UnionVector; + +import io.netty.util.internal.PlatformDependent; + +/** + * Utility to append two vectors together. + */ +class VectorAppender implements VectorVisitor { + + /** + * The targetVector to be appended. + */ + private final ValueVector targetVector; + + private final TypeEqualsVisitor typeVisitor; + + /** + * Constructs a new targetVector appender, with the given targetVector. + * @param targetVector the targetVector to be appended. + */ + VectorAppender(ValueVector targetVector) { + this.targetVector = targetVector; + typeVisitor = new TypeEqualsVisitor(targetVector, false, true); + } + + @Override + public ValueVector visit(BaseFixedWidthVector deltaVector, Void value) { + Preconditions.checkArgument(targetVector.getField().getType().equals(deltaVector.getField().getType()), + "The targetVector to append must have the same type as the targetVector being appended"); + + if (deltaVector.getValueCount() == 0) { + return targetVector; // optimization, nothing to append, return + } + + int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount(); + + // make sure there is enough capacity + while (targetVector.getValueCapacity() < newValueCount) { + targetVector.reAlloc(); + } + + // append validity buffer + BitVectorHelper.concatBits( + targetVector.getValidityBuffer(), targetVector.getValueCount(), + deltaVector.getValidityBuffer(), deltaVector.getValueCount(), targetVector.getValidityBuffer()); + + // append data buffer + PlatformDependent.copyMemory(deltaVector.getDataBuffer().memoryAddress(), + targetVector.getDataBuffer().memoryAddress() + deltaVector.getTypeWidth() * targetVector.getValueCount(), + deltaVector.getTypeWidth() * deltaVector.getValueCount()); + targetVector.setValueCount(newValueCount); + return targetVector; + } + + @Override + public ValueVector visit(BaseVariableWidthVector deltaVector, Void value) { + Preconditions.checkArgument(targetVector.getField().getType().equals(deltaVector.getField().getType()), + "The targetVector to append must have the same type as the targetVector being appended"); + + if (deltaVector.getValueCount() == 0) { + return targetVector; // nothing to append, return + } + + int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount(); + + int targetDataSize = targetVector.getOffsetBuffer().getInt( + (long) targetVector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH); + int deltaDataSize = deltaVector.getOffsetBuffer().getInt( + (long) deltaVector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH); + int newValueCapacity = targetDataSize + deltaDataSize; + + // make sure there is enough capacity + while (targetVector.getValueCapacity() < newValueCount) { + targetVector.reAlloc(); + } + while (targetVector.getDataBuffer().capacity() < newValueCapacity) { + ((BaseVariableWidthVector) targetVector).reallocDataBuffer(); + } + + // append validity buffer + BitVectorHelper.concatBits( + targetVector.getValidityBuffer(), targetVector.getValueCount(), + deltaVector.getValidityBuffer(), deltaVector.getValueCount(), targetVector.getValidityBuffer()); + + // append data buffer + PlatformDependent.copyMemory(deltaVector.getDataBuffer().memoryAddress(), + targetVector.getDataBuffer().memoryAddress() + targetDataSize, deltaDataSize); + + // copy offset buffer + PlatformDependent.copyMemory( + deltaVector.getOffsetBuffer().memoryAddress() + BaseVariableWidthVector.OFFSET_WIDTH, + targetVector.getOffsetBuffer().memoryAddress() + (targetVector.getValueCount() + 1) * + BaseVariableWidthVector.OFFSET_WIDTH, + deltaVector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH); + + // increase each offset from the second buffer + for (int i = 0; i < deltaVector.getValueCount(); i++) { + int oldOffset = targetVector.getOffsetBuffer().getInt((long) (targetVector.getValueCount() + 1 + i) * + BaseVariableWidthVector.OFFSET_WIDTH); + targetVector.getOffsetBuffer().setInt( + (long) (targetVector.getValueCount() + 1 + i) * + BaseVariableWidthVector.OFFSET_WIDTH, oldOffset + targetDataSize); + } + ((BaseVariableWidthVector) targetVector).setLastSet(newValueCount - 1); + targetVector.setValueCount(newValueCount); + return targetVector; + } + + @Override + public ValueVector visit(BaseLargeVariableWidthVector deltaVector, Void value) { + Preconditions.checkArgument(targetVector.getField().getType().equals(deltaVector.getField().getType()), + "The targetVector to append must have the same type as the targetVector being appended"); + + if (deltaVector.getValueCount() == 0) { + return targetVector; // nothing to append, return + } + + int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount(); + + long targetDataSize = targetVector.getOffsetBuffer().getLong( + (long) targetVector.getValueCount() * BaseLargeVariableWidthVector.OFFSET_WIDTH); + long deltaDataSize = deltaVector.getOffsetBuffer().getLong( + (long) deltaVector.getValueCount() * BaseLargeVariableWidthVector.OFFSET_WIDTH); + long newValueCapacity = targetDataSize + deltaDataSize; + + // make sure there is enough capacity + while (targetVector.getValueCapacity() < newValueCount) { + targetVector.reAlloc(); + } + while (targetVector.getDataBuffer().capacity() < newValueCapacity) { + ((BaseLargeVariableWidthVector) targetVector).reallocDataBuffer(); + } + + // append validity buffer + BitVectorHelper.concatBits( + targetVector.getValidityBuffer(), targetVector.getValueCount(), + deltaVector.getValidityBuffer(), deltaVector.getValueCount(), targetVector.getValidityBuffer()); + + // append data buffer + PlatformDependent.copyMemory(deltaVector.getDataBuffer().memoryAddress(), + targetVector.getDataBuffer().memoryAddress() + targetDataSize, deltaDataSize); + + // copy offset buffer + PlatformDependent.copyMemory( + deltaVector.getOffsetBuffer().memoryAddress() + BaseLargeVariableWidthVector.OFFSET_WIDTH, + targetVector.getOffsetBuffer().memoryAddress() + (targetVector.getValueCount() + 1) * + BaseLargeVariableWidthVector.OFFSET_WIDTH, + deltaVector.getValueCount() * BaseLargeVariableWidthVector.OFFSET_WIDTH); + + // increase each offset from the second buffer + for (int i = 0; i < deltaVector.getValueCount(); i++) { + long oldOffset = targetVector.getOffsetBuffer().getLong((long) (targetVector.getValueCount() + 1 + i) * + BaseLargeVariableWidthVector.OFFSET_WIDTH); + targetVector.getOffsetBuffer().setLong( + (long) (targetVector.getValueCount() + 1 + i) * + BaseLargeVariableWidthVector.OFFSET_WIDTH, oldOffset + targetDataSize); + } + ((BaseLargeVariableWidthVector) targetVector).setLastSet(newValueCount - 1); + targetVector.setValueCount(newValueCount); + return targetVector; + } + + @Override + public ValueVector visit(ListVector deltaVector, Void value) { + Preconditions.checkArgument(typeVisitor.equals(deltaVector), + "The targetVector to append must have the same type as the targetVector being appended"); + + if (deltaVector.getValueCount() == 0) { + return targetVector; // nothing to append, return + } + + int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount(); + + int targetListSize = targetVector.getOffsetBuffer().getInt( + (long) targetVector.getValueCount() * ListVector.OFFSET_WIDTH); + int deltaListSize = deltaVector.getOffsetBuffer().getInt( + (long) deltaVector.getValueCount() * ListVector.OFFSET_WIDTH); + + ListVector targetListVector = (ListVector) targetVector; + + // make sure the underlying vector has value count set + targetListVector.getDataVector().setValueCount(targetListSize); + deltaVector.getDataVector().setValueCount(deltaListSize); + + // make sure there is enough capacity + while (targetVector.getValueCapacity() < newValueCount) { + targetVector.reAlloc(); + } + + // append validity buffer + BitVectorHelper.concatBits( + targetVector.getValidityBuffer(), targetVector.getValueCount(), + deltaVector.getValidityBuffer(), deltaVector.getValueCount(), targetVector.getValidityBuffer()); + + // append offset buffer + PlatformDependent.copyMemory(deltaVector.getOffsetBuffer().memoryAddress() + ListVector.OFFSET_WIDTH, + targetVector.getOffsetBuffer().memoryAddress() + (targetVector.getValueCount() + 1) * + ListVector.OFFSET_WIDTH, + (long) deltaVector.getValueCount() * ListVector.OFFSET_WIDTH); + + // increase each offset from the second buffer + for (int i = 0; i < deltaVector.getValueCount(); i++) { + int oldOffset = targetVector.getOffsetBuffer().getInt( + (long) (targetVector.getValueCount() + 1 + i) * ListVector.OFFSET_WIDTH); + targetVector.getOffsetBuffer().setInt((long) (targetVector.getValueCount() + 1 + i) * ListVector.OFFSET_WIDTH, + oldOffset + targetListSize); + } + targetListVector.setLastSet(newValueCount - 1); + + // append underlying vectors + VectorAppender innerAppender = new VectorAppender(targetListVector.getDataVector()); + deltaVector.getDataVector().accept(innerAppender, null); + + targetVector.setValueCount(newValueCount); + return targetVector; + } + + @Override + public ValueVector visit(LargeListVector deltaVector, Void value) { + Preconditions.checkArgument(typeVisitor.equals(deltaVector), + "The targetVector to append must have the same type as the targetVector being appended"); + + if (deltaVector.getValueCount() == 0) { + return targetVector; // nothing to append, return + } + + int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount(); + + long targetListSize = targetVector.getOffsetBuffer().getLong( + (long) targetVector.getValueCount() * LargeListVector.OFFSET_WIDTH); + long deltaListSize = deltaVector.getOffsetBuffer().getLong( + (long) deltaVector.getValueCount() * LargeListVector.OFFSET_WIDTH); + + ListVector targetListVector = (ListVector) targetVector; + + // make sure the underlying vector has value count set + // todo recheck these casts when int64 vectors are supported + targetListVector.getDataVector().setValueCount(checkedCastToInt(targetListSize)); + deltaVector.getDataVector().setValueCount(checkedCastToInt(deltaListSize)); + + // make sure there is enough capacity + while (targetVector.getValueCapacity() < newValueCount) { + targetVector.reAlloc(); + } + + // append validity buffer + BitVectorHelper.concatBits( + targetVector.getValidityBuffer(), targetVector.getValueCount(), + deltaVector.getValidityBuffer(), deltaVector.getValueCount(), targetVector.getValidityBuffer()); + + // append offset buffer + PlatformDependent.copyMemory(deltaVector.getOffsetBuffer().memoryAddress() + ListVector.OFFSET_WIDTH, + targetVector.getOffsetBuffer().memoryAddress() + (targetVector.getValueCount() + 1) * + LargeListVector.OFFSET_WIDTH, + (long) deltaVector.getValueCount() * ListVector.OFFSET_WIDTH); + + // increase each offset from the second buffer + for (int i = 0; i < deltaVector.getValueCount(); i++) { + long oldOffset = targetVector.getOffsetBuffer().getLong( + (long) (targetVector.getValueCount() + 1 + i) * LargeListVector.OFFSET_WIDTH); + targetVector.getOffsetBuffer().setLong((long) (targetVector.getValueCount() + 1 + i) * + LargeListVector.OFFSET_WIDTH, oldOffset + targetListSize); + } + targetListVector.setLastSet(newValueCount - 1); + + // append underlying vectors + VectorAppender innerAppender = new VectorAppender(targetListVector.getDataVector()); + deltaVector.getDataVector().accept(innerAppender, null); + + targetVector.setValueCount(newValueCount); + return targetVector; + } + + @Override + public ValueVector visit(FixedSizeListVector deltaVector, Void value) { + Preconditions.checkArgument(typeVisitor.equals(deltaVector), + "The vector to append must have the same type as the targetVector being appended"); + + if (deltaVector.getValueCount() == 0) { + return targetVector; // optimization, nothing to append, return + } + + FixedSizeListVector targetListVector = (FixedSizeListVector) targetVector; + + Preconditions.checkArgument(targetListVector.getListSize() == deltaVector.getListSize(), + "FixedSizeListVector must have the same list size to append"); + + int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount(); + + int targetListSize = targetListVector.getValueCount() * targetListVector.getListSize(); + int deltaListSize = deltaVector.getValueCount() * deltaVector.getListSize(); + + // make sure the underlying vector has value count set + targetListVector.getDataVector().setValueCount(targetListSize); + deltaVector.getDataVector().setValueCount(deltaListSize); + + // make sure there is enough capacity + while (targetVector.getValueCapacity() < newValueCount) { + targetVector.reAlloc(); + } + + // append validity buffer + BitVectorHelper.concatBits( + targetVector.getValidityBuffer(), targetVector.getValueCount(), + deltaVector.getValidityBuffer(), deltaVector.getValueCount(), targetVector.getValidityBuffer()); + + // append underlying vectors + VectorAppender innerAppender = new VectorAppender(targetListVector.getDataVector()); + deltaVector.getDataVector().accept(innerAppender, null); + + targetVector.setValueCount(newValueCount); + return targetVector; + } + + @Override + public ValueVector visit(NonNullableStructVector deltaVector, Void value) { + Preconditions.checkArgument(typeVisitor.equals(deltaVector), + "The vector to append must have the same type as the targetVector being appended"); + + if (deltaVector.getValueCount() == 0) { + return targetVector; // optimization, nothing to append, return + } + + NonNullableStructVector targetStructVector = (NonNullableStructVector) targetVector; + int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount(); + + // make sure there is enough capacity + while (targetVector.getValueCapacity() < newValueCount) { + targetVector.reAlloc(); + } + + // append validity buffer + BitVectorHelper.concatBits( + targetVector.getValidityBuffer(), targetVector.getValueCount(), + deltaVector.getValidityBuffer(), deltaVector.getValueCount(), targetVector.getValidityBuffer()); + + // append child vectors + for (int i = 0; i < targetStructVector.getChildrenFromFields().size(); i++) { + ValueVector targetChild = targetStructVector.getVectorById(i); + ValueVector deltaChild = deltaVector.getVectorById(i); + + targetChild.setValueCount(targetStructVector.getValueCount()); + deltaChild.setValueCount(deltaVector.getValueCount()); + + VectorAppender innerAppender = new VectorAppender(targetChild); + deltaChild.accept(innerAppender, null); + } + + targetVector.setValueCount(newValueCount); + return targetVector; + } + + @Override + public ValueVector visit(UnionVector deltaVector, Void value) { + // we only make sure that both vectors are union vectors. + Preconditions.checkArgument(targetVector.getMinorType() == deltaVector.getMinorType(), + "The vector to append must have the same type as the targetVector being appended"); + + if (deltaVector.getValueCount() == 0) { + return targetVector; // optimization, nothing to append, return + } + + UnionVector targetUnionVector = (UnionVector) targetVector; + int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount(); + + // make sure there is enough capacity + while (targetUnionVector.getValueCapacity() < newValueCount) { + targetUnionVector.reAlloc(); + } + + // append type buffers + PlatformDependent.copyMemory(deltaVector.getTypeBufferAddress(), + targetUnionVector.getTypeBufferAddress() + targetVector.getValueCount(), + deltaVector.getValueCount()); + + // build the hash set for all types + HashSet targetTypes = new HashSet<>(); + for (int i = 0; i < targetUnionVector.getValueCount(); i++) { + targetTypes.add(targetUnionVector.getTypeValue(i)); + } + HashSet deltaTypes = new HashSet<>(); + for (int i = 0; i < deltaVector.getValueCount(); i++) { + deltaTypes.add(deltaVector.getTypeValue(i)); + } + + // append child vectors + for (int i = 0; i < Byte.MAX_VALUE; i++) { + if (targetTypes.contains(i) || deltaTypes.contains(i)) { + ValueVector targetChild = targetUnionVector.getVectorByType(i); + if (!targetTypes.contains(i)) { + // if the vector type does not exist in the target, it must be newly created + // and we must make sure it has enough capacity. + while (targetChild.getValueCapacity() < newValueCount) { + targetChild.reAlloc(); + } + } + + if (deltaTypes.contains(i)) { + // append child vectors + ValueVector deltaChild = deltaVector.getVectorByType(i); + + targetChild.setValueCount(targetUnionVector.getValueCount()); + deltaChild.setValueCount(deltaVector.getValueCount()); + + VectorAppender innerAppender = new VectorAppender(targetChild); + deltaChild.accept(innerAppender, null); + } + targetChild.setValueCount(newValueCount); + } + } + + targetVector.setValueCount(newValueCount); + return targetVector; + } + + @Override + public ValueVector visit(DenseUnionVector deltaVector, Void value) { + // we only make sure that both vectors are union vectors. + Preconditions.checkArgument(targetVector.getMinorType() == deltaVector.getMinorType(), + "The vector to append must have the same type as the targetVector being appended"); + + if (deltaVector.getValueCount() == 0) { + return targetVector; // optimization, nothing to append, return + } + + DenseUnionVector targetDenseUnionVector = (DenseUnionVector) targetVector; + int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount(); + + // make sure there is enough capacity + while (targetDenseUnionVector.getValueCapacity() < newValueCount) { + targetDenseUnionVector.reAlloc(); + } + + // append type buffers + PlatformDependent.copyMemory(deltaVector.getTypeBuffer().memoryAddress(), + targetDenseUnionVector.getTypeBuffer() .memoryAddress() + targetVector.getValueCount(), + deltaVector.getValueCount()); + + // append offset buffers + for (int i = 0; i < deltaVector.getValueCount(); i++) { + byte typeId = deltaVector.getTypeId(i); + ValueVector targetChildVector = targetDenseUnionVector.getVectorByType(typeId); + int offsetBase = targetChildVector == null ? 0 : targetChildVector.getValueCount(); + int deltaOffset = deltaVector.getOffset(i); + long index = (long) (targetVector.getValueCount() + i) * DenseUnionVector.OFFSET_WIDTH; + + targetVector.getOffsetBuffer().setInt(index, offsetBase + deltaOffset); + } + + // append child vectors + for (int i = 0; i <= Byte.MAX_VALUE; i++) { + ValueVector targetChildVector = targetDenseUnionVector.getVectorByType((byte) i); + ValueVector deltaChildVector = deltaVector.getVectorByType((byte) i); + + if (targetChildVector == null && deltaChildVector == null) { + // the type id is not registered in either vector, we are done. + continue; + } else if (targetChildVector == null && deltaChildVector != null) { + // first register a new child in the target vector + targetDenseUnionVector.registerNewTypeId(deltaChildVector.getField()); + targetChildVector = targetDenseUnionVector.addVector( + (byte) i, deltaChildVector.getField().createVector(targetDenseUnionVector.getAllocator())); + + // now we have both child vecors not null, we can append them. + VectorAppender childAppender = new VectorAppender(targetChildVector); + deltaChildVector.accept(childAppender, null); + } else if (targetChildVector != null && deltaChildVector == null) { + // the value only exists in the target vector, so we are done + continue; + } else { + // both child vectors are non-null + + // first check vector types + TypeEqualsVisitor childTypeVisitor = + new TypeEqualsVisitor(targetChildVector, /* check name */ false, /* check meta data*/ false); + if (!childTypeVisitor.equals(deltaChildVector)) { + throw new IllegalArgumentException("dense union vectors have different child vector types with type id " + i); + } + + // append child vectors + VectorAppender childAppender = new VectorAppender(targetChildVector); + deltaChildVector.accept(childAppender, null); + } + } + + targetVector.setValueCount(newValueCount); + return targetVector; + } + + @Override + public ValueVector visit(NullVector deltaVector, Void value) { + Preconditions.checkArgument(targetVector.getField().getType().equals(deltaVector.getField().getType()), + "The targetVector to append must have the same type as the targetVector being appended"); + return targetVector; + } + + @Override + public ValueVector visit(ExtensionTypeVector deltaVector, Void value) { + ValueVector targetUnderlying = ((ExtensionTypeVector) targetVector).getUnderlyingVector(); + VectorAppender underlyingAppender = new VectorAppender(targetUnderlying); + deltaVector.getUnderlyingVector().accept(underlyingAppender, null); + return targetVector; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/VectorBatchAppender.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/VectorBatchAppender.java new file mode 100644 index 000000000..570783d10 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/VectorBatchAppender.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import org.apache.arrow.vector.ValueVector; + +/** + * Utility to add vector values in batch. + */ +public class VectorBatchAppender { + + /** + * Add value vectors in batch. + * @param targetVector the target vector. + * @param vectorsToAppend the vectors to append. + * @param the vector type. + */ + public static void batchAppend(V targetVector, V... vectorsToAppend) { + VectorAppender appender = new VectorAppender(targetVector); + for (V delta : vectorsToAppend) { + delta.accept(appender, null); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/VectorSchemaRootAppender.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/VectorSchemaRootAppender.java new file mode 100644 index 000000000..3c6044ec5 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/util/VectorSchemaRootAppender.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.compare.TypeEqualsVisitor; + +/** + * Utility to append {@link org.apache.arrow.vector.VectorSchemaRoot}s with the same schema. + */ +public class VectorSchemaRootAppender { + + /** + * Appends a number of {@link VectorSchemaRoot}s. + * @param checkSchema if we need to check schema for the vector schema roots. + * @param targetRoot the vector schema root to be appended. + * @param rootsToAppend the vector schema roots to append. + * @throws IllegalArgumentException throws if we need to check schema, and checking schema fails. + */ + public static void append(boolean checkSchema, VectorSchemaRoot targetRoot, VectorSchemaRoot... rootsToAppend) { + // create appenders + VectorAppender[] appenders = new VectorAppender[targetRoot.getFieldVectors().size()]; + for (int i = 0; i < appenders.length; i++) { + appenders[i] = new VectorAppender(targetRoot.getVector(i)); + } + + // create type checkers, if necessary + TypeEqualsVisitor[] typeCheckers = null; + if (checkSchema) { + typeCheckers = new TypeEqualsVisitor[targetRoot.getFieldVectors().size()]; + for (int i = 0; i < typeCheckers.length; i++) { + typeCheckers[i] = new TypeEqualsVisitor(targetRoot.getVector(i), + /* check name */ false, /* check meta data */ false); + } + } + + for (VectorSchemaRoot delta : rootsToAppend) { + // check schema, if necessary + if (checkSchema) { + if (delta.getFieldVectors().size() != targetRoot.getFieldVectors().size()) { + throw new IllegalArgumentException("Vector schema roots have different numbers of child vectors."); + } + for (int i = 0; i < typeCheckers.length; i++) { + if (!typeCheckers[i].equals(delta.getVector(i))) { + throw new IllegalArgumentException("Vector schema roots have different schemas."); + } + } + } + + // append child vectors. + for (int i = 0; i < appenders.length; i++) { + delta.getVector(i).accept(appenders[i], null); + } + targetRoot.setRowCount(targetRoot.getRowCount() + delta.getRowCount()); + } + } + + /** + * Appends a number of {@link VectorSchemaRoot}s. + * This method performs schema checking before appending data. + * @param targetRoot the vector schema root to be appended. + * @param rootsToAppend the vector schema roots to append. + * @throws IllegalArgumentException throws if we need to check schema, and checking schema fails. + */ + public static void append(VectorSchemaRoot targetRoot, VectorSchemaRoot... rootsToAppend) { + append(true, targetRoot, rootsToAppend); + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/validate/MetadataV4UnionChecker.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/validate/MetadataV4UnionChecker.java new file mode 100644 index 000000000..2a7068365 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/validate/MetadataV4UnionChecker.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.validate; + +import java.io.IOException; +import java.util.Iterator; + +import org.apache.arrow.vector.types.MetadataVersion; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; + +/** + * Given a field, checks that no Union fields are present. + * + * This is intended to be used to prevent unions from being read/written with V4 metadata. + */ +public final class MetadataV4UnionChecker { + static boolean isUnion(Field field) { + return field.getType().getTypeID() == ArrowType.ArrowTypeID.Union; + } + + static Field check(Field field) { + if (isUnion(field)) { + return field; + } + // Naive recursive DFS + for (final Field child : field.getChildren()) { + final Field result = check(child); + if (result != null) { + return result; + } + } + return null; + } + + /** + * Check the schema, raising an error if an unsupported feature is used (e.g. unions with < V5 metadata). + */ + public static void checkForUnion(Iterator fields, MetadataVersion metadataVersion) { + if (metadataVersion.toFlatbufID() >= MetadataVersion.V5.toFlatbufID()) { + return; + } + while (fields.hasNext()) { + Field union = check(fields.next()); + if (union != null) { + throw new IllegalArgumentException( + "Cannot write union with V4 metadata version, use V5 instead. Found field: " + union); + } + } + } + + /** + * Check the schema, raising an error if an unsupported feature is used (e.g. unions with < V5 metadata). + */ + public static void checkRead(Schema schema, MetadataVersion metadataVersion) throws IOException { + if (metadataVersion.toFlatbufID() >= MetadataVersion.V5.toFlatbufID()) { + return; + } + for (final Field field : schema.getFields()) { + Field union = check(field); + if (union != null) { + throw new IOException("Cannot read union with V4 metadata version. Found field: " + union); + } + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateUtil.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateUtil.java new file mode 100644 index 000000000..e1b60e926 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateUtil.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.validate; + +/** + * Utilities for vector validation. + */ +public class ValidateUtil { + + private ValidateUtil() { + } + + /** + * Validate the expression. + * @param expression the expression to validate. + * @param errorMessage the error message. + * @throws ValidateException if the expression evaluates to false. + */ + public static void validateOrThrow(boolean expression, String errorMessage) { + if (!expression) { + throw new ValidateException(errorMessage); + } + } + + /** + * Validate the expression. + * @param expression the expression to validate. + * @param errorMessage the error message template. + * @param args the error message arguments. + * @throws ValidateException if the expression evaluates to false. + */ + public static void validateOrThrow(boolean expression, String errorMessage, Object... args) { + if (!expression) { + throw new ValidateException(String.format(errorMessage, args)); + } + } + + /** + * A exception that is thrown when the vector validation fails. + */ + public static class ValidateException extends RuntimeException { + public ValidateException(String message) { + super(message); + } + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java new file mode 100644 index 000000000..d4abaa194 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java @@ -0,0 +1,246 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.validate; + +import static org.apache.arrow.vector.validate.ValidateUtil.validateOrThrow; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.BaseLargeVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.ExtensionTypeVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.TypeLayout; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.complex.DenseUnionVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.LargeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.types.pojo.ArrowType; + +/** + * Visitor to validate vector buffers. + */ +public class ValidateVectorBufferVisitor implements VectorVisitor { + + private void validateVectorCommon(ValueVector vector) { + ArrowType arrowType = vector.getField().getType(); + validateOrThrow(vector.getValueCount() >= 0, + "Vector valueCount %s is negative.", vector.getValueCapacity()); + + if (vector instanceof FieldVector) { + FieldVector fieldVector = (FieldVector) vector; + int typeBufferCount = TypeLayout.getTypeBufferCount(arrowType); + validateOrThrow(fieldVector.getFieldBuffers().size() == typeBufferCount, + "Expected %s buffers in vector of type %s, got %s.", + typeBufferCount, vector.getField().getType().toString(), fieldVector.getFieldBuffers().size()); + } + } + + private void validateValidityBuffer(ValueVector vector, int valueCount) { + ArrowBuf validityBuffer = vector.getValidityBuffer(); + validateOrThrow(validityBuffer != null, "The validity buffer is null."); + validateOrThrow(validityBuffer.capacity() * 8 >= valueCount, + "Not enough capacity for the validity buffer. Minimum capacity %s, actual capacity %s.", + (valueCount + 7) / 8, validityBuffer.capacity()); + } + + private void validateOffsetBuffer(ValueVector vector, long minCapacity) { + ArrowBuf offsetBuffer = vector.getOffsetBuffer(); + validateOrThrow(offsetBuffer != null, "The offset buffer is null."); + validateOrThrow(offsetBuffer.capacity() >= minCapacity, + "Not enough capacity for the offset buffer. Minimum capacity %s, actual capacity %s.", + minCapacity, offsetBuffer.capacity()); + } + + private void validateFixedWidthDataBuffer(ValueVector vector, int valueCount, int bitWidth) { + ArrowBuf dataBuffer = vector.getDataBuffer(); + validateOrThrow(dataBuffer != null, "The fixed width data buffer is null."); + validateOrThrow((long) bitWidth * valueCount <= dataBuffer.capacity() * 8L, + "Not enough capacity for fixed width data buffer. Minimum capacity %s, actual capacity %s.", + ((long) bitWidth * valueCount + 7L) / 8L, dataBuffer.capacity()); + } + + private void validateDataBuffer(ValueVector vector, long minCapacity) { + ArrowBuf dataBuffer = vector.getDataBuffer(); + validateOrThrow(dataBuffer != null, "The data buffer is null."); + validateOrThrow(dataBuffer.capacity() >= minCapacity, + "Not enough capacity for data buffer. Minimum capacity %s, actual capacity %s.", + minCapacity, dataBuffer.capacity()); + } + + private void validateTypeBuffer(ArrowBuf typeBuf, long minCapacity) { + validateOrThrow(typeBuf != null, "The type buffer is null."); + validateOrThrow(typeBuf.capacity() >= minCapacity, + "Not enough capacity for type buffer. Minimum capacity %s, actual capacity %s.", + minCapacity, typeBuf.capacity()); + } + + @Override + public Void visit(BaseFixedWidthVector vector, Void value) { + int bitWidth = (vector instanceof BitVector) ? 1 : vector.getTypeWidth() * 8; + int valueCount = vector.getValueCount(); + validateVectorCommon(vector); + validateValidityBuffer(vector, valueCount); + validateFixedWidthDataBuffer(vector, valueCount, bitWidth); + return null; + } + + @Override + public Void visit(BaseVariableWidthVector vector, Void value) { + int valueCount = vector.getValueCount(); + validateVectorCommon(vector); + validateValidityBuffer(vector, valueCount); + long minOffsetCapacity = valueCount == 0 ? 0L : (long) (valueCount + 1) * BaseVariableWidthVector.OFFSET_WIDTH; + validateOffsetBuffer(vector, minOffsetCapacity); + int lastOffset = valueCount == 0 ? 0 : + vector.getOffsetBuffer().getInt(valueCount * BaseVariableWidthVector.OFFSET_WIDTH); + validateDataBuffer(vector, lastOffset); + return null; + } + + @Override + public Void visit(BaseLargeVariableWidthVector vector, Void value) { + int valueCount = vector.getValueCount(); + validateVectorCommon(vector); + validateValidityBuffer(vector, valueCount); + long minOffsetCapacity = valueCount == 0 ? 0L + : (long) (valueCount + 1) * BaseLargeVariableWidthVector.OFFSET_WIDTH; + validateOffsetBuffer(vector, minOffsetCapacity); + long lastOffset = valueCount == 0 ? 0L : + vector.getOffsetBuffer().getLong((long) valueCount * BaseLargeVariableWidthVector.OFFSET_WIDTH); + validateDataBuffer(vector, lastOffset); + return null; + } + + @Override + public Void visit(ListVector vector, Void value) { + int valueCount = vector.getValueCount(); + validateVectorCommon(vector); + validateValidityBuffer(vector, valueCount); + long minOffsetCapacity = valueCount == 0 ? 0L : (long) (valueCount + 1) * ListVector.OFFSET_WIDTH; + validateOffsetBuffer(vector, minOffsetCapacity); + + FieldVector dataVector = vector.getDataVector(); + int lastOffset = valueCount == 0 ? 0 : + vector.getOffsetBuffer().getInt(valueCount * BaseVariableWidthVector.OFFSET_WIDTH); + int dataVectorLength = dataVector == null ? 0 : dataVector.getValueCount(); + validateOrThrow(dataVectorLength >= lastOffset, + "Inner vector does not contain enough elements. Minimum element count %s, actual element count %s", + lastOffset + 1, dataVectorLength); + + if (dataVector != null) { + dataVector.accept(this, null); + } + return null; + } + + @Override + public Void visit(FixedSizeListVector vector, Void value) { + int valueCount = vector.getValueCount(); + validateVectorCommon(vector); + validateValidityBuffer(vector, valueCount); + FieldVector dataVector = vector.getDataVector(); + int dataVectorLength = dataVector == null ? 0 : dataVector.getValueCount(); + validateOrThrow(dataVectorLength >= valueCount * vector.getListSize(), + "Inner vector does not contain enough elements. Minimum element count %s, actual element count %s.", + valueCount * vector.getListSize(), dataVectorLength); + if (dataVector != null) { + dataVector.accept(this, null); + } + return null; + } + + @Override + public Void visit(LargeListVector vector, Void value) { + int valueCount = vector.getValueCount(); + validateVectorCommon(vector); + validateValidityBuffer(vector, valueCount); + long minOffsetCapacity = valueCount == 0 ? 0L : (long) (valueCount + 1) * LargeListVector.OFFSET_WIDTH; + validateOffsetBuffer(vector, minOffsetCapacity); + + FieldVector dataVector = vector.getDataVector(); + long lastOffset = valueCount == 0 ? 0 : + vector.getOffsetBuffer().getLong(valueCount * BaseLargeVariableWidthVector.OFFSET_WIDTH); + int dataVectorLength = dataVector == null ? 0 : dataVector.getValueCount(); + validateOrThrow(dataVectorLength >= lastOffset, + "Inner vector does not contain enough elements. Minimum element count %s, actual element count %s", + lastOffset + 1, dataVectorLength); + + if (dataVector != null) { + dataVector.accept(this, null); + } + return null; + } + + @Override + public Void visit(NonNullableStructVector vector, Void value) { + int valueCount = vector.getValueCount(); + validateVectorCommon(vector); + validateValidityBuffer(vector, valueCount); + for (ValueVector subVector : vector.getChildrenFromFields()) { + validateOrThrow(valueCount == subVector.getValueCount(), + "Struct vector length not equal to child vector length. Struct vector length %s, child vector length %s", + valueCount, subVector.getValueCount()); + subVector.accept(this, null); + } + return null; + } + + @Override + public Void visit(UnionVector vector, Void value) { + int valueCount = vector.getValueCount(); + validateVectorCommon(vector); + validateTypeBuffer(vector.getTypeBuffer(), valueCount * UnionVector.TYPE_WIDTH); + for (ValueVector subVector : vector.getChildrenFromFields()) { + validateOrThrow(valueCount == subVector.getValueCount(), + "Union vector length not equal to child vector length. Union vector length %s, child vector length %s", + valueCount, subVector.getValueCount()); + subVector.accept(this, null); + } + return null; + } + + @Override + public Void visit(DenseUnionVector vector, Void value) { + int valueCount = vector.getValueCount(); + validateVectorCommon(vector); + validateOffsetBuffer(vector, (long) valueCount * DenseUnionVector.OFFSET_WIDTH); + validateTypeBuffer(vector.getTypeBuffer(), valueCount * DenseUnionVector.TYPE_WIDTH); + for (ValueVector subVector : vector.getChildrenFromFields()) { + subVector.accept(this, null); + } + return null; + } + + @Override + public Void visit(NullVector vector, Void value) { + return null; + } + + @Override + public Void visit(ExtensionTypeVector vector, Void value) { + vector.getUnderlyingVector().accept(this, value); + return null; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorDataVisitor.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorDataVisitor.java new file mode 100644 index 000000000..cdeb4f1ea --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorDataVisitor.java @@ -0,0 +1,180 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.validate; + +import static org.apache.arrow.vector.validate.ValidateUtil.validateOrThrow; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.BaseLargeVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.ExtensionTypeVector; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.complex.DenseUnionVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.LargeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.UnionVector; + +/** + * Utility for validating vector data. + */ +public class ValidateVectorDataVisitor implements VectorVisitor { + + private void validateOffsetBuffer(ValueVector vector, int valueCount) { + if (valueCount == 0) { + return; + } + ArrowBuf offsetBuffer = vector.getOffsetBuffer(); + + // verify that the values in the offset buffer is non-decreasing + int prevValue = offsetBuffer.getInt(0); + for (int i = 1; i <= valueCount; i++) { + int curValue = offsetBuffer.getInt(i * 4); + validateOrThrow(curValue >= 0, "The value at position %s of the offset buffer is negative: %s.", i, curValue); + validateOrThrow(curValue >= prevValue, + "The values in positions %s and %s of the offset buffer are decreasing: %s, %s.", + i - 1, i, prevValue, curValue); + prevValue = curValue; + } + } + + private void validateLargeOffsetBuffer(ValueVector vector, int valueCount) { + if (valueCount == 0) { + return; + } + ArrowBuf offsetBuffer = vector.getOffsetBuffer(); + + // verify that the values in the large offset buffer is non-decreasing + long prevValue = offsetBuffer.getLong(0); + for (int i = 1; i <= valueCount; i++) { + long curValue = offsetBuffer.getLong((long) i * 8); + validateOrThrow(curValue >= 0L, "The value at position %s of the large offset buffer is negative: %s.", + i, curValue); + validateOrThrow(curValue >= prevValue, + "The values in positions %s and %s of the large offset buffer are decreasing: %s, %s.", + i - 1, i, prevValue, curValue); + prevValue = curValue; + } + } + + private void validateTypeBuffer(ArrowBuf typeBuf, int valueCount) { + for (int i = 0; i < valueCount; i++) { + validateOrThrow(typeBuf.getByte(i) >= 0, "The type id at position %s is negative: %s.", + i, typeBuf.getByte(i)); + } + } + + @Override + public Void visit(BaseFixedWidthVector vector, Void value) { + return null; + } + + @Override + public Void visit(BaseVariableWidthVector vector, Void value) { + validateOffsetBuffer(vector, vector.getValueCount()); + return null; + } + + @Override + public Void visit(BaseLargeVariableWidthVector vector, Void value) { + validateLargeOffsetBuffer(vector, vector.getValueCount()); + return null; + } + + @Override + public Void visit(ListVector vector, Void value) { + validateOffsetBuffer(vector, vector.getValueCount()); + ValueVector innerVector = vector.getDataVector(); + if (innerVector != null) { + innerVector.accept(this, null); + } + return null; + } + + @Override + public Void visit(FixedSizeListVector vector, Void value) { + validateOffsetBuffer(vector, vector.getValueCount()); + ValueVector innerVector = vector.getDataVector(); + if (innerVector != null) { + innerVector.accept(this, null); + } + return null; + } + + @Override + public Void visit(LargeListVector vector, Void value) { + validateLargeOffsetBuffer(vector, vector.getValueCount()); + ValueVector innerVector = vector.getDataVector(); + if (innerVector != null) { + innerVector.accept(this, null); + } + return null; + } + + @Override + public Void visit(NonNullableStructVector vector, Void value) { + for (ValueVector subVector : vector.getChildrenFromFields()) { + subVector.accept(this, null); + } + return null; + } + + @Override + public Void visit(UnionVector vector, Void value) { + validateTypeBuffer(vector.getTypeBuffer(), vector.getValueCount()); + for (ValueVector subVector : vector.getChildrenFromFields()) { + subVector.accept(this, null); + } + return null; + } + + @Override + public Void visit(DenseUnionVector vector, Void value) { + validateTypeBuffer(vector.getTypeBuffer(), vector.getValueCount()); + + // validate offset buffer + for (int i = 0; i < vector.getValueCount(); i++) { + int offset = vector.getOffset(i); + byte typeId = vector.getTypeId(i); + ValueVector subVector = vector.getVectorByType(typeId); + validateOrThrow(offset < subVector.getValueCount(), + "Dense union vector offset exceeds sub-vector boundary. Vector offset %s, sub vector size %s", + offset, subVector.getValueCount()); + } + + for (ValueVector subVector : vector.getChildrenFromFields()) { + subVector.accept(this, null); + } + return null; + } + + @Override + public Void visit(NullVector vector, Void value) { + return null; + } + + @Override + public Void visit(ExtensionTypeVector vector, Void value) { + vector.getUnderlyingVector().accept(this, value); + return null; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java new file mode 100644 index 000000000..65795b468 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java @@ -0,0 +1,378 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.validate; + +import static org.apache.arrow.vector.validate.ValidateUtil.validateOrThrow; + +import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.BaseLargeVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.DateDayVector; +import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.Decimal256Vector; +import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.DurationVector; +import org.apache.arrow.vector.ExtensionTypeVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.IntervalDayVector; +import org.apache.arrow.vector.IntervalMonthDayNanoVector; +import org.apache.arrow.vector.IntervalYearVector; +import org.apache.arrow.vector.LargeVarBinaryVector; +import org.apache.arrow.vector.LargeVarCharVector; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.SmallIntVector; +import org.apache.arrow.vector.TimeMicroVector; +import org.apache.arrow.vector.TimeMilliVector; +import org.apache.arrow.vector.TimeNanoVector; +import org.apache.arrow.vector.TimeSecVector; +import org.apache.arrow.vector.TimeStampMicroTZVector; +import org.apache.arrow.vector.TimeStampMicroVector; +import org.apache.arrow.vector.TimeStampMilliTZVector; +import org.apache.arrow.vector.TimeStampMilliVector; +import org.apache.arrow.vector.TimeStampNanoTZVector; +import org.apache.arrow.vector.TimeStampNanoVector; +import org.apache.arrow.vector.TimeStampSecTZVector; +import org.apache.arrow.vector.TimeStampSecVector; +import org.apache.arrow.vector.TinyIntVector; +import org.apache.arrow.vector.UInt1Vector; +import org.apache.arrow.vector.UInt2Vector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.UInt8Vector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VarBinaryVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.complex.DenseUnionVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.LargeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.types.DateUnit; +import org.apache.arrow.vector.types.FloatingPointPrecision; +import org.apache.arrow.vector.types.IntervalUnit; +import org.apache.arrow.vector.types.TimeUnit; +import org.apache.arrow.vector.types.UnionMode; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.FieldType; + +/** + * Utility to validate vector type information. + */ +public class ValidateVectorTypeVisitor implements VectorVisitor { + + private void validateVectorCommon(ValueVector vector, Class expectedArrowType) { + validateOrThrow(vector.getField() != null, "Vector field is empty."); + validateOrThrow(vector.getField().getFieldType() != null, "Vector field type is empty."); + ArrowType arrowType = vector.getField().getFieldType().getType(); + validateOrThrow(arrowType != null, "Vector arrow type is empty."); + validateOrThrow(expectedArrowType == arrowType.getClass(), + "Incorrect arrow type for " + vector.getClass() + " : " + arrowType.toString()); + } + + private void validateIntVector(ValueVector vector, int expectedWidth, boolean expectedSigned) { + validateOrThrow(vector.getField().getFieldType().getType() instanceof ArrowType.Int, + "Vector %s is not an integer vector.", vector.getClass()); + ArrowType.Int intType = (ArrowType.Int) vector.getField().getFieldType().getType(); + validateOrThrow(intType.getIsSigned() == expectedSigned, + "Expecting bit width %s, actual width %s.", expectedWidth, intType.getBitWidth()); + validateOrThrow(intType.getBitWidth() == expectedWidth, "Expecting bit width %s, actual bit width %s.", + expectedWidth, intType.getBitWidth()); + } + + private void validateFloatingPointVector(ValueVector vector, FloatingPointPrecision expectedPrecision) { + validateOrThrow(vector.getField().getFieldType().getType() instanceof ArrowType.FloatingPoint, + "Vector %s is not a floating point vector.", vector.getClass()); + ArrowType.FloatingPoint floatType = (ArrowType.FloatingPoint) vector.getField().getFieldType().getType(); + validateOrThrow(floatType.getPrecision() == expectedPrecision, "Expecting precision %s, actual precision %s.", + expectedPrecision, floatType.getPrecision()); + } + + private void validateDateVector(ValueVector vector, DateUnit expectedDateUnit) { + validateOrThrow(vector.getField().getFieldType().getType() instanceof ArrowType.Date, + "Vector %s is not a date vector", vector.getClass()); + ArrowType.Date dateType = (ArrowType.Date) vector.getField().getFieldType().getType(); + validateOrThrow(dateType.getUnit() == expectedDateUnit, + "Expecting date unit %s, actual date unit %s.", expectedDateUnit, dateType.getUnit()); + } + + private void validateTimeVector(ValueVector vector, TimeUnit expectedTimeUnit, int expectedBitWidth) { + validateOrThrow(vector.getField().getFieldType().getType() instanceof ArrowType.Time, + "Vector %s is not a time vector.", vector.getClass()); + ArrowType.Time timeType = (ArrowType.Time) vector.getField().getFieldType().getType(); + validateOrThrow(timeType.getUnit() == expectedTimeUnit, + "Expecting time unit %s, actual time unit %s.", expectedTimeUnit, timeType.getUnit()); + validateOrThrow(timeType.getBitWidth() == expectedBitWidth, + "Expecting bit width %s, actual bit width %s.", expectedBitWidth, timeType.getBitWidth()); + } + + private void validateIntervalVector(ValueVector vector, IntervalUnit expectedIntervalUnit) { + validateOrThrow(vector.getField().getFieldType().getType() instanceof ArrowType.Interval, + "Vector %s is not an interval vector.", vector.getClass()); + ArrowType.Interval intervalType = (ArrowType.Interval) vector.getField().getFieldType().getType(); + validateOrThrow(intervalType.getUnit() == expectedIntervalUnit, + "Expecting interval unit %s, actual date unit %s.", expectedIntervalUnit, intervalType.getUnit()); + } + + private void validateTimeStampVector(ValueVector vector, TimeUnit expectedTimeUnit, boolean expectTZ) { + validateOrThrow(vector.getField().getFieldType().getType() instanceof ArrowType.Timestamp, + "Vector %s is not a time stamp vector.", vector.getClass()); + ArrowType.Timestamp timestampType = (ArrowType.Timestamp) vector.getField().getFieldType().getType(); + validateOrThrow(timestampType.getUnit() == expectedTimeUnit, + "Expecting time stamp unit %s, actual time stamp unit %s.", expectedTimeUnit, timestampType.getUnit()); + if (expectTZ) { + validateOrThrow(timestampType.getTimezone() != null, "The time zone should not be null"); + } else { + validateOrThrow(timestampType.getTimezone() == null, "The time zone should be null"); + } + } + + private void validateExtensionTypeVector(ExtensionTypeVector vector) { + validateOrThrow(vector.getField().getFieldType().getType() instanceof ArrowType.ExtensionType, + "Vector %s is not an extension type vector.", vector.getClass()); + validateOrThrow(vector.getField().getMetadata().containsKey(ArrowType.ExtensionType.EXTENSION_METADATA_KEY_NAME), + "Field %s does not have proper extension type metadata: %s", + vector.getField().getName(), + vector.getField().getMetadata()); + // Validate the storage vector type + vector.getUnderlyingVector().accept(this, null); + } + + @Override + public Void visit(BaseFixedWidthVector vector, Void value) { + if (vector instanceof TinyIntVector) { + validateVectorCommon(vector, ArrowType.Int.class); + validateIntVector(vector, 8, true); + } else if (vector instanceof SmallIntVector) { + validateVectorCommon(vector, ArrowType.Int.class); + validateIntVector(vector, 16, true); + } else if (vector instanceof IntVector) { + validateVectorCommon(vector, ArrowType.Int.class); + validateIntVector(vector, 32, true); + } else if (vector instanceof BigIntVector) { + validateVectorCommon(vector, ArrowType.Int.class); + validateIntVector(vector, 64, true); + } else if (vector instanceof UInt1Vector) { + validateVectorCommon(vector, ArrowType.Int.class); + validateIntVector(vector, 8, false); + } else if (vector instanceof UInt2Vector) { + validateVectorCommon(vector, ArrowType.Int.class); + validateIntVector(vector, 16, false); + } else if (vector instanceof UInt4Vector) { + validateVectorCommon(vector, ArrowType.Int.class); + validateIntVector(vector, 32, false); + } else if (vector instanceof UInt8Vector) { + validateVectorCommon(vector, ArrowType.Int.class); + validateIntVector(vector, 64, false); + } else if (vector instanceof BitVector) { + validateVectorCommon(vector, ArrowType.Bool.class); + } else if (vector instanceof DecimalVector || vector instanceof Decimal256Vector) { + validateVectorCommon(vector, ArrowType.Decimal.class); + ArrowType.Decimal arrowType = (ArrowType.Decimal) vector.getField().getType(); + validateOrThrow(arrowType.getScale() > 0, "The scale of decimal %s is not positive.", arrowType.getScale()); + validateOrThrow(arrowType.getPrecision() > 0, "The precision of decimal %S is not positive.", + arrowType.getPrecision()); + } else if (vector instanceof DateDayVector) { + validateVectorCommon(vector, ArrowType.Date.class); + validateDateVector(vector, DateUnit.DAY); + } else if (vector instanceof DateMilliVector) { + validateVectorCommon(vector, ArrowType.Date.class); + validateDateVector(vector, DateUnit.MILLISECOND); + } else if (vector instanceof DurationVector) { + validateVectorCommon(vector, ArrowType.Duration.class); + ArrowType.Duration arrowType = (ArrowType.Duration) vector.getField().getType(); + validateOrThrow(((DurationVector) vector).getUnit() == arrowType.getUnit(), + "Different duration time unit for vector and arrow type. Vector time unit %s, type time unit %s.", + ((DurationVector) vector).getUnit(), arrowType.getUnit()); + } else if (vector instanceof Float4Vector) { + validateVectorCommon(vector, ArrowType.FloatingPoint.class); + validateFloatingPointVector(vector, FloatingPointPrecision.SINGLE); + } else if (vector instanceof Float8Vector) { + validateVectorCommon(vector, ArrowType.FloatingPoint.class); + validateFloatingPointVector(vector, FloatingPointPrecision.DOUBLE); + } else if (vector instanceof IntervalDayVector) { + validateVectorCommon(vector, ArrowType.Interval.class); + validateIntervalVector(vector, IntervalUnit.DAY_TIME); + } else if (vector instanceof IntervalMonthDayNanoVector) { + validateVectorCommon(vector, ArrowType.Interval.class); + validateIntervalVector(vector, IntervalUnit.MONTH_DAY_NANO); + } else if (vector instanceof IntervalYearVector) { + validateVectorCommon(vector, ArrowType.Interval.class); + validateIntervalVector(vector, IntervalUnit.YEAR_MONTH); + } else if (vector instanceof TimeMicroVector) { + validateVectorCommon(vector, ArrowType.Time.class); + validateTimeVector(vector, TimeUnit.MICROSECOND, 64); + } else if (vector instanceof TimeMilliVector) { + validateVectorCommon(vector, ArrowType.Time.class); + validateTimeVector(vector, TimeUnit.MILLISECOND, 32); + } else if (vector instanceof TimeNanoVector) { + validateVectorCommon(vector, ArrowType.Time.class); + validateTimeVector(vector, TimeUnit.NANOSECOND, 64); + } else if (vector instanceof TimeSecVector) { + validateVectorCommon(vector, ArrowType.Time.class); + validateTimeVector(vector, TimeUnit.SECOND, 32); + } else if (vector instanceof TimeStampMicroTZVector) { + validateVectorCommon(vector, ArrowType.Timestamp.class); + validateTimeStampVector(vector, TimeUnit.MICROSECOND, true); + } else if (vector instanceof TimeStampMicroVector) { + validateVectorCommon(vector, ArrowType.Timestamp.class); + validateTimeStampVector(vector, TimeUnit.MICROSECOND, false); + } else if (vector instanceof TimeStampMilliTZVector) { + validateVectorCommon(vector, ArrowType.Timestamp.class); + validateTimeStampVector(vector, TimeUnit.MILLISECOND, true); + } else if (vector instanceof TimeStampMilliVector) { + validateVectorCommon(vector, ArrowType.Timestamp.class); + validateTimeStampVector(vector, TimeUnit.MILLISECOND, false); + } else if (vector instanceof TimeStampNanoTZVector) { + validateVectorCommon(vector, ArrowType.Timestamp.class); + validateTimeStampVector(vector, TimeUnit.NANOSECOND, true); + } else if (vector instanceof TimeStampNanoVector) { + validateVectorCommon(vector, ArrowType.Timestamp.class); + validateTimeStampVector(vector, TimeUnit.NANOSECOND, false); + } else if (vector instanceof TimeStampSecTZVector) { + validateVectorCommon(vector, ArrowType.Timestamp.class); + validateTimeStampVector(vector, TimeUnit.SECOND, true); + } else if (vector instanceof TimeStampSecVector) { + validateVectorCommon(vector, ArrowType.Timestamp.class); + validateTimeStampVector(vector, TimeUnit.SECOND, false); + } else if (vector instanceof FixedSizeBinaryVector) { + validateVectorCommon(vector, ArrowType.FixedSizeBinary.class); + ArrowType.FixedSizeBinary arrowType = (ArrowType.FixedSizeBinary) vector.getField().getType(); + validateOrThrow(arrowType.getByteWidth() > 0, "The byte width of a FixedSizeBinaryVector %s is not positive.", + arrowType.getByteWidth()); + validateOrThrow(arrowType.getByteWidth() == vector.getTypeWidth(), + "Type width mismatch for FixedSizeBinaryVector. Vector type width %s, arrow type type width %s.", + vector.getTypeWidth(), arrowType.getByteWidth()); + } else { + throw new IllegalArgumentException("Unknown type for fixed width vector " + vector.getClass()); + } + return null; + } + + @Override + public Void visit(BaseVariableWidthVector vector, Void value) { + if (vector instanceof VarCharVector) { + validateVectorCommon(vector, ArrowType.Utf8.class); + } else if (vector instanceof VarBinaryVector) { + validateVectorCommon(vector, ArrowType.Binary.class); + } + return null; + } + + @Override + public Void visit(BaseLargeVariableWidthVector vector, Void value) { + if (vector instanceof LargeVarCharVector) { + validateVectorCommon(vector, ArrowType.LargeUtf8.class); + } else if (vector instanceof LargeVarBinaryVector) { + validateVectorCommon(vector, ArrowType.LargeBinary.class); + } + return null; + } + + @Override + public Void visit(ListVector vector, Void value) { + validateVectorCommon(vector, ArrowType.List.class); + ValueVector innerVector = vector.getDataVector(); + if (innerVector != null) { + innerVector.accept(this, null); + } + return null; + } + + @Override + public Void visit(FixedSizeListVector vector, Void value) { + validateVectorCommon(vector, ArrowType.FixedSizeList.class); + ArrowType.FixedSizeList arrowType = (ArrowType.FixedSizeList) vector.getField().getType(); + validateOrThrow(arrowType.getListSize() == vector.getListSize(), + "Inconsistent list size for FixedSizeListVector. Vector list size %s, arrow type list size %s.", + vector.getListSize(), arrowType.getListSize()); + validateOrThrow(arrowType.getListSize() > 0, "The list size %s is not positive.", arrowType.getListSize()); + ValueVector innerVector = vector.getDataVector(); + if (innerVector != null) { + innerVector.accept(this, null); + } + return null; + } + + @Override + public Void visit(LargeListVector vector, Void value) { + validateVectorCommon(vector, ArrowType.LargeList.class); + ValueVector innerVector = vector.getDataVector(); + if (innerVector != null) { + innerVector.accept(this, null); + } + return null; + } + + @Override + public Void visit(NonNullableStructVector vector, Void value) { + validateVectorCommon(vector, ArrowType.Struct.class); + validateOrThrow(vector.getField().getChildren().size() == vector.getChildrenFromFields().size(), + "Child field count and child vector count mismatch. Vector child count %s, field child count %s", + vector.getChildrenFromFields().size(), vector.getField().getChildren().size()); + for (int i = 0; i < vector.getChildrenFromFields().size(); i++) { + ValueVector subVector = vector.getChildByOrdinal(i); + FieldType subType = vector.getField().getChildren().get(i).getFieldType(); + + validateOrThrow(subType.equals(subVector.getField().getFieldType()), + "Struct vector's field type not equal to the child vector's field type. " + + "Struct field type %s, sub-vector field type %s", subType, subVector.getField().getFieldType()); + subVector.accept(this, null); + } + return null; + } + + @Override + public Void visit(UnionVector vector, Void value) { + validateVectorCommon(vector, ArrowType.Union.class); + ArrowType.Union arrowType = (ArrowType.Union) vector.getField().getType(); + validateOrThrow(arrowType.getMode() == UnionMode.Sparse, "The union mode of UnionVector must be sparse"); + for (ValueVector subVector : vector.getChildrenFromFields()) { + subVector.accept(this, null); + } + return null; + } + + @Override + public Void visit(DenseUnionVector vector, Void value) { + validateVectorCommon(vector, ArrowType.Union.class); + ArrowType.Union arrowType = (ArrowType.Union) vector.getField().getType(); + validateOrThrow(arrowType.getMode() == UnionMode.Dense, "The union mode of DenseUnionVector must be dense"); + for (ValueVector subVector : vector.getChildrenFromFields()) { + subVector.accept(this, null); + } + return null; + } + + @Override + public Void visit(NullVector vector, Void value) { + validateVectorCommon(vector, ArrowType.Null.class); + return null; + } + + @Override + public Void visit(ExtensionTypeVector vector, Void value) { + validateExtensionTypeVector(vector); + return null; + } +} diff --git a/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorVisitor.java b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorVisitor.java new file mode 100644 index 000000000..7e99b1f90 --- /dev/null +++ b/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorVisitor.java @@ -0,0 +1,273 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.validate; + +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.BaseLargeVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.ExtensionTypeVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.complex.DenseUnionVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.LargeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.util.ValueVectorUtility; + +/** + * Visitor to validate vector (without validating data). + * This visitor could be used for {@link ValueVector#accept(VectorVisitor, Object)} API, + * and also users could simply use {@link ValueVectorUtility#validate(ValueVector)}. + */ +public class ValidateVectorVisitor implements VectorVisitor { + + @Override + public Void visit(BaseFixedWidthVector vector, Void value) { + if (vector.getValueCount() > 0) { + if (vector.getDataBuffer() == null || vector.getDataBuffer().capacity() == 0) { + throw new IllegalArgumentException("valueBuffer is null or capacity is 0"); + } + } + return null; + } + + @Override + public Void visit(BaseVariableWidthVector vector, Void value) { + + if (vector.getValueCount() > 0) { + if (vector.getDataBuffer() == null || vector.getDataBuffer().capacity() == 0) { + throw new IllegalArgumentException("valueBuffer is null or capacity is 0"); + } + + ArrowBuf offsetBuf = vector.getOffsetBuffer(); + int minBufferSize = (vector.getValueCount() + 1) * BaseVariableWidthVector.OFFSET_WIDTH; + + if (offsetBuf.capacity() < minBufferSize) { + throw new IllegalArgumentException(String.format("offsetBuffer too small in vector of type %s" + + " and valueCount %s : expected at least %s byte(s), got %s", + vector.getField().getType().toString(), + vector.getValueCount(), minBufferSize, offsetBuf.capacity())); + } + + int firstOffset = vector.getOffsetBuffer().getInt(0); + int lastOffset = vector.getOffsetBuffer().getInt(vector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH); + + if (firstOffset < 0 || lastOffset < 0) { + throw new IllegalArgumentException("Negative offsets in vector"); + } + + int dataExtent = lastOffset - firstOffset; + + if (dataExtent > 0 && (vector.getDataBuffer().capacity() == 0)) { + throw new IllegalArgumentException("dataBuffer capacity is 0"); + } + + if (dataExtent > vector.getDataBuffer().capacity()) { + throw new IllegalArgumentException(String.format("Length spanned by offsets %s larger than" + + " dataBuffer capacity %s", dataExtent, vector.getValueCount())); + } + } + return null; + } + + @Override + public Void visit(BaseLargeVariableWidthVector left, Void value) { + return null; + } + + @Override + public Void visit(ListVector vector, Void value) { + + FieldVector dataVector = vector.getDataVector(); + + if (vector.getValueCount() > 0) { + + ArrowBuf offsetBuf = vector.getOffsetBuffer(); + int minBufferSize = (vector.getValueCount() + 1) * BaseVariableWidthVector.OFFSET_WIDTH; + + if (offsetBuf.capacity() < minBufferSize) { + throw new IllegalArgumentException(String.format("offsetBuffer too small in vector of type %s" + + " and valueCount %s : expected at least %s byte(s), got %s", + vector.getField().getType().toString(), + vector.getValueCount(), minBufferSize, offsetBuf.capacity())); + } + + int firstOffset = vector.getOffsetBuffer().getInt(0); + int lastOffset = vector.getOffsetBuffer().getInt(vector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH); + + if (firstOffset < 0 || lastOffset < 0) { + throw new IllegalArgumentException("Negative offsets in list vector"); + } + + int dataExtent = lastOffset - firstOffset; + + if (dataExtent > 0 && (dataVector.getDataBuffer() == null || dataVector.getDataBuffer().capacity() == 0)) { + throw new IllegalArgumentException("valueBuffer is null or capacity is 0"); + } + + if (dataExtent > dataVector.getValueCount()) { + throw new IllegalArgumentException(String.format("Length spanned by list offsets (%s) larger than" + + " data vector valueCount (length %s)", dataExtent, dataVector.getValueCount())); + } + } + + return dataVector.accept(this, null); + } + + @Override + public Void visit(LargeListVector vector, Void value) { + + FieldVector dataVector = vector.getDataVector(); + + if (vector.getValueCount() > 0) { + + ArrowBuf offsetBuf = vector.getOffsetBuffer(); + long minBufferSize = (vector.getValueCount() + 1) * LargeListVector.OFFSET_WIDTH; + + if (offsetBuf.capacity() < minBufferSize) { + throw new IllegalArgumentException(String.format("offsetBuffer too small in vector of type %s" + + " and valueCount %s : expected at least %s byte(s), got %s", + vector.getField().getType().toString(), + vector.getValueCount(), minBufferSize, offsetBuf.capacity())); + } + + long firstOffset = vector.getOffsetBuffer().getLong(0); + long lastOffset = vector.getOffsetBuffer().getLong(vector.getValueCount() * LargeListVector.OFFSET_WIDTH); + + if (firstOffset < 0 || lastOffset < 0) { + throw new IllegalArgumentException("Negative offsets in list vector"); + } + + long dataExtent = lastOffset - firstOffset; + + if (dataExtent > 0 && (dataVector.getDataBuffer() == null || dataVector.getDataBuffer().capacity() == 0)) { + throw new IllegalArgumentException("valueBuffer is null or capacity is 0"); + } + + if (dataExtent > dataVector.getValueCount()) { + throw new IllegalArgumentException(String.format("Length spanned by list offsets (%s) larger than" + + " data vector valueCount (length %s)", dataExtent, dataVector.getValueCount())); + } + } + + return dataVector.accept(this, null); + } + + @Override + public Void visit(FixedSizeListVector vector, Void value) { + + FieldVector dataVector = vector.getDataVector(); + int valueCount = vector.getValueCount(); + int listSize = vector.getListSize(); + + if (valueCount > 0 && (dataVector.getDataBuffer() == null || dataVector.getDataBuffer().capacity() == 0)) { + throw new IllegalArgumentException("valueBuffer is null or capacity is 0"); + } + + if (valueCount * listSize != dataVector.getValueCount()) { + throw new IllegalArgumentException(String.format("data vector valueCount invalid, expect %s, " + + "actual is: %s", valueCount * listSize, dataVector.getValueCount())); + } + + return null; + } + + @Override + public Void visit(NonNullableStructVector vector, Void value) { + + List childFields = vector.getField().getChildren(); + final int valueCount = vector.getValueCount(); + + for (int i = 0; i < childFields.size(); i++) { + FieldVector child = vector.getChildrenFromFields().get(i); + + if (child.getValueCount() != valueCount) { + throw new IllegalArgumentException(String.format("struct child vector #%s valueCount is not equals with " + + "struct vector, expect %s, actual %s", i, vector.getValueCount(), child.getValueCount())); + } + + if (!childFields.get(i).getType().equals(child.getField().getType())) { + throw new IllegalArgumentException(String.format("struct child vector #%s does not match type: %s vs %s", + i, childFields.get(i).getType().toString(), child.getField().getType().toString())); + } + + child.accept(this, null); + } + return null; + } + + @Override + public Void visit(UnionVector vector, Void value) { + + List childFields = vector.getField().getChildren(); + final int valueCount = vector.getValueCount(); + + for (int i = 0; i < childFields.size(); i++) { + FieldVector child = vector.getChildrenFromFields().get(i); + + if (child.getValueCount() != valueCount) { + throw new IllegalArgumentException(String.format("union child vector #%s valueCount is not equals with union" + + " vector, expect %s, actual %s", i, vector.getValueCount(), child.getValueCount())); + } + + if (!childFields.get(i).getType().equals(child.getField().getType())) { + throw new IllegalArgumentException(String.format("union child vector #%s does not match type: %s vs %s", + i, childFields.get(i).getType().toString(), child.getField().getType().toString())); + } + + child.accept(this, null); + } + return null; + } + + @Override + public Void visit(DenseUnionVector vector, Void value) { + + List childFields = vector.getField().getChildren(); + for (int i = 0; i < childFields.size(); i++) { + FieldVector child = vector.getChildrenFromFields().get(i); + + if (!childFields.get(i).getType().equals(child.getField().getType())) { + throw new IllegalArgumentException(String.format("union child vector #%s does not match type: %s vs %s", + i, childFields.get(i).getType().toString(), child.getField().getType().toString())); + } + + child.accept(this, null); + } + return null; + } + + @Override + public Void visit(NullVector vector, Void value) { + return null; + } + + @Override + public Void visit(ExtensionTypeVector vector, Void value) { + vector.getUnderlyingVector().accept(this, value); + return null; + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/util/TestSchemaUtil.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/util/TestSchemaUtil.java new file mode 100644 index 000000000..cefff8382 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/util/TestSchemaUtil.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.util; + +import static java.util.Arrays.asList; +import static org.junit.Assert.assertEquals; + +import java.io.IOException; + +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.SchemaUtility; +import org.junit.Test; + +public class TestSchemaUtil { + + private static Field field(String name, boolean nullable, ArrowType type, Field... children) { + return new Field(name, new FieldType(nullable, type, null, null), asList(children)); + } + + @Test + public void testSerializationAndDeserialization() throws IOException { + Schema schema = new Schema(asList( + field("a", false, new ArrowType.Null()), + field("b", true, new ArrowType.Utf8()), + field("c", true, new ArrowType.Binary())) + ); + + byte[] serialized = SchemaUtility.serialize(schema); + Schema deserialized = SchemaUtility.deserialize(serialized, new RootAllocator(Long.MAX_VALUE)); + assertEquals(schema, deserialized); + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/DirtyRootAllocator.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/DirtyRootAllocator.java new file mode 100644 index 000000000..27b8ac752 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/DirtyRootAllocator.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferManager; +import org.apache.arrow.memory.RootAllocator; + +/** + * Root allocator that returns buffers pre-filled with a given value.
    + * Useful for testing if value vectors are properly zeroing their buffers. + */ +public class DirtyRootAllocator extends RootAllocator { + + private final byte fillValue; + + public DirtyRootAllocator(final long limit, final byte fillValue) { + super(limit); + this.fillValue = fillValue; + } + + @Override + public ArrowBuf buffer(long size) { + return buffer(size, null); + } + + @Override + public ArrowBuf buffer(long size, BufferManager manager) { + ArrowBuf buffer = super.buffer(size, manager); + // contaminate the buffer + for (int i = 0; i < buffer.capacity(); i++) { + buffer.setByte(i, fillValue); + } + + return buffer; + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ITTestLargeVector.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ITTestLargeVector.java new file mode 100644 index 000000000..19648dc9e --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ITTestLargeVector.java @@ -0,0 +1,280 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.math.BigDecimal; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.holders.NullableDecimalHolder; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Integration test for a vector with a large (more than 2GB) {@link org.apache.arrow.memory.ArrowBuf} as + * the data buffer. + * To run this test, please make sure there is at least 4GB free memory in the system. + */ +public class ITTestLargeVector { + private static final Logger logger = LoggerFactory.getLogger(ITTestLargeVector.class); + + @Test + public void testLargeLongVector() { + logger.trace("Testing large big int vector."); + + final long bufSize = 4 * 1024 * 1024 * 1024L; + final int vecLength = (int) (bufSize / BigIntVector.TYPE_WIDTH); + + try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + BigIntVector largeVec = new BigIntVector("vec", allocator)) { + largeVec.allocateNew(vecLength); + + logger.trace("Successfully allocated a vector with capacity {}", vecLength); + + for (int i = 0; i < vecLength; i++) { + largeVec.set(i, i * 10L); + + if ((i + 1) % 10000 == 0) { + logger.trace("Successfully written {} values", i + 1); + } + } + logger.trace("Successfully written {} values", vecLength); + + for (int i = 0; i < vecLength; i++) { + long val = largeVec.get(i); + assertEquals(i * 10L, val); + + if ((i + 1) % 10000 == 0) { + logger.trace("Successfully read {} values", i + 1); + } + } + logger.trace("Successfully read {} values", vecLength); + } + logger.trace("Successfully released the large vector."); + } + + @Test + public void testLargeIntVector() { + logger.trace("Testing large int vector."); + + final long bufSize = 4 * 1024 * 1024 * 1024L; + final int vecLength = (int) (bufSize / IntVector.TYPE_WIDTH); + + try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + IntVector largeVec = new IntVector("vec", allocator)) { + largeVec.allocateNew(vecLength); + + logger.trace("Successfully allocated a vector with capacity {}", vecLength); + + for (int i = 0; i < vecLength; i++) { + largeVec.set(i, i); + + if ((i + 1) % 10000 == 0) { + logger.trace("Successfully written {} values", i + 1); + } + } + logger.trace("Successfully written {} values", vecLength); + + for (int i = 0; i < vecLength; i++) { + long val = largeVec.get(i); + assertEquals(i, val); + + if ((i + 1) % 10000 == 0) { + logger.trace("Successfully read {} values", i + 1); + } + } + logger.trace("Successfully read {} values", vecLength); + } + logger.trace("Successfully released the large vector."); + } + + @Test + public void testLargeDecimalVector() { + logger.trace("Testing large decimal vector."); + + final long bufSize = 4 * 1024 * 1024 * 1024L; + final int vecLength = (int) (bufSize / DecimalVector.TYPE_WIDTH); + + try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + DecimalVector largeVec = new DecimalVector("vec", allocator, 38, 0)) { + largeVec.allocateNew(vecLength); + + logger.trace("Successfully allocated a vector with capacity {}", vecLength); + + for (int i = 0; i < vecLength; i++) { + largeVec.set(i, 0); + + if ((i + 1) % 10000 == 0) { + logger.trace("Successfully written {} values", i + 1); + } + } + logger.trace("Successfully written {} values", vecLength); + + for (int i = 0; i < vecLength; i++) { + ArrowBuf buf = largeVec.get(i); + assertEquals(buf.capacity(), DecimalVector.TYPE_WIDTH); + assertEquals(0, buf.getLong(0)); + assertEquals(0, buf.getLong(8)); + + if ((i + 1) % 10000 == 0) { + logger.trace("Successfully read {} values", i + 1); + } + } + logger.trace("Successfully read {} values", vecLength); + + // try setting values with a large offset in the buffer + largeVec.set(vecLength - 1, 12345L); + assertEquals(12345L, largeVec.getObject(vecLength - 1).longValue()); + + NullableDecimalHolder holder = new NullableDecimalHolder(); + holder.buffer = largeVec.valueBuffer; + holder.isSet = 1; + holder.start = (long) (vecLength - 1) * largeVec.getTypeWidth(); + assertTrue(holder.start > Integer.MAX_VALUE); + largeVec.set(0, holder); + + BigDecimal decimal = largeVec.getObject(0); + assertEquals(12345L, decimal.longValue()); + + logger.trace("Successfully setting values from large offsets"); + } + logger.trace("Successfully released the large vector."); + } + + @Test + public void testLargeFixedSizeBinaryVector() { + logger.trace("Testing large fixed size binary vector."); + + final long bufSize = 4 * 1024 * 1024 * 1024L; + final int typeWidth = 8; + final int vecLength = (int) (bufSize / typeWidth); + + try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + FixedSizeBinaryVector largeVec = new FixedSizeBinaryVector("vec", allocator, typeWidth)) { + largeVec.allocateNew(vecLength); + + logger.trace("Successfully allocated a vector with capacity {}", vecLength); + + byte[] value = new byte[] {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'}; + for (int i = 0; i < vecLength; i++) { + largeVec.set(i, value); + + if ((i + 1) % 10000 == 0) { + logger.trace("Successfully written {} values", i + 1); + } + } + logger.trace("Successfully written {} values", vecLength); + + for (int i = 0; i < vecLength; i++) { + byte[] buf = largeVec.get(i); + assertEquals(typeWidth, buf.length); + assertArrayEquals(buf, value); + + if ((i + 1) % 10000 == 0) { + logger.trace("Successfully read {} values", i + 1); + } + } + logger.trace("Successfully read {} values", vecLength); + } + logger.trace("Successfully released the large vector."); + } + + @Test + public void testLargeVarCharVector() { + logger.trace("Testing large var char vector."); + + final long bufSize = 4 * 1024 * 1024 * 1024L; + final int vecLength = (int) (bufSize / BaseVariableWidthVector.OFFSET_WIDTH); + final String strElement = "a"; + + try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + VarCharVector largeVec = new VarCharVector("vec", allocator)) { + largeVec.allocateNew(vecLength); + + logger.trace("Successfully allocated a vector with capacity " + vecLength); + + for (int i = 0; i < vecLength; i++) { + largeVec.setSafe(i, strElement.getBytes()); + + if ((i + 1) % 10000 == 0) { + logger.trace("Successfully written " + (i + 1) + " values"); + } + } + largeVec.setValueCount(vecLength); + assertTrue(largeVec.getOffsetBuffer().readableBytes() > Integer.MAX_VALUE); + assertTrue(largeVec.getDataBuffer().readableBytes() < Integer.MAX_VALUE); + logger.trace("Successfully written " + vecLength + " values"); + + for (int i = 0; i < vecLength; i++) { + byte[] val = largeVec.get(i); + assertEquals(strElement, new String(val)); + + if ((i + 1) % 10000 == 0) { + logger.trace("Successfully read " + (i + 1) + " values"); + } + } + logger.trace("Successfully read " + vecLength + " values"); + } + logger.trace("Successfully released the large vector."); + } + + @Test + public void testLargeLargeVarCharVector() { + logger.trace("Testing large large var char vector."); + + final long bufSize = 4 * 1024 * 1024 * 1024L; + final int vecLength = (int) (bufSize / BaseLargeVariableWidthVector.OFFSET_WIDTH); + final String strElement = "9876543210"; + + try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + LargeVarCharVector largeVec = new LargeVarCharVector("vec", allocator)) { + largeVec.allocateNew(vecLength); + + logger.trace("Successfully allocated a vector with capacity " + vecLength); + + for (int i = 0; i < vecLength; i++) { + largeVec.setSafe(i, strElement.getBytes()); + + if ((i + 1) % 10000 == 0) { + logger.trace("Successfully written " + (i + 1) + " values"); + } + } + largeVec.setValueCount(vecLength); + assertTrue(largeVec.getOffsetBuffer().readableBytes() > Integer.MAX_VALUE); + assertTrue(largeVec.getDataBuffer().readableBytes() > Integer.MAX_VALUE); + logger.trace("Successfully written " + vecLength + " values"); + + for (int i = 0; i < vecLength; i++) { + byte[] val = largeVec.get(i); + assertEquals(strElement, new String(val)); + + if ((i + 1) % 10000 == 0) { + logger.trace("Successfully read " + (i + 1) + " values"); + } + } + logger.trace("Successfully read " + vecLength + " values"); + } + logger.trace("Successfully released the large vector."); + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestBitVector.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestBitVector.java new file mode 100644 index 000000000..28d56e342 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestBitVector.java @@ -0,0 +1,543 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.util.stream.IntStream; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.memory.util.hash.MurmurHasher; +import org.apache.arrow.vector.testing.ValueVectorDataPopulator; +import org.apache.arrow.vector.util.TransferPair; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class TestBitVector { + private static final String EMPTY_SCHEMA_PATH = ""; + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testBitVectorCopyFromSafe() { + final int size = 20; + try (final BitVector src = new BitVector(EMPTY_SCHEMA_PATH, allocator); + final BitVector dst = new BitVector(EMPTY_SCHEMA_PATH, allocator)) { + src.allocateNew(size); + dst.allocateNew(10); + + for (int i = 0; i < size; i++) { + src.set(i, i % 2); + } + src.setValueCount(size); + + for (int i = 0; i < size; i++) { + dst.copyFromSafe(i, i, src); + } + dst.setValueCount(size); + + for (int i = 0; i < size; i++) { + assertEquals(src.getObject(i), dst.getObject(i)); + } + } + } + + @Test + public void testSplitAndTransfer() throws Exception { + + try (final BitVector sourceVector = new BitVector("bitvector", allocator)) { + + sourceVector.allocateNew(40); + + /* populate the bitvector -- 010101010101010101010101..... */ + for (int i = 0; i < 40; i++) { + if ((i & 1) == 1) { + sourceVector.set(i, 1); + } else { + sourceVector.set(i, 0); + } + } + + sourceVector.setValueCount(40); + + /* check the vector output */ + for (int i = 0; i < 40; i++) { + int result = sourceVector.get(i); + if ((i & 1) == 1) { + assertEquals(Integer.toString(1), Integer.toString(result)); + } else { + assertEquals(Integer.toString(0), Integer.toString(result)); + } + } + + try (final BitVector toVector = new BitVector("toVector", allocator)) { + final TransferPair transferPair = sourceVector.makeTransferPair(toVector); + + /* + * form test cases such that we cover: + * + * (1) the start index is exactly where a particular byte starts in the source bit vector + * (2) the start index is randomly positioned within a byte in the source bit vector + * (2.1) the length is a multiple of 8 + * (2.2) the length is not a multiple of 8 + */ + final int[][] transferLengths = {{0, 8}, {8, 10}, {18, 0}, {18, 8}, {26, 0}, {26, 14}}; + + for (final int[] transferLength : transferLengths) { + final int start = transferLength[0]; + final int length = transferLength[1]; + + transferPair.splitAndTransfer(start, length); + + /* check the toVector output after doing splitAndTransfer */ + for (int i = 0; i < length; i++) { + int actual = toVector.get(i); + int expected = sourceVector.get(start + i); + assertEquals("different data values not expected --> sourceVector index: " + (start + i) + + " toVector index: " + i, expected, actual); + } + } + } + } + } + + @Test + public void testSplitAndTransfer1() throws Exception { + + try (final BitVector sourceVector = new BitVector("bitvector", allocator)) { + + sourceVector.allocateNew(8190); + + /* populate the bitvector */ + for (int i = 0; i < 8190; i++) { + sourceVector.set(i, 1); + } + + sourceVector.setValueCount(8190); + + /* check the vector output */ + for (int i = 0; i < 8190; i++) { + int result = sourceVector.get(i); + assertEquals(Integer.toString(1), Integer.toString(result)); + } + + try (final BitVector toVector = new BitVector("toVector", allocator)) { + final TransferPair transferPair = sourceVector.makeTransferPair(toVector); + + final int[][] transferLengths = {{0, 4095}, {4095, 4095}}; + + for (final int[] transferLength : transferLengths) { + final int start = transferLength[0]; + final int length = transferLength[1]; + + transferPair.splitAndTransfer(start, length); + + /* check the toVector output after doing splitAndTransfer */ + for (int i = 0; i < length; i++) { + int actual = toVector.get(i); + int expected = sourceVector.get(start + i); + assertEquals("different data values not expected --> sourceVector index: " + (start + i) + + " toVector index: " + i, expected, actual); + } + } + } + } + } + + @Test + public void testSplitAndTransfer2() throws Exception { + + try (final BitVector sourceVector = new BitVector("bitvector", allocator)) { + + sourceVector.allocateNew(32); + + /* populate the bitvector */ + for (int i = 0; i < 32; i++) { + if ((i & 1) == 1) { + sourceVector.set(i, 1); + } else { + sourceVector.set(i, 0); + } + } + + sourceVector.setValueCount(32); + + /* check the vector output */ + for (int i = 0; i < 32; i++) { + int result = sourceVector.get(i); + if ((i & 1) == 1) { + assertEquals(Integer.toString(1), Integer.toString(result)); + } else { + assertEquals(Integer.toString(0), Integer.toString(result)); + } + } + + try (final BitVector toVector = new BitVector("toVector", allocator)) { + final TransferPair transferPair = sourceVector.makeTransferPair(toVector); + + final int[][] transferLengths = {{5, 22}, {5, 24}, {5, 25}, {5, 27}, {0, 31}, {5, 7}, {2, 3}}; + + for (final int[] transferLength : transferLengths) { + final int start = transferLength[0]; + final int length = transferLength[1]; + + transferPair.splitAndTransfer(start, length); + + /* check the toVector output after doing splitAndTransfer */ + for (int i = 0; i < length; i++) { + int actual = toVector.get(i); + int expected = sourceVector.get(start + i); + assertEquals("different data values not expected --> sourceVector index: " + (start + i) + + " toVector index: " + i, expected, actual); + } + } + } + } + } + + @Test + public void testReallocAfterVectorTransfer1() { + try (final BitVector vector = new BitVector(EMPTY_SCHEMA_PATH, allocator)) { + vector.allocateNew(4096); + int valueCapacity = vector.getValueCapacity(); + assertEquals(4096, valueCapacity); + + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 1) { + vector.setToOne(i); + } + } + + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 1) { + assertEquals("unexpected cleared bit at index: " + i, 1, vector.get(i)); + } else { + assertTrue("unexpected set bit at index: " + i, vector.isNull(i)); + } + } + + /* trigger first realloc */ + vector.setSafeToOne(valueCapacity); + assertEquals(valueCapacity * 2, vector.getValueCapacity()); + + for (int i = valueCapacity; i < valueCapacity * 2; i++) { + if ((i & 1) == 1) { + vector.setToOne(i); + } + } + + for (int i = 0; i < valueCapacity * 2; i++) { + if (((i & 1) == 1) || (i == valueCapacity)) { + assertEquals("unexpected cleared bit at index: " + i, 1, vector.get(i)); + } else { + assertTrue("unexpected set bit at index: " + i, vector.isNull(i)); + } + } + + /* trigger second realloc */ + vector.setSafeToOne(valueCapacity * 2); + assertEquals(valueCapacity * 4, vector.getValueCapacity()); + + for (int i = valueCapacity * 2; i < valueCapacity * 4; i++) { + if ((i & 1) == 1) { + vector.setToOne(i); + } + } + + for (int i = 0; i < valueCapacity * 4; i++) { + if (((i & 1) == 1) || (i == valueCapacity) || (i == valueCapacity * 2)) { + assertEquals("unexpected cleared bit at index: " + i, 1, vector.get(i)); + } else { + assertTrue("unexpected set bit at index: " + i, vector.isNull(i)); + } + } + + /* now transfer the vector */ + TransferPair transferPair = vector.getTransferPair(allocator); + transferPair.transfer(); + final BitVector toVector = (BitVector) transferPair.getTo(); + + assertEquals(valueCapacity * 4, toVector.getValueCapacity()); + + /* realloc the toVector */ + toVector.setSafeToOne(valueCapacity * 4); + + for (int i = 0; i < toVector.getValueCapacity(); i++) { + if (i <= valueCapacity * 4) { + if (((i & 1) == 1) || (i == valueCapacity) || + (i == valueCapacity * 2) || (i == valueCapacity * 4)) { + assertEquals("unexpected cleared bit at index: " + i, 1, toVector.get(i)); + } else { + assertTrue("unexpected set bit at index: " + i, toVector.isNull(i)); + } + } else { + assertTrue("unexpected set bit at index: " + i, toVector.isNull(i)); + } + } + + toVector.close(); + } + } + + @Test + public void testReallocAfterVectorTransfer2() { + try (final BitVector vector = new BitVector(EMPTY_SCHEMA_PATH, allocator)) { + vector.allocateNew(4096); + int valueCapacity = vector.getValueCapacity(); + assertEquals(4096, valueCapacity); + + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 1) { + vector.set(i, 1); + } + } + + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 1) { + assertFalse("unexpected cleared bit at index: " + i, vector.isNull(i)); + } else { + assertTrue("unexpected set bit at index: " + i, vector.isNull(i)); + } + } + + /* trigger first realloc */ + vector.setSafe(valueCapacity, 1, 1); + assertEquals(valueCapacity * 2, vector.getValueCapacity()); + + for (int i = valueCapacity; i < valueCapacity * 2; i++) { + if ((i & 1) == 1) { + vector.set(i, 1); + } + } + + for (int i = 0; i < valueCapacity * 2; i++) { + if (((i & 1) == 1) || (i == valueCapacity)) { + assertFalse("unexpected cleared bit at index: " + i, vector.isNull(i)); + } else { + assertTrue("unexpected set bit at index: " + i, vector.isNull(i)); + } + } + + /* trigger second realloc */ + vector.setSafe(valueCapacity * 2, 1, 1); + assertEquals(valueCapacity * 4, vector.getValueCapacity()); + + for (int i = valueCapacity * 2; i < valueCapacity * 4; i++) { + if ((i & 1) == 1) { + vector.set(i, 1); + } + } + + for (int i = 0; i < valueCapacity * 4; i++) { + if (((i & 1) == 1) || (i == valueCapacity) || (i == valueCapacity * 2)) { + assertFalse("unexpected cleared bit at index: " + i, vector.isNull(i)); + } else { + assertTrue("unexpected set bit at index: " + i, vector.isNull(i)); + } + } + + /* now transfer the vector */ + TransferPair transferPair = vector.getTransferPair(allocator); + transferPair.transfer(); + final BitVector toVector = (BitVector) transferPair.getTo(); + + assertEquals(valueCapacity * 4, toVector.getValueCapacity()); + + /* realloc the toVector */ + toVector.setSafe(valueCapacity * 4, 1, 1); + + for (int i = 0; i < toVector.getValueCapacity(); i++) { + if (i <= valueCapacity * 4) { + if (((i & 1) == 1) || (i == valueCapacity) || + (i == valueCapacity * 2) || (i == valueCapacity * 4)) { + assertFalse("unexpected cleared bit at index: " + i, toVector.isNull(i)); + } else { + assertTrue("unexpected set bit at index: " + i, toVector.isNull(i)); + } + } else { + assertTrue("unexpected set bit at index: " + i, toVector.isNull(i)); + } + } + + toVector.close(); + } + } + + @Test + public void testBitVector() { + // Create a new value vector for 1024 integers + try (final BitVector vector = new BitVector(EMPTY_SCHEMA_PATH, allocator)) { + vector.allocateNew(1024); + vector.setValueCount(1024); + + // Put and set a few values + vector.set(0, 1); + vector.set(1, 0); + vector.set(100, 0); + vector.set(1022, 1); + + vector.setValueCount(1024); + + assertEquals(1, vector.get(0)); + assertEquals(0, vector.get(1)); + assertEquals(0, vector.get(100)); + assertEquals(1, vector.get(1022)); + + assertEquals(1020, vector.getNullCount()); + + // test setting the same value twice + vector.set(0, 1); + vector.set(0, 1); + vector.set(1, 0); + vector.set(1, 0); + assertEquals(1, vector.get(0)); + assertEquals(0, vector.get(1)); + + // test toggling the values + vector.set(0, 0); + vector.set(1, 1); + assertEquals(0, vector.get(0)); + assertEquals(1, vector.get(1)); + + // should not change + assertEquals(1020, vector.getNullCount()); + + // Ensure null value + assertTrue(vector.isNull(3)); + + // unset the previously set bits + vector.setNull(0); + vector.setNull(1); + vector.setNull(100); + vector.setNull(1022); + // this should set all the array to 0 + assertEquals(1024, vector.getNullCount()); + + // set all the array to 1 + for (int i = 0; i < 1024; ++i) { + assertEquals(1024 - i, vector.getNullCount()); + vector.set(i, 1); + } + + assertEquals(0, vector.getNullCount()); + + vector.allocateNew(1015); + vector.setValueCount(1015); + + // ensure it has been zeroed + assertEquals(1015, vector.getNullCount()); + + vector.set(0, 1); + vector.set(1014, 1); // ensure that the last item of the last byte is allocated + + assertEquals(1013, vector.getNullCount()); + + vector.zeroVector(); + assertEquals(1015, vector.getNullCount()); + + // set all the array to 1 + for (int i = 0; i < 1015; ++i) { + assertEquals(1015 - i, vector.getNullCount()); + vector.set(i, 1); + } + + assertEquals(0, vector.getNullCount()); + } + } + + @Test + public void testBitVectorRangeSetAllOnes() { + validateRange(1000, 0, 1000); + validateRange(1000, 0, 1); + validateRange(1000, 1, 2); + validateRange(1000, 5, 6); + validateRange(1000, 5, 10); + validateRange(1000, 5, 150); + validateRange(1000, 5, 27); + for (int i = 0; i < 8; i++) { + for (int j = 0; j < 8; j++) { + validateRange(1000, 10 + i, 27 + j); + validateRange(1000, i, j); + } + } + } + + private void validateRange(int length, int start, int count) { + String desc = "[" + start + ", " + (start + count) + ") "; + try (BitVector bitVector = new BitVector("bits", allocator)) { + bitVector.reset(); + bitVector.allocateNew(length); + bitVector.setRangeToOne(start, count); + for (int i = 0; i < start; i++) { + Assert.assertTrue(desc + i, bitVector.isNull(i)); + } + for (int i = start; i < start + count; i++) { + Assert.assertEquals(desc + i, 1, bitVector.get(i)); + } + for (int i = start + count; i < length; i++) { + Assert.assertTrue(desc + i, bitVector.isNull(i)); + } + } + } + + @Test + public void testBitVectorHashCode() { + final int size = 6; + try (final BitVector vector = new BitVector(EMPTY_SCHEMA_PATH, allocator)) { + ValueVectorDataPopulator.setVector(vector, 0, 1, null, 0, 1, null); + + int[] hashCodes = new int[size]; + IntStream.range(0, size).forEach(i -> hashCodes[i] = vector.hashCode(i)); + + assertTrue(hashCodes[0] == hashCodes[3]); + assertTrue(hashCodes[1] == hashCodes[4]); + assertTrue(hashCodes[2] == hashCodes[5]); + + assertFalse(hashCodes[0] == hashCodes[1]); + assertFalse(hashCodes[0] == hashCodes[2]); + assertFalse(hashCodes[1] == hashCodes[2]); + + MurmurHasher hasher = new MurmurHasher(); + + IntStream.range(0, size).forEach(i -> hashCodes[i] = vector.hashCode(i, hasher)); + + assertTrue(hashCodes[0] == hashCodes[3]); + assertTrue(hashCodes[1] == hashCodes[4]); + assertTrue(hashCodes[2] == hashCodes[5]); + + assertFalse(hashCodes[0] == hashCodes[1]); + assertFalse(hashCodes[0] == hashCodes[2]); + assertFalse(hashCodes[1] == hashCodes[2]); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestBitVectorHelper.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestBitVectorHelper.java new file mode 100644 index 000000000..9c7e1979d --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestBitVectorHelper.java @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertFalse; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.junit.Test; + +import io.netty.util.internal.PlatformDependent; + +public class TestBitVectorHelper { + @Test + public void testGetNullCount() throws Exception { + try (BufferAllocator root = new RootAllocator()) { + // test case 1, 1 null value for 0b110 + ArrowBuf validityBuffer = root.buffer(3); + // we set validity buffer to be 0b10110, but only have 3 items with 1st item is null + validityBuffer.setByte(0, 0b10110); + + // we will only consider 0b110 here, since we only 3 items and only one is null + int count = BitVectorHelper.getNullCount(validityBuffer, 3); + assertEquals(count, 1); + validityBuffer.close(); + + // test case 2, no null value for 0xFF + validityBuffer = root.buffer(8); + validityBuffer.setByte(0, 0xFF); + + count = BitVectorHelper.getNullCount(validityBuffer, 8); + assertEquals(count, 0); + validityBuffer.close(); + + // test case 3, 1 null value for 0x7F + validityBuffer = root.buffer(8); + validityBuffer.setByte(0, 0x7F); + + count = BitVectorHelper.getNullCount(validityBuffer, 8); + assertEquals(count, 1); + validityBuffer.close(); + + // test case 4, validity buffer has multiple bytes, 11 items + validityBuffer = root.buffer(11); + validityBuffer.setByte(0, 0b10101010); + validityBuffer.setByte(1, 0b01010101); + + count = BitVectorHelper.getNullCount(validityBuffer, 11); + assertEquals(count, 5); + validityBuffer.close(); + } + } + + @Test + public void testAllBitsNull() { + final int bufferLength = 32 * 1024; + try (RootAllocator allocator = new RootAllocator(bufferLength); + ArrowBuf validityBuffer = allocator.buffer(bufferLength)) { + + validityBuffer.setZero(0, bufferLength); + int bitLength = 1024; + assertTrue(BitVectorHelper.checkAllBitsEqualTo(validityBuffer, bitLength, false)); + + bitLength = 1027; + assertTrue(BitVectorHelper.checkAllBitsEqualTo(validityBuffer, bitLength, false)); + + validityBuffer.setZero(0, bufferLength); + bitLength = 1025; + BitVectorHelper.setBit(validityBuffer, 12); + assertFalse(BitVectorHelper.checkAllBitsEqualTo(validityBuffer, bitLength, false)); + + validityBuffer.setZero(0, bufferLength); + bitLength = 1025; + BitVectorHelper.setBit(validityBuffer, 1024); + assertFalse(BitVectorHelper.checkAllBitsEqualTo(validityBuffer, bitLength, false)); + + validityBuffer.setZero(0, bufferLength); + bitLength = 1026; + BitVectorHelper.setBit(validityBuffer, 1024); + assertFalse(BitVectorHelper.checkAllBitsEqualTo(validityBuffer, bitLength, false)); + + validityBuffer.setZero(0, bufferLength); + bitLength = 1027; + BitVectorHelper.setBit(validityBuffer, 1025); + assertFalse(BitVectorHelper.checkAllBitsEqualTo(validityBuffer, bitLength, false)); + + validityBuffer.setZero(0, bufferLength); + bitLength = 1031; + BitVectorHelper.setBit(validityBuffer, 1029); + BitVectorHelper.setBit(validityBuffer, 1030); + assertFalse(BitVectorHelper.checkAllBitsEqualTo(validityBuffer, bitLength, false)); + } + } + + @Test + public void testAllBitsSet() { + final int bufferLength = 32 * 1024; + try (RootAllocator allocator = new RootAllocator(bufferLength); + ArrowBuf validityBuffer = allocator.buffer(bufferLength)) { + + PlatformDependent.setMemory(validityBuffer.memoryAddress(), bufferLength, (byte) -1); + int bitLength = 1024; + assertTrue(BitVectorHelper.checkAllBitsEqualTo(validityBuffer, bitLength, true)); + + bitLength = 1028; + assertTrue(BitVectorHelper.checkAllBitsEqualTo(validityBuffer, bitLength, true)); + + PlatformDependent.setMemory(validityBuffer.memoryAddress(), bufferLength, (byte) -1); + bitLength = 1025; + BitVectorHelper.unsetBit(validityBuffer, 12); + assertFalse(BitVectorHelper.checkAllBitsEqualTo(validityBuffer, bitLength, true)); + + PlatformDependent.setMemory(validityBuffer.memoryAddress(), bufferLength, (byte) -1); + bitLength = 1025; + BitVectorHelper.unsetBit(validityBuffer, 1024); + assertFalse(BitVectorHelper.checkAllBitsEqualTo(validityBuffer, bitLength, true)); + + PlatformDependent.setMemory(validityBuffer.memoryAddress(), bufferLength, (byte) -1); + bitLength = 1026; + BitVectorHelper.unsetBit(validityBuffer, 1024); + assertFalse(BitVectorHelper.checkAllBitsEqualTo(validityBuffer, bitLength, true)); + + PlatformDependent.setMemory(validityBuffer.memoryAddress(), bufferLength, (byte) -1); + bitLength = 1027; + BitVectorHelper.unsetBit(validityBuffer, 1025); + assertFalse(BitVectorHelper.checkAllBitsEqualTo(validityBuffer, bitLength, true)); + + PlatformDependent.setMemory(validityBuffer.memoryAddress(), bufferLength, (byte) -1); + bitLength = 1031; + BitVectorHelper.unsetBit(validityBuffer, 1029); + BitVectorHelper.unsetBit(validityBuffer, 1030); + assertFalse(BitVectorHelper.checkAllBitsEqualTo(validityBuffer, bitLength, true)); + } + } + + @Test + public void testConcatBits() { + try (RootAllocator allocator = new RootAllocator(1024 * 1024)) { + try (ArrowBuf buf1 = allocator.buffer(1024); + ArrowBuf buf2 = allocator.buffer(1024); + ArrowBuf output = allocator.buffer(1024)) { + + buf1.setZero(0, buf1.capacity()); + buf2.setZero(0, buf2.capacity()); + + final int maxCount = 100; + for (int i = 0; i < maxCount; i++) { + if (i % 3 == 0) { + BitVectorHelper.setBit(buf1, i); + BitVectorHelper.setBit(buf2, i); + } + } + + // test the case where the number of bits for both sets are multiples of 8. + concatAndVerify(buf1, 40, buf2, 48, output); + + // only the number of bits in the first set is a multiple of 8 + concatAndVerify(buf1, 32, buf2, 47, output); + + // only the number of bits in the second set is a multiple of 8 + concatAndVerify(buf1, 31, buf2, 48, output); + + // neither set has a size that is a multiple of 8 + concatAndVerify(buf1, 27, buf2, 52, output); + + // the remaining bits in the second set is spread in two bytes + concatAndVerify(buf1, 31, buf2, 55, output); + } + } + } + + @Test + public void testConcatBitsInPlace() { + try (RootAllocator allocator = new RootAllocator(1024 * 1024)) { + try (ArrowBuf buf1 = allocator.buffer(1024); + ArrowBuf buf2 = allocator.buffer(1024)) { + + buf1.setZero(0, buf1.capacity()); + buf2.setZero(0, buf2.capacity()); + + final int maxCount = 100; + for (int i = 0; i < maxCount; i++) { + if (i % 3 == 0) { + BitVectorHelper.setBit(buf1, i); + BitVectorHelper.setBit(buf2, i); + } + } + + // test the case where the number of bits for both sets are multiples of 8. + concatAndVerify(buf1, 40, buf2, 48, buf1); + + // only the number of bits in the first set is a multiple of 8 + concatAndVerify(buf1, 32, buf2, 47, buf1); + + // only the number of bits in the second set is a multiple of 8 + concatAndVerify(buf1, 31, buf2, 48, buf1); + + // neither set has a size that is a multiple of 8 + concatAndVerify(buf1, 27, buf2, 52, buf1); + + // the remaining bits in the second set is spread in two bytes + concatAndVerify(buf1, 31, buf2, 55, buf1); + } + } + } + + private void concatAndVerify(ArrowBuf buf1, int count1, ArrowBuf buf2, int count2, ArrowBuf output) { + BitVectorHelper.concatBits(buf1, count1, buf2, count2, output); + int outputIdx = 0; + for (int i = 0; i < count1; i++, outputIdx++) { + assertEquals(BitVectorHelper.get(output, outputIdx), BitVectorHelper.get(buf1, i)); + } + for (int i = 0; i < count2; i++, outputIdx++) { + assertEquals(BitVectorHelper.get(output, outputIdx), BitVectorHelper.get(buf2, i)); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java new file mode 100644 index 000000000..8efadad9b --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.ReferenceManager; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.CallBack; +import org.junit.Test; + +public class TestBufferOwnershipTransfer { + + @Test + public void testTransferFixedWidth() { + BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); + BufferAllocator childAllocator1 = allocator.newChildAllocator("child1", 100000, 100000); + BufferAllocator childAllocator2 = allocator.newChildAllocator("child2", 100000, 100000); + + IntVector v1 = new IntVector("v1", childAllocator1); + v1.allocateNew(); + v1.setValueCount(4095); + long totalAllocatedMemory = childAllocator1.getAllocatedMemory(); + + IntVector v2 = new IntVector("v2", childAllocator2); + + v1.makeTransferPair(v2).transfer(); + + assertEquals(0, childAllocator1.getAllocatedMemory()); + assertEquals(totalAllocatedMemory, childAllocator2.getAllocatedMemory()); + + v1.close(); + v2.close(); + childAllocator1.close(); + childAllocator2.close(); + allocator.close(); + } + + @Test + public void testTransferVariableWidth() { + BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); + BufferAllocator childAllocator1 = allocator.newChildAllocator("child1", 100000, 100000); + BufferAllocator childAllocator2 = allocator.newChildAllocator("child2", 100000, 100000); + + VarCharVector v1 = new VarCharVector("v1", childAllocator1); + v1.allocateNew(); + v1.setSafe(4094, "hello world".getBytes(), 0, 11); + v1.setValueCount(4001); + + VarCharVector v2 = new VarCharVector("v2", childAllocator2); + long memoryBeforeTransfer = childAllocator1.getAllocatedMemory(); + + v1.makeTransferPair(v2).transfer(); + + assertEquals(0, childAllocator1.getAllocatedMemory()); + assertEquals(memoryBeforeTransfer, childAllocator2.getAllocatedMemory()); + + v1.close(); + v2.close(); + childAllocator1.close(); + childAllocator2.close(); + allocator.close(); + } + + private static class Pointer { + T value; + } + + private static CallBack newTriggerCallback(final Pointer trigger) { + trigger.value = false; + return new CallBack() { + @Override + public void doWork() { + trigger.value = true; + } + }; + } + + @Test + public void emptyListTransferShouldNotTriggerSchemaChange() { + final BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); + + final Pointer trigger1 = new Pointer<>(); + final Pointer trigger2 = new Pointer<>(); + final ListVector v1 = new ListVector("v1", allocator, + FieldType.nullable(ArrowType.Null.INSTANCE), + newTriggerCallback(trigger1)); + final ListVector v2 = new ListVector("v2", allocator, + FieldType.nullable(ArrowType.Null.INSTANCE), + newTriggerCallback(trigger2)); + + try { + // since we are working with empty vectors, their internal + // buffers will be allocator.EMPTY which use + // ReferenceManager.NO_OP instance and transfer() is not + // supported + v1.makeTransferPair(v2).transfer(); + } catch (Exception e) { + assertTrue(e instanceof UnsupportedOperationException); + assertTrue(e.getMessage().contains(ReferenceManager.NO_OP_ERROR_MESSAGE)); + } + + assertFalse(trigger1.value); + assertFalse(trigger2.value); + + v1.close(); + v2.close(); + allocator.close(); + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestCopyFrom.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestCopyFrom.java new file mode 100644 index 000000000..3786f63c3 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestCopyFrom.java @@ -0,0 +1,1104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.TestUtils.newVector; +import static org.junit.Assert.*; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + +import java.math.BigDecimal; +import java.nio.charset.Charset; +import java.time.Duration; +import java.time.Period; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.types.Types.MinorType; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +/* + * Tested field types: + * + * NullableInt + * NullableBigInt + * NullableFloat4 + * NullableFloat8 + * NullableBit + * NullableDecimal + * NullableIntervalDay + * NullableIntervalYear + * NullableSmallInt + * NullableTinyInt + * NullableVarChar + * NullableTimeMicro + * NullableTimeMilli + * NullableTimeStamp* + */ + +public class TestCopyFrom { + + private static final String EMPTY_SCHEMA_PATH = ""; + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test /* NullableVarChar */ + public void testCopyFromWithNulls() { + try (final VarCharVector vector = + newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator); + final VarCharVector vector2 = + newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { + vector.allocateNew(); + assertTrue(vector.getValueCapacity() >= 1); + assertEquals(0, vector.getValueCount()); + int initialCapacity = vector.getValueCapacity(); + + for (int i = 0; i < initialCapacity; i++) { + if (i % 3 == 0) { + continue; + } + byte[] b = Integer.toString(i).getBytes(); + vector.setSafe(i, b, 0, b.length); + } + + /* NO reAlloc() should have happened in setSafe() */ + int capacity = vector.getValueCapacity(); + assertEquals(initialCapacity, capacity); + + vector.setValueCount(initialCapacity); + + for (int i = 0; i < initialCapacity; i++) { + if (i % 3 == 0) { + assertNull(vector.getObject(i)); + } else { + assertEquals( + "unexpected value at index: " + i, + Integer.toString(i), + vector.getObject(i).toString()); + } + } + + vector2.setInitialCapacity(initialCapacity); + vector2.allocateNew(); + capacity = vector2.getValueCapacity(); + assertEquals(initialCapacity, capacity); + + for (int i = 0; i < initialCapacity; i++) { + vector2.copyFromSafe(i, i, vector); + if (i % 3 == 0) { + assertNull(vector2.getObject(i)); + } else { + assertEquals( + "unexpected value at index: " + i, + Integer.toString(i), + vector2.getObject(i).toString()); + } + } + + /* NO reAlloc() should have happened in copyFrom */ + capacity = vector2.getValueCapacity(); + assertEquals(initialCapacity, capacity); + + vector2.setValueCount(initialCapacity); + + for (int i = 0; i < initialCapacity; i++) { + if (i % 3 == 0) { + assertNull(vector2.getObject(i)); + } else { + assertEquals( + "unexpected value at index: " + i, + Integer.toString(i), + vector2.getObject(i).toString()); + } + } + } + } + + @Test /* NullableVarChar */ + public void testCopyFromWithNulls1() { + try (final VarCharVector vector = + newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator); + final VarCharVector vector2 = + newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { + vector.allocateNew(); + assertTrue(vector.getValueCapacity() >= 1); + assertEquals(0, vector.getValueCount()); + int initialCapacity = vector.getValueCapacity(); + + for (int i = 0; i < initialCapacity; i++) { + if (i % 3 == 0) { + continue; + } + byte[] b = Integer.toString(i).getBytes(); + vector.setSafe(i, b, 0, b.length); + } + + /* NO reAlloc() should have happened in setSafe() */ + int capacity = vector.getValueCapacity(); + assertEquals(initialCapacity, capacity); + + vector.setValueCount(initialCapacity); + + for (int i = 0; i < initialCapacity; i++) { + if (i % 3 == 0) { + assertNull(vector.getObject(i)); + } else { + assertEquals( + "unexpected value at index: " + i, + Integer.toString(i), + vector.getObject(i).toString()); + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew((initialCapacity / 4) * 10, initialCapacity / 4); + + capacity = vector2.getValueCapacity(); + assertTrue(capacity >= initialCapacity / 4); + assertTrue(capacity < initialCapacity / 2); + + for (int i = 0; i < initialCapacity; i++) { + vector2.copyFromSafe(i, i, vector); + if (i % 3 == 0) { + assertNull(vector2.getObject(i)); + } else { + assertEquals( + "unexpected value at index: " + i, + Integer.toString(i), + vector2.getObject(i).toString()); + } + } + + /* 2 reAllocs should have happened in copyFromSafe() */ + capacity = vector2.getValueCapacity(); + assertTrue(capacity >= initialCapacity); + + vector2.setValueCount(initialCapacity); + + for (int i = 0; i < initialCapacity; i++) { + if (i % 3 == 0) { + assertNull(vector2.getObject(i)); + } else { + assertEquals( + "unexpected value at index: " + i, + Integer.toString(i), + vector2.getObject(i).toString()); + } + } + } + } + + @Test /* IntVector */ + public void testCopyFromWithNulls2() { + try (final IntVector vector1 = new IntVector(EMPTY_SCHEMA_PATH, allocator); + final IntVector vector2 = new IntVector(EMPTY_SCHEMA_PATH, allocator)) { + + vector1.allocateNew(); + assertTrue(vector1.getValueCapacity() >= vector1.INITIAL_VALUE_ALLOCATION); + assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); + + for (int i = 0; i < initialCapacity; i++) { + if ((i & 1) == 0) { + continue; + } + vector1.setSafe(i, 1000 + i); + } + + vector1.setValueCount(initialCapacity); + + /* No realloc should have happened in setSafe or + * setValueCount + */ + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); + + for (int i = 0; i < initialCapacity; i++) { + if ((i & 1) == 0) { + assertNull(vector1.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, 1000 + i, vector1.get(i)); + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); + + for (int i = 0; i < initialCapacity; i++) { + vector2.copyFromSafe(i, i, vector1); + } + + /* 2 realloc should have happened in copyFromSafe() */ + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); + /* setValueCount() should have done another realloc */ + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); + + /* check vector data after copy and realloc */ + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, 1000 + i, vector2.get(i)); + } + } + } + } + + @Test /* BigIntVector */ + public void testCopyFromWithNulls3() { + try (final BigIntVector vector1 = new BigIntVector(EMPTY_SCHEMA_PATH, allocator); + final BigIntVector vector2 = new BigIntVector(EMPTY_SCHEMA_PATH, allocator)) { + + vector1.allocateNew(); + assertTrue(vector1.getValueCapacity() >= vector1.INITIAL_VALUE_ALLOCATION); + assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); + + for (int i = 0; i < initialCapacity; i++) { + if ((i & 1) == 0) { + continue; + } + vector1.setSafe(i, 10000000000L + (long) i); + } + + vector1.setValueCount(initialCapacity); + + /* No realloc should have happened in setSafe or + * setValueCount + */ + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); + + for (int i = 0; i < initialCapacity; i++) { + if ((i & 1) == 0) { + assertNull(vector1.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, 10000000000L + (long) i, vector1.get(i)); + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); + + for (int i = 0; i < initialCapacity; i++) { + vector2.copyFromSafe(i, i, vector1); + } + + /* 2 realloc should have happened in copyFromSafe() */ + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); + /* setValueCount() should have done another realloc */ + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); + + /* check vector data after copy and realloc */ + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, 10000000000L + (long) i, vector2.get(i)); + } + } + } + } + + @Test /* BitVector */ + public void testCopyFromWithNulls4() { + try (final BitVector vector1 = new BitVector(EMPTY_SCHEMA_PATH, allocator); + final BitVector vector2 = new BitVector(EMPTY_SCHEMA_PATH, allocator)) { + + vector1.setInitialCapacity(4096); + vector1.allocateNew(); + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(0, vector1.getValueCount()); + + int counter = 0; + for (int i = 0; i < 4096; i++) { + if ((i & 1) == 0) { + continue; + } + if ((counter & 1) == 0) { + vector1.setSafe(i, 1); + } else { + vector1.setSafe(i, 0); + } + counter++; + } + + vector1.setValueCount(4096); + + /* No realloc should have happened in setSafe or + * setValueCount + */ + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(4096, vector1.getValueCount()); + + counter = 0; + for (int i = 0; i < 4096; i++) { + if ((i & 1) == 0) { + assertNull(vector1.getObject(i)); + } else { + if ((counter & 1) == 0) { + assertTrue(vector1.getObject(i)); + } else { + assertFalse(vector1.getObject(i)); + } + counter++; + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(1024); + assertEquals(1024, vector2.getValueCapacity()); + + for (int i = 0; i < 4096; i++) { + vector2.copyFromSafe(i, i, vector1); + } + + /* 2 realloc should have happened in copyFromSafe() */ + assertEquals(4096, vector2.getValueCapacity()); + vector2.setValueCount(8192); + /* setValueCount() should have done another realloc */ + assertEquals(8192, vector2.getValueCount()); + assertEquals(8192, vector2.getValueCapacity()); + + /* check vector data after copy and realloc */ + counter = 0; + for (int i = 0; i < 8192; i++) { + if (((i & 1) == 0) || (i >= 4096)) { + assertNull(vector2.getObject(i)); + } else { + if ((counter & 1) == 0) { + assertTrue(vector2.getObject(i)); + } else { + assertFalse(vector2.getObject(i)); + } + counter++; + } + } + } + } + + @Test /* Float4Vector */ + public void testCopyFromWithNulls5() { + try (final Float4Vector vector1 = new Float4Vector(EMPTY_SCHEMA_PATH, allocator); + final Float4Vector vector2 = new Float4Vector(EMPTY_SCHEMA_PATH, allocator)) { + + vector1.allocateNew(); + assertTrue(vector1.getValueCapacity() >= vector1.INITIAL_VALUE_ALLOCATION); + assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); + + for (int i = 0; i < initialCapacity; i++) { + if ((i & 1) == 0) { + continue; + } + vector1.setSafe(i, 100.25f + (float) i); + } + + vector1.setValueCount(initialCapacity); + + /* No realloc should have happened in setSafe or + * setValueCount + */ + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); + + for (int i = 0; i < initialCapacity; i++) { + if ((i & 1) == 0) { + assertNull(vector1.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, 100.25f + (float) i, vector1.get(i), 0); + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); + + for (int i = 0; i < initialCapacity; i++) { + vector2.copyFromSafe(i, i, vector1); + } + + /* 2 realloc should have happened in copyFromSafe() */ + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); + /* setValueCount() should have done another realloc */ + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); + + /* check vector data after copy and realloc */ + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, 100.25f + i * 1.0f, vector2.get(i), 0); + } + } + } + } + + @Test /* Float8Vector */ + public void testCopyFromWithNulls6() { + try (final Float8Vector vector1 = new Float8Vector(EMPTY_SCHEMA_PATH, allocator); + final Float8Vector vector2 = new Float8Vector(EMPTY_SCHEMA_PATH, allocator)) { + + vector1.allocateNew(); + assertTrue(vector1.getValueCapacity() >= vector1.INITIAL_VALUE_ALLOCATION); + assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); + + for (int i = 0; i < initialCapacity; i++) { + if ((i & 1) == 0) { + continue; + } + vector1.setSafe(i, 123456.7865 + (double) i); + } + + vector1.setValueCount(initialCapacity); + + /* No realloc should have happened in setSafe or + * setValueCount + */ + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); + + for (int i = 0; i < initialCapacity; i++) { + if ((i & 1) == 0) { + assertNull(vector1.getObject(i)); + } else { + assertEquals( + "unexpected value at index: " + i, 123456.7865 + (double) i, vector1.get(i), 0); + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); + + for (int i = 0; i < initialCapacity; i++) { + vector2.copyFromSafe(i, i, vector1); + } + + /* 2 realloc should have happened in copyFromSafe() */ + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); + /* setValueCount() should have done another realloc */ + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); + + /* check vector data after copy and realloc */ + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { + assertNull(vector2.getObject(i)); + } else { + assertEquals( + "unexpected value at index: " + i, 123456.7865 + (double) i, vector2.get(i), 0); + } + } + } + } + + @Test /* IntervalDayVector */ + public void testCopyFromWithNulls7() { + try (final IntervalDayVector vector1 = new IntervalDayVector(EMPTY_SCHEMA_PATH, allocator); + final IntervalDayVector vector2 = new IntervalDayVector(EMPTY_SCHEMA_PATH, allocator)) { + + vector1.allocateNew(); + assertTrue(vector1.getValueCapacity() >= vector1.INITIAL_VALUE_ALLOCATION); + assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); + + final int days = 10; + final int milliseconds = 10000; + for (int i = 0; i < initialCapacity; i++) { + if ((i & 1) == 0) { + continue; + } + vector1.setSafe(i, days + i, milliseconds + i); + } + + vector1.setValueCount(initialCapacity); + + /* No realloc should have happened in setSafe or + * setValueCount + */ + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); + + for (int i = 0; i < initialCapacity; i++) { + if ((i & 1) == 0) { + assertNull(vector1.getObject(i)); + } else { + final Duration d = vector1.getObject(i); + assertEquals(days + i, d.toDays()); + assertEquals(milliseconds + i, d.minusDays(days + i).toMillis()); + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); + + for (int i = 0; i < initialCapacity; i++) { + vector2.copyFromSafe(i, i, vector1); + } + + /* 2 realloc should have happened in copyFromSafe() */ + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); + /* setValueCount() should have done another realloc */ + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); + + /* check vector data after copy and realloc */ + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { + assertNull(vector2.getObject(i)); + } else { + final Duration d = vector2.getObject(i); + assertEquals(days + i, d.toDays()); + assertEquals(milliseconds + i, d.minusDays(days + i).toMillis()); + } + } + } + } + + @Test /* IntervalYearVector */ + public void testCopyFromWithNulls8() { + try (final IntervalYearVector vector1 = new IntervalYearVector(EMPTY_SCHEMA_PATH, allocator); + final IntervalYearVector vector2 = new IntervalYearVector(EMPTY_SCHEMA_PATH, allocator)) { + + vector1.allocateNew(); + assertTrue(vector1.getValueCapacity() >= vector1.INITIAL_VALUE_ALLOCATION); + assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); + + final int interval = 30; /* 2 years 6 months */ + final Period[] periods = new Period[4096]; + for (int i = 0; i < initialCapacity; i++) { + if ((i & 1) == 0) { + continue; + } + vector1.setSafe(i, interval + i); + final int years = (interval + i) / org.apache.arrow.vector.util.DateUtility.yearsToMonths; + final int months = (interval + i) % org.apache.arrow.vector.util.DateUtility.yearsToMonths; + periods[i] = Period.ofYears(years).plusMonths(months).normalized(); + } + + vector1.setValueCount(initialCapacity); + + /* No realloc should have happened in setSafe or + * setValueCount + */ + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); + + for (int i = 0; i < initialCapacity; i++) { + if ((i & 1) == 0) { + assertNull(vector1.getObject(i)); + } else { + final Period p = vector1.getObject(i).normalized(); + assertEquals(interval + i, vector1.get(i)); + assertEquals(periods[i], p); + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); + + for (int i = 0; i < initialCapacity; i++) { + vector2.copyFromSafe(i, i, vector1); + } + + /* 2 realloc should have happened in copyFromSafe() */ + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); + /* setValueCount() should have done another realloc */ + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); + + /* check vector data after copy and realloc */ + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { + assertNull(vector2.getObject(i)); + } else { + final Period p = vector2.getObject(i).normalized(); + assertEquals(periods[i], p); + } + } + } + } + + @Test /* SmallIntVector */ + public void testCopyFromWithNulls9() { + try (final SmallIntVector vector1 = new SmallIntVector(EMPTY_SCHEMA_PATH, allocator); + final SmallIntVector vector2 = new SmallIntVector(EMPTY_SCHEMA_PATH, allocator)) { + + vector1.allocateNew(); + assertTrue(vector1.getValueCapacity() >= vector1.INITIAL_VALUE_ALLOCATION); + assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); + + final short val = 1000; + for (int i = 0; i < initialCapacity; i++) { + if ((i & 1) == 0) { + continue; + } + vector1.setSafe(i, val + (short) i); + } + + vector1.setValueCount(initialCapacity); + + /* No realloc should have happened in setSafe or + * setValueCount + */ + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); + + for (int i = 0; i < initialCapacity; i++) { + if ((i & 1) == 0) { + assertNull(vector1.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, val + (short) i, vector1.get(i)); + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); + + for (int i = 0; i < initialCapacity; i++) { + vector2.copyFromSafe(i, i, vector1); + } + + /* 2 realloc should have happened in copyFromSafe() */ + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); + /* setValueCount() should have done another realloc */ + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); + + /* check vector data after copy and realloc */ + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, val + (short) i, vector2.get(i)); + } + } + } + } + + @Test /* TimeMicroVector */ + public void testCopyFromWithNulls10() { + try (final TimeMicroVector vector1 = new TimeMicroVector(EMPTY_SCHEMA_PATH, allocator); + final TimeMicroVector vector2 = new TimeMicroVector(EMPTY_SCHEMA_PATH, allocator)) { + + vector1.allocateNew(); + assertTrue(vector1.getValueCapacity() >= vector1.INITIAL_VALUE_ALLOCATION); + assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); + + final long val = 100485765432L; + for (int i = 0; i < initialCapacity; i++) { + if ((i & 1) == 0) { + continue; + } + vector1.setSafe(i, val + (long) i); + } + + vector1.setValueCount(initialCapacity); + + /* No realloc should have happened in setSafe or + * setValueCount + */ + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); + + for (int i = 0; i < initialCapacity; i++) { + if ((i & 1) == 0) { + assertNull(vector1.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, val + (long) i, vector1.get(i)); + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); + + for (int i = 0; i < initialCapacity; i++) { + vector2.copyFromSafe(i, i, vector1); + } + + /* 2 realloc should have happened in copyFromSafe() */ + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); + /* setValueCount() should have done another realloc */ + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); + + /* check vector data after copy and realloc */ + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, val + (long) i, vector2.get(i)); + } + } + } + } + + @Test /* TimeMilliVector */ + public void testCopyFromWithNulls11() { + try (final TimeMilliVector vector1 = new TimeMilliVector(EMPTY_SCHEMA_PATH, allocator); + final TimeMilliVector vector2 = new TimeMilliVector(EMPTY_SCHEMA_PATH, allocator)) { + + vector1.allocateNew(); + assertTrue(vector1.getValueCapacity() >= vector1.INITIAL_VALUE_ALLOCATION); + assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); + + final int val = 1000; + for (int i = 0; i < initialCapacity; i++) { + if ((i & 1) == 0) { + continue; + } + vector1.setSafe(i, val + i); + } + + vector1.setValueCount(initialCapacity); + + /* No realloc should have happened in setSafe or + * setValueCount + */ + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); + + for (int i = 0; i < initialCapacity; i++) { + if ((i & 1) == 0) { + assertNull(vector1.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, val + i, vector1.get(i)); + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); + + for (int i = 0; i < initialCapacity; i++) { + vector2.copyFromSafe(i, i, vector1); + } + + /* 2 realloc should have happened in copyFromSafe() */ + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); + /* setValueCount() should have done another realloc */ + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); + + /* check vector data after copy and realloc */ + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, val + i, vector2.get(i)); + } + } + } + } + + @Test /* TinyIntVector */ + public void testCopyFromWithNulls12() { + try (final TinyIntVector vector1 = new TinyIntVector(EMPTY_SCHEMA_PATH, allocator); + final TinyIntVector vector2 = new TinyIntVector(EMPTY_SCHEMA_PATH, allocator)) { + + vector1.allocateNew(); + assertTrue(vector1.getValueCapacity() >= vector1.INITIAL_VALUE_ALLOCATION); + assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); + + byte val = -128; + for (int i = 0; i < initialCapacity; i++) { + if ((i & 1) == 0) { + continue; + } + vector1.setSafe(i, val); + val++; + } + + vector1.setValueCount(initialCapacity); + + /* No realloc should have happened in setSafe or + * setValueCount + */ + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); + + val = -128; + for (int i = 0; i < initialCapacity; i++) { + if ((i & 1) == 0) { + assertNull(vector1.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, val, vector1.get(i)); + val++; + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + + for (int i = 0; i < initialCapacity; i++) { + vector2.copyFromSafe(i, i, vector1); + } + + /* 2 realloc should have happened in copyFromSafe() */ + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); + /* setValueCount() should have done another realloc */ + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); + + /* check vector data after copy and realloc */ + val = -128; + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, val, vector2.get(i)); + val++; + } + } + } + } + + @Test /* DecimalVector */ + public void testCopyFromWithNulls13() { + try (final DecimalVector vector1 = new DecimalVector(EMPTY_SCHEMA_PATH, allocator, 30, 16); + final DecimalVector vector2 = new DecimalVector(EMPTY_SCHEMA_PATH, allocator, 30, 16)) { + + vector1.allocateNew(); + assertTrue(vector1.getValueCapacity() >= vector1.INITIAL_VALUE_ALLOCATION); + assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); + + final double baseValue = 104567897654.876543654; + final BigDecimal[] decimals = new BigDecimal[4096]; + for (int i = 0; i < initialCapacity; i++) { + if ((i & 1) == 0) { + continue; + } + BigDecimal decimal = new BigDecimal(baseValue + (double) i); + vector1.setSafe(i, decimal); + decimals[i] = decimal; + } + + vector1.setValueCount(initialCapacity); + + /* No realloc should have happened in setSafe or + * setValueCount + */ + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); + + for (int i = 0; i < initialCapacity; i++) { + if ((i & 1) == 0) { + assertNull(vector1.getObject(i)); + } else { + final BigDecimal decimal = vector1.getObject(i); + assertEquals(decimals[i], decimal); + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); + + for (int i = 0; i < initialCapacity; i++) { + vector2.copyFromSafe(i, i, vector1); + } + + /* 2 realloc should have happened in copyFromSafe() */ + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); + /* setValueCount() should have done another realloc */ + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); + + /* check vector data after copy and realloc */ + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { + assertNull(vector2.getObject(i)); + } else { + final BigDecimal decimal = vector2.getObject(i); + assertEquals(decimals[i], decimal); + } + } + } + } + + @Test /* TimeStampVector */ + public void testCopyFromWithNulls14() { + try (final TimeStampVector vector1 = new TimeStampMicroVector(EMPTY_SCHEMA_PATH, allocator); + final TimeStampVector vector2 = new TimeStampMicroVector(EMPTY_SCHEMA_PATH, allocator)) { + + vector1.allocateNew(); + assertTrue(vector1.getValueCapacity() >= vector1.INITIAL_VALUE_ALLOCATION); + assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); + + final long val = 20145678912L; + for (int i = 0; i < initialCapacity; i++) { + if ((i & 1) == 0) { + continue; + } + vector1.setSafe(i, val + (long) i); + } + + vector1.setValueCount(initialCapacity); + + /* No realloc should have happened in setSafe or + * setValueCount + */ + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); + + for (int i = 0; i < initialCapacity; i++) { + if ((i & 1) == 0) { + assertNull(vector1.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, val + (long) i, vector1.get(i)); + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); + + for (int i = 0; i < initialCapacity; i++) { + vector2.copyFromSafe(i, i, vector1); + } + + /* 2 realloc should have happened in copyFromSafe() */ + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); + /* setValueCount() should have done another realloc */ + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); + + /* check vector data after copy and realloc */ + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, val + (long) i, vector2.get(i)); + } + } + } + } + + @Test //https://issues.apache.org/jira/browse/ARROW-7837 + public void testCopySafeArrow7837() { + // this test exposes a bug in `handleSafe` where + // it reads a stale index and as a result missed a required resize of the value vector. + try (VarCharVector vc1 = new VarCharVector("vc1", allocator); + VarCharVector vc2 = new VarCharVector("vc2", allocator); + ) { + //initial size is carefully set in order to force the second 'copyFromSafe' operation + // to trigger a reallocation of the vector. + vc2.setInitialCapacity(/*valueCount*/20, /*density*/0.5); + + vc1.setSafe(0, "1234567890".getBytes(Charset.forName("utf-8"))); + assertFalse(vc1.isNull(0)); + assertEquals(vc1.getObject(0).toString(), "1234567890"); + + vc2.copyFromSafe(0, 0, vc1); + assertFalse(vc2.isNull(0)); + assertEquals(vc2.getObject(0).toString(), "1234567890"); + + vc2.copyFromSafe(0, 5, vc1); + assertTrue(vc2.isNull(1)); + assertTrue(vc2.isNull(2)); + assertTrue(vc2.isNull(3)); + assertTrue(vc2.isNull(4)); + assertFalse(vc2.isNull(5)); + assertEquals(vc2.getObject(5).toString(), "1234567890"); + } + } + + +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestDecimal256Vector.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestDecimal256Vector.java new file mode 100644 index 000000000..82c912cef --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestDecimal256Vector.java @@ -0,0 +1,357 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import java.math.BigDecimal; +import java.math.BigInteger; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestDecimal256Vector { + + private static long[] intValues; + + static { + intValues = new long[60]; + for (int i = 0; i < intValues.length / 2; i++) { + intValues[i] = 1 << i + 1; + intValues[2 * i] = -1 * (1 << i + 1); + } + } + + private int scale = 3; + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testValuesWriteRead() { + try (Decimal256Vector decimalVector = TestUtils.newVector(Decimal256Vector.class, "decimal", + new ArrowType.Decimal(10, scale, 256), allocator);) { + + try (Decimal256Vector oldConstructor = new Decimal256Vector("decimal", allocator, 10, scale);) { + assertEquals(decimalVector.getField().getType(), oldConstructor.getField().getType()); + } + + decimalVector.allocateNew(); + BigDecimal[] values = new BigDecimal[intValues.length]; + for (int i = 0; i < intValues.length; i++) { + BigDecimal decimal = new BigDecimal(BigInteger.valueOf(intValues[i]), scale); + values[i] = decimal; + decimalVector.setSafe(i, decimal); + } + + decimalVector.setValueCount(intValues.length); + + for (int i = 0; i < intValues.length; i++) { + BigDecimal value = decimalVector.getObject(i); + assertEquals("unexpected data at index: " + i, values[i], value); + } + } + } + + @Test + public void testDecimal256DifferentScaleAndPrecision() { + try (Decimal256Vector decimalVector = TestUtils.newVector(Decimal256Vector.class, "decimal", + new ArrowType.Decimal(4, 2, 256), allocator)) { + decimalVector.allocateNew(); + + // test Decimal256 with different scale + { + BigDecimal decimal = new BigDecimal(BigInteger.valueOf(0), 3); + UnsupportedOperationException ue = + assertThrows(UnsupportedOperationException.class, () -> decimalVector.setSafe(0, decimal)); + assertEquals("BigDecimal scale must equal that in the Arrow vector: 3 != 2", ue.getMessage()); + } + + // test BigDecimal with larger precision than initialized + { + BigDecimal decimal = new BigDecimal(BigInteger.valueOf(12345), 2); + UnsupportedOperationException ue = + assertThrows(UnsupportedOperationException.class, () -> decimalVector.setSafe(0, decimal)); + assertEquals("BigDecimal precision can not be greater than that in the Arrow vector: 5 > 4", ue.getMessage()); + } + } + } + + @Test + public void testWriteBigEndian() { + try (Decimal256Vector decimalVector = TestUtils.newVector(Decimal256Vector.class, "decimal", + new ArrowType.Decimal(38, 18, 256), allocator);) { + decimalVector.allocateNew(); + BigDecimal decimal1 = new BigDecimal("123456789.000000000000000000"); + BigDecimal decimal2 = new BigDecimal("11.123456789123456789"); + BigDecimal decimal3 = new BigDecimal("1.000000000000000000"); + BigDecimal decimal4 = new BigDecimal("0.111111111000000000"); + BigDecimal decimal5 = new BigDecimal("987654321.123456789000000000"); + BigDecimal decimal6 = new BigDecimal("222222222222.222222222000000000"); + BigDecimal decimal7 = new BigDecimal("7777777777777.666666667000000000"); + BigDecimal decimal8 = new BigDecimal("1212121212.343434343000000000"); + + byte[] decimalValue1 = decimal1.unscaledValue().toByteArray(); + byte[] decimalValue2 = decimal2.unscaledValue().toByteArray(); + byte[] decimalValue3 = decimal3.unscaledValue().toByteArray(); + byte[] decimalValue4 = decimal4.unscaledValue().toByteArray(); + byte[] decimalValue5 = decimal5.unscaledValue().toByteArray(); + byte[] decimalValue6 = decimal6.unscaledValue().toByteArray(); + byte[] decimalValue7 = decimal7.unscaledValue().toByteArray(); + byte[] decimalValue8 = decimal8.unscaledValue().toByteArray(); + + decimalVector.setBigEndian(0, decimalValue1); + decimalVector.setBigEndian(1, decimalValue2); + decimalVector.setBigEndian(2, decimalValue3); + decimalVector.setBigEndian(3, decimalValue4); + decimalVector.setBigEndian(4, decimalValue5); + decimalVector.setBigEndian(5, decimalValue6); + decimalVector.setBigEndian(6, decimalValue7); + decimalVector.setBigEndian(7, decimalValue8); + + decimalVector.setValueCount(8); + assertEquals(8, decimalVector.getValueCount()); + assertEquals(decimal1, decimalVector.getObject(0)); + assertEquals(decimal2, decimalVector.getObject(1)); + assertEquals(decimal3, decimalVector.getObject(2)); + assertEquals(decimal4, decimalVector.getObject(3)); + assertEquals(decimal5, decimalVector.getObject(4)); + assertEquals(decimal6, decimalVector.getObject(5)); + assertEquals(decimal7, decimalVector.getObject(6)); + assertEquals(decimal8, decimalVector.getObject(7)); + } + } + + @Test + public void testLongReadWrite() { + try (Decimal256Vector decimalVector = TestUtils.newVector(Decimal256Vector.class, "decimal", + new ArrowType.Decimal(38, 0, 256), allocator)) { + decimalVector.allocateNew(); + + long[] longValues = {0L, -2L, Long.MAX_VALUE, Long.MIN_VALUE, 187L}; + + for (int i = 0; i < longValues.length; ++i) { + decimalVector.set(i, longValues[i]); + } + + decimalVector.setValueCount(longValues.length); + + for (int i = 0; i < longValues.length; ++i) { + assertEquals(new BigDecimal(longValues[i]), decimalVector.getObject(i)); + } + } + } + + + @Test + public void testBigDecimalReadWrite() { + try (Decimal256Vector decimalVector = TestUtils.newVector(Decimal256Vector.class, "decimal", + new ArrowType.Decimal(38, 9, 256), allocator);) { + decimalVector.allocateNew(); + BigDecimal decimal1 = new BigDecimal("123456789.000000000"); + BigDecimal decimal2 = new BigDecimal("11.123456789"); + BigDecimal decimal3 = new BigDecimal("1.000000000"); + BigDecimal decimal4 = new BigDecimal("-0.111111111"); + BigDecimal decimal5 = new BigDecimal("-987654321.123456789"); + BigDecimal decimal6 = new BigDecimal("-222222222222.222222222"); + BigDecimal decimal7 = new BigDecimal("7777777777777.666666667"); + BigDecimal decimal8 = new BigDecimal("1212121212.343434343"); + + decimalVector.set(0, decimal1); + decimalVector.set(1, decimal2); + decimalVector.set(2, decimal3); + decimalVector.set(3, decimal4); + decimalVector.set(4, decimal5); + decimalVector.set(5, decimal6); + decimalVector.set(6, decimal7); + decimalVector.set(7, decimal8); + + decimalVector.setValueCount(8); + assertEquals(8, decimalVector.getValueCount()); + assertEquals(decimal1, decimalVector.getObject(0)); + assertEquals(decimal2, decimalVector.getObject(1)); + assertEquals(decimal3, decimalVector.getObject(2)); + assertEquals(decimal4, decimalVector.getObject(3)); + assertEquals(decimal5, decimalVector.getObject(4)); + assertEquals(decimal6, decimalVector.getObject(5)); + assertEquals(decimal7, decimalVector.getObject(6)); + assertEquals(decimal8, decimalVector.getObject(7)); + } + } + + /** + * Test {@link Decimal256Vector#setBigEndian(int, byte[])} which takes BE layout input and stores in native-endian + * (NE) layout. + * Cases to cover: input byte array in different lengths in range [1-16] and negative values. + */ + @Test + public void decimalBE2NE() { + try (Decimal256Vector decimalVector = TestUtils.newVector(Decimal256Vector.class, "decimal", + new ArrowType.Decimal(23, 2, 256), allocator)) { + decimalVector.allocateNew(); + + BigInteger[] testBigInts = new BigInteger[] { + new BigInteger("0"), + new BigInteger("-1"), + new BigInteger("23"), + new BigInteger("234234"), + new BigInteger("-234234234"), + new BigInteger("234234234234"), + new BigInteger("-56345345345345"), + new BigInteger("2982346298346289346293467923465345634500"), // converts to 16+ byte array + new BigInteger("-389457298347598237459832459823434653600"), // converts to 16+ byte array + new BigInteger("-345345"), + new BigInteger("754533") + }; + + int insertionIdx = 0; + insertionIdx++; // insert a null + for (BigInteger val : testBigInts) { + decimalVector.setBigEndian(insertionIdx++, val.toByteArray()); + } + insertionIdx++; // insert a null + // insert a zero length buffer + decimalVector.setBigEndian(insertionIdx++, new byte[0]); + + // Try inserting a buffer larger than 33 bytes and expect a failure + final int insertionIdxCapture = insertionIdx; + IllegalArgumentException ex = assertThrows(IllegalArgumentException.class, + () -> decimalVector.setBigEndian(insertionIdxCapture, new byte[33])); + assertTrue(ex.getMessage().equals("Invalid decimal value length. Valid length in [1 - 32], got 33")); + decimalVector.setValueCount(insertionIdx); + + // retrieve values and check if they are correct + int outputIdx = 0; + assertTrue(decimalVector.isNull(outputIdx++)); + for (BigInteger expected : testBigInts) { + final BigDecimal actual = decimalVector.getObject(outputIdx++); + assertEquals(expected, actual.unscaledValue()); + } + assertTrue(decimalVector.isNull(outputIdx++)); + assertEquals(BigInteger.valueOf(0), decimalVector.getObject(outputIdx).unscaledValue()); + } + } + + @Test + public void setUsingArrowBufOfLEInts() { + try (Decimal256Vector decimalVector = TestUtils.newVector(Decimal256Vector.class, "decimal", + new ArrowType.Decimal(5, 2, 256), allocator); + ArrowBuf buf = allocator.buffer(8);) { + decimalVector.allocateNew(); + + // add a positive value equivalent to 705.32 + int val = 70532; + buf.setInt(0, val); + decimalVector.setSafe(0, 0, buf, 4); + + // add a -ve value equivalent to -705.32 + val = -70532; + buf.setInt(4, val); + decimalVector.setSafe(1, 4, buf, 4); + + decimalVector.setValueCount(2); + + BigDecimal [] expectedValues = new BigDecimal[] {BigDecimal.valueOf(705.32), BigDecimal + .valueOf(-705.32)}; + for (int i = 0; i < 2; i ++) { + BigDecimal value = decimalVector.getObject(i); + assertEquals(expectedValues[i], value); + } + } + + } + + @Test + public void setUsingArrowLongLEBytes() { + try (Decimal256Vector decimalVector = TestUtils.newVector(Decimal256Vector.class, "decimal", + new ArrowType.Decimal(18, 0, 256), allocator); + ArrowBuf buf = allocator.buffer(16);) { + decimalVector.allocateNew(); + + long val = Long.MAX_VALUE; + buf.setLong(0, val); + decimalVector.setSafe(0, 0, buf, 8); + + val = Long.MIN_VALUE; + buf.setLong(8, val); + decimalVector.setSafe(1, 8, buf, 8); + + decimalVector.setValueCount(2); + + BigDecimal [] expectedValues = new BigDecimal[] {BigDecimal.valueOf(Long.MAX_VALUE), BigDecimal + .valueOf(Long.MIN_VALUE)}; + for (int i = 0; i < 2; i ++) { + BigDecimal value = decimalVector.getObject(i); + assertEquals(expectedValues[i], value); + } + } + } + + @Test + public void setUsingArrowBufOfBEBytes() { + try (Decimal256Vector decimalVector = TestUtils.newVector(Decimal256Vector.class, "decimal", + new ArrowType.Decimal(5, 2, 256), allocator); + ArrowBuf buf = allocator.buffer(9);) { + BigDecimal [] expectedValues = new BigDecimal[] {BigDecimal.valueOf(705.32), BigDecimal + .valueOf(-705.32), BigDecimal.valueOf(705.32)}; + verifyWritingArrowBufWithBigEndianBytes(decimalVector, buf, expectedValues, 3); + } + + try (Decimal256Vector decimalVector = TestUtils.newVector(Decimal256Vector.class, "decimal", + new ArrowType.Decimal(43, 2, 256), allocator); + ArrowBuf buf = allocator.buffer(45);) { + BigDecimal[] expectedValues = new BigDecimal[] {new BigDecimal("29823462983462893462934679234653450000000.63"), + new BigDecimal("-2982346298346289346293467923465345.63"), + new BigDecimal("2982346298346289346293467923465345.63")}; + verifyWritingArrowBufWithBigEndianBytes(decimalVector, buf, expectedValues, 15); + } + } + + private void verifyWritingArrowBufWithBigEndianBytes(Decimal256Vector decimalVector, + ArrowBuf buf, BigDecimal[] expectedValues, + int length) { + decimalVector.allocateNew(); + for (int i = 0; i < expectedValues.length; i++) { + byte[] bigEndianBytes = expectedValues[i].unscaledValue().toByteArray(); + buf.setBytes(length * i , bigEndianBytes, 0 , bigEndianBytes.length); + decimalVector.setBigEndianSafe(i, length * i, buf, bigEndianBytes.length); + } + + decimalVector.setValueCount(3); + + for (int i = 0; i < expectedValues.length; i ++) { + BigDecimal value = decimalVector.getObject(i); + assertEquals(expectedValues[i], value); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestDecimalVector.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestDecimalVector.java new file mode 100644 index 000000000..c7e3e436e --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestDecimalVector.java @@ -0,0 +1,365 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.math.BigDecimal; +import java.math.BigInteger; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestDecimalVector { + + private static long[] intValues; + + static { + intValues = new long[60]; + for (int i = 0; i < intValues.length / 2; i++) { + intValues[i] = 1 << i + 1; + intValues[2 * i] = -1 * (1 << i + 1); + } + } + + private int scale = 3; + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testValuesWriteRead() { + try (DecimalVector decimalVector = TestUtils.newVector(DecimalVector.class, "decimal", + new ArrowType.Decimal(10, scale, 128), allocator);) { + + try (DecimalVector oldConstructor = new DecimalVector("decimal", allocator, 10, scale);) { + assertEquals(decimalVector.getField().getType(), oldConstructor.getField().getType()); + } + + decimalVector.allocateNew(); + BigDecimal[] values = new BigDecimal[intValues.length]; + for (int i = 0; i < intValues.length; i++) { + BigDecimal decimal = new BigDecimal(BigInteger.valueOf(intValues[i]), scale); + values[i] = decimal; + decimalVector.setSafe(i, decimal); + } + + decimalVector.setValueCount(intValues.length); + + for (int i = 0; i < intValues.length; i++) { + BigDecimal value = decimalVector.getObject(i); + assertEquals("unexpected data at index: " + i, values[i], value); + } + } + } + + @Test + public void testBigDecimalDifferentScaleAndPrecision() { + try (DecimalVector decimalVector = TestUtils.newVector(DecimalVector.class, "decimal", + new ArrowType.Decimal(4, 2, 128), allocator);) { + decimalVector.allocateNew(); + + // test BigDecimal with different scale + boolean hasError = false; + try { + BigDecimal decimal = new BigDecimal(BigInteger.valueOf(0), 3); + decimalVector.setSafe(0, decimal); + } catch (UnsupportedOperationException ue) { + hasError = true; + } finally { + assertTrue(hasError); + } + + // test BigDecimal with larger precision than initialized + hasError = false; + try { + BigDecimal decimal = new BigDecimal(BigInteger.valueOf(12345), 2); + decimalVector.setSafe(0, decimal); + } catch (UnsupportedOperationException ue) { + hasError = true; + } finally { + assertTrue(hasError); + } + } + } + + @Test + public void testWriteBigEndian() { + try (DecimalVector decimalVector = TestUtils.newVector(DecimalVector.class, "decimal", + new ArrowType.Decimal(38, 9, 128), allocator);) { + decimalVector.allocateNew(); + BigDecimal decimal1 = new BigDecimal("123456789.000000000"); + BigDecimal decimal2 = new BigDecimal("11.123456789"); + BigDecimal decimal3 = new BigDecimal("1.000000000"); + BigDecimal decimal4 = new BigDecimal("0.111111111"); + BigDecimal decimal5 = new BigDecimal("987654321.123456789"); + BigDecimal decimal6 = new BigDecimal("222222222222.222222222"); + BigDecimal decimal7 = new BigDecimal("7777777777777.666666667"); + BigDecimal decimal8 = new BigDecimal("1212121212.343434343"); + + byte[] decimalValue1 = decimal1.unscaledValue().toByteArray(); + byte[] decimalValue2 = decimal2.unscaledValue().toByteArray(); + byte[] decimalValue3 = decimal3.unscaledValue().toByteArray(); + byte[] decimalValue4 = decimal4.unscaledValue().toByteArray(); + byte[] decimalValue5 = decimal5.unscaledValue().toByteArray(); + byte[] decimalValue6 = decimal6.unscaledValue().toByteArray(); + byte[] decimalValue7 = decimal7.unscaledValue().toByteArray(); + byte[] decimalValue8 = decimal8.unscaledValue().toByteArray(); + + decimalVector.setBigEndian(0, decimalValue1); + decimalVector.setBigEndian(1, decimalValue2); + decimalVector.setBigEndian(2, decimalValue3); + decimalVector.setBigEndian(3, decimalValue4); + decimalVector.setBigEndian(4, decimalValue5); + decimalVector.setBigEndian(5, decimalValue6); + decimalVector.setBigEndian(6, decimalValue7); + decimalVector.setBigEndian(7, decimalValue8); + + decimalVector.setValueCount(8); + assertEquals(8, decimalVector.getValueCount()); + assertEquals(decimal1, decimalVector.getObject(0)); + assertEquals(decimal2, decimalVector.getObject(1)); + assertEquals(decimal3, decimalVector.getObject(2)); + assertEquals(decimal4, decimalVector.getObject(3)); + assertEquals(decimal5, decimalVector.getObject(4)); + assertEquals(decimal6, decimalVector.getObject(5)); + assertEquals(decimal7, decimalVector.getObject(6)); + assertEquals(decimal8, decimalVector.getObject(7)); + } + } + + @Test + public void testLongReadWrite() { + try (DecimalVector decimalVector = TestUtils.newVector(DecimalVector.class, "decimal", + new ArrowType.Decimal(38, 0, 128), allocator)) { + decimalVector.allocateNew(); + + long[] longValues = {0L, -2L, Long.MAX_VALUE, Long.MIN_VALUE, 187L}; + + for (int i = 0; i < longValues.length; ++i) { + decimalVector.set(i, longValues[i]); + } + + decimalVector.setValueCount(longValues.length); + + for (int i = 0; i < longValues.length; ++i) { + assertEquals(new BigDecimal(longValues[i]), decimalVector.getObject(i)); + } + } + } + + + @Test + public void testBigDecimalReadWrite() { + try (DecimalVector decimalVector = TestUtils.newVector(DecimalVector.class, "decimal", + new ArrowType.Decimal(38, 9, 128), allocator);) { + decimalVector.allocateNew(); + BigDecimal decimal1 = new BigDecimal("123456789.000000000"); + BigDecimal decimal2 = new BigDecimal("11.123456789"); + BigDecimal decimal3 = new BigDecimal("1.000000000"); + BigDecimal decimal4 = new BigDecimal("-0.111111111"); + BigDecimal decimal5 = new BigDecimal("-987654321.123456789"); + BigDecimal decimal6 = new BigDecimal("-222222222222.222222222"); + BigDecimal decimal7 = new BigDecimal("7777777777777.666666667"); + BigDecimal decimal8 = new BigDecimal("1212121212.343434343"); + + decimalVector.set(0, decimal1); + decimalVector.set(1, decimal2); + decimalVector.set(2, decimal3); + decimalVector.set(3, decimal4); + decimalVector.set(4, decimal5); + decimalVector.set(5, decimal6); + decimalVector.set(6, decimal7); + decimalVector.set(7, decimal8); + + decimalVector.setValueCount(8); + assertEquals(8, decimalVector.getValueCount()); + assertEquals(decimal1, decimalVector.getObject(0)); + assertEquals(decimal2, decimalVector.getObject(1)); + assertEquals(decimal3, decimalVector.getObject(2)); + assertEquals(decimal4, decimalVector.getObject(3)); + assertEquals(decimal5, decimalVector.getObject(4)); + assertEquals(decimal6, decimalVector.getObject(5)); + assertEquals(decimal7, decimalVector.getObject(6)); + assertEquals(decimal8, decimalVector.getObject(7)); + } + } + + /** + * Test {@link DecimalVector#setBigEndian(int, byte[])} which takes BE layout input and stores in native-endian (NE) + * layout. + * Cases to cover: input byte array in different lengths in range [1-16] and negative values. + */ + @Test + public void decimalBE2NE() { + try (DecimalVector decimalVector = TestUtils.newVector(DecimalVector.class, "decimal", + new ArrowType.Decimal(21, 2, 128), allocator)) { + decimalVector.allocateNew(); + + BigInteger[] testBigInts = new BigInteger[] { + new BigInteger("0"), + new BigInteger("-1"), + new BigInteger("23"), + new BigInteger("234234"), + new BigInteger("-234234234"), + new BigInteger("234234234234"), + new BigInteger("-56345345345345"), + new BigInteger("29823462983462893462934679234653456345"), // converts to 16 byte array + new BigInteger("-3894572983475982374598324598234346536"), // converts to 16 byte array + new BigInteger("-345345"), + new BigInteger("754533") + }; + + int insertionIdx = 0; + insertionIdx++; // insert a null + for (BigInteger val : testBigInts) { + decimalVector.setBigEndian(insertionIdx++, val.toByteArray()); + } + insertionIdx++; // insert a null + // insert a zero length buffer + decimalVector.setBigEndian(insertionIdx++, new byte[0]); + + // Try inserting a buffer larger than 16bytes and expect a failure + try { + decimalVector.setBigEndian(insertionIdx, new byte[17]); + fail("above statement should have failed"); + } catch (IllegalArgumentException ex) { + assertTrue(ex.getMessage().equals("Invalid decimal value length. Valid length in [1 - 16], got 17")); + } + decimalVector.setValueCount(insertionIdx); + + // retrieve values and check if they are correct + int outputIdx = 0; + assertTrue(decimalVector.isNull(outputIdx++)); + for (BigInteger expected : testBigInts) { + final BigDecimal actual = decimalVector.getObject(outputIdx++); + assertEquals(expected, actual.unscaledValue()); + } + assertTrue(decimalVector.isNull(outputIdx++)); + assertEquals(BigInteger.valueOf(0), decimalVector.getObject(outputIdx).unscaledValue()); + } + } + + @Test + public void setUsingArrowBufOfInts() { + try (DecimalVector decimalVector = TestUtils.newVector(DecimalVector.class, "decimal", + new ArrowType.Decimal(5, 2, 128), allocator); + ArrowBuf buf = allocator.buffer(8);) { + decimalVector.allocateNew(); + + // add a positive value equivalent to 705.32 + int val = 70532; + buf.setInt(0, val); + decimalVector.setSafe(0, 0, buf, 4); + + // add a -ve value equivalent to -705.32 + val = -70532; + buf.setInt(4, val); + decimalVector.setSafe(1, 4, buf, 4); + + decimalVector.setValueCount(2); + + BigDecimal [] expectedValues = new BigDecimal[] {BigDecimal.valueOf(705.32), BigDecimal + .valueOf(-705.32)}; + for (int i = 0; i < 2; i ++) { + BigDecimal value = decimalVector.getObject(i); + assertEquals(expectedValues[i], value); + } + } + + } + + @Test + public void setUsingArrowLongBytes() { + try (DecimalVector decimalVector = TestUtils.newVector(DecimalVector.class, "decimal", + new ArrowType.Decimal(18, 0, 128), allocator); + ArrowBuf buf = allocator.buffer(16);) { + decimalVector.allocateNew(); + + long val = Long.MAX_VALUE; + buf.setLong(0, val); + decimalVector.setSafe(0, 0, buf, 8); + + val = Long.MIN_VALUE; + buf.setLong(8, val); + decimalVector.setSafe(1, 8, buf, 8); + + decimalVector.setValueCount(2); + + BigDecimal [] expectedValues = new BigDecimal[] {BigDecimal.valueOf(Long.MAX_VALUE), BigDecimal + .valueOf(Long.MIN_VALUE)}; + for (int i = 0; i < 2; i ++) { + BigDecimal value = decimalVector.getObject(i); + assertEquals(expectedValues[i], value); + } + } + } + + @Test + public void setUsingArrowBufOfBEBytes() { + try (DecimalVector decimalVector = TestUtils.newVector(DecimalVector.class, "decimal", + new ArrowType.Decimal(5, 2, 128), allocator); + ArrowBuf buf = allocator.buffer(9);) { + BigDecimal [] expectedValues = new BigDecimal[] {BigDecimal.valueOf(705.32), BigDecimal + .valueOf(-705.32), BigDecimal.valueOf(705.32)}; + verifyWritingArrowBufWithBigEndianBytes(decimalVector, buf, expectedValues, 3); + } + + try (DecimalVector decimalVector = TestUtils.newVector(DecimalVector.class, "decimal", + new ArrowType.Decimal(36, 2, 128), allocator); + ArrowBuf buf = allocator.buffer(45);) { + BigDecimal[] expectedValues = new BigDecimal[] {new BigDecimal("2982346298346289346293467923465345.63"), + new BigDecimal("-2982346298346289346293467923465345.63"), + new BigDecimal("2982346298346289346293467923465345.63")}; + verifyWritingArrowBufWithBigEndianBytes(decimalVector, buf, expectedValues, 15); + } + } + + private void verifyWritingArrowBufWithBigEndianBytes(DecimalVector decimalVector, + ArrowBuf buf, BigDecimal[] expectedValues, + int length) { + decimalVector.allocateNew(); + for (int i = 0; i < expectedValues.length; i++) { + byte []bigEndianBytes = expectedValues[i].unscaledValue().toByteArray(); + buf.setBytes(length * i , bigEndianBytes, 0 , bigEndianBytes.length); + decimalVector.setBigEndianSafe(i, length * i, buf, bigEndianBytes.length); + } + + decimalVector.setValueCount(3); + + for (int i = 0; i < expectedValues.length; i ++) { + BigDecimal value = decimalVector.getObject(i); + assertEquals(expectedValues[i], value); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java new file mode 100644 index 000000000..01becf007 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java @@ -0,0 +1,639 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.DenseUnionVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.VectorWithOrdinal; +import org.apache.arrow.vector.holders.NullableBigIntHolder; +import org.apache.arrow.vector.holders.NullableBitHolder; +import org.apache.arrow.vector.holders.NullableFloat4Holder; +import org.apache.arrow.vector.holders.NullableIntHolder; +import org.apache.arrow.vector.holders.NullableUInt4Holder; +import org.apache.arrow.vector.testing.ValueVectorDataPopulator; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.UnionMode; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.JsonStringHashMap; +import org.apache.arrow.vector.util.Text; +import org.apache.arrow.vector.util.TransferPair; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestDenseUnionVector { + private static final String EMPTY_SCHEMA_PATH = ""; + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testDenseUnionVector() throws Exception { + + final NullableUInt4Holder uInt4Holder = new NullableUInt4Holder(); + uInt4Holder.value = 100; + uInt4Holder.isSet = 1; + + try (DenseUnionVector unionVector = new DenseUnionVector(EMPTY_SCHEMA_PATH, allocator, null, null)) { + unionVector.allocateNew(); + + // write some data + byte uint4TypeId = unionVector.registerNewTypeId(Field.nullable("", MinorType.UINT4.getType())); + unionVector.setTypeId(0, uint4TypeId); + unionVector.setSafe(0, uInt4Holder); + unionVector.setTypeId(2, uint4TypeId); + unionVector.setSafe(2, uInt4Holder); + unionVector.setValueCount(4); + + // check that what we wrote is correct + assertEquals(4, unionVector.getValueCount()); + + assertEquals(false, unionVector.isNull(0)); + assertEquals(100, unionVector.getObject(0)); + + assertNull(unionVector.getObject(1)); + + assertEquals(false, unionVector.isNull(2)); + assertEquals(100, unionVector.getObject(2)); + + assertNull(unionVector.getObject(3)); + } + } + + @Test + public void testTransfer() throws Exception { + try (DenseUnionVector srcVector = new DenseUnionVector(EMPTY_SCHEMA_PATH, allocator, null, null)) { + srcVector.allocateNew(); + + // write some data + byte intTypeId = srcVector.registerNewTypeId(Field.nullable("", MinorType.INT.getType())); + srcVector.setTypeId(0, intTypeId); + srcVector.setSafe(0, newIntHolder(5)); + byte bitTypeId = srcVector.registerNewTypeId(Field.nullable("", MinorType.BIT.getType())); + srcVector.setTypeId(1, bitTypeId); + srcVector.setSafe(1, newBitHolder(false)); + srcVector.setTypeId(3, intTypeId); + srcVector.setSafe(3, newIntHolder(10)); + srcVector.setTypeId(5, bitTypeId); + srcVector.setSafe(5, newBitHolder(false)); + srcVector.setValueCount(6); + + try (DenseUnionVector destVector = new DenseUnionVector(EMPTY_SCHEMA_PATH, allocator, null, null)) { + TransferPair pair = srcVector.makeTransferPair(destVector); + + // Creating the transfer should transfer the type of the field at least. + assertEquals(srcVector.getField(), destVector.getField()); + + // transfer + pair.transfer(); + + assertEquals(srcVector.getField(), destVector.getField()); + + // now check the values are transferred + assertEquals(6, destVector.getValueCount()); + + assertFalse(destVector.isNull(0)); + assertEquals(5, destVector.getObject(0)); + + assertFalse(destVector.isNull(1)); + assertEquals(false, destVector.getObject(1)); + + assertNull(destVector.getObject(2)); + + assertFalse(destVector.isNull(3)); + assertEquals(10, destVector.getObject(3)); + + assertNull(destVector.getObject(4)); + + assertFalse(destVector.isNull(5)); + assertEquals(false, destVector.getObject(5)); + } + } + } + + @Test + public void testSplitAndTransfer() throws Exception { + try (DenseUnionVector sourceVector = new DenseUnionVector(EMPTY_SCHEMA_PATH, allocator, null, null)) { + + sourceVector.allocateNew(); + + /* populate the UnionVector */ + byte intTypeId = sourceVector.registerNewTypeId(Field.nullable("", MinorType.INT.getType())); + sourceVector.setTypeId(0, intTypeId); + sourceVector.setSafe(0, newIntHolder(5)); + sourceVector.setTypeId(1, intTypeId); + sourceVector.setSafe(1, newIntHolder(10)); + sourceVector.setTypeId(2, intTypeId); + sourceVector.setSafe(2, newIntHolder(15)); + sourceVector.setTypeId(3, intTypeId); + sourceVector.setSafe(3, newIntHolder(20)); + sourceVector.setTypeId(4, intTypeId); + sourceVector.setSafe(4, newIntHolder(25)); + sourceVector.setTypeId(5, intTypeId); + sourceVector.setSafe(5, newIntHolder(30)); + sourceVector.setTypeId(6, intTypeId); + sourceVector.setSafe(6, newIntHolder(35)); + sourceVector.setTypeId(7, intTypeId); + sourceVector.setSafe(7, newIntHolder(40)); + sourceVector.setTypeId(8, intTypeId); + sourceVector.setSafe(8, newIntHolder(45)); + sourceVector.setTypeId(9, intTypeId); + sourceVector.setSafe(9, newIntHolder(50)); + sourceVector.setValueCount(10); + + /* check the vector output */ + assertEquals(10, sourceVector.getValueCount()); + assertEquals(false, sourceVector.isNull(0)); + assertEquals(5, sourceVector.getObject(0)); + assertEquals(false, sourceVector.isNull(1)); + assertEquals(10, sourceVector.getObject(1)); + assertEquals(false, sourceVector.isNull(2)); + assertEquals(15, sourceVector.getObject(2)); + assertEquals(false, sourceVector.isNull(3)); + assertEquals(20, sourceVector.getObject(3)); + assertEquals(false, sourceVector.isNull(4)); + assertEquals(25, sourceVector.getObject(4)); + assertEquals(false, sourceVector.isNull(5)); + assertEquals(30, sourceVector.getObject(5)); + assertEquals(false, sourceVector.isNull(6)); + assertEquals(35, sourceVector.getObject(6)); + assertEquals(false, sourceVector.isNull(7)); + assertEquals(40, sourceVector.getObject(7)); + assertEquals(false, sourceVector.isNull(8)); + assertEquals(45, sourceVector.getObject(8)); + assertEquals(false, sourceVector.isNull(9)); + assertEquals(50, sourceVector.getObject(9)); + + try (DenseUnionVector toVector = new DenseUnionVector(EMPTY_SCHEMA_PATH, allocator, null, null)) { + toVector.registerNewTypeId(Field.nullable("", MinorType.INT.getType())); + + final TransferPair transferPair = sourceVector.makeTransferPair(toVector); + + final int[][] transferLengths = {{0, 3}, + {3, 1}, + {4, 2}, + {6, 1}, + {7, 1}, + {8, 2} + }; + + for (final int[] transferLength : transferLengths) { + final int start = transferLength[0]; + final int length = transferLength[1]; + + transferPair.splitAndTransfer(start, length); + + /* check the toVector output after doing the splitAndTransfer */ + for (int i = 0; i < length; i++) { + assertEquals("Different data at indexes: " + (start + i) + "and " + i, sourceVector.getObject(start + i), + toVector.getObject(i)); + } + } + } + } + } + + @Test + public void testSplitAndTransferWithMixedVectors() throws Exception { + try (DenseUnionVector sourceVector = new DenseUnionVector(EMPTY_SCHEMA_PATH, allocator, null, null)) { + + sourceVector.allocateNew(); + + /* populate the UnionVector */ + byte intTypeId = sourceVector.registerNewTypeId(Field.nullable("", MinorType.INT.getType())); + + sourceVector.setTypeId(0, intTypeId); + sourceVector.setSafe(0, newIntHolder(5)); + + byte float4TypeId = sourceVector.registerNewTypeId(Field.nullable("", MinorType.FLOAT4.getType())); + + sourceVector.setTypeId(1, float4TypeId); + sourceVector.setSafe(1, newFloat4Holder(5.5f)); + + sourceVector.setTypeId(2, intTypeId); + sourceVector.setSafe(2, newIntHolder(10)); + + sourceVector.setTypeId(3, float4TypeId); + sourceVector.setSafe(3, newFloat4Holder(10.5f)); + + sourceVector.setTypeId(4, intTypeId); + sourceVector.setSafe(4, newIntHolder(15)); + + sourceVector.setTypeId(5, float4TypeId); + sourceVector.setSafe(5, newFloat4Holder(15.5f)); + + sourceVector.setTypeId(6, intTypeId); + sourceVector.setSafe(6, newIntHolder(20)); + + sourceVector.setTypeId(7, float4TypeId); + sourceVector.setSafe(7, newFloat4Holder(20.5f)); + + sourceVector.setTypeId(8, intTypeId); + sourceVector.setSafe(8, newIntHolder(30)); + + sourceVector.setTypeId(9, float4TypeId); + sourceVector.setSafe(9, newFloat4Holder(30.5f)); + sourceVector.setValueCount(10); + + /* check the vector output */ + assertEquals(10, sourceVector.getValueCount()); + assertEquals(false, sourceVector.isNull(0)); + assertEquals(5, sourceVector.getObject(0)); + assertEquals(false, sourceVector.isNull(1)); + assertEquals(5.5f, sourceVector.getObject(1)); + assertEquals(false, sourceVector.isNull(2)); + assertEquals(10, sourceVector.getObject(2)); + assertEquals(false, sourceVector.isNull(3)); + assertEquals(10.5f, sourceVector.getObject(3)); + assertEquals(false, sourceVector.isNull(4)); + assertEquals(15, sourceVector.getObject(4)); + assertEquals(false, sourceVector.isNull(5)); + assertEquals(15.5f, sourceVector.getObject(5)); + assertEquals(false, sourceVector.isNull(6)); + assertEquals(20, sourceVector.getObject(6)); + assertEquals(false, sourceVector.isNull(7)); + assertEquals(20.5f, sourceVector.getObject(7)); + assertEquals(false, sourceVector.isNull(8)); + assertEquals(30, sourceVector.getObject(8)); + assertEquals(false, sourceVector.isNull(9)); + assertEquals(30.5f, sourceVector.getObject(9)); + + try (DenseUnionVector toVector = new DenseUnionVector(EMPTY_SCHEMA_PATH, allocator, null, null)) { + toVector.registerNewTypeId(Field.nullable("", MinorType.INT.getType())); + toVector.registerNewTypeId(Field.nullable("", MinorType.FLOAT4.getType())); + + final TransferPair transferPair = sourceVector.makeTransferPair(toVector); + + final int[][] transferLengths = {{0, 2}, + {2, 1}, + {3, 2}, + {5, 3}, + {8, 2} + }; + + for (final int[] transferLength : transferLengths) { + final int start = transferLength[0]; + final int length = transferLength[1]; + + transferPair.splitAndTransfer(start, length); + + /* check the toVector output after doing the splitAndTransfer */ + for (int i = 0; i < length; i++) { + assertEquals("Different values at index: " + i, sourceVector.getObject(start + i), toVector.getObject(i)); + } + } + } + } + } + + @Test + public void testGetFieldTypeInfo() throws Exception { + Map metadata = new HashMap<>(); + metadata.put("key1", "value1"); + + int[] typeIds = new int[2]; + typeIds[0] = 0; + typeIds[1] = 1; + + List children = new ArrayList<>(); + children.add(new Field("int", FieldType.nullable(MinorType.INT.getType()), null)); + children.add(new Field("varchar", FieldType.nullable(MinorType.VARCHAR.getType()), null)); + + final FieldType fieldType = new FieldType(false, new ArrowType.Union(UnionMode.Dense, typeIds), + /*dictionary=*/null, metadata); + final Field field = new Field("union", fieldType, children); + + MinorType minorType = MinorType.DENSEUNION; + DenseUnionVector vector = (DenseUnionVector) minorType.getNewVector(field, allocator, null); + vector.initializeChildrenFromFields(children); + + assertEquals(vector.getField(), field); + + // Union has 2 child vectors + assertEquals(vector.size(), 2); + + // Check child field 0 + VectorWithOrdinal intChild = vector.getChildVectorWithOrdinal("int"); + assertEquals(intChild.ordinal, 0); + assertEquals(intChild.vector.getField(), children.get(0)); + + // Check child field 1 + VectorWithOrdinal varcharChild = vector.getChildVectorWithOrdinal("varchar"); + assertEquals(varcharChild.ordinal, 1); + assertEquals(varcharChild.vector.getField(), children.get(1)); + } + + @Test + public void testGetBufferAddress() throws Exception { + try (DenseUnionVector vector = new DenseUnionVector(EMPTY_SCHEMA_PATH, allocator, null, null)) { + boolean error = false; + + vector.allocateNew(); + + /* populate the UnionVector */ + byte intTypeId = vector.registerNewTypeId(Field.nullable("", MinorType.INT.getType())); + vector.setTypeId(0, intTypeId); + vector.setSafe(0, newIntHolder(5)); + + byte float4TypeId = vector.registerNewTypeId(Field.nullable("", MinorType.INT.getType())); + vector.setTypeId(1, float4TypeId); + vector.setSafe(1, newFloat4Holder(5.5f)); + + vector.setTypeId(2, intTypeId); + vector.setSafe(2, newIntHolder(10)); + + vector.setTypeId(3, float4TypeId); + vector.setSafe(3, newFloat4Holder(10.5f)); + + vector.setValueCount(10); + + /* check the vector output */ + assertEquals(10, vector.getValueCount()); + assertEquals(false, vector.isNull(0)); + assertEquals(5, vector.getObject(0)); + assertEquals(false, vector.isNull(1)); + assertEquals(5.5f, vector.getObject(1)); + assertEquals(false, vector.isNull(2)); + assertEquals(10, vector.getObject(2)); + assertEquals(false, vector.isNull(3)); + assertEquals(10.5f, vector.getObject(3)); + + List buffers = vector.getFieldBuffers(); + + long offsetAddress = vector.getOffsetBufferAddress(); + + try { + vector.getDataBufferAddress(); + } catch (UnsupportedOperationException ue) { + error = true; + } finally { + assertTrue(error); + } + + assertEquals(2, buffers.size()); + assertEquals(offsetAddress, buffers.get(1).memoryAddress()); + } + } + + /** + * Test adding two struct vectors to the dense union vector. + */ + @Test + public void testMultipleStructs() { + FieldType type = new FieldType(true, ArrowType.Struct.INSTANCE, null, null); + try (StructVector structVector1 = new StructVector("struct1", allocator, type, null); + StructVector structVector2 = new StructVector("struct2", allocator, type, null); + DenseUnionVector unionVector = DenseUnionVector.empty("union", allocator)) { + + // prepare sub vectors + + // first struct vector: (int, int) + IntVector subVector11 = structVector1 + .addOrGet("sub11", FieldType.nullable(MinorType.INT.getType()), IntVector.class); + subVector11.allocateNew(); + ValueVectorDataPopulator.setVector(subVector11, 0, 1); + + IntVector subVector12 = structVector1 + .addOrGet("sub12", FieldType.nullable(MinorType.INT.getType()), IntVector.class); + subVector12.allocateNew(); + ValueVectorDataPopulator.setVector(subVector12, 0, 10); + + structVector1.setIndexDefined(0); + structVector1.setIndexDefined(1); + structVector1.setValueCount(2); + + // second struct vector: (string, string) + VarCharVector subVector21 = structVector2 + .addOrGet("sub21", FieldType.nullable(MinorType.VARCHAR.getType()), VarCharVector.class); + subVector21.allocateNew(); + ValueVectorDataPopulator.setVector(subVector21, "a0"); + + VarCharVector subVector22 = structVector2 + .addOrGet("sub22", FieldType.nullable(MinorType.VARCHAR.getType()), VarCharVector.class); + subVector22.allocateNew(); + ValueVectorDataPopulator.setVector(subVector22, "b0"); + + structVector2.setIndexDefined(0); + structVector2.setValueCount(1); + + // register relative types + byte typeId1 = unionVector.registerNewTypeId(structVector1.getField()); + byte typeId2 = unionVector.registerNewTypeId(structVector2.getField()); + assertEquals(typeId1, 0); + assertEquals(typeId2, 1); + + // add two struct vectors to union vector + unionVector.addVector(typeId1, structVector1); + unionVector.addVector(typeId2, structVector2); + + while (unionVector.getValueCapacity() < 3) { + unionVector.reAlloc(); + } + + ArrowBuf offsetBuf = unionVector.getOffsetBuffer(); + + unionVector.setTypeId(0, typeId1); + offsetBuf.setInt(0, 0); + + unionVector.setTypeId(1, typeId2); + offsetBuf.setInt(DenseUnionVector.OFFSET_WIDTH, 0); + + unionVector.setTypeId(2, typeId1); + offsetBuf.setInt(DenseUnionVector.OFFSET_WIDTH * 2, 1); + + unionVector.setValueCount(3); + + Map value0 = new JsonStringHashMap<>(); + value0.put("sub11", 0); + value0.put("sub12", 0); + + assertEquals(value0, unionVector.getObject(0)); + + Map value1 = new JsonStringHashMap<>(); + value1.put("sub21", new Text("a0")); + value1.put("sub22", new Text("b0")); + + assertEquals(value1, unionVector.getObject(1)); + + Map value2 = new JsonStringHashMap<>(); + value2.put("sub11", 1); + value2.put("sub12", 10); + + assertEquals(value2, unionVector.getObject(2)); + } + } + + /** + * Test adding two varchar vectors to the dense union vector. + */ + @Test + public void testMultipleVarChars() { + try (VarCharVector childVector1 = new VarCharVector("child1", allocator); + VarCharVector childVector2 = new VarCharVector("child2", allocator); + DenseUnionVector unionVector = DenseUnionVector.empty("union", allocator)) { + + // prepare sub vectors + ValueVectorDataPopulator.setVector(childVector1, "a0", "a4"); + ValueVectorDataPopulator.setVector(childVector2, "b1", "b2"); + + // register relative types + byte typeId1 = unionVector.registerNewTypeId(childVector1.getField()); + byte typeId2 = unionVector.registerNewTypeId(childVector2.getField()); + + assertEquals(typeId1, 0); + assertEquals(typeId2, 1); + + while (unionVector.getValueCapacity() < 5) { + unionVector.reAlloc(); + } + + // add two struct vectors to union vector + unionVector.addVector(typeId1, childVector1); + unionVector.addVector(typeId2, childVector2); + + ArrowBuf offsetBuf = unionVector.getOffsetBuffer(); + + // slot 0 points to child1 + unionVector.setTypeId(0, typeId1); + offsetBuf.setInt(0, 0); + + // slot 1 points to child2 + unionVector.setTypeId(1, typeId2); + offsetBuf.setInt(DenseUnionVector.OFFSET_WIDTH, 0); + + // slot 2 points to child2 + unionVector.setTypeId(2, typeId2); + offsetBuf.setInt(DenseUnionVector.OFFSET_WIDTH * 2, 1); + + + // slot 4 points to child1 + unionVector.setTypeId(4, typeId1); + offsetBuf.setInt(DenseUnionVector.OFFSET_WIDTH * 4, 1); + + unionVector.setValueCount(5); + + assertEquals(new Text("a0"), unionVector.getObject(0)); + assertEquals(new Text("b1"), unionVector.getObject(1)); + assertEquals(new Text("b2"), unionVector.getObject(2)); + assertNull(unionVector.getObject(3)); + assertEquals(new Text("a4"), unionVector.getObject(4)); + } + } + + @Test + public void testChildVectorValueCounts() { + final NullableIntHolder intHolder = new NullableIntHolder(); + intHolder.isSet = 1; + + final NullableBigIntHolder longHolder = new NullableBigIntHolder(); + longHolder.isSet = 1; + + final NullableFloat4Holder floatHolder = new NullableFloat4Holder(); + floatHolder.isSet = 1; + + try (DenseUnionVector vector = new DenseUnionVector("vector", allocator, null, null)) { + vector.allocateNew(); + + // populate the delta vector with values {7, null, 8L, 9.0f, 10, 12L} + while (vector.getValueCapacity() < 6) { + vector.reAlloc(); + } + byte intTypeId = vector.registerNewTypeId(Field.nullable("", Types.MinorType.INT.getType())); + vector.setTypeId(0, intTypeId); + intHolder.value = 7; + vector.setSafe(0, intHolder); + byte longTypeId = vector.registerNewTypeId(Field.nullable("", Types.MinorType.BIGINT.getType())); + vector.setTypeId(2, longTypeId); + longHolder.value = 8L; + vector.setSafe(2, longHolder); + byte floatTypeId = vector.registerNewTypeId(Field.nullable("", Types.MinorType.FLOAT4.getType())); + vector.setTypeId(3, floatTypeId); + floatHolder.value = 9.0f; + vector.setSafe(3, floatHolder); + + vector.setTypeId(4, intTypeId); + intHolder.value = 10; + vector.setSafe(4, intHolder); + vector.setTypeId(5, longTypeId); + longHolder.value = 12L; + vector.setSafe(5, longHolder); + + vector.setValueCount(6); + + // verify results + IntVector intVector = (IntVector) vector.getVectorByType(intTypeId); + assertEquals(2, intVector.getValueCount()); + assertEquals(7, intVector.get(0)); + assertEquals(10, intVector.get(1)); + + BigIntVector longVector = (BigIntVector) vector.getVectorByType(longTypeId); + assertEquals(2, longVector.getValueCount()); + assertEquals(8L, longVector.get(0)); + assertEquals(12L, longVector.get(1)); + + Float4Vector floagVector = (Float4Vector) vector.getVectorByType(floatTypeId); + assertEquals(1, floagVector.getValueCount()); + assertEquals(9.0f, floagVector.get(0), 0); + } + } + + private static NullableIntHolder newIntHolder(int value) { + final NullableIntHolder holder = new NullableIntHolder(); + holder.isSet = 1; + holder.value = value; + return holder; + } + + private static NullableBitHolder newBitHolder(boolean value) { + final NullableBitHolder holder = new NullableBitHolder(); + holder.isSet = 1; + holder.value = value ? 1 : 0; + return holder; + } + + private static NullableFloat4Holder newFloat4Holder(float value) { + final NullableFloat4Holder holder = new NullableFloat4Holder(); + holder.isSet = 1; + holder.value = value; + return holder; + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java new file mode 100644 index 000000000..bc6cddf36 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java @@ -0,0 +1,1032 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.TestUtils.newVarBinaryVector; +import static org.apache.arrow.vector.TestUtils.newVarCharVector; +import static org.apache.arrow.vector.testing.ValueVectorDataPopulator.setVector; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.function.ToIntBiFunction; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.complex.impl.NullableStructWriter; +import org.apache.arrow.vector.complex.impl.UnionListWriter; +import org.apache.arrow.vector.dictionary.Dictionary; +import org.apache.arrow.vector.dictionary.DictionaryEncoder; +import org.apache.arrow.vector.dictionary.DictionaryProvider; +import org.apache.arrow.vector.dictionary.ListSubfieldEncoder; +import org.apache.arrow.vector.dictionary.StructSubfieldEncoder; +import org.apache.arrow.vector.holders.NullableIntHolder; +import org.apache.arrow.vector.holders.NullableUInt4Holder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.DictionaryEncoding; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.Text; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestDictionaryVector { + + private BufferAllocator allocator; + + byte[] zero = "foo".getBytes(StandardCharsets.UTF_8); + byte[] one = "bar".getBytes(StandardCharsets.UTF_8); + byte[] two = "baz".getBytes(StandardCharsets.UTF_8); + + byte[][] data = new byte[][] {zero, one, two}; + + @Before + public void init() { + allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testEncodeStrings() { + // Create a new value vector + try (final VarCharVector vector = newVarCharVector("foo", allocator); + final VarCharVector dictionaryVector = newVarCharVector("dict", allocator);) { + + setVector(vector, zero, one, one, two, zero); + setVector(dictionaryVector, zero, one, two); + + Dictionary dictionary = + new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null)); + + try (final ValueVector encoded = DictionaryEncoder.encode(vector, dictionary)) { + // verify indices + assertEquals(IntVector.class, encoded.getClass()); + + IntVector index = ((IntVector) encoded); + assertEquals(5, index.getValueCount()); + assertEquals(0, index.get(0)); + assertEquals(1, index.get(1)); + assertEquals(1, index.get(2)); + assertEquals(2, index.get(3)); + assertEquals(0, index.get(4)); + + // now run through the decoder and verify we get the original back + try (ValueVector decoded = DictionaryEncoder.decode(encoded, dictionary)) { + assertEquals(vector.getClass(), decoded.getClass()); + assertEquals(vector.getValueCount(), ((VarCharVector) decoded).getValueCount()); + for (int i = 0; i < 5; i++) { + assertEquals(vector.getObject(i), ((VarCharVector) decoded).getObject(i)); + } + } + } + } + } + + @Test + public void testEncodeLargeVector() { + // Create a new value vector + try (final VarCharVector vector = newVarCharVector("foo", allocator); + final VarCharVector dictionaryVector = newVarCharVector("dict", allocator);) { + vector.allocateNew(); + + int count = 10000; + + for (int i = 0; i < 10000; ++i) { + vector.setSafe(i, data[i % 3], 0, data[i % 3].length); + } + vector.setValueCount(count); + + setVector(dictionaryVector, zero, one, two); + + Dictionary dictionary = + new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null)); + + try (final ValueVector encoded = DictionaryEncoder.encode(vector, dictionary)) { + // verify indices + assertEquals(IntVector.class, encoded.getClass()); + + IntVector index = ((IntVector) encoded); + assertEquals(count, index.getValueCount()); + for (int i = 0; i < count; ++i) { + assertEquals(i % 3, index.get(i)); + } + + // now run through the decoder and verify we get the original back + try (ValueVector decoded = DictionaryEncoder.decode(encoded, dictionary)) { + assertEquals(vector.getClass(), decoded.getClass()); + assertEquals(vector.getValueCount(), decoded.getValueCount()); + for (int i = 0; i < count; ++i) { + assertEquals(vector.getObject(i), decoded.getObject(i)); + } + } + } + } + } + + @Test + public void testEncodeList() { + // Create a new value vector + try (final ListVector vector = ListVector.empty("vector", allocator); + final ListVector dictionaryVector = ListVector.empty("dict", allocator);) { + + UnionListWriter writer = vector.getWriter(); + writer.allocate(); + + //set some values + writeListVector(writer, new int[]{10, 20}); + writeListVector(writer, new int[]{10, 20}); + writeListVector(writer, new int[]{10, 20}); + writeListVector(writer, new int[]{30, 40, 50}); + writeListVector(writer, new int[]{30, 40, 50}); + writeListVector(writer, new int[]{10, 20}); + + writer.setValueCount(6); + + UnionListWriter dictWriter = dictionaryVector.getWriter(); + dictWriter.allocate(); + + writeListVector(dictWriter, new int[]{10, 20}); + writeListVector(dictWriter, new int[]{30, 40, 50}); + + dictWriter.setValueCount(2); + + Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null)); + + try (final ValueVector encoded = DictionaryEncoder.encode(vector, dictionary)) { + // verify indices + assertEquals(IntVector.class, encoded.getClass()); + + IntVector index = ((IntVector) encoded); + assertEquals(6, index.getValueCount()); + assertEquals(0, index.get(0)); + assertEquals(0, index.get(1)); + assertEquals(0, index.get(2)); + assertEquals(1, index.get(3)); + assertEquals(1, index.get(4)); + assertEquals(0, index.get(5)); + + // now run through the decoder and verify we get the original back + try (ValueVector decoded = DictionaryEncoder.decode(encoded, dictionary)) { + assertEquals(vector.getClass(), decoded.getClass()); + assertEquals(vector.getValueCount(), decoded.getValueCount()); + for (int i = 0; i < 5; i++) { + assertEquals(vector.getObject(i), decoded.getObject(i)); + } + } + } + } + } + + @Test + public void testEncodeStruct() { + // Create a new value vector + try (final StructVector vector = StructVector.empty("vector", allocator); + final StructVector dictionaryVector = StructVector.empty("dict", allocator);) { + vector.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class); + vector.addOrGet("f1", FieldType.nullable(new ArrowType.Int(64, true)), BigIntVector.class); + dictionaryVector.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class); + dictionaryVector.addOrGet("f1", FieldType.nullable(new ArrowType.Int(64, true)), BigIntVector.class); + + NullableStructWriter writer = vector.getWriter(); + writer.allocate(); + + writeStructVector(writer, 1, 10L); + writeStructVector(writer, 1, 10L); + writeStructVector(writer, 1, 10L); + writeStructVector(writer, 2, 20L); + writeStructVector(writer, 2, 20L); + writeStructVector(writer, 2, 20L); + writeStructVector(writer, 1, 10L); + + writer.setValueCount(7); + + NullableStructWriter dictWriter = dictionaryVector.getWriter(); + dictWriter.allocate(); + + writeStructVector(dictWriter, 1, 10L); + writeStructVector(dictWriter, 2, 20L); + + + dictionaryVector.setValueCount(2); + + Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null)); + + try (final ValueVector encoded = DictionaryEncoder.encode(vector, dictionary)) { + // verify indices + assertEquals(IntVector.class, encoded.getClass()); + + IntVector index = ((IntVector) encoded); + assertEquals(7, index.getValueCount()); + assertEquals(0, index.get(0)); + assertEquals(0, index.get(1)); + assertEquals(0, index.get(2)); + assertEquals(1, index.get(3)); + assertEquals(1, index.get(4)); + assertEquals(1, index.get(5)); + assertEquals(0, index.get(6)); + + // now run through the decoder and verify we get the original back + try (ValueVector decoded = DictionaryEncoder.decode(encoded, dictionary)) { + assertEquals(vector.getClass(), decoded.getClass()); + assertEquals(vector.getValueCount(), decoded.getValueCount()); + for (int i = 0; i < 5; i++) { + assertEquals(vector.getObject(i), decoded.getObject(i)); + } + } + } + } + } + + @Test + public void testEncodeBinaryVector() { + // Create a new value vector + try (final VarBinaryVector vector = newVarBinaryVector("foo", allocator); + final VarBinaryVector dictionaryVector = newVarBinaryVector("dict", allocator)) { + + setVector(vector, zero, one, one, two, zero); + setVector(dictionaryVector, zero, one, two); + + Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null)); + + try (final ValueVector encoded = DictionaryEncoder.encode(vector, dictionary)) { + // verify indices + assertEquals(IntVector.class, encoded.getClass()); + + IntVector index = ((IntVector) encoded); + assertEquals(5, index.getValueCount()); + assertEquals(0, index.get(0)); + assertEquals(1, index.get(1)); + assertEquals(1, index.get(2)); + assertEquals(2, index.get(3)); + assertEquals(0, index.get(4)); + + // now run through the decoder and verify we get the original back + try (VarBinaryVector decoded = (VarBinaryVector) DictionaryEncoder.decode(encoded, dictionary)) { + assertEquals(vector.getClass(), decoded.getClass()); + assertEquals(vector.getValueCount(), decoded.getValueCount()); + for (int i = 0; i < 5; i++) { + assertTrue(Arrays.equals(vector.getObject(i), decoded.getObject(i))); + } + } + } + } + } + + @Test + public void testEncodeUnion() { + // Create a new value vector + try (final UnionVector vector = new UnionVector("vector", allocator, /* field type */ null, /* call-back */ null); + final UnionVector dictionaryVector = + new UnionVector("dict", allocator, /* field type */ null, /* call-back */ null);) { + + final NullableUInt4Holder uintHolder1 = new NullableUInt4Holder(); + uintHolder1.value = 10; + uintHolder1.isSet = 1; + + final NullableIntHolder intHolder1 = new NullableIntHolder(); + intHolder1.value = 10; + intHolder1.isSet = 1; + + final NullableIntHolder intHolder2 = new NullableIntHolder(); + intHolder2.value = 20; + intHolder2.isSet = 1; + + //write data + vector.setType(0, Types.MinorType.UINT4); + vector.setSafe(0, uintHolder1); + + vector.setType(1, Types.MinorType.INT); + vector.setSafe(1, intHolder1); + + vector.setType(2, Types.MinorType.INT); + vector.setSafe(2, intHolder1); + + vector.setType(3, Types.MinorType.INT); + vector.setSafe(3, intHolder2); + + vector.setType(4, Types.MinorType.INT); + vector.setSafe(4, intHolder2); + + vector.setValueCount(5); + + //write dictionary + dictionaryVector.setType(0, Types.MinorType.UINT4); + dictionaryVector.setSafe(0, uintHolder1); + + dictionaryVector.setType(1, Types.MinorType.INT); + dictionaryVector.setSafe(1, intHolder1); + + dictionaryVector.setType(2, Types.MinorType.INT); + dictionaryVector.setSafe(2, intHolder2); + + dictionaryVector.setValueCount(3); + + Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null)); + + try (final ValueVector encoded = DictionaryEncoder.encode(vector, dictionary)) { + // verify indices + assertEquals(IntVector.class, encoded.getClass()); + + IntVector index = ((IntVector) encoded); + assertEquals(5, index.getValueCount()); + assertEquals(0, index.get(0)); + assertEquals(1, index.get(1)); + assertEquals(1, index.get(2)); + assertEquals(2, index.get(3)); + assertEquals(2, index.get(4)); + + // now run through the decoder and verify we get the original back + try (ValueVector decoded = DictionaryEncoder.decode(encoded, dictionary)) { + assertEquals(vector.getClass(), decoded.getClass()); + assertEquals(vector.getValueCount(), decoded.getValueCount()); + for (int i = 0; i < 5; i++) { + assertEquals(vector.getObject(i), decoded.getObject(i)); + } + } + } + } + } + + @Test + public void testIntEquals() { + //test Int + try (final IntVector vector1 = new IntVector("int", allocator); + final IntVector vector2 = new IntVector("int", allocator)) { + + Dictionary dict1 = new Dictionary(vector1, new DictionaryEncoding(1L, false, null)); + Dictionary dict2 = new Dictionary(vector2, new DictionaryEncoding(1L, false, null)); + + setVector(vector1, 1, 2, 3); + setVector(vector2, 1, 2, 0); + + assertFalse(dict1.equals(dict2)); + + vector2.setSafe(2, 3); + assertTrue(dict1.equals(dict2)); + } + } + + @Test + public void testVarcharEquals() { + try (final VarCharVector vector1 = new VarCharVector("varchar", allocator); + final VarCharVector vector2 = new VarCharVector("varchar", allocator)) { + + Dictionary dict1 = new Dictionary(vector1, new DictionaryEncoding(1L, false, null)); + Dictionary dict2 = new Dictionary(vector2, new DictionaryEncoding(1L, false, null)); + + setVector(vector1, zero, one, two); + setVector(vector2, zero, one, one); + + assertFalse(dict1.equals(dict2)); + + vector2.setSafe(2, two, 0, two.length); + assertTrue(dict1.equals(dict2)); + } + } + + @Test + public void testVarBinaryEquals() { + try (final VarBinaryVector vector1 = new VarBinaryVector("binary", allocator); + final VarBinaryVector vector2 = new VarBinaryVector("binary", allocator)) { + + Dictionary dict1 = new Dictionary(vector1, new DictionaryEncoding(1L, false, null)); + Dictionary dict2 = new Dictionary(vector2, new DictionaryEncoding(1L, false, null)); + + setVector(vector1, zero, one, two); + setVector(vector2, zero, one, one); + + assertFalse(dict1.equals(dict2)); + + vector2.setSafe(2, two, 0, two.length); + assertTrue(dict1.equals(dict2)); + } + } + + @Test + public void testListEquals() { + try (final ListVector vector1 = ListVector.empty("list", allocator); + final ListVector vector2 = ListVector.empty("list", allocator);) { + + Dictionary dict1 = new Dictionary(vector1, new DictionaryEncoding(1L, false, null)); + Dictionary dict2 = new Dictionary(vector2, new DictionaryEncoding(1L, false, null)); + + UnionListWriter writer1 = vector1.getWriter(); + writer1.allocate(); + + //set some values + writeListVector(writer1, new int[] {1, 2}); + writeListVector(writer1, new int[] {3, 4}); + writeListVector(writer1, new int[] {5, 6}); + writer1.setValueCount(3); + + UnionListWriter writer2 = vector2.getWriter(); + writer2.allocate(); + + //set some values + writeListVector(writer2, new int[] {1, 2}); + writeListVector(writer2, new int[] {3, 4}); + writeListVector(writer2, new int[] {5, 6}); + writer2.setValueCount(3); + + assertTrue(dict1.equals(dict2)); + } + } + + @Test + public void testStructEquals() { + try (final StructVector vector1 = StructVector.empty("struct", allocator); + final StructVector vector2 = StructVector.empty("struct", allocator);) { + vector1.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class); + vector1.addOrGet("f1", FieldType.nullable(new ArrowType.Int(64, true)), BigIntVector.class); + vector2.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class); + vector2.addOrGet("f1", FieldType.nullable(new ArrowType.Int(64, true)), BigIntVector.class); + + Dictionary dict1 = new Dictionary(vector1, new DictionaryEncoding(1L, false, null)); + Dictionary dict2 = new Dictionary(vector2, new DictionaryEncoding(1L, false, null)); + + NullableStructWriter writer1 = vector1.getWriter(); + writer1.allocate(); + + writeStructVector(writer1, 1, 10L); + writeStructVector(writer1, 2, 20L); + writer1.setValueCount(2); + + NullableStructWriter writer2 = vector2.getWriter(); + writer2.allocate(); + + writeStructVector(writer2, 1, 10L); + writeStructVector(writer2, 2, 20L); + writer2.setValueCount(2); + + assertTrue(dict1.equals(dict2)); + } + } + + @Test + public void testUnionEquals() { + try (final UnionVector vector1 = new UnionVector("union", allocator, /* field type */ null, /* call-back */ null); + final UnionVector vector2 = + new UnionVector("union", allocator, /* field type */ null, /* call-back */ null);) { + + final NullableUInt4Holder uInt4Holder = new NullableUInt4Holder(); + uInt4Holder.value = 10; + uInt4Holder.isSet = 1; + + final NullableIntHolder intHolder = new NullableIntHolder(); + uInt4Holder.value = 20; + uInt4Holder.isSet = 1; + + vector1.setType(0, Types.MinorType.UINT4); + vector1.setSafe(0, uInt4Holder); + + vector1.setType(2, Types.MinorType.INT); + vector1.setSafe(2, intHolder); + vector1.setValueCount(3); + + vector2.setType(0, Types.MinorType.UINT4); + vector2.setSafe(0, uInt4Holder); + + vector2.setType(2, Types.MinorType.INT); + vector2.setSafe(2, intHolder); + vector2.setValueCount(3); + + Dictionary dict1 = new Dictionary(vector1, new DictionaryEncoding(1L, false, null)); + Dictionary dict2 = new Dictionary(vector2, new DictionaryEncoding(1L, false, null)); + + assertTrue(dict1.equals(dict2)); + } + } + + @Test + public void testEncodeWithEncoderInstance() { + // Create a new value vector + try (final VarCharVector vector = newVarCharVector("vector", allocator); + final VarCharVector dictionaryVector = newVarCharVector("dict", allocator);) { + + setVector(vector, zero, one, one, two, zero); + setVector(dictionaryVector, zero, one, two); + + Dictionary dictionary = + new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null)); + DictionaryEncoder encoder = new DictionaryEncoder(dictionary, allocator); + + try (final ValueVector encoded = encoder.encode(vector)) { + // verify indices + assertEquals(IntVector.class, encoded.getClass()); + + IntVector index = ((IntVector) encoded); + assertEquals(5, index.getValueCount()); + assertEquals(0, index.get(0)); + assertEquals(1, index.get(1)); + assertEquals(1, index.get(2)); + assertEquals(2, index.get(3)); + assertEquals(0, index.get(4)); + + // now run through the decoder and verify we get the original back + try (ValueVector decoded = encoder.decode(encoded)) { + assertEquals(vector.getClass(), decoded.getClass()); + assertEquals(vector.getValueCount(), (decoded).getValueCount()); + for (int i = 0; i < 5; i++) { + assertEquals(vector.getObject(i), ((VarCharVector) decoded).getObject(i)); + } + } + } + } + } + + @Test + public void testEncodeMultiVectors() { + // Create a new value vector + try (final VarCharVector vector1 = newVarCharVector("vector1", allocator); + final VarCharVector vector2 = newVarCharVector("vector2", allocator); + final VarCharVector dictionaryVector = newVarCharVector("dict", allocator);) { + + setVector(vector1, zero, one, one, two, zero); + setVector(vector2, zero, one, one); + setVector(dictionaryVector, zero, one, two); + + Dictionary dictionary = + new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null)); + DictionaryEncoder encoder = new DictionaryEncoder(dictionary, allocator); + + try (final ValueVector encoded = encoder.encode(vector1)) { + // verify indices + assertEquals(IntVector.class, encoded.getClass()); + + IntVector index = ((IntVector) encoded); + assertEquals(5, index.getValueCount()); + assertEquals(0, index.get(0)); + assertEquals(1, index.get(1)); + assertEquals(1, index.get(2)); + assertEquals(2, index.get(3)); + assertEquals(0, index.get(4)); + + // now run through the decoder and verify we get the original back + try (ValueVector decoded = encoder.decode(encoded)) { + assertEquals(vector1.getClass(), decoded.getClass()); + assertEquals(vector1.getValueCount(), (decoded).getValueCount()); + for (int i = 0; i < 5; i++) { + assertEquals(vector1.getObject(i), ((VarCharVector) decoded).getObject(i)); + } + } + } + + try (final ValueVector encoded = encoder.encode(vector2)) { + // verify indices + assertEquals(IntVector.class, encoded.getClass()); + + IntVector index = ((IntVector) encoded); + assertEquals(3, index.getValueCount()); + assertEquals(0, index.get(0)); + assertEquals(1, index.get(1)); + assertEquals(1, index.get(2)); + + // now run through the decoder and verify we get the original back + try (ValueVector decoded = encoder.decode(encoded)) { + assertEquals(vector2.getClass(), decoded.getClass()); + assertEquals(vector2.getValueCount(), (decoded).getValueCount()); + for (int i = 0; i < 3; i++) { + assertEquals(vector2.getObject(i), ((VarCharVector) decoded).getObject(i)); + } + } + } + } + } + + @Test + public void testEncodeListSubField() { + // Create a new value vector + try (final ListVector vector = ListVector.empty("vector", allocator); + final ListVector dictionaryVector = ListVector.empty("dict", allocator);) { + + UnionListWriter writer = vector.getWriter(); + writer.allocate(); + + //set some values + writeListVector(writer, new int[]{10, 20}); + writeListVector(writer, new int[]{10, 20}); + writeListVector(writer, new int[]{10, 20}); + writeListVector(writer, new int[]{30, 40, 50}); + writeListVector(writer, new int[]{30, 40, 50}); + writeListVector(writer, new int[]{10, 20}); + writer.setValueCount(6); + + UnionListWriter dictWriter = dictionaryVector.getWriter(); + dictWriter.allocate(); + writeListVector(dictWriter, new int[]{10, 20, 30, 40, 50}); + dictionaryVector.setValueCount(1); + + Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null)); + ListSubfieldEncoder encoder = new ListSubfieldEncoder(dictionary, allocator); + + try (final ListVector encoded = (ListVector) encoder.encodeListSubField(vector)) { + // verify indices + assertEquals(ListVector.class, encoded.getClass()); + + assertEquals(6, encoded.getValueCount()); + int[] realValue1 = convertListToIntArray(encoded.getObject(0)); + assertTrue(Arrays.equals(new int[] {0, 1}, realValue1)); + int[] realValue2 = convertListToIntArray(encoded.getObject(1)); + assertTrue(Arrays.equals(new int[] {0, 1}, realValue2)); + int[] realValue3 = convertListToIntArray(encoded.getObject(2)); + assertTrue(Arrays.equals(new int[] {0, 1}, realValue3)); + int[] realValue4 = convertListToIntArray(encoded.getObject(3)); + assertTrue(Arrays.equals(new int[] {2, 3, 4}, realValue4)); + int[] realValue5 = convertListToIntArray(encoded.getObject(4)); + assertTrue(Arrays.equals(new int[] {2, 3, 4}, realValue5)); + int[] realValue6 = convertListToIntArray(encoded.getObject(5)); + assertTrue(Arrays.equals(new int[] {0, 1}, realValue6)); + + // now run through the decoder and verify we get the original back + try (ValueVector decoded = encoder.decodeListSubField(encoded)) { + assertEquals(vector.getClass(), decoded.getClass()); + assertEquals(vector.getValueCount(), decoded.getValueCount()); + for (int i = 0; i < 5; i++) { + assertEquals(vector.getObject(i), decoded.getObject(i)); + } + } + } + } + } + + @Test + public void testEncodeFixedSizeListSubField() { + // Create a new value vector + try (final FixedSizeListVector vector = FixedSizeListVector.empty("vector", 2, allocator); + final FixedSizeListVector dictionaryVector = FixedSizeListVector.empty("dict", 2, allocator)) { + + vector.allocateNew(); + vector.setValueCount(4); + + IntVector dataVector = + (IntVector) vector.addOrGetVector(FieldType.nullable(Types.MinorType.INT.getType())).getVector(); + dataVector.allocateNew(8); + dataVector.setValueCount(8); + // set value at index 0 + vector.setNotNull(0); + dataVector.set(0, 10); + dataVector.set(1, 20); + // set value at index 1 + vector.setNotNull(1); + dataVector.set(2, 10); + dataVector.set(3, 20); + // set value at index 2 + vector.setNotNull(2); + dataVector.set(4, 30); + dataVector.set(5, 40); + // set value at index 3 + vector.setNotNull(3); + dataVector.set(6, 10); + dataVector.set(7, 20); + + dictionaryVector.allocateNew(); + dictionaryVector.setValueCount(2); + IntVector dictDataVector = + (IntVector) dictionaryVector.addOrGetVector(FieldType.nullable(Types.MinorType.INT.getType())).getVector(); + dictDataVector.allocateNew(4); + dictDataVector.setValueCount(4); + + dictionaryVector.setNotNull(0); + dictDataVector.set(0, 10); + dictDataVector.set(1, 20); + dictionaryVector.setNotNull(1); + dictDataVector.set(2, 30); + dictDataVector.set(3, 40); + + Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null)); + ListSubfieldEncoder encoder = new ListSubfieldEncoder(dictionary, allocator); + + try (final FixedSizeListVector encoded = + (FixedSizeListVector) encoder.encodeListSubField(vector)) { + // verify indices + assertEquals(FixedSizeListVector.class, encoded.getClass()); + + assertEquals(4, encoded.getValueCount()); + int[] realValue1 = convertListToIntArray(encoded.getObject(0)); + assertTrue(Arrays.equals(new int[] {0, 1}, realValue1)); + int[] realValue2 = convertListToIntArray(encoded.getObject(1)); + assertTrue(Arrays.equals(new int[] {0, 1}, realValue2)); + int[] realValue3 = convertListToIntArray(encoded.getObject(2)); + assertTrue(Arrays.equals(new int[] {2, 3}, realValue3)); + int[] realValue4 = convertListToIntArray(encoded.getObject(3)); + assertTrue(Arrays.equals(new int[] {0, 1}, realValue4)); + + // now run through the decoder and verify we get the original back + try (ValueVector decoded = encoder.decodeListSubField(encoded)) { + assertEquals(vector.getClass(), decoded.getClass()); + assertEquals(vector.getValueCount(), decoded.getValueCount()); + for (int i = 0; i < 5; i++) { + assertEquals(vector.getObject(i), decoded.getObject(i)); + } + } + } + } + } + + @Test + public void testEncodeStructSubField() { + try (final StructVector vector = StructVector.empty("vector", allocator); + final VarCharVector dictVector1 = new VarCharVector("f0", allocator); + final VarCharVector dictVector2 = new VarCharVector("f1", allocator)) { + + vector.addOrGet("f0", FieldType.nullable(ArrowType.Utf8.INSTANCE), VarCharVector.class); + vector.addOrGet("f1", FieldType.nullable(ArrowType.Utf8.INSTANCE), VarCharVector.class); + + NullableStructWriter writer = vector.getWriter(); + writer.allocate(); + //set some values + writeStructVector(writer, "aa", "baz"); + writeStructVector(writer, "bb", "bar"); + writeStructVector(writer, "cc", "foo"); + writeStructVector(writer, "aa", "foo"); + writeStructVector(writer, "dd", "foo"); + writer.setValueCount(5); + + // initialize dictionaries + DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider(); + + + setVector(dictVector1, + "aa".getBytes(StandardCharsets.UTF_8), + "bb".getBytes(StandardCharsets.UTF_8), + "cc".getBytes(StandardCharsets.UTF_8), + "dd".getBytes(StandardCharsets.UTF_8)); + setVector(dictVector2, + "foo".getBytes(StandardCharsets.UTF_8), + "baz".getBytes(StandardCharsets.UTF_8), + "bar".getBytes(StandardCharsets.UTF_8)); + + provider.put(new Dictionary(dictVector1, new DictionaryEncoding(1L, false, null))); + provider.put(new Dictionary(dictVector2, new DictionaryEncoding(2L, false, null))); + + StructSubfieldEncoder encoder = new StructSubfieldEncoder(allocator, provider); + Map columnToDictionaryId = new HashMap<>(); + columnToDictionaryId.put(0, 1L); + columnToDictionaryId.put(1, 2L); + + try (final StructVector encoded = (StructVector) encoder.encode(vector, columnToDictionaryId)) { + // verify indices + assertEquals(StructVector.class, encoded.getClass()); + + assertEquals(5, encoded.getValueCount()); + Object[] realValue1 = convertMapValuesToArray(encoded.getObject(0)); + assertTrue(Arrays.equals(new Object[] {0, 1}, realValue1)); + Object[] realValue2 = convertMapValuesToArray(encoded.getObject(1)); + assertTrue(Arrays.equals(new Object[] {1, 2}, realValue2)); + Object[] realValue3 = convertMapValuesToArray(encoded.getObject(2)); + assertTrue(Arrays.equals(new Object[] {2, 0}, realValue3)); + Object[] realValue4 = convertMapValuesToArray(encoded.getObject(3)); + assertTrue(Arrays.equals(new Object[] {0, 0}, realValue4)); + Object[] realValue5 = convertMapValuesToArray(encoded.getObject(4)); + assertTrue(Arrays.equals(new Object[] {3, 0}, realValue5)); + + // now run through the decoder and verify we get the original back + try (ValueVector decoded = encoder.decode(encoded)) { + assertEquals(vector.getClass(), decoded.getClass()); + assertEquals(vector.getValueCount(), decoded.getValueCount()); + for (int i = 0; i < 5; i++) { + assertEquals(vector.getObject(i), decoded.getObject(i)); + } + } + } + } + } + + @Test + public void testEncodeStructSubFieldWithCertainColumns() { + // in this case, some child vector is encoded and others are not + try (final StructVector vector = StructVector.empty("vector", allocator); + final VarCharVector dictVector1 = new VarCharVector("f0", allocator)) { + + vector.addOrGet("f0", FieldType.nullable(ArrowType.Utf8.INSTANCE), VarCharVector.class); + vector.addOrGet("f1", FieldType.nullable(ArrowType.Utf8.INSTANCE), VarCharVector.class); + + NullableStructWriter writer = vector.getWriter(); + writer.allocate(); + //set some values + writeStructVector(writer, "aa", "baz"); + writeStructVector(writer, "bb", "bar"); + writeStructVector(writer, "cc", "foo"); + writeStructVector(writer, "aa", "foo"); + writeStructVector(writer, "dd", "foo"); + writer.setValueCount(5); + + // initialize dictionaries + DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider(); + + setVector(dictVector1, "aa".getBytes(), "bb".getBytes(), "cc".getBytes(), "dd".getBytes()); + + provider.put(new Dictionary(dictVector1, new DictionaryEncoding(1L, false, null))); + StructSubfieldEncoder encoder = new StructSubfieldEncoder(allocator, provider); + Map columnToDictionaryId = new HashMap<>(); + columnToDictionaryId.put(0, 1L); + + try (final StructVector encoded = (StructVector) encoder.encode(vector, columnToDictionaryId)) { + // verify indices + assertEquals(StructVector.class, encoded.getClass()); + + assertEquals(5, encoded.getValueCount()); + Object[] realValue1 = convertMapValuesToArray(encoded.getObject(0)); + assertTrue(Arrays.equals(new Object[] {0, new Text("baz")}, realValue1)); + Object[] realValue2 = convertMapValuesToArray(encoded.getObject(1)); + assertTrue(Arrays.equals(new Object[] {1, new Text("bar")}, realValue2)); + Object[] realValue3 = convertMapValuesToArray(encoded.getObject(2)); + assertTrue(Arrays.equals(new Object[] {2, new Text("foo")}, realValue3)); + Object[] realValue4 = convertMapValuesToArray(encoded.getObject(3)); + assertTrue(Arrays.equals(new Object[] {0, new Text("foo")}, realValue4)); + Object[] realValue5 = convertMapValuesToArray(encoded.getObject(4)); + assertTrue(Arrays.equals(new Object[] {3, new Text("foo")}, realValue5)); + + // now run through the decoder and verify we get the original back + try (ValueVector decoded = encoder.decode(encoded)) { + assertEquals(vector.getClass(), decoded.getClass()); + assertEquals(vector.getValueCount(), decoded.getValueCount()); + for (int i = 0; i < 5; i++) { + assertEquals(vector.getObject(i), decoded.getObject(i)); + } + } + } + + } + } + + private void testDictionary(Dictionary dictionary, ToIntBiFunction valGetter) { + try (VarCharVector vector = new VarCharVector("vector", allocator)) { + setVector(vector, "1", "3", "5", "7", "9"); + try (ValueVector encodedVector = DictionaryEncoder.encode(vector, dictionary)) { + + // verify encoded result + assertEquals(vector.getValueCount(), encodedVector.getValueCount()); + assertEquals(valGetter.applyAsInt(encodedVector, 0), 1); + assertEquals(valGetter.applyAsInt(encodedVector, 1), 3); + assertEquals(valGetter.applyAsInt(encodedVector, 2), 5); + assertEquals(valGetter.applyAsInt(encodedVector, 3), 7); + assertEquals(valGetter.applyAsInt(encodedVector, 4), 9); + + try (ValueVector decodedVector = DictionaryEncoder.decode(encodedVector, dictionary)) { + assertTrue(decodedVector instanceof VarCharVector); + assertEquals(vector.getValueCount(), decodedVector.getValueCount()); + assertArrayEquals("1".getBytes(), ((VarCharVector) decodedVector).get(0)); + assertArrayEquals("3".getBytes(), ((VarCharVector) decodedVector).get(1)); + assertArrayEquals("5".getBytes(), ((VarCharVector) decodedVector).get(2)); + assertArrayEquals("7".getBytes(), ((VarCharVector) decodedVector).get(3)); + assertArrayEquals("9".getBytes(), ((VarCharVector) decodedVector).get(4)); + } + } + } + } + + @Test + public void testDictionaryUInt1() { + try (VarCharVector dictionaryVector = new VarCharVector("dict vector", allocator)) { + setVector(dictionaryVector, "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"); + Dictionary dictionary1 = new Dictionary(dictionaryVector, + new DictionaryEncoding(/*id=*/10L, /*ordered=*/false, + /*indexType=*/new ArrowType.Int(/*bitWidth*/8, /*isSigned*/false))); + testDictionary(dictionary1, (vector, index) -> ((UInt1Vector) vector).get(index)); + } + } + + @Test + public void testDictionaryUInt2() { + try (VarCharVector dictionaryVector = new VarCharVector("dict vector", allocator)) { + setVector(dictionaryVector, "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"); + Dictionary dictionary2 = new Dictionary(dictionaryVector, + new DictionaryEncoding(/*id=*/20L, /*ordered=*/false, + /*indexType=*/new ArrowType.Int(/*indexType=*/16, /*isSigned*/false))); + testDictionary(dictionary2, (vector, index) -> ((UInt2Vector) vector).get(index)); + } + } + + @Test + public void testDictionaryUInt4() { + try (VarCharVector dictionaryVector = new VarCharVector("dict vector", allocator)) { + setVector(dictionaryVector, "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"); + Dictionary dictionary4 = new Dictionary(dictionaryVector, + new DictionaryEncoding(/*id=*/30L, /*ordered=*/false, + /*indexType=*/new ArrowType.Int(/*indexType=*/32, /*isSigned*/false))); + testDictionary(dictionary4, (vector, index) -> ((UInt4Vector) vector).get(index)); + } + } + + @Test + public void testDictionaryUInt8() { + try (VarCharVector dictionaryVector = new VarCharVector("dict vector", allocator)) { + setVector(dictionaryVector, "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"); + Dictionary dictionary8 = new Dictionary(dictionaryVector, + new DictionaryEncoding(/*id=*/40L, /*ordered=*/false, + /*indexType=*/new ArrowType.Int(/*indexType=*/64, /*isSigned*/false))); + testDictionary(dictionary8, (vector, index) -> (int) ((UInt8Vector) vector).get(index)); + } + } + + @Test + public void testDictionaryUIntOverflow() { + // the size is within the range of UInt1, but outside the range of TinyInt. + final int vecLength = 256; + try (VarCharVector dictionaryVector = new VarCharVector("dict vector", allocator)) { + dictionaryVector.allocateNew(vecLength * 3, vecLength); + for (int i = 0; i < vecLength; i++) { + dictionaryVector.set(i, String.valueOf(i).getBytes()); + } + dictionaryVector.setValueCount(vecLength); + + Dictionary dictionary = new Dictionary(dictionaryVector, + new DictionaryEncoding(/*id=*/10L, /*ordered=*/false, + /*indexType=*/new ArrowType.Int(/*indexType=*/8, /*isSigned*/false))); + + try (VarCharVector vector = new VarCharVector("vector", allocator)) { + setVector(vector, "255"); + try (UInt1Vector encodedVector = (UInt1Vector) DictionaryEncoder.encode(vector, dictionary)) { + + // verify encoded result + assertEquals(1, encodedVector.getValueCount()); + assertEquals(255, encodedVector.getValueAsLong(0)); + + try (VarCharVector decodedVector = (VarCharVector) DictionaryEncoder.decode(encodedVector, dictionary)) { + assertEquals(1, decodedVector.getValueCount()); + assertArrayEquals("255".getBytes(), decodedVector.get(0)); + } + } + } + } + } + + private int[] convertListToIntArray(List list) { + int[] values = new int[list.size()]; + for (int i = 0; i < list.size(); i++) { + values[i] = (int) list.get(i); + } + return values; + } + + private Object[] convertMapValuesToArray(Map map) { + Object[] values = new Object[map.size()]; + Iterator valueIterator = map.values().iterator(); + for (int i = 0; i < map.size(); i++) { + values[i] = valueIterator.next(); + } + return values; + } + + private void writeStructVector(NullableStructWriter writer, String value1, String value2) { + + byte[] bytes1 = value1.getBytes(StandardCharsets.UTF_8); + byte[] bytes2 = value2.getBytes(StandardCharsets.UTF_8); + ArrowBuf temp = allocator.buffer(bytes1.length > bytes2.length ? bytes1.length : bytes2.length); + + writer.start(); + temp.setBytes(0, bytes1); + writer.varChar("f0").writeVarChar(0, bytes1.length, temp); + temp.setBytes(0, bytes2); + writer.varChar("f1").writeVarChar(0, bytes2.length, temp); + writer.end(); + temp.close(); + } + + private void writeStructVector(NullableStructWriter writer, int value1, long value2) { + writer.start(); + writer.integer("f0").writeInt(value1); + writer.bigInt("f1").writeBigInt(value2); + writer.end(); + } + + private void writeListVector(UnionListWriter writer, int[] values) { + writer.startList(); + for (int v: values) { + writer.integer().writeInt(v); + } + writer.endList(); + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestDurationVector.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestDurationVector.java new file mode 100644 index 000000000..8ae876f20 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestDurationVector.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + +import java.time.Duration; + +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.holders.NullableDurationHolder; +import org.apache.arrow.vector.types.TimeUnit; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestDurationVector { + RootAllocator allocator; + + @Before + public void init() { + allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100); + } + + @After + public void terminate() { + allocator.close(); + } + + @Test + public void testSecBasics() { + try (DurationVector secVector = TestUtils.newVector(DurationVector.class, "second", + new ArrowType.Duration(TimeUnit.SECOND), allocator)) { + + secVector.allocateNew(); + secVector.setNull(0); + secVector.setSafe(1, 1000); + secVector.setValueCount(2); + assertNull(secVector.getObject(0)); + assertEquals(Duration.ofSeconds(1000), secVector.getObject(1)); + assertNull(secVector.getAsStringBuilder(0)); + assertEquals("PT16M40S", secVector.getAsStringBuilder(1).toString()); + // Holder + NullableDurationHolder holder = new NullableDurationHolder(); + secVector.get(0, holder); + assertEquals(0, holder.isSet); + secVector.get(1, holder); + assertEquals(1 , holder.isSet); + assertEquals(1000 , holder.value); + } + } + + @Test + public void testMilliBasics() { + try (DurationVector milliVector = TestUtils.newVector(DurationVector.class, "nanos", + new ArrowType.Duration(TimeUnit.MILLISECOND), allocator)) { + + milliVector.allocateNew(); + milliVector.setNull(0); + milliVector.setSafe(1, 1000); + milliVector.setValueCount(2); + assertNull(milliVector.getObject(0)); + assertEquals(Duration.ofSeconds(1), milliVector.getObject(1)); + assertNull(milliVector.getAsStringBuilder(0)); + assertEquals("PT1S", milliVector.getAsStringBuilder(1).toString()); + // Holder + NullableDurationHolder holder = new NullableDurationHolder(); + milliVector.get(0, holder); + assertEquals(0, holder.isSet); + milliVector.get(1, holder); + assertEquals(1 , holder.isSet); + assertEquals(1000 , holder.value); + } + } + + @Test + public void testMicroBasics() { + try (DurationVector microVector = TestUtils.newVector(DurationVector.class, "micro", + new ArrowType.Duration(TimeUnit.MICROSECOND), allocator)) { + + microVector.allocateNew(); + microVector.setNull(0); + microVector.setSafe(1, 1000); + microVector.setValueCount(2); + assertNull(microVector.getObject(0)); + assertEquals(Duration.ofMillis(1), microVector.getObject(1)); + assertNull(microVector.getAsStringBuilder(0)); + assertEquals("PT0.001S", microVector.getAsStringBuilder(1).toString()); + // Holder + NullableDurationHolder holder = new NullableDurationHolder(); + microVector.get(0, holder); + assertEquals(0, holder.isSet); + microVector.get(1, holder); + assertEquals(1 , holder.isSet); + assertEquals(1000 , holder.value); + } + } + + @Test + public void testNanosBasics() { + try (DurationVector nanoVector = TestUtils.newVector(DurationVector.class, "nanos", + new ArrowType.Duration(TimeUnit.NANOSECOND), allocator)) { + + nanoVector.allocateNew(); + nanoVector.setNull(0); + nanoVector.setSafe(1, 1000000); + nanoVector.setValueCount(2); + assertNull(nanoVector.getObject(0)); + assertEquals(Duration.ofMillis(1), nanoVector.getObject(1)); + assertNull(nanoVector.getAsStringBuilder(0)); + assertEquals("PT0.001S", nanoVector.getAsStringBuilder(1).toString()); + // Holder + NullableDurationHolder holder = new NullableDurationHolder(); + nanoVector.get(0, holder); + assertEquals(0, holder.isSet); + nanoVector.get(1, holder); + assertEquals(1 , holder.isSet); + assertEquals(1000000 , holder.value); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeBinaryVector.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeBinaryVector.java new file mode 100644 index 000000000..363821e98 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeBinaryVector.java @@ -0,0 +1,279 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.holders.FixedSizeBinaryHolder; +import org.apache.arrow.vector.holders.NullableFixedSizeBinaryHolder; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestFixedSizeBinaryVector { + private static final int numValues = 123; + private static final int typeWidth = 9; + private static final int smallDataSize = 6; + private static final int largeDataSize = 12; + + private static byte[][] values; + + static { + values = new byte[numValues][typeWidth]; + for (int i = 0; i < numValues; i++) { + for (int j = 0; j < typeWidth; j++) { + values[i][j] = ((byte) i); + } + } + } + + private ArrowBuf[] bufs = new ArrowBuf[numValues]; + private FixedSizeBinaryHolder[] holders = new FixedSizeBinaryHolder[numValues]; + private NullableFixedSizeBinaryHolder[] nullableHolders = new NullableFixedSizeBinaryHolder[numValues]; + + private static byte[] smallValue; + + static { + smallValue = new byte[smallDataSize]; + for (int i = 0; i < smallDataSize; i++) { + smallValue[i] = ((byte) i); + } + } + + private ArrowBuf smallBuf; + private FixedSizeBinaryHolder smallHolder; + private NullableFixedSizeBinaryHolder smallNullableHolder; + + private static byte[] largeValue; + + static { + largeValue = new byte[largeDataSize]; + for (int i = 0; i < largeDataSize; i++) { + largeValue[i] = ((byte) i); + } + } + + private ArrowBuf largeBuf; + private FixedSizeBinaryHolder largeHolder; + private NullableFixedSizeBinaryHolder largeNullableHolder; + + private BufferAllocator allocator; + private FixedSizeBinaryVector vector; + + private static void failWithException(String message) throws Exception { + throw new Exception(message); + } + + + @Before + public void init() throws Exception { + allocator = new DirtyRootAllocator(Integer.MAX_VALUE, (byte) 100); + vector = new FixedSizeBinaryVector("fixedSizeBinary", allocator, typeWidth); + vector.allocateNew(); + + for (int i = 0; i < numValues; i++) { + bufs[i] = allocator.buffer(typeWidth); + bufs[i].setBytes(0, values[i]); + + holders[i] = new FixedSizeBinaryHolder(); + holders[i].byteWidth = typeWidth; + holders[i].buffer = bufs[i]; + + nullableHolders[i] = new NullableFixedSizeBinaryHolder(); + nullableHolders[i].byteWidth = typeWidth; + nullableHolders[i].buffer = bufs[i]; + nullableHolders[i].isSet = 1; + } + + smallBuf = allocator.buffer(smallDataSize); + smallBuf.setBytes(0, smallValue); + + smallHolder = new FixedSizeBinaryHolder(); + smallHolder.byteWidth = smallDataSize; + smallHolder.buffer = smallBuf; + + smallNullableHolder = new NullableFixedSizeBinaryHolder(); + smallNullableHolder.byteWidth = smallDataSize; + smallNullableHolder.buffer = smallBuf; + + largeBuf = allocator.buffer(largeDataSize); + largeBuf.setBytes(0, largeValue); + + largeHolder = new FixedSizeBinaryHolder(); + largeHolder.byteWidth = typeWidth; + largeHolder.buffer = largeBuf; + + largeNullableHolder = new NullableFixedSizeBinaryHolder(); + largeNullableHolder.byteWidth = typeWidth; + largeNullableHolder.buffer = largeBuf; + } + + @After + public void terminate() throws Exception { + for (int i = 0; i < numValues; i++) { + bufs[i].close(); + } + smallBuf.close(); + largeBuf.close(); + + vector.close(); + allocator.close(); + } + + @Test + public void testSetUsingByteArray() { + for (int i = 0; i < numValues; i++) { + vector.set(i, values[i]); + } + vector.setValueCount(numValues); + for (int i = 0; i < numValues; i++) { + assertArrayEquals(values[i], vector.getObject(i)); + } + } + + @Test + public void testSetUsingNull() { + final byte[] value = null; + for (int i = 0; i < numValues; i++) { + final int index = i; + Exception e = assertThrows(NullPointerException.class, () -> { + vector.set(index, value); + }); + assertEquals("expecting a valid byte array", e.getMessage()); + } + } + + @Test + public void testSetUsingHolder() { + for (int i = 0; i < numValues; i++) { + vector.set(i, holders[i]); + } + vector.setValueCount(numValues); + for (int i = 0; i < numValues; i++) { + assertArrayEquals(values[i], vector.getObject(i)); + } + } + + @Test + public void testSetUsingNullableHolder() { + for (int i = 0; i < numValues; i++) { + vector.set(i, nullableHolders[i]); + } + vector.setValueCount(numValues); + for (int i = 0; i < numValues; i++) { + assertArrayEquals(values[i], vector.getObject(i)); + } + } + + @Test + public void testGetUsingNullableHolder() { + for (int i = 0; i < numValues; i++) { + vector.set(i, holders[i]); + } + vector.setValueCount(numValues); + for (int i = 0; i < numValues; i++) { + vector.get(i, nullableHolders[i]); + assertEquals(typeWidth, nullableHolders[i].byteWidth); + assertTrue(nullableHolders[i].isSet > 0); + byte[] actual = new byte[typeWidth]; + nullableHolders[i].buffer.getBytes(0, actual, 0, typeWidth); + assertArrayEquals(values[i], actual); + } + } + + @Test + public void testSetWithInvalidInput() throws Exception { + String errorMsg = "input data needs to be at least " + typeWidth + " bytes"; + + // test small inputs, byteWidth matches but value or buffer is too small + try { + vector.set(0, smallValue); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + try { + vector.set(0, smallHolder); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + try { + vector.set(0, smallNullableHolder); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + try { + vector.set(0, smallBuf); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + // test large inputs, byteWidth matches but value or buffer is bigger than byteWidth + vector.set(0, largeValue); + vector.set(0, largeHolder); + vector.set(0, largeNullableHolder); + vector.set(0, largeBuf); + } + + @Test + public void setSetSafeWithInvalidInput() throws Exception { + String errorMsg = "input data needs to be at least " + typeWidth + " bytes"; + + // test small inputs, byteWidth matches but value or buffer is too small + try { + vector.setSafe(0, smallValue); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + try { + vector.setSafe(0, smallHolder); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + try { + vector.setSafe(0, smallNullableHolder); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + try { + vector.setSafe(0, smallBuf); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + // test large inputs, byteWidth matches but value or buffer is bigger than byteWidth + vector.setSafe(0, largeValue); + vector.setSafe(0, largeHolder); + vector.setSafe(0, largeNullableHolder); + vector.setSafe(0, largeBuf); + } + + @Test + public void testGetNull() { + vector.setNull(0); + assertNull(vector.get(0)); + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeListVector.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeListVector.java new file mode 100644 index 000000000..9d7e413a7 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeListVector.java @@ -0,0 +1,507 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import java.math.BigDecimal; +import java.util.Arrays; +import java.util.List; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.impl.UnionFixedSizeListReader; +import org.apache.arrow.vector.complex.impl.UnionFixedSizeListWriter; +import org.apache.arrow.vector.complex.impl.UnionListReader; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class TestFixedSizeListVector { + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testIntType() { + try (FixedSizeListVector vector = FixedSizeListVector.empty("list", 2, allocator)) { + IntVector nested = (IntVector) vector.addOrGetVector(FieldType.nullable(MinorType.INT.getType())).getVector(); + vector.allocateNew(); + + for (int i = 0; i < 10; i++) { + vector.setNotNull(i); + nested.set(i * 2, i); + nested.set(i * 2 + 1, i + 10); + } + vector.setValueCount(10); + + UnionFixedSizeListReader reader = vector.getReader(); + for (int i = 0; i < 10; i++) { + reader.setPosition(i); + Assert.assertTrue(reader.isSet()); + Assert.assertTrue(reader.next()); + assertEquals(i, reader.reader().readInteger().intValue()); + Assert.assertTrue(reader.next()); + assertEquals(i + 10, reader.reader().readInteger().intValue()); + Assert.assertFalse(reader.next()); + assertEquals(Arrays.asList(i, i + 10), reader.readObject()); + } + } + } + + @Test + public void testFloatTypeNullable() { + try (FixedSizeListVector vector = FixedSizeListVector.empty("list", 2, allocator)) { + Float4Vector nested = (Float4Vector) vector.addOrGetVector(FieldType.nullable(MinorType.FLOAT4.getType())) + .getVector(); + vector.allocateNew(); + + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector.setNotNull(i); + nested.set(i * 2, i + 0.1f); + nested.set(i * 2 + 1, i + 10.1f); + } + } + vector.setValueCount(10); + + UnionFixedSizeListReader reader = vector.getReader(); + for (int i = 0; i < 10; i++) { + reader.setPosition(i); + if (i % 2 == 0) { + Assert.assertTrue(reader.isSet()); + Assert.assertTrue(reader.next()); + assertEquals(i + 0.1f, reader.reader().readFloat(), 0.00001); + Assert.assertTrue(reader.next()); + assertEquals(i + 10.1f, reader.reader().readFloat(), 0.00001); + Assert.assertFalse(reader.next()); + assertEquals(Arrays.asList(i + 0.1f, i + 10.1f), reader.readObject()); + } else { + Assert.assertFalse(reader.isSet()); + Assert.assertNull(reader.readObject()); + } + } + } + } + + @Test + public void testNestedInList() { + try (ListVector vector = ListVector.empty("list", allocator)) { + FixedSizeListVector tuples = (FixedSizeListVector) vector.addOrGetVector( + FieldType.nullable(new ArrowType.FixedSizeList(2))).getVector(); + IntVector innerVector = (IntVector) tuples.addOrGetVector(FieldType.nullable(MinorType.INT.getType())) + .getVector(); + vector.allocateNew(); + + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + int position = vector.startNewValue(i); + for (int j = 0; j < i % 7; j++) { + tuples.setNotNull(position + j); + innerVector.set((position + j) * 2, j); + innerVector.set((position + j) * 2 + 1, j + 1); + } + vector.endValue(i, i % 7); + } + } + vector.setValueCount(10); + + UnionListReader reader = vector.getReader(); + for (int i = 0; i < 10; i++) { + reader.setPosition(i); + if (i % 2 == 0) { + for (int j = 0; j < i % 7; j++) { + Assert.assertTrue(reader.next()); + FieldReader innerListReader = reader.reader(); + for (int k = 0; k < 2; k++) { + Assert.assertTrue(innerListReader.next()); + assertEquals(k + j, innerListReader.reader().readInteger().intValue()); + } + Assert.assertFalse(innerListReader.next()); + } + Assert.assertFalse(reader.next()); + } else { + Assert.assertFalse(reader.isSet()); + Assert.assertNull(reader.readObject()); + } + } + } + } + + @Test + public void testTransferPair() { + try (FixedSizeListVector from = new FixedSizeListVector( + "from", allocator, new FieldType(true, new ArrowType.FixedSizeList(2), null), null); + FixedSizeListVector to = new FixedSizeListVector( + "to", allocator, new FieldType(true, new ArrowType.FixedSizeList(2), null), null)) { + Float4Vector nested = (Float4Vector) from.addOrGetVector(FieldType.nullable(MinorType.FLOAT4.getType())) + .getVector(); + from.allocateNew(); + + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + from.setNotNull(i); + nested.set(i * 2, i + 0.1f); + nested.set(i * 2 + 1, i + 10.1f); + } + } + from.setValueCount(10); + + TransferPair pair = from.makeTransferPair(to); + + pair.copyValueSafe(0, 1); + pair.copyValueSafe(2, 2); + to.copyFromSafe(4, 3, from); + + to.setValueCount(10); + + UnionFixedSizeListReader reader = to.getReader(); + + reader.setPosition(0); + Assert.assertFalse(reader.isSet()); + Assert.assertNull(reader.readObject()); + + reader.setPosition(1); + Assert.assertTrue(reader.isSet()); + Assert.assertTrue(reader.next()); + assertEquals(0.1f, reader.reader().readFloat(), 0.00001); + Assert.assertTrue(reader.next()); + assertEquals(10.1f, reader.reader().readFloat(), 0.00001); + Assert.assertFalse(reader.next()); + assertEquals(Arrays.asList(0.1f, 10.1f), reader.readObject()); + + reader.setPosition(2); + Assert.assertTrue(reader.isSet()); + Assert.assertTrue(reader.next()); + assertEquals(2.1f, reader.reader().readFloat(), 0.00001); + Assert.assertTrue(reader.next()); + assertEquals(12.1f, reader.reader().readFloat(), 0.00001); + Assert.assertFalse(reader.next()); + assertEquals(Arrays.asList(2.1f, 12.1f), reader.readObject()); + + reader.setPosition(3); + Assert.assertTrue(reader.isSet()); + Assert.assertTrue(reader.next()); + assertEquals(4.1f, reader.reader().readFloat(), 0.00001); + Assert.assertTrue(reader.next()); + assertEquals(14.1f, reader.reader().readFloat(), 0.00001); + Assert.assertFalse(reader.next()); + assertEquals(Arrays.asList(4.1f, 14.1f), reader.readObject()); + + for (int i = 4; i < 10; i++) { + reader.setPosition(i); + Assert.assertFalse(reader.isSet()); + Assert.assertNull(reader.readObject()); + } + } + } + + @Test + public void testConsistentChildName() throws Exception { + try (FixedSizeListVector listVector = FixedSizeListVector.empty("sourceVector", 2, allocator)) { + String emptyListStr = listVector.getField().toString(); + Assert.assertTrue(emptyListStr.contains(ListVector.DATA_VECTOR_NAME)); + + listVector.addOrGetVector(FieldType.nullable(MinorType.INT.getType())); + String emptyVectorStr = listVector.getField().toString(); + Assert.assertTrue(emptyVectorStr.contains(ListVector.DATA_VECTOR_NAME)); + } + } + + @Test + public void testUnionFixedSizeListWriterWithNulls() throws Exception { + /* Write to a decimal list vector + * each list of size 3 and having its data values alternating between null and a non-null. + * Read and verify + */ + try (final FixedSizeListVector vector = FixedSizeListVector.empty("vector", /*listSize=*/3, allocator)) { + + UnionFixedSizeListWriter writer = vector.getWriter(); + writer.allocate(); + + final int valueCount = 100; + + for (int i = 0; i < valueCount; i++) { + writer.startList(); + writer.decimal().writeDecimal(new BigDecimal(i)); + writer.writeNull(); + writer.decimal().writeDecimal(new BigDecimal(i * 3)); + writer.endList(); + } + vector.setValueCount(valueCount); + + for (int i = 0; i < valueCount; i++) { + List values = (List) vector.getObject(i); + assertEquals(3, values.size()); + assertEquals(new BigDecimal(i), values.get(0)); + assertEquals(null, values.get(1)); + assertEquals(new BigDecimal(i * 3), values.get(2)); + } + } + } + + @Test + public void testUnionFixedSizeListWriter() throws Exception { + try (final FixedSizeListVector vector1 = FixedSizeListVector.empty("vector", 3, allocator)) { + + UnionFixedSizeListWriter writer1 = vector1.getWriter(); + writer1.allocate(); + + int[] values1 = new int[] {1, 2, 3}; + int[] values2 = new int[] {4, 5, 6}; + int[] values3 = new int[] {7, 8, 9}; + + //set some values + writeListVector(vector1, writer1, values1); + writeListVector(vector1, writer1, values2); + writeListVector(vector1, writer1, values3); + writer1.setValueCount(3); + + assertEquals(3, vector1.getValueCount()); + + int[] realValue1 = convertListToIntArray(vector1.getObject(0)); + assertTrue(Arrays.equals(values1, realValue1)); + int[] realValue2 = convertListToIntArray(vector1.getObject(1)); + assertTrue(Arrays.equals(values2, realValue2)); + int[] realValue3 = convertListToIntArray(vector1.getObject(2)); + assertTrue(Arrays.equals(values3, realValue3)); + } + } + + @Test + public void testWriteDecimal() throws Exception { + try (final FixedSizeListVector vector = FixedSizeListVector.empty("vector", /*listSize=*/3, allocator)) { + + UnionFixedSizeListWriter writer = vector.getWriter(); + writer.allocate(); + + final int valueCount = 100; + + for (int i = 0; i < valueCount; i++) { + writer.startList(); + writer.decimal().writeDecimal(new BigDecimal(i)); + writer.decimal().writeDecimal(new BigDecimal(i * 2)); + writer.decimal().writeDecimal(new BigDecimal(i * 3)); + writer.endList(); + } + vector.setValueCount(valueCount); + + for (int i = 0; i < valueCount; i++) { + List values = (List) vector.getObject(i); + assertEquals(3, values.size()); + assertEquals(new BigDecimal(i), values.get(0)); + assertEquals(new BigDecimal(i * 2), values.get(1)); + assertEquals(new BigDecimal(i * 3), values.get(2)); + } + } + } + + @Test + public void testDecimalIndexCheck() throws Exception { + try (final FixedSizeListVector vector = FixedSizeListVector.empty("vector", /*listSize=*/3, allocator)) { + + UnionFixedSizeListWriter writer = vector.getWriter(); + writer.allocate(); + + IllegalStateException e = assertThrows(IllegalStateException.class, () -> { + writer.startList(); + writer.decimal().writeDecimal(new BigDecimal(1)); + writer.decimal().writeDecimal(new BigDecimal(2)); + writer.decimal().writeDecimal(new BigDecimal(3)); + writer.decimal().writeDecimal(new BigDecimal(4)); + writer.endList(); + }); + assertEquals("values at index 0 is greater than listSize 3", e.getMessage()); + } + } + + + @Test(expected = IllegalStateException.class) + public void testWriteIllegalData() throws Exception { + try (final FixedSizeListVector vector1 = FixedSizeListVector.empty("vector", 3, allocator)) { + + UnionFixedSizeListWriter writer1 = vector1.getWriter(); + writer1.allocate(); + + int[] values1 = new int[] {1, 2, 3}; + int[] values2 = new int[] {4, 5, 6, 7, 8}; + + //set some values + writeListVector(vector1, writer1, values1); + writeListVector(vector1, writer1, values2); + writer1.setValueCount(3); + + assertEquals(3, vector1.getValueCount()); + int[] realValue1 = convertListToIntArray(vector1.getObject(0)); + assertTrue(Arrays.equals(values1, realValue1)); + int[] realValue2 = convertListToIntArray(vector1.getObject(1)); + assertTrue(Arrays.equals(values2, realValue2)); + } + } + + @Test + public void testSplitAndTransfer() throws Exception { + try (final FixedSizeListVector vector1 = FixedSizeListVector.empty("vector", 3, allocator)) { + + UnionFixedSizeListWriter writer1 = vector1.getWriter(); + writer1.allocate(); + + int[] values1 = new int[] {1, 2, 3}; + int[] values2 = new int[] {4, 5, 6}; + int[] values3 = new int[] {7, 8, 9}; + + //set some values + writeListVector(vector1, writer1, values1); + writeListVector(vector1, writer1, values2); + writeListVector(vector1, writer1, values3); + writer1.setValueCount(3); + + TransferPair transferPair = vector1.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 2); + FixedSizeListVector targetVector = (FixedSizeListVector) transferPair.getTo(); + + assertEquals(2, targetVector.getValueCount()); + int[] realValue1 = convertListToIntArray(targetVector.getObject(0)); + assertTrue(Arrays.equals(values1, realValue1)); + int[] realValue2 = convertListToIntArray(targetVector.getObject(1)); + assertTrue(Arrays.equals(values2, realValue2)); + + targetVector.clear(); + } + } + + @Test + public void testZeroWidthVector() { + try (final FixedSizeListVector vector1 = FixedSizeListVector.empty("vector", 0, allocator)) { + + UnionFixedSizeListWriter writer1 = vector1.getWriter(); + writer1.allocate(); + + int[] values1 = new int[] {}; + int[] values2 = new int[] {}; + int[] values3 = null; + int[] values4 = new int[] {}; + + //set some values + writeListVector(vector1, writer1, values1); + writeListVector(vector1, writer1, values2); + writeListVector(vector1, writer1, values3); + writeListVector(vector1, writer1, values4); + writer1.setValueCount(4); + + assertEquals(4, vector1.getValueCount()); + + int[] realValue1 = convertListToIntArray(vector1.getObject(0)); + assertArrayEquals(values1, realValue1); + int[] realValue2 = convertListToIntArray(vector1.getObject(1)); + assertArrayEquals(values2, realValue2); + assertNull(vector1.getObject(2)); + int[] realValue4 = convertListToIntArray(vector1.getObject(3)); + assertArrayEquals(values4, realValue4); + } + } + + @Test + public void testVectorWithNulls() { + try (final FixedSizeListVector vector1 = FixedSizeListVector.empty("vector", 4, allocator)) { + + UnionFixedSizeListWriter writer1 = vector1.getWriter(); + writer1.allocate(); + + List values1 = Arrays.asList(null, 1, 2, 3); + List values2 = Arrays.asList(4, null, 5, 6); + List values3 = null; + List values4 = Arrays.asList(7, 8, null, 9); + + //set some values + writeListVector(vector1, writer1, values1); + writeListVector(vector1, writer1, values2); + writeListVector(vector1, writer1, values3); + writeListVector(vector1, writer1, values4); + writer1.setValueCount(4); + + assertEquals(4, vector1.getValueCount()); + + List realValue1 = vector1.getObject(0); + assertEquals(values1, realValue1); + List realValue2 = vector1.getObject(1); + assertEquals(values2, realValue2); + List realValue3 = vector1.getObject(2); + assertEquals(values3, realValue3); + List realValue4 = vector1.getObject(3); + assertEquals(values4, realValue4); + } + } + + private int[] convertListToIntArray(List list) { + int[] values = new int[list.size()]; + for (int i = 0; i < list.size(); i++) { + values[i] = (int) list.get(i); + } + return values; + } + + private void writeListVector(FixedSizeListVector vector, UnionFixedSizeListWriter writer, int[] values) { + writer.startList(); + if (values != null) { + for (int v : values) { + writer.integer().writeInt(v); + } + } else { + vector.setNull(writer.getPosition()); + } + writer.endList(); + } + + private void writeListVector(FixedSizeListVector vector, UnionFixedSizeListWriter writer, List values) { + writer.startList(); + if (values != null) { + for (Integer v : values) { + if (v == null) { + writer.writeNull(); + } else { + writer.integer().writeInt(v); + } + } + } else { + vector.setNull(writer.getPosition()); + } + writer.endList(); + } + +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestIntervalMonthDayNanoVector.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestIntervalMonthDayNanoVector.java new file mode 100644 index 000000000..93d6fab70 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestIntervalMonthDayNanoVector.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertEquals; + + +import java.time.Duration; +import java.time.Period; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.holders.IntervalMonthDayNanoHolder; +import org.apache.arrow.vector.holders.NullableIntervalMonthDayNanoHolder; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestIntervalMonthDayNanoVector { + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testBasics() { + try (final IntervalMonthDayNanoVector vector = new IntervalMonthDayNanoVector(/*name=*/"", allocator)) { + int valueCount = 100; + vector.setInitialCapacity(valueCount); + vector.allocateNew(); + NullableIntervalMonthDayNanoHolder nullableHolder = new NullableIntervalMonthDayNanoHolder(); + nullableHolder.isSet = 1; + nullableHolder.months = 2; + nullableHolder.days = 20; + nullableHolder.nanoseconds = 123; + IntervalMonthDayNanoHolder holder = new IntervalMonthDayNanoHolder(); + holder.months = Integer.MIN_VALUE; + holder.days = Integer.MIN_VALUE; + holder.nanoseconds = Long.MIN_VALUE; + + + vector.set(0, /*months=*/1, /*days=*/2, /*nanoseconds=*/-2); + vector.setSafe(2, /*months=*/1, /*days=*/2, /*nanoseconds=*/-3); + vector.setSafe(/*index=*/4, nullableHolder); + vector.set(3, holder); + nullableHolder.isSet = 0; + vector.setSafe(/*index=*/5, nullableHolder); + vector.setValueCount(5); + + assertEquals("P1M2D PT-0.000000002S ", vector.getAsStringBuilder(0).toString()); + assertEquals(null, vector.getAsStringBuilder(1)); + assertEquals("P1M2D PT-0.000000003S ", vector.getAsStringBuilder(2).toString()); + assertEquals(new PeriodDuration(Period.of(0, Integer.MIN_VALUE, Integer.MIN_VALUE), + Duration.ofNanos(Long.MIN_VALUE)), vector.getObject(3)); + assertEquals("P2M20D PT0.000000123S ", vector.getAsStringBuilder(4).toString()); + + assertEquals(null, vector.getObject(5)); + + vector.get(1, nullableHolder); + assertEquals(0, nullableHolder.isSet); + + vector.get(2, nullableHolder); + assertEquals(1, nullableHolder.isSet); + assertEquals(1, nullableHolder.months); + assertEquals(2, nullableHolder.days); + assertEquals(-3, nullableHolder.nanoseconds); + + IntervalMonthDayNanoVector.getDays(vector.valueBuffer, 2); + assertEquals(1, IntervalMonthDayNanoVector.getMonths(vector.valueBuffer, 2)); + assertEquals(2, IntervalMonthDayNanoVector.getDays(vector.valueBuffer, 2)); + assertEquals(-3, IntervalMonthDayNanoVector.getNanoseconds(vector.valueBuffer, 2)); + + assertEquals(0, vector.isSet(1)); + assertEquals(1, vector.isSet(2)); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestIntervalYearVector.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestIntervalYearVector.java new file mode 100644 index 000000000..5ea48b485 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestIntervalYearVector.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertEquals; + +import org.apache.arrow.memory.BufferAllocator; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestIntervalYearVector { + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testGetAsStringBuilder() { + try (final IntervalYearVector vector = new IntervalYearVector("", allocator)) { + int valueCount = 100; + vector.setInitialCapacity(valueCount); + vector.allocateNew(); + for (int i = 0; i < valueCount; i++) { + vector.set(i, i); + } + + assertEquals("0 years 1 month ", vector.getAsStringBuilder(1).toString()); + assertEquals("0 years 10 months ", vector.getAsStringBuilder(10).toString()); + assertEquals("1 year 8 months ", vector.getAsStringBuilder(20).toString()); + assertEquals("2 years 6 months ", vector.getAsStringBuilder(30).toString()); + + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestLargeListVector.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestLargeListVector.java new file mode 100644 index 000000000..c1d60da4d --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestLargeListVector.java @@ -0,0 +1,982 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.BaseRepeatedValueVector; +import org.apache.arrow.vector.complex.LargeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.impl.UnionLargeListWriter; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class TestLargeListVector { + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testCopyFrom() throws Exception { + try (LargeListVector inVector = LargeListVector.empty("input", allocator); + LargeListVector outVector = LargeListVector.empty("output", allocator)) { + UnionLargeListWriter writer = inVector.getWriter(); + writer.allocate(); + + // populate input vector with the following records + // [1, 2, 3] + // null + // [] + writer.setPosition(0); // optional + writer.startList(); + writer.bigInt().writeBigInt(1); + writer.bigInt().writeBigInt(2); + writer.bigInt().writeBigInt(3); + writer.endList(); + + writer.setPosition(2); + writer.startList(); + writer.endList(); + + writer.setValueCount(3); + + // copy values from input to output + outVector.allocateNew(); + for (int i = 0; i < 3; i++) { + outVector.copyFrom(i, i, inVector); + } + outVector.setValueCount(3); + + // assert the output vector is correct + FieldReader reader = outVector.getReader(); + Assert.assertTrue("shouldn't be null", reader.isSet()); + reader.setPosition(1); + Assert.assertFalse("should be null", reader.isSet()); + reader.setPosition(2); + Assert.assertTrue("shouldn't be null", reader.isSet()); + + + /* index 0 */ + Object result = outVector.getObject(0); + ArrayList resultSet = (ArrayList) result; + assertEquals(3, resultSet.size()); + assertEquals(new Long(1), resultSet.get(0)); + assertEquals(new Long(2), resultSet.get(1)); + assertEquals(new Long(3), resultSet.get(2)); + + /* index 1 */ + result = outVector.getObject(1); + assertNull(result); + + /* index 2 */ + result = outVector.getObject(2); + resultSet = (ArrayList) result; + assertEquals(0, resultSet.size()); + + /* 3+0+0/3 */ + assertEquals(1.0D, inVector.getDensity(), 0); + } + } + + @Test + public void testSetLastSetUsage() throws Exception { + try (LargeListVector listVector = LargeListVector.empty("input", allocator)) { + + /* Explicitly add the dataVector */ + MinorType type = MinorType.BIGINT; + listVector.addOrGetVector(FieldType.nullable(type.getType())); + + /* allocate memory */ + listVector.allocateNew(); + + /* get inner buffers; validityBuffer and offsetBuffer */ + + ArrowBuf validityBuffer = listVector.getValidityBuffer(); + ArrowBuf offsetBuffer = listVector.getOffsetBuffer(); + + /* get the underlying data vector -- BigIntVector */ + BigIntVector dataVector = (BigIntVector) listVector.getDataVector(); + + /* check current lastSet */ + assertEquals(-1L, listVector.getLastSet()); + + int index = 0; + int offset = 0; + + /* write [10, 11, 12] to the list vector at index 0 */ + BitVectorHelper.setBit(validityBuffer, index); + dataVector.setSafe(0, 1, 10); + dataVector.setSafe(1, 1, 11); + dataVector.setSafe(2, 1, 12); + offsetBuffer.setLong((index + 1) * LargeListVector.OFFSET_WIDTH, 3); + + index += 1; + + /* write [13, 14] to the list vector at index 1 */ + BitVectorHelper.setBit(validityBuffer, index); + dataVector.setSafe(3, 1, 13); + dataVector.setSafe(4, 1, 14); + offsetBuffer.setLong((index + 1) * LargeListVector.OFFSET_WIDTH, 5); + + index += 1; + + /* write [15, 16, 17] to the list vector at index 2 */ + BitVectorHelper.setBit(validityBuffer, index); + dataVector.setSafe(5, 1, 15); + dataVector.setSafe(6, 1, 16); + dataVector.setSafe(7, 1, 17); + offsetBuffer.setLong((index + 1) * LargeListVector.OFFSET_WIDTH, 8); + + /* check current lastSet */ + assertEquals(-1L, listVector.getLastSet()); + + /* set lastset and arbitrary valuecount for list vector. + * + * NOTE: if we don't execute setLastSet() before setLastValueCount(), then + * the latter will corrupt the offsetBuffer and thus the accessor will not + * retrieve the correct values from underlying dataBuffer. Run the test + * by commenting out next line and we should see failures from 5th assert + * onwards. This is why doing setLastSet() is important before setValueCount() + * once the vector has been loaded. + * + * Another important thing to remember is the value of lastSet itself. + * Even though the listVector has elements till index 2 only, the lastSet should + * be set as 3. This is because the offsetBuffer has valid offsets filled till index 3. + * If we do setLastSet(2), the offsetBuffer at index 3 will contain incorrect value + * after execution of setValueCount(). + * + * correct state of the listVector + * bitvector {1, 1, 1, 0, 0.... } + * offsetvector {0, 3, 5, 8, 8, 8.....} + * datavector { [10, 11, 12], + * [13, 14], + * [15, 16, 17] + * } + * + * if we don't do setLastSet() before setValueCount --> incorrect state + * bitvector {1, 1, 1, 0, 0.... } + * offsetvector {0, 0, 0, 0, 0, 0.....} + * datavector { [10, 11, 12], + * [13, 14], + * [15, 16, 17] + * } + * + * if we do setLastSet(2) before setValueCount --> incorrect state + * bitvector {1, 1, 1, 0, 0.... } + * offsetvector {0, 3, 5, 5, 5, 5.....} + * datavector { [10, 11, 12], + * [13, 14], + * [15, 16, 17] + * } + */ + listVector.setLastSet(2); + listVector.setValueCount(10); + + /* (3+2+3)/10 */ + assertEquals(0.8D, listVector.getDensity(), 0); + + index = 0; + offset = (int) offsetBuffer.getLong(index * LargeListVector.OFFSET_WIDTH); + assertEquals(Integer.toString(0), Integer.toString(offset)); + + Long actual = dataVector.getObject(offset); + assertEquals(new Long(10), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(11), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(12), actual); + + index++; + offset = (int) offsetBuffer.getLong(index * LargeListVector.OFFSET_WIDTH); + assertEquals(Integer.toString(3), Integer.toString(offset)); + + actual = dataVector.getObject(offset); + assertEquals(new Long(13), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(14), actual); + + index++; + offset = (int) offsetBuffer.getLong(index * LargeListVector.OFFSET_WIDTH); + assertEquals(Integer.toString(5), Integer.toString(offset)); + + actual = dataVector.getObject(offset); + assertEquals(new Long(15), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(16), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(17), actual); + + index++; + offset = (int) offsetBuffer.getLong(index * LargeListVector.OFFSET_WIDTH); + assertEquals(Integer.toString(8), Integer.toString(offset)); + + actual = dataVector.getObject(offset); + assertNull(actual); + } + } + + @Test + public void testSplitAndTransfer() throws Exception { + try (LargeListVector listVector = LargeListVector.empty("sourceVector", allocator)) { + + /* Explicitly add the dataVector */ + MinorType type = MinorType.BIGINT; + listVector.addOrGetVector(FieldType.nullable(type.getType())); + + UnionLargeListWriter listWriter = listVector.getWriter(); + + /* allocate memory */ + listWriter.allocate(); + + /* populate data */ + listWriter.setPosition(0); + listWriter.startList(); + listWriter.bigInt().writeBigInt(10); + listWriter.bigInt().writeBigInt(11); + listWriter.bigInt().writeBigInt(12); + listWriter.endList(); + + listWriter.setPosition(1); + listWriter.startList(); + listWriter.bigInt().writeBigInt(13); + listWriter.bigInt().writeBigInt(14); + listWriter.endList(); + + listWriter.setPosition(2); + listWriter.startList(); + listWriter.bigInt().writeBigInt(15); + listWriter.bigInt().writeBigInt(16); + listWriter.bigInt().writeBigInt(17); + listWriter.bigInt().writeBigInt(18); + listWriter.endList(); + + listWriter.setPosition(3); + listWriter.startList(); + listWriter.bigInt().writeBigInt(19); + listWriter.endList(); + + listWriter.setPosition(4); + listWriter.startList(); + listWriter.bigInt().writeBigInt(20); + listWriter.bigInt().writeBigInt(21); + listWriter.bigInt().writeBigInt(22); + listWriter.bigInt().writeBigInt(23); + listWriter.endList(); + + listVector.setValueCount(5); + + assertEquals(4, listVector.getLastSet()); + + /* get offset buffer */ + final ArrowBuf offsetBuffer = listVector.getOffsetBuffer(); + + /* get dataVector */ + BigIntVector dataVector = (BigIntVector) listVector.getDataVector(); + + /* check the vector output */ + + int index = 0; + int offset = 0; + Long actual = null; + + /* index 0 */ + assertFalse(listVector.isNull(index)); + offset = (int) offsetBuffer.getLong(index * LargeListVector.OFFSET_WIDTH); + assertEquals(Integer.toString(0), Integer.toString(offset)); + + actual = dataVector.getObject(offset); + assertEquals(new Long(10), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(11), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(12), actual); + + /* index 1 */ + index++; + assertFalse(listVector.isNull(index)); + offset = (int) offsetBuffer.getLong(index * LargeListVector.OFFSET_WIDTH); + assertEquals(Integer.toString(3), Integer.toString(offset)); + + actual = dataVector.getObject(offset); + assertEquals(new Long(13), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(14), actual); + + /* index 2 */ + index++; + assertFalse(listVector.isNull(index)); + offset = (int) offsetBuffer.getLong(index * LargeListVector.OFFSET_WIDTH); + assertEquals(Integer.toString(5), Integer.toString(offset)); + + actual = dataVector.getObject(offset); + assertEquals(new Long(15), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(16), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(17), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(18), actual); + + /* index 3 */ + index++; + assertFalse(listVector.isNull(index)); + offset = (int) offsetBuffer.getLong(index * LargeListVector.OFFSET_WIDTH); + assertEquals(Integer.toString(9), Integer.toString(offset)); + + actual = dataVector.getObject(offset); + assertEquals(new Long(19), actual); + + /* index 4 */ + index++; + assertFalse(listVector.isNull(index)); + offset = (int) offsetBuffer.getLong(index * LargeListVector.OFFSET_WIDTH); + assertEquals(Integer.toString(10), Integer.toString(offset)); + + actual = dataVector.getObject(offset); + assertEquals(new Long(20), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(21), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(22), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(23), actual); + + /* index 5 */ + index++; + assertTrue(listVector.isNull(index)); + offset = (int) offsetBuffer.getLong(index * LargeListVector.OFFSET_WIDTH); + assertEquals(Integer.toString(14), Integer.toString(offset)); + + /* do split and transfer */ + try (LargeListVector toVector = LargeListVector.empty("toVector", allocator)) { + + TransferPair transferPair = listVector.makeTransferPair(toVector); + + int[][] transferLengths = {{0, 2}, {3, 1}, {4, 1}}; + + for (final int[] transferLength : transferLengths) { + int start = transferLength[0]; + int splitLength = transferLength[1]; + + int dataLength1 = 0; + int dataLength2 = 0; + + int offset1 = 0; + int offset2 = 0; + + transferPair.splitAndTransfer(start, splitLength); + + /* get offsetBuffer of toVector */ + final ArrowBuf toOffsetBuffer = toVector.getOffsetBuffer(); + + /* get dataVector of toVector */ + BigIntVector dataVector1 = (BigIntVector) toVector.getDataVector(); + + for (int i = 0; i < splitLength; i++) { + dataLength1 = (int) offsetBuffer.getLong((start + i + 1) * LargeListVector.OFFSET_WIDTH) - + (int) offsetBuffer.getLong((start + i) * LargeListVector.OFFSET_WIDTH); + dataLength2 = (int) toOffsetBuffer.getLong((i + 1) * LargeListVector.OFFSET_WIDTH) - + (int) toOffsetBuffer.getLong(i * LargeListVector.OFFSET_WIDTH); + + assertEquals("Different data lengths at index: " + i + " and start: " + start, + dataLength1, dataLength2); + + offset1 = (int) offsetBuffer.getLong((start + i) * LargeListVector.OFFSET_WIDTH); + offset2 = (int) toOffsetBuffer.getLong(i * LargeListVector.OFFSET_WIDTH); + + for (int j = 0; j < dataLength1; j++) { + assertEquals("Different data at indexes: " + offset1 + " and " + offset2, + dataVector.getObject(offset1), dataVector1.getObject(offset2)); + + offset1++; + offset2++; + } + } + } + } + } + } + + @Test + public void testNestedLargeListVector() throws Exception { + try (LargeListVector listVector = LargeListVector.empty("sourceVector", allocator)) { + + UnionLargeListWriter listWriter = listVector.getWriter(); + + /* allocate memory */ + listWriter.allocate(); + + /* the dataVector that backs a listVector will also be a + * listVector for this test. + */ + + /* write one or more inner lists at index 0 */ + listWriter.setPosition(0); + listWriter.startList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(50); + listWriter.list().bigInt().writeBigInt(100); + listWriter.list().bigInt().writeBigInt(200); + listWriter.list().endList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(75); + listWriter.list().bigInt().writeBigInt(125); + listWriter.list().bigInt().writeBigInt(150); + listWriter.list().bigInt().writeBigInt(175); + listWriter.list().endList(); + + listWriter.endList(); + + /* write one or more inner lists at index 1 */ + listWriter.setPosition(1); + listWriter.startList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(10); + listWriter.list().endList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(15); + listWriter.list().bigInt().writeBigInt(20); + listWriter.list().endList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(25); + listWriter.list().bigInt().writeBigInt(30); + listWriter.list().bigInt().writeBigInt(35); + listWriter.list().endList(); + + listWriter.endList(); + + assertEquals(1, listVector.getLastSet()); + + listVector.setValueCount(2); + + assertEquals(2, listVector.getValueCount()); + + /* get listVector value at index 0 -- the value itself is a listvector */ + Object result = listVector.getObject(0); + ArrayList> resultSet = (ArrayList>) result; + ArrayList list; + + assertEquals(2, resultSet.size()); /* 2 inner lists at index 0 */ + assertEquals(3, resultSet.get(0).size()); /* size of first inner list */ + assertEquals(4, resultSet.get(1).size()); /* size of second inner list */ + + list = resultSet.get(0); + assertEquals(new Long(50), list.get(0)); + assertEquals(new Long(100), list.get(1)); + assertEquals(new Long(200), list.get(2)); + + list = resultSet.get(1); + assertEquals(new Long(75), list.get(0)); + assertEquals(new Long(125), list.get(1)); + assertEquals(new Long(150), list.get(2)); + assertEquals(new Long(175), list.get(3)); + + /* get listVector value at index 1 -- the value itself is a listvector */ + result = listVector.getObject(1); + resultSet = (ArrayList>) result; + + assertEquals(3, resultSet.size()); /* 3 inner lists at index 1 */ + assertEquals(1, resultSet.get(0).size()); /* size of first inner list */ + assertEquals(2, resultSet.get(1).size()); /* size of second inner list */ + assertEquals(3, resultSet.get(2).size()); /* size of third inner list */ + + list = resultSet.get(0); + assertEquals(new Long(10), list.get(0)); + + list = resultSet.get(1); + assertEquals(new Long(15), list.get(0)); + assertEquals(new Long(20), list.get(1)); + + list = resultSet.get(2); + assertEquals(new Long(25), list.get(0)); + assertEquals(new Long(30), list.get(1)); + assertEquals(new Long(35), list.get(2)); + + /* check underlying bitVector */ + assertFalse(listVector.isNull(0)); + assertFalse(listVector.isNull(1)); + + /* check underlying offsets */ + final ArrowBuf offsetBuffer = listVector.getOffsetBuffer(); + + /* listVector has 2 lists at index 0 and 3 lists at index 1 */ + assertEquals(0, offsetBuffer.getLong(0 * LargeListVector.OFFSET_WIDTH)); + assertEquals(2, offsetBuffer.getLong(1 * LargeListVector.OFFSET_WIDTH)); + assertEquals(5, offsetBuffer.getLong(2 * LargeListVector.OFFSET_WIDTH)); + } + } + + @Test + public void testNestedLargeListVector1() throws Exception { + try (LargeListVector listVector = LargeListVector.empty("sourceVector", allocator)) { + + MinorType listType = MinorType.LIST; + MinorType scalarType = MinorType.BIGINT; + + listVector.addOrGetVector(FieldType.nullable(listType.getType())); + + ListVector innerList1 = (ListVector) listVector.getDataVector(); + innerList1.addOrGetVector(FieldType.nullable(listType.getType())); + + ListVector innerList2 = (ListVector) innerList1.getDataVector(); + innerList2.addOrGetVector(FieldType.nullable(listType.getType())); + + ListVector innerList3 = (ListVector) innerList2.getDataVector(); + innerList3.addOrGetVector(FieldType.nullable(listType.getType())); + + ListVector innerList4 = (ListVector) innerList3.getDataVector(); + innerList4.addOrGetVector(FieldType.nullable(listType.getType())); + + ListVector innerList5 = (ListVector) innerList4.getDataVector(); + innerList5.addOrGetVector(FieldType.nullable(listType.getType())); + + ListVector innerList6 = (ListVector) innerList5.getDataVector(); + innerList6.addOrGetVector(FieldType.nullable(scalarType.getType())); + + listVector.setInitialCapacity(128); + } + } + + @Test + public void testNestedLargeListVector2() throws Exception { + try (LargeListVector listVector = LargeListVector.empty("sourceVector", allocator)) { + listVector.setInitialCapacity(1); + UnionLargeListWriter listWriter = listVector.getWriter(); + /* allocate memory */ + listWriter.allocate(); + + /* write one or more inner lists at index 0 */ + listWriter.setPosition(0); + listWriter.startList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(50); + listWriter.list().bigInt().writeBigInt(100); + listWriter.list().bigInt().writeBigInt(200); + listWriter.list().endList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(75); + listWriter.list().bigInt().writeBigInt(125); + listWriter.list().endList(); + + listWriter.endList(); + + /* write one or more inner lists at index 1 */ + listWriter.setPosition(1); + listWriter.startList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(15); + listWriter.list().bigInt().writeBigInt(20); + listWriter.list().endList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(25); + listWriter.list().bigInt().writeBigInt(30); + listWriter.list().bigInt().writeBigInt(35); + listWriter.list().endList(); + + listWriter.endList(); + + assertEquals(1, listVector.getLastSet()); + + listVector.setValueCount(2); + + assertEquals(2, listVector.getValueCount()); + + /* get listVector value at index 0 -- the value itself is a listvector */ + Object result = listVector.getObject(0); + ArrayList> resultSet = (ArrayList>) result; + ArrayList list; + + assertEquals(2, resultSet.size()); /* 2 inner lists at index 0 */ + assertEquals(3, resultSet.get(0).size()); /* size of first inner list */ + assertEquals(2, resultSet.get(1).size()); /* size of second inner list */ + + list = resultSet.get(0); + assertEquals(new Long(50), list.get(0)); + assertEquals(new Long(100), list.get(1)); + assertEquals(new Long(200), list.get(2)); + + list = resultSet.get(1); + assertEquals(new Long(75), list.get(0)); + assertEquals(new Long(125), list.get(1)); + + /* get listVector value at index 1 -- the value itself is a listvector */ + result = listVector.getObject(1); + resultSet = (ArrayList>) result; + + assertEquals(2, resultSet.size()); /* 3 inner lists at index 1 */ + assertEquals(2, resultSet.get(0).size()); /* size of first inner list */ + assertEquals(3, resultSet.get(1).size()); /* size of second inner list */ + + list = resultSet.get(0); + assertEquals(new Long(15), list.get(0)); + assertEquals(new Long(20), list.get(1)); + + list = resultSet.get(1); + assertEquals(new Long(25), list.get(0)); + assertEquals(new Long(30), list.get(1)); + assertEquals(new Long(35), list.get(2)); + + /* check underlying bitVector */ + assertFalse(listVector.isNull(0)); + assertFalse(listVector.isNull(1)); + + /* check underlying offsets */ + final ArrowBuf offsetBuffer = listVector.getOffsetBuffer(); + + /* listVector has 2 lists at index 0 and 3 lists at index 1 */ + assertEquals(0, offsetBuffer.getLong(0 * LargeListVector.OFFSET_WIDTH)); + assertEquals(2, offsetBuffer.getLong(1 * LargeListVector.OFFSET_WIDTH)); + assertEquals(4, offsetBuffer.getLong(2 * LargeListVector.OFFSET_WIDTH)); + } + } + + @Test + public void testGetBufferAddress() throws Exception { + try (LargeListVector listVector = LargeListVector.empty("vector", allocator)) { + + UnionLargeListWriter listWriter = listVector.getWriter(); + boolean error = false; + + listWriter.allocate(); + + listWriter.setPosition(0); + listWriter.startList(); + listWriter.bigInt().writeBigInt(50); + listWriter.bigInt().writeBigInt(100); + listWriter.bigInt().writeBigInt(200); + listWriter.endList(); + + listWriter.setPosition(1); + listWriter.startList(); + listWriter.bigInt().writeBigInt(250); + listWriter.bigInt().writeBigInt(300); + listWriter.endList(); + + listVector.setValueCount(2); + + /* check listVector contents */ + Object result = listVector.getObject(0); + ArrayList resultSet = (ArrayList) result; + assertEquals(3, resultSet.size()); + assertEquals(new Long(50), resultSet.get(0)); + assertEquals(new Long(100), resultSet.get(1)); + assertEquals(new Long(200), resultSet.get(2)); + + result = listVector.getObject(1); + resultSet = (ArrayList) result; + assertEquals(2, resultSet.size()); + assertEquals(new Long(250), resultSet.get(0)); + assertEquals(new Long(300), resultSet.get(1)); + + List buffers = listVector.getFieldBuffers(); + + long bitAddress = listVector.getValidityBufferAddress(); + long offsetAddress = listVector.getOffsetBufferAddress(); + + try { + long dataAddress = listVector.getDataBufferAddress(); + } catch (UnsupportedOperationException ue) { + error = true; + } finally { + assertTrue(error); + } + + assertEquals(2, buffers.size()); + assertEquals(bitAddress, buffers.get(0).memoryAddress()); + assertEquals(offsetAddress, buffers.get(1).memoryAddress()); + + /* (3+2)/2 */ + assertEquals(2.5, listVector.getDensity(), 0); + } + } + + @Test + public void testConsistentChildName() throws Exception { + try (LargeListVector listVector = LargeListVector.empty("sourceVector", allocator)) { + String emptyListStr = listVector.getField().toString(); + assertTrue(emptyListStr.contains(LargeListVector.DATA_VECTOR_NAME)); + + listVector.addOrGetVector(FieldType.nullable(MinorType.INT.getType())); + String emptyVectorStr = listVector.getField().toString(); + assertTrue(emptyVectorStr.contains(LargeListVector.DATA_VECTOR_NAME)); + } + } + + @Test + public void testSetInitialCapacity() { + try (final LargeListVector vector = LargeListVector.empty("", allocator)) { + vector.addOrGetVector(FieldType.nullable(MinorType.INT.getType())); + + /** + * use the default multiplier of 5, + * 512 * 5 => 2560 * 4 => 10240 bytes => 16KB => 4096 value capacity. + */ + vector.setInitialCapacity(512); + vector.allocateNew(); + assertEquals(512, vector.getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 512 * 5); + + /* use density as 4 */ + vector.setInitialCapacity(512, 4); + vector.allocateNew(); + assertEquals(512, vector.getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 512 * 4); + + /** + * inner value capacity we pass to data vector is 512 * 0.1 => 51 + * For an int vector this is 204 bytes of memory for data buffer + * and 7 bytes for validity buffer. + * and with power of 2 allocation, we allocate 256 bytes and 8 bytes + * for the data buffer and validity buffer of the inner vector. Thus + * value capacity of inner vector is 64 + */ + vector.setInitialCapacity(512, 0.1); + vector.allocateNew(); + assertEquals(512, vector.getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 51); + + /** + * inner value capacity we pass to data vector is 512 * 0.01 => 5 + * For an int vector this is 20 bytes of memory for data buffer + * and 1 byte for validity buffer. + * and with power of 2 allocation, we allocate 32 bytes and 1 bytes + * for the data buffer and validity buffer of the inner vector. Thus + * value capacity of inner vector is 8 + */ + vector.setInitialCapacity(512, 0.01); + vector.allocateNew(); + assertEquals(512, vector.getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 5); + + /** + * inner value capacity we pass to data vector is 5 * 0.1 => 0 + * which is then rounded off to 1. So we pass value count as 1 + * to the inner int vector. + * the offset buffer of the list vector is allocated for 6 values + * which is 24 bytes and then rounded off to 32 bytes (8 values) + * the validity buffer of the list vector is allocated for 5 + * values which is 1 byte. This is why value capacity of the list + * vector is 7 as we take the min of validity buffer value capacity + * and offset buffer value capacity. + */ + vector.setInitialCapacity(5, 0.1); + vector.allocateNew(); + assertEquals(7, vector.getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 1); + } + } + + @Test + public void testClearAndReuse() { + try (final LargeListVector vector = LargeListVector.empty("list", allocator)) { + BigIntVector bigIntVector = + (BigIntVector) vector.addOrGetVector(FieldType.nullable(MinorType.BIGINT.getType())).getVector(); + vector.setInitialCapacity(10); + vector.allocateNew(); + + vector.startNewValue(0); + bigIntVector.setSafe(0, 7); + vector.endValue(0, 1); + vector.startNewValue(1); + bigIntVector.setSafe(1, 8); + vector.endValue(1, 1); + vector.setValueCount(2); + + Object result = vector.getObject(0); + ArrayList resultSet = (ArrayList) result; + assertEquals(new Long(7), resultSet.get(0)); + + result = vector.getObject(1); + resultSet = (ArrayList) result; + assertEquals(new Long(8), resultSet.get(0)); + + // Clear and release the buffers to trigger a realloc when adding next value + vector.clear(); + + // The list vector should reuse a buffer when reallocating the offset buffer + vector.startNewValue(0); + bigIntVector.setSafe(0, 7); + vector.endValue(0, 1); + vector.startNewValue(1); + bigIntVector.setSafe(1, 8); + vector.endValue(1, 1); + vector.setValueCount(2); + + result = vector.getObject(0); + resultSet = (ArrayList) result; + assertEquals(new Long(7), resultSet.get(0)); + + result = vector.getObject(1); + resultSet = (ArrayList) result; + assertEquals(new Long(8), resultSet.get(0)); + } + } + + @Test + public void testWriterGetField() { + try (final LargeListVector vector = LargeListVector.empty("list", allocator)) { + + UnionLargeListWriter writer = vector.getWriter(); + writer.allocate(); + + //set some values + writer.startList(); + writer.integer().writeInt(1); + writer.integer().writeInt(2); + writer.endList(); + vector.setValueCount(2); + + Field expectedDataField = new Field(BaseRepeatedValueVector.DATA_VECTOR_NAME, + FieldType.nullable(new ArrowType.Int(32, true)), null); + Field expectedField = new Field(vector.getName(), FieldType.nullable(ArrowType.LargeList.INSTANCE), + Arrays.asList(expectedDataField)); + + assertEquals(expectedField, writer.getField()); + } + } + + @Test + public void testClose() throws Exception { + try (final LargeListVector vector = LargeListVector.empty("list", allocator)) { + + UnionLargeListWriter writer = vector.getWriter(); + writer.allocate(); + + //set some values + writer.startList(); + writer.integer().writeInt(1); + writer.integer().writeInt(2); + writer.endList(); + vector.setValueCount(2); + + assertTrue(vector.getBufferSize() > 0); + assertTrue(vector.getDataVector().getBufferSize() > 0); + + writer.close(); + assertEquals(0, vector.getBufferSize()); + assertEquals(0, vector.getDataVector().getBufferSize()); + } + } + + @Test + public void testGetBufferSizeFor() { + try (final LargeListVector vector = LargeListVector.empty("list", allocator)) { + + UnionLargeListWriter writer = vector.getWriter(); + writer.allocate(); + + //set some values + writeIntValues(writer, new int[] {1, 2}); + writeIntValues(writer, new int[] {3, 4}); + writeIntValues(writer, new int[] {5, 6}); + writeIntValues(writer, new int[] {7, 8, 9, 10}); + writeIntValues(writer, new int[] {11, 12, 13, 14}); + writer.setValueCount(5); + + IntVector dataVector = (IntVector) vector.getDataVector(); + int[] indices = new int[] {0, 2, 4, 6, 10, 14}; + + for (int valueCount = 1; valueCount <= 5; valueCount++) { + int validityBufferSize = BitVectorHelper.getValidityBufferSize(valueCount); + int offsetBufferSize = (valueCount + 1) * LargeListVector.OFFSET_WIDTH; + + int expectedSize = validityBufferSize + offsetBufferSize + dataVector.getBufferSizeFor(indices[valueCount]); + assertEquals(expectedSize, vector.getBufferSizeFor(valueCount)); + } + } + } + + @Test + public void testIsEmpty() { + try (final LargeListVector vector = LargeListVector.empty("list", allocator)) { + UnionLargeListWriter writer = vector.getWriter(); + writer.allocate(); + + // set values [1,2], null, [], [5,6] + writeIntValues(writer, new int[] {1, 2}); + writer.setPosition(2); + writeIntValues(writer, new int[] {}); + writeIntValues(writer, new int[] {5, 6}); + writer.setValueCount(4); + + assertFalse(vector.isEmpty(0)); + assertTrue(vector.isNull(1)); + assertTrue(vector.isEmpty(1)); + assertFalse(vector.isNull(2)); + assertTrue(vector.isEmpty(2)); + assertFalse(vector.isEmpty(3)); + } + } + + private void writeIntValues(UnionLargeListWriter writer, int[] values) { + writer.startList(); + for (int v: values) { + writer.integer().writeInt(v); + } + writer.endList(); + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarBinaryVector.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarBinaryVector.java new file mode 100644 index 000000000..644827ce9 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarBinaryVector.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.holders.NullableLargeVarBinaryHolder; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestLargeVarBinaryVector { + + private BufferAllocator allocator; + + @Before + public void prepare() { + allocator = new RootAllocator(Integer.MAX_VALUE); + } + + @After + public void shutdown() { + allocator.close(); + } + + @Test + public void testSetNullableLargeVarBinaryHolder() { + try (LargeVarBinaryVector vector = new LargeVarBinaryVector("", allocator)) { + vector.allocateNew(100, 10); + + NullableLargeVarBinaryHolder nullHolder = new NullableLargeVarBinaryHolder(); + nullHolder.isSet = 0; + + NullableLargeVarBinaryHolder binHolder = new NullableLargeVarBinaryHolder(); + binHolder.isSet = 1; + + String str = "hello"; + ArrowBuf buf = allocator.buffer(16); + buf.setBytes(0, str.getBytes()); + + binHolder.start = 0; + binHolder.end = str.length(); + binHolder.buffer = buf; + + vector.set(0, nullHolder); + vector.set(1, binHolder); + + // verify results + assertTrue(vector.isNull(0)); + assertEquals(str, new String(vector.get(1))); + + buf.close(); + } + } + + @Test + public void testSetNullableLargeVarBinaryHolderSafe() { + try (LargeVarBinaryVector vector = new LargeVarBinaryVector("", allocator)) { + vector.allocateNew(5, 1); + + NullableLargeVarBinaryHolder nullHolder = new NullableLargeVarBinaryHolder(); + nullHolder.isSet = 0; + + NullableLargeVarBinaryHolder binHolder = new NullableLargeVarBinaryHolder(); + binHolder.isSet = 1; + + String str = "hello world"; + ArrowBuf buf = allocator.buffer(16); + buf.setBytes(0, str.getBytes()); + + binHolder.start = 0; + binHolder.end = str.length(); + binHolder.buffer = buf; + + vector.setSafe(0, binHolder); + vector.setSafe(1, nullHolder); + + // verify results + assertEquals(str, new String(vector.get(0))); + assertTrue(vector.isNull(1)); + + buf.close(); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java new file mode 100644 index 000000000..1b81c6b20 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java @@ -0,0 +1,816 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.holders.NullableLargeVarCharHolder; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; +import org.apache.arrow.vector.testing.ValueVectorDataPopulator; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.util.TransferPair; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.junit.jupiter.api.Assertions; + +public class TestLargeVarCharVector { + + private static final byte[] STR1 = "AAAAA1".getBytes(); + private static final byte[] STR2 = "BBBBBBBBB2".getBytes(); + private static final byte[] STR3 = "CCCC3".getBytes(); + private static final byte[] STR4 = "DDDDDDDD4".getBytes(); + private static final byte[] STR5 = "EEE5".getBytes(); + private static final byte[] STR6 = "FFFFF6".getBytes(); + + private BufferAllocator allocator; + + @Before + public void prepare() { + allocator = new RootAllocator(Integer.MAX_VALUE); + } + + @After + public void shutdown() { + allocator.close(); + } + + @Test + public void testTransfer() { + try (BufferAllocator childAllocator1 = allocator.newChildAllocator("child1", 1000000, 1000000); + BufferAllocator childAllocator2 = allocator.newChildAllocator("child2", 1000000, 1000000); + LargeVarCharVector v1 = new LargeVarCharVector("v1", childAllocator1); + LargeVarCharVector v2 = new LargeVarCharVector("v2", childAllocator2);) { + v1.allocateNew(); + v1.setSafe(4094, "hello world".getBytes(), 0, 11); + v1.setValueCount(4001); + + long memoryBeforeTransfer = childAllocator1.getAllocatedMemory(); + + v1.makeTransferPair(v2).transfer(); + + assertEquals(0, childAllocator1.getAllocatedMemory()); + assertEquals(memoryBeforeTransfer, childAllocator2.getAllocatedMemory()); + } + } + + @Test + public void testCopyValueSafe() { + try (final LargeVarCharVector largeVarCharVector = new LargeVarCharVector("myvector", allocator); + final LargeVarCharVector newLargeVarCharVector = new LargeVarCharVector("newvector", allocator)) { + largeVarCharVector.allocateNew(10000, 1000); + + final int valueCount = 500; + populateLargeVarcharVector(largeVarCharVector, valueCount, null); + + final TransferPair tp = largeVarCharVector.makeTransferPair(newLargeVarCharVector); + + // new vector memory is not pre-allocated, we expect copyValueSafe work fine. + for (int i = 0; i < valueCount; i++) { + tp.copyValueSafe(i, i); + } + newLargeVarCharVector.setValueCount(valueCount); + + for (int i = 0; i < valueCount; i++) { + final boolean expectedSet = (i % 3) == 0; + if (expectedSet) { + assertFalse(largeVarCharVector.isNull(i)); + assertFalse(newLargeVarCharVector.isNull(i)); + assertArrayEquals(largeVarCharVector.get(i), newLargeVarCharVector.get(i)); + } else { + assertTrue(newLargeVarCharVector.isNull(i)); + } + } + } + } + + @Test + public void testSplitAndTransferNon() { + try (final LargeVarCharVector largeVarCharVector = new LargeVarCharVector("myvector", allocator)) { + + largeVarCharVector.allocateNew(10000, 1000); + final int valueCount = 500; + populateLargeVarcharVector(largeVarCharVector, valueCount, null); + + final TransferPair tp = largeVarCharVector.getTransferPair(allocator); + try (LargeVarCharVector newLargeVarCharVector = (LargeVarCharVector) tp.getTo()) { + + tp.splitAndTransfer(0, 0); + assertEquals(0, newLargeVarCharVector.getValueCount()); + } + } + } + + @Test + public void testSplitAndTransferAll() { + try (final LargeVarCharVector largeVarCharVector = new LargeVarCharVector("myvector", allocator)) { + + largeVarCharVector.allocateNew(10000, 1000); + final int valueCount = 500; + populateLargeVarcharVector(largeVarCharVector, valueCount, null); + + final TransferPair tp = largeVarCharVector.getTransferPair(allocator); + try (LargeVarCharVector newLargeVarCharVector = (LargeVarCharVector) tp.getTo()) { + + tp.splitAndTransfer(0, valueCount); + assertEquals(valueCount, newLargeVarCharVector.getValueCount()); + } + } + } + + @Test + public void testInvalidStartIndex() { + try (final LargeVarCharVector largeVarCharVector = new LargeVarCharVector("myvector", allocator); + final LargeVarCharVector newLargeVarCharVector = new LargeVarCharVector("newvector", allocator)) { + + largeVarCharVector.allocateNew(10000, 1000); + final int valueCount = 500; + populateLargeVarcharVector(largeVarCharVector, valueCount, null); + + final TransferPair tp = largeVarCharVector.makeTransferPair(newLargeVarCharVector); + + IllegalArgumentException e = Assertions.assertThrows( + IllegalArgumentException.class, + () -> tp.splitAndTransfer(valueCount, 10)); + + assertEquals("Invalid startIndex: 500", e.getMessage()); + } + } + + @Test + public void testInvalidLength() { + try (final LargeVarCharVector largeVarCharVector = new LargeVarCharVector("myvector", allocator); + final LargeVarCharVector newLargeVarCharVector = new LargeVarCharVector("newvector", allocator)) { + + largeVarCharVector.allocateNew(10000, 1000); + final int valueCount = 500; + populateLargeVarcharVector(largeVarCharVector, valueCount, null); + + final TransferPair tp = largeVarCharVector.makeTransferPair(newLargeVarCharVector); + + IllegalArgumentException e = Assertions.assertThrows( + IllegalArgumentException.class, + () -> tp.splitAndTransfer(0, valueCount * 2)); + + assertEquals("Invalid length: 1000", e.getMessage()); + } + } + + @Test /* LargeVarCharVector */ + public void testSizeOfValueBuffer() { + try (final LargeVarCharVector vector = new LargeVarCharVector("", allocator)) { + int valueCount = 100; + int currentSize = 0; + vector.setInitialCapacity(valueCount); + vector.allocateNew(); + vector.setValueCount(valueCount); + for (int i = 0; i < valueCount; i++) { + currentSize += i; + vector.setSafe(i, new byte[i]); + } + + assertEquals(currentSize, vector.sizeOfValueBuffer()); + } + } + + @Test + public void testSetLastSetUsage() { + final byte[] STR1 = "AAAAA1".getBytes(); + final byte[] STR2 = "BBBBBBBBB2".getBytes(); + final byte[] STR3 = "CCCC3".getBytes(); + final byte[] STR4 = "DDDDDDDD4".getBytes(); + final byte[] STR5 = "EEE5".getBytes(); + final byte[] STR6 = "FFFFF6".getBytes(); + + try (final LargeVarCharVector vector = new LargeVarCharVector("myvector", allocator)) { + vector.allocateNew(1024 * 10, 1024); + + setBytes(0, STR1, vector); + setBytes(1, STR2, vector); + setBytes(2, STR3, vector); + setBytes(3, STR4, vector); + setBytes(4, STR5, vector); + setBytes(5, STR6, vector); + + /* Check current lastSet */ + assertEquals(-1, vector.getLastSet()); + + /* Check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertArrayEquals(STR5, vector.get(4)); + assertArrayEquals(STR6, vector.get(5)); + + /* + * If we don't do setLastSe(5) before setValueCount(), then the latter will corrupt + * the value vector by filling in all positions [0,valuecount-1] will empty byte arrays. + * Run the test by commenting out next line and we should see incorrect vector output. + */ + vector.setLastSet(5); + vector.setValueCount(20); + + /* Check current lastSet */ + assertEquals(19, vector.getLastSet()); + + /* Check the vector output again */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertArrayEquals(STR5, vector.get(4)); + assertArrayEquals(STR6, vector.get(5)); + assertEquals(0, vector.getValueLength(6)); + assertEquals(0, vector.getValueLength(7)); + assertEquals(0, vector.getValueLength(8)); + assertEquals(0, vector.getValueLength(9)); + assertEquals(0, vector.getValueLength(10)); + assertEquals(0, vector.getValueLength(11)); + assertEquals(0, vector.getValueLength(12)); + assertEquals(0, vector.getValueLength(13)); + assertEquals(0, vector.getValueLength(14)); + assertEquals(0, vector.getValueLength(15)); + assertEquals(0, vector.getValueLength(16)); + assertEquals(0, vector.getValueLength(17)); + assertEquals(0, vector.getValueLength(18)); + assertEquals(0, vector.getValueLength(19)); + + /* Check offsets */ + assertEquals(0, vector.offsetBuffer.getLong(0 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(6, vector.offsetBuffer.getLong(1 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(16, vector.offsetBuffer.getLong(2 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(21, vector.offsetBuffer.getLong(3 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(30, vector.offsetBuffer.getLong(4 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(34, vector.offsetBuffer.getLong(5 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(6 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(7 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(8 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(9 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(10 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(11 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(12 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(13 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(14 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(15 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(16 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(17 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(18 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(19 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + + vector.set(19, STR6); + assertArrayEquals(STR6, vector.get(19)); + assertEquals(40, vector.offsetBuffer.getLong(19 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(46, vector.offsetBuffer.getLong(20 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + } + } + + @Test(expected = OutOfMemoryException.class) + public void testVectorAllocateNew() { + try (RootAllocator smallAllocator = new RootAllocator(200); + LargeVarCharVector vector = new LargeVarCharVector("vec", smallAllocator)) { + vector.allocateNew(); + } + } + + @Test(expected = OversizedAllocationException.class) + public void testLargeVariableVectorReallocation() { + final LargeVarCharVector vector = new LargeVarCharVector("vector", allocator); + // edge case 1: value count = MAX_VALUE_ALLOCATION + final long expectedAllocationInBytes = BaseValueVector.MAX_ALLOCATION_SIZE; + final int expectedOffsetSize = 10; + try { + vector.allocateNew(expectedAllocationInBytes, 10); + assertTrue(expectedOffsetSize <= vector.getValueCapacity()); + assertTrue(expectedAllocationInBytes <= vector.getDataBuffer().capacity()); + vector.reAlloc(); + assertTrue(expectedOffsetSize * 2 <= vector.getValueCapacity()); + assertTrue(expectedAllocationInBytes * 2 <= vector.getDataBuffer().capacity()); + } finally { + vector.close(); + } + + // common: value count < MAX_VALUE_ALLOCATION + try { + vector.allocateNew(BaseValueVector.MAX_ALLOCATION_SIZE / 2, 0); + vector.reAlloc(); // value allocation reaches to MAX_VALUE_ALLOCATION + vector.reAlloc(); // this tests if it overflows + } finally { + vector.close(); + } + } + + @Test + public void testSplitAndTransfer() { + try (final LargeVarCharVector largeVarCharVector = new LargeVarCharVector("myvector", allocator)) { + largeVarCharVector.allocateNew(10000, 1000); + + final int valueCount = 500; + final String[] compareArray = new String[valueCount]; + + populateLargeVarcharVector(largeVarCharVector, valueCount, compareArray); + + final TransferPair tp = largeVarCharVector.getTransferPair(allocator); + try (final LargeVarCharVector newLargeVarCharVector = (LargeVarCharVector) tp.getTo()) { + final int[][] startLengths = {{0, 201}, {201, 0}, {201, 200}, {401, 99}}; + + for (final int[] startLength : startLengths) { + final int start = startLength[0]; + final int length = startLength[1]; + tp.splitAndTransfer(start, length); + for (int i = 0; i < length; i++) { + final boolean expectedSet = ((start + i) % 3) == 0; + if (expectedSet) { + final byte[] expectedValue = compareArray[start + i].getBytes(); + assertFalse(newLargeVarCharVector.isNull(i)); + assertArrayEquals(expectedValue, newLargeVarCharVector.get(i)); + } else { + assertTrue(newLargeVarCharVector.isNull(i)); + } + } + } + } + } + } + + @Test + public void testReallocAfterVectorTransfer() { + final byte[] STR1 = "AAAAA1".getBytes(); + final byte[] STR2 = "BBBBBBBBB2".getBytes(); + + try (final LargeVarCharVector vector = new LargeVarCharVector("vector", allocator)) { + /* 4096 values with 10 byte per record */ + vector.allocateNew(4096 * 10, 4096); + int valueCapacity = vector.getValueCapacity(); + assertTrue(valueCapacity >= 4096); + + /* populate the vector */ + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 1) { + vector.set(i, STR1); + } else { + vector.set(i, STR2); + } + } + + /* Check the vector output */ + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 1) { + assertArrayEquals(STR1, vector.get(i)); + } else { + assertArrayEquals(STR2, vector.get(i)); + } + } + + /* trigger first realloc */ + vector.setSafe(valueCapacity, STR2, 0, STR2.length); + assertTrue(vector.getValueCapacity() >= 2 * valueCapacity); + while (vector.getByteCapacity() < 10 * vector.getValueCapacity()) { + vector.reallocDataBuffer(); + } + + /* populate the remaining vector */ + for (int i = valueCapacity; i < vector.getValueCapacity(); i++) { + if ((i & 1) == 1) { + vector.set(i, STR1); + } else { + vector.set(i, STR2); + } + } + + /* Check the vector output */ + valueCapacity = vector.getValueCapacity(); + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 1) { + assertArrayEquals(STR1, vector.get(i)); + } else { + assertArrayEquals(STR2, vector.get(i)); + } + } + + /* trigger second realloc */ + vector.setSafe(valueCapacity + 10, STR2, 0, STR2.length); + assertTrue(vector.getValueCapacity() >= 2 * valueCapacity); + while (vector.getByteCapacity() < 10 * vector.getValueCapacity()) { + vector.reallocDataBuffer(); + } + + /* populate the remaining vector */ + for (int i = valueCapacity; i < vector.getValueCapacity(); i++) { + if ((i & 1) == 1) { + vector.set(i, STR1); + } else { + vector.set(i, STR2); + } + } + + /* Check the vector output */ + valueCapacity = vector.getValueCapacity(); + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 1) { + assertArrayEquals(STR1, vector.get(i)); + } else { + assertArrayEquals(STR2, vector.get(i)); + } + } + + /* we are potentially working with 4x the size of vector buffer + * that we initially started with. Now let's transfer the vector. + */ + + TransferPair transferPair = vector.getTransferPair(allocator); + transferPair.transfer(); + try (LargeVarCharVector toVector = (LargeVarCharVector) transferPair.getTo()) { + valueCapacity = toVector.getValueCapacity(); + + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 1) { + assertArrayEquals(STR1, toVector.get(i)); + } else { + assertArrayEquals(STR2, toVector.get(i)); + } + } + } + } + } + + @Test + public void testVectorLoadUnload() { + try (final LargeVarCharVector vector1 = new LargeVarCharVector("myvector", allocator)) { + + ValueVectorDataPopulator.setVector(vector1, STR1, STR2, STR3, STR4, STR5, STR6); + + assertEquals(5, vector1.getLastSet()); + vector1.setValueCount(15); + assertEquals(14, vector1.getLastSet()); + + /* Check the vector output */ + assertArrayEquals(STR1, vector1.get(0)); + assertArrayEquals(STR2, vector1.get(1)); + assertArrayEquals(STR3, vector1.get(2)); + assertArrayEquals(STR4, vector1.get(3)); + assertArrayEquals(STR5, vector1.get(4)); + assertArrayEquals(STR6, vector1.get(5)); + + Field field = vector1.getField(); + String fieldName = field.getName(); + + List fields = new ArrayList<>(); + List fieldVectors = new ArrayList<>(); + + fields.add(field); + fieldVectors.add(vector1); + + Schema schema = new Schema(fields); + + VectorSchemaRoot schemaRoot1 = new VectorSchemaRoot(schema, fieldVectors, vector1.getValueCount()); + VectorUnloader vectorUnloader = new VectorUnloader(schemaRoot1); + + try ( + ArrowRecordBatch recordBatch = vectorUnloader.getRecordBatch(); + VectorSchemaRoot schemaRoot2 = VectorSchemaRoot.create(schema, allocator); + ) { + + VectorLoader vectorLoader = new VectorLoader(schemaRoot2); + vectorLoader.load(recordBatch); + + LargeVarCharVector vector2 = (LargeVarCharVector) schemaRoot2.getVector(fieldName); + /* + * lastSet would have internally been set by VectorLoader.load() when it invokes + * loadFieldBuffers. + */ + assertEquals(14, vector2.getLastSet()); + vector2.setValueCount(25); + assertEquals(24, vector2.getLastSet()); + + /* Check the vector output */ + assertArrayEquals(STR1, vector2.get(0)); + assertArrayEquals(STR2, vector2.get(1)); + assertArrayEquals(STR3, vector2.get(2)); + assertArrayEquals(STR4, vector2.get(3)); + assertArrayEquals(STR5, vector2.get(4)); + assertArrayEquals(STR6, vector2.get(5)); + } + } + } + + @Test + public void testFillEmptiesUsage() { + try (final LargeVarCharVector vector = new LargeVarCharVector("myvector", allocator)) { + + vector.allocateNew(1024 * 10, 1024); + + setBytes(0, STR1, vector); + setBytes(1, STR2, vector); + setBytes(2, STR3, vector); + setBytes(3, STR4, vector); + setBytes(4, STR5, vector); + setBytes(5, STR6, vector); + + /* Check current lastSet */ + assertEquals(-1, vector.getLastSet()); + + /* Check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertArrayEquals(STR5, vector.get(4)); + assertArrayEquals(STR6, vector.get(5)); + + vector.setLastSet(5); + /* fill empty byte arrays from index [6, 9] */ + vector.fillEmpties(10); + + /* Check current lastSet */ + assertEquals(9, vector.getLastSet()); + + /* Check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertArrayEquals(STR5, vector.get(4)); + assertArrayEquals(STR6, vector.get(5)); + assertEquals(0, vector.getValueLength(6)); + assertEquals(0, vector.getValueLength(7)); + assertEquals(0, vector.getValueLength(8)); + assertEquals(0, vector.getValueLength(9)); + + setBytes(10, STR1, vector); + setBytes(11, STR2, vector); + + vector.setLastSet(11); + /* fill empty byte arrays from index [12, 14] */ + vector.setValueCount(15); + + /* Check current lastSet */ + assertEquals(14, vector.getLastSet()); + + /* Check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertArrayEquals(STR5, vector.get(4)); + assertArrayEquals(STR6, vector.get(5)); + assertEquals(0, vector.getValueLength(6)); + assertEquals(0, vector.getValueLength(7)); + assertEquals(0, vector.getValueLength(8)); + assertEquals(0, vector.getValueLength(9)); + assertArrayEquals(STR1, vector.get(10)); + assertArrayEquals(STR2, vector.get(11)); + assertEquals(0, vector.getValueLength(12)); + assertEquals(0, vector.getValueLength(13)); + assertEquals(0, vector.getValueLength(14)); + + /* Check offsets */ + assertEquals(0, + vector.offsetBuffer.getLong(0 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(6, + vector.offsetBuffer.getLong(1 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(16, + vector.offsetBuffer.getLong(2 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(21, + vector.offsetBuffer.getLong(3 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(30, + vector.offsetBuffer.getLong(4 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(34, + vector.offsetBuffer.getLong(5 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + + assertEquals(40, + vector.offsetBuffer.getLong(6 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, + vector.offsetBuffer.getLong(7 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, + vector.offsetBuffer.getLong(8 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, + vector.offsetBuffer.getLong(9 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, + vector.offsetBuffer.getLong(10 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + + assertEquals(46, + vector.offsetBuffer.getLong(11 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(56, + vector.offsetBuffer.getLong(12 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + + assertEquals(56, + vector.offsetBuffer.getLong(13 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(56, + vector.offsetBuffer.getLong(14 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(56, + vector.offsetBuffer.getLong(15 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + } + } + + @Test + public void testGetBufferAddress1() { + try (final LargeVarCharVector vector = new LargeVarCharVector("myvector", allocator)) { + + ValueVectorDataPopulator.setVector(vector, STR1, STR2, STR3, STR4, STR5, STR6); + vector.setValueCount(15); + + /* check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertArrayEquals(STR5, vector.get(4)); + assertArrayEquals(STR6, vector.get(5)); + + List buffers = vector.getFieldBuffers(); + long bitAddress = vector.getValidityBufferAddress(); + long offsetAddress = vector.getOffsetBufferAddress(); + long dataAddress = vector.getDataBufferAddress(); + + assertEquals(3, buffers.size()); + assertEquals(bitAddress, buffers.get(0).memoryAddress()); + assertEquals(offsetAddress, buffers.get(1).memoryAddress()); + assertEquals(dataAddress, buffers.get(2).memoryAddress()); + } + } + + @Test + public void testSetNullableLargeVarCharHolder() { + try (LargeVarCharVector vector = new LargeVarCharVector("", allocator)) { + vector.allocateNew(100, 10); + + NullableLargeVarCharHolder nullHolder = new NullableLargeVarCharHolder(); + nullHolder.isSet = 0; + + NullableLargeVarCharHolder stringHolder = new NullableLargeVarCharHolder(); + stringHolder.isSet = 1; + + String str = "hello"; + ArrowBuf buf = allocator.buffer(16); + buf.setBytes(0, str.getBytes()); + + stringHolder.start = 0; + stringHolder.end = str.length(); + stringHolder.buffer = buf; + + vector.set(0, nullHolder); + vector.set(1, stringHolder); + + // verify results + assertTrue(vector.isNull(0)); + assertEquals(str, new String(vector.get(1))); + + buf.close(); + } + } + + @Test + public void testSetNullableLargeVarCharHolderSafe() { + try (LargeVarCharVector vector = new LargeVarCharVector("", allocator)) { + vector.allocateNew(5, 1); + + NullableLargeVarCharHolder nullHolder = new NullableLargeVarCharHolder(); + nullHolder.isSet = 0; + + NullableLargeVarCharHolder stringHolder = new NullableLargeVarCharHolder(); + stringHolder.isSet = 1; + + String str = "hello world"; + ArrowBuf buf = allocator.buffer(16); + buf.setBytes(0, str.getBytes()); + + stringHolder.start = 0; + stringHolder.end = str.length(); + stringHolder.buffer = buf; + + vector.setSafe(0, stringHolder); + vector.setSafe(1, nullHolder); + + // verify results + assertEquals(str, new String(vector.get(0))); + assertTrue(vector.isNull(1)); + + buf.close(); + } + } + + @Test + public void testGetNullFromLargeVariableWidthVector() { + try (final LargeVarCharVector largeVarCharVector = new LargeVarCharVector("largevarcharvec", allocator); + final LargeVarBinaryVector largeVarBinaryVector = new LargeVarBinaryVector("largevarbinary", allocator)) { + largeVarCharVector.allocateNew(10, 1); + largeVarBinaryVector.allocateNew(10, 1); + + largeVarCharVector.setNull(0); + largeVarBinaryVector.setNull(0); + + assertNull(largeVarCharVector.get(0)); + assertNull(largeVarBinaryVector.get(0)); + } + } + + @Test + public void testLargeVariableWidthVectorNullHashCode() { + try (LargeVarCharVector largeVarChVec = new LargeVarCharVector("large var char vector", allocator)) { + largeVarChVec.allocateNew(100, 1); + largeVarChVec.setValueCount(1); + + largeVarChVec.set(0, "abc".getBytes()); + largeVarChVec.setNull(0); + + assertEquals(0, largeVarChVec.hashCode(0)); + } + } + + @Test + public void testUnloadLargeVariableWidthVector() { + try (final LargeVarCharVector largeVarCharVector = new LargeVarCharVector("var char", allocator)) { + largeVarCharVector.allocateNew(5, 2); + largeVarCharVector.setValueCount(2); + + largeVarCharVector.set(0, "abcd".getBytes()); + + List bufs = largeVarCharVector.getFieldBuffers(); + assertEquals(3, bufs.size()); + + ArrowBuf offsetBuf = bufs.get(1); + ArrowBuf dataBuf = bufs.get(2); + + assertEquals(24, offsetBuf.writerIndex()); + assertEquals(4, offsetBuf.getLong(8)); + assertEquals(4, offsetBuf.getLong(16)); + + assertEquals(4, dataBuf.writerIndex()); + } + } + + @Test + public void testNullableType() { + try (final LargeVarCharVector vector = new LargeVarCharVector("", allocator)) { + vector.setInitialCapacity(512); + vector.allocateNew(); + + assertTrue(vector.getValueCapacity() >= 512); + int initialCapacity = vector.getValueCapacity(); + + try { + vector.set(initialCapacity, "foo".getBytes(StandardCharsets.UTF_8)); + Assert.fail("Expected out of bounds exception"); + } catch (Exception e) { + // ok + } + + vector.reAlloc(); + assertTrue(vector.getValueCapacity() >= 2 * initialCapacity); + + vector.set(initialCapacity, "foo".getBytes(StandardCharsets.UTF_8)); + assertEquals("foo", new String(vector.get(initialCapacity), StandardCharsets.UTF_8)); + } + } + + private void populateLargeVarcharVector(final LargeVarCharVector vector, int valueCount, String[] values) { + for (int i = 0; i < valueCount; i += 3) { + final String s = String.format("%010d", i); + vector.set(i, s.getBytes()); + if (values != null) { + values[i] = s; + } + } + vector.setValueCount(valueCount); + } + + public static void setBytes(int index, byte[] bytes, LargeVarCharVector vector) { + final long currentOffset = vector.offsetBuffer.getLong((long) index * BaseLargeVariableWidthVector.OFFSET_WIDTH); + + BitVectorHelper.setBit(vector.validityBuffer, index); + vector.offsetBuffer.setLong( + (long) (index + 1) * BaseLargeVariableWidthVector.OFFSET_WIDTH, currentOffset + bytes.length); + vector.valueBuffer.setBytes(currentOffset, bytes, 0, bytes.length); + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java new file mode 100644 index 000000000..ffeedf04d --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java @@ -0,0 +1,981 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.BaseRepeatedValueVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.impl.UnionListWriter; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class TestListVector { + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testCopyFrom() throws Exception { + try (ListVector inVector = ListVector.empty("input", allocator); + ListVector outVector = ListVector.empty("output", allocator)) { + UnionListWriter writer = inVector.getWriter(); + writer.allocate(); + + // populate input vector with the following records + // [1, 2, 3] + // null + // [] + writer.setPosition(0); // optional + writer.startList(); + writer.bigInt().writeBigInt(1); + writer.bigInt().writeBigInt(2); + writer.bigInt().writeBigInt(3); + writer.endList(); + + writer.setPosition(2); + writer.startList(); + writer.endList(); + + writer.setValueCount(3); + + // copy values from input to output + outVector.allocateNew(); + for (int i = 0; i < 3; i++) { + outVector.copyFrom(i, i, inVector); + } + outVector.setValueCount(3); + + // assert the output vector is correct + FieldReader reader = outVector.getReader(); + Assert.assertTrue("shouldn't be null", reader.isSet()); + reader.setPosition(1); + Assert.assertFalse("should be null", reader.isSet()); + reader.setPosition(2); + Assert.assertTrue("shouldn't be null", reader.isSet()); + + + /* index 0 */ + Object result = outVector.getObject(0); + ArrayList resultSet = (ArrayList) result; + assertEquals(3, resultSet.size()); + assertEquals(new Long(1), (Long) resultSet.get(0)); + assertEquals(new Long(2), (Long) resultSet.get(1)); + assertEquals(new Long(3), (Long) resultSet.get(2)); + + /* index 1 */ + result = outVector.getObject(1); + assertNull(result); + + /* index 2 */ + result = outVector.getObject(2); + resultSet = (ArrayList) result; + assertEquals(0, resultSet.size()); + + /* 3+0+0/3 */ + assertEquals(1.0D, inVector.getDensity(), 0); + } + } + + @Test + public void testSetLastSetUsage() throws Exception { + try (ListVector listVector = ListVector.empty("input", allocator)) { + + /* Explicitly add the dataVector */ + MinorType type = MinorType.BIGINT; + listVector.addOrGetVector(FieldType.nullable(type.getType())); + + /* allocate memory */ + listVector.allocateNew(); + + /* get inner buffers; validityBuffer and offsetBuffer */ + + ArrowBuf validityBuffer = listVector.getValidityBuffer(); + ArrowBuf offsetBuffer = listVector.getOffsetBuffer(); + + /* get the underlying data vector -- BigIntVector */ + BigIntVector dataVector = (BigIntVector) listVector.getDataVector(); + + /* check current lastSet */ + assertEquals(Integer.toString(-1), Integer.toString(listVector.getLastSet())); + + int index = 0; + int offset = 0; + + /* write [10, 11, 12] to the list vector at index 0 */ + BitVectorHelper.setBit(validityBuffer, index); + dataVector.setSafe(0, 1, 10); + dataVector.setSafe(1, 1, 11); + dataVector.setSafe(2, 1, 12); + offsetBuffer.setInt((index + 1) * ListVector.OFFSET_WIDTH, 3); + + index += 1; + + /* write [13, 14] to the list vector at index 1 */ + BitVectorHelper.setBit(validityBuffer, index); + dataVector.setSafe(3, 1, 13); + dataVector.setSafe(4, 1, 14); + offsetBuffer.setInt((index + 1) * ListVector.OFFSET_WIDTH, 5); + + index += 1; + + /* write [15, 16, 17] to the list vector at index 2 */ + BitVectorHelper.setBit(validityBuffer, index); + dataVector.setSafe(5, 1, 15); + dataVector.setSafe(6, 1, 16); + dataVector.setSafe(7, 1, 17); + offsetBuffer.setInt((index + 1) * ListVector.OFFSET_WIDTH, 8); + + /* check current lastSet */ + assertEquals(Integer.toString(-1), Integer.toString(listVector.getLastSet())); + + /* set lastset and arbitrary valuecount for list vector. + * + * NOTE: if we don't execute setLastSet() before setLastValueCount(), then + * the latter will corrupt the offsetBuffer and thus the accessor will not + * retrieve the correct values from underlying dataBuffer. Run the test + * by commenting out next line and we should see failures from 5th assert + * onwards. This is why doing setLastSet() is important before setValueCount() + * once the vector has been loaded. + * + * Another important thing to remember is the value of lastSet itself. + * Even though the listVector has elements till index 2 only, the lastSet should + * be set as 3. This is because the offsetBuffer has valid offsets filled till index 3. + * If we do setLastSet(2), the offsetBuffer at index 3 will contain incorrect value + * after execution of setValueCount(). + * + * correct state of the listVector + * bitvector {1, 1, 1, 0, 0.... } + * offsetvector {0, 3, 5, 8, 8, 8.....} + * datavector { [10, 11, 12], + * [13, 14], + * [15, 16, 17] + * } + * + * if we don't do setLastSet() before setValueCount --> incorrect state + * bitvector {1, 1, 1, 0, 0.... } + * offsetvector {0, 0, 0, 0, 0, 0.....} + * datavector { [10, 11, 12], + * [13, 14], + * [15, 16, 17] + * } + * + * if we do setLastSet(2) before setValueCount --> incorrect state + * bitvector {1, 1, 1, 0, 0.... } + * offsetvector {0, 3, 5, 5, 5, 5.....} + * datavector { [10, 11, 12], + * [13, 14], + * [15, 16, 17] + * } + */ + listVector.setLastSet(2); + listVector.setValueCount(10); + + /* (3+2+3)/10 */ + assertEquals(0.8D, listVector.getDensity(), 0); + + index = 0; + offset = offsetBuffer.getInt(index * ListVector.OFFSET_WIDTH); + assertEquals(Integer.toString(0), Integer.toString(offset)); + + Long actual = dataVector.getObject(offset); + assertEquals(new Long(10), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(11), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(12), actual); + + index++; + offset = offsetBuffer.getInt(index * ListVector.OFFSET_WIDTH); + assertEquals(Integer.toString(3), Integer.toString(offset)); + + actual = dataVector.getObject(offset); + assertEquals(new Long(13), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(14), actual); + + index++; + offset = offsetBuffer.getInt(index * ListVector.OFFSET_WIDTH); + assertEquals(Integer.toString(5), Integer.toString(offset)); + + actual = dataVector.getObject(offset); + assertEquals(new Long(15), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(16), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(17), actual); + + index++; + offset = offsetBuffer.getInt(index * ListVector.OFFSET_WIDTH); + assertEquals(Integer.toString(8), Integer.toString(offset)); + + actual = dataVector.getObject(offset); + assertNull(actual); + } + } + + @Test + public void testSplitAndTransfer() throws Exception { + try (ListVector listVector = ListVector.empty("sourceVector", allocator)) { + + /* Explicitly add the dataVector */ + MinorType type = MinorType.BIGINT; + listVector.addOrGetVector(FieldType.nullable(type.getType())); + + UnionListWriter listWriter = listVector.getWriter(); + + /* allocate memory */ + listWriter.allocate(); + + /* populate data */ + listWriter.setPosition(0); + listWriter.startList(); + listWriter.bigInt().writeBigInt(10); + listWriter.bigInt().writeBigInt(11); + listWriter.bigInt().writeBigInt(12); + listWriter.endList(); + + listWriter.setPosition(1); + listWriter.startList(); + listWriter.bigInt().writeBigInt(13); + listWriter.bigInt().writeBigInt(14); + listWriter.endList(); + + listWriter.setPosition(2); + listWriter.startList(); + listWriter.bigInt().writeBigInt(15); + listWriter.bigInt().writeBigInt(16); + listWriter.bigInt().writeBigInt(17); + listWriter.bigInt().writeBigInt(18); + listWriter.endList(); + + listWriter.setPosition(3); + listWriter.startList(); + listWriter.bigInt().writeBigInt(19); + listWriter.endList(); + + listWriter.setPosition(4); + listWriter.startList(); + listWriter.bigInt().writeBigInt(20); + listWriter.bigInt().writeBigInt(21); + listWriter.bigInt().writeBigInt(22); + listWriter.bigInt().writeBigInt(23); + listWriter.endList(); + + listVector.setValueCount(5); + + assertEquals(4, listVector.getLastSet()); + + /* get offset buffer */ + final ArrowBuf offsetBuffer = listVector.getOffsetBuffer(); + + /* get dataVector */ + BigIntVector dataVector = (BigIntVector) listVector.getDataVector(); + + /* check the vector output */ + + int index = 0; + int offset = 0; + Long actual = null; + + /* index 0 */ + assertFalse(listVector.isNull(index)); + offset = offsetBuffer.getInt(index * ListVector.OFFSET_WIDTH); + assertEquals(Integer.toString(0), Integer.toString(offset)); + + actual = dataVector.getObject(offset); + assertEquals(new Long(10), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(11), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(12), actual); + + /* index 1 */ + index++; + assertFalse(listVector.isNull(index)); + offset = offsetBuffer.getInt(index * ListVector.OFFSET_WIDTH); + assertEquals(Integer.toString(3), Integer.toString(offset)); + + actual = dataVector.getObject(offset); + assertEquals(new Long(13), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(14), actual); + + /* index 2 */ + index++; + assertFalse(listVector.isNull(index)); + offset = offsetBuffer.getInt(index * ListVector.OFFSET_WIDTH); + assertEquals(Integer.toString(5), Integer.toString(offset)); + + actual = dataVector.getObject(offset); + assertEquals(new Long(15), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(16), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(17), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(18), actual); + + /* index 3 */ + index++; + assertFalse(listVector.isNull(index)); + offset = offsetBuffer.getInt(index * ListVector.OFFSET_WIDTH); + assertEquals(Integer.toString(9), Integer.toString(offset)); + + actual = dataVector.getObject(offset); + assertEquals(new Long(19), actual); + + /* index 4 */ + index++; + assertFalse(listVector.isNull(index)); + offset = offsetBuffer.getInt(index * ListVector.OFFSET_WIDTH); + assertEquals(Integer.toString(10), Integer.toString(offset)); + + actual = dataVector.getObject(offset); + assertEquals(new Long(20), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(21), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(22), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(new Long(23), actual); + + /* index 5 */ + index++; + assertTrue(listVector.isNull(index)); + offset = offsetBuffer.getInt(index * ListVector.OFFSET_WIDTH); + assertEquals(Integer.toString(14), Integer.toString(offset)); + + /* do split and transfer */ + try (ListVector toVector = ListVector.empty("toVector", allocator)) { + + TransferPair transferPair = listVector.makeTransferPair(toVector); + + int[][] transferLengths = {{0, 2}, {3, 1}, {4, 1}}; + + for (final int[] transferLength : transferLengths) { + int start = transferLength[0]; + int splitLength = transferLength[1]; + + int dataLength1 = 0; + int dataLength2 = 0; + + int offset1 = 0; + int offset2 = 0; + + transferPair.splitAndTransfer(start, splitLength); + + /* get offsetBuffer of toVector */ + final ArrowBuf toOffsetBuffer = toVector.getOffsetBuffer(); + + /* get dataVector of toVector */ + BigIntVector dataVector1 = (BigIntVector) toVector.getDataVector(); + + for (int i = 0; i < splitLength; i++) { + dataLength1 = offsetBuffer.getInt((start + i + 1) * ListVector.OFFSET_WIDTH) - + offsetBuffer.getInt((start + i) * ListVector.OFFSET_WIDTH); + dataLength2 = toOffsetBuffer.getInt((i + 1) * ListVector.OFFSET_WIDTH) - + toOffsetBuffer.getInt(i * ListVector.OFFSET_WIDTH); + + assertEquals("Different data lengths at index: " + i + " and start: " + start, + dataLength1, dataLength2); + + offset1 = offsetBuffer.getInt((start + i) * ListVector.OFFSET_WIDTH); + offset2 = toOffsetBuffer.getInt(i * ListVector.OFFSET_WIDTH); + + for (int j = 0; j < dataLength1; j++) { + assertEquals("Different data at indexes: " + offset1 + " and " + offset2, + dataVector.getObject(offset1), dataVector1.getObject(offset2)); + + offset1++; + offset2++; + } + } + } + } + } + } + + @Test + public void testNestedListVector() throws Exception { + try (ListVector listVector = ListVector.empty("sourceVector", allocator)) { + + UnionListWriter listWriter = listVector.getWriter(); + + /* allocate memory */ + listWriter.allocate(); + + /* the dataVector that backs a listVector will also be a + * listVector for this test. + */ + + /* write one or more inner lists at index 0 */ + listWriter.setPosition(0); + listWriter.startList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(50); + listWriter.list().bigInt().writeBigInt(100); + listWriter.list().bigInt().writeBigInt(200); + listWriter.list().endList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(75); + listWriter.list().bigInt().writeBigInt(125); + listWriter.list().bigInt().writeBigInt(150); + listWriter.list().bigInt().writeBigInt(175); + listWriter.list().endList(); + + listWriter.endList(); + + /* write one or more inner lists at index 1 */ + listWriter.setPosition(1); + listWriter.startList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(10); + listWriter.list().endList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(15); + listWriter.list().bigInt().writeBigInt(20); + listWriter.list().endList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(25); + listWriter.list().bigInt().writeBigInt(30); + listWriter.list().bigInt().writeBigInt(35); + listWriter.list().endList(); + + listWriter.endList(); + + assertEquals(1, listVector.getLastSet()); + + listVector.setValueCount(2); + + assertEquals(2, listVector.getValueCount()); + + /* get listVector value at index 0 -- the value itself is a listvector */ + Object result = listVector.getObject(0); + ArrayList> resultSet = (ArrayList>) result; + ArrayList list; + + assertEquals(2, resultSet.size()); /* 2 inner lists at index 0 */ + assertEquals(3, resultSet.get(0).size()); /* size of first inner list */ + assertEquals(4, resultSet.get(1).size()); /* size of second inner list */ + + list = resultSet.get(0); + assertEquals(new Long(50), list.get(0)); + assertEquals(new Long(100), list.get(1)); + assertEquals(new Long(200), list.get(2)); + + list = resultSet.get(1); + assertEquals(new Long(75), list.get(0)); + assertEquals(new Long(125), list.get(1)); + assertEquals(new Long(150), list.get(2)); + assertEquals(new Long(175), list.get(3)); + + /* get listVector value at index 1 -- the value itself is a listvector */ + result = listVector.getObject(1); + resultSet = (ArrayList>) result; + + assertEquals(3, resultSet.size()); /* 3 inner lists at index 1 */ + assertEquals(1, resultSet.get(0).size()); /* size of first inner list */ + assertEquals(2, resultSet.get(1).size()); /* size of second inner list */ + assertEquals(3, resultSet.get(2).size()); /* size of third inner list */ + + list = resultSet.get(0); + assertEquals(new Long(10), list.get(0)); + + list = resultSet.get(1); + assertEquals(new Long(15), list.get(0)); + assertEquals(new Long(20), list.get(1)); + + list = resultSet.get(2); + assertEquals(new Long(25), list.get(0)); + assertEquals(new Long(30), list.get(1)); + assertEquals(new Long(35), list.get(2)); + + /* check underlying bitVector */ + assertFalse(listVector.isNull(0)); + assertFalse(listVector.isNull(1)); + + /* check underlying offsets */ + final ArrowBuf offsetBuffer = listVector.getOffsetBuffer(); + + /* listVector has 2 lists at index 0 and 3 lists at index 1 */ + assertEquals(0, offsetBuffer.getInt(0 * ListVector.OFFSET_WIDTH)); + assertEquals(2, offsetBuffer.getInt(1 * ListVector.OFFSET_WIDTH)); + assertEquals(5, offsetBuffer.getInt(2 * ListVector.OFFSET_WIDTH)); + } + } + + @Test + public void testNestedListVector1() throws Exception { + try (ListVector listVector = ListVector.empty("sourceVector", allocator)) { + + MinorType listType = MinorType.LIST; + MinorType scalarType = MinorType.BIGINT; + + listVector.addOrGetVector(FieldType.nullable(listType.getType())); + + ListVector innerList1 = (ListVector) listVector.getDataVector(); + innerList1.addOrGetVector(FieldType.nullable(listType.getType())); + + ListVector innerList2 = (ListVector) innerList1.getDataVector(); + innerList2.addOrGetVector(FieldType.nullable(listType.getType())); + + ListVector innerList3 = (ListVector) innerList2.getDataVector(); + innerList3.addOrGetVector(FieldType.nullable(listType.getType())); + + ListVector innerList4 = (ListVector) innerList3.getDataVector(); + innerList4.addOrGetVector(FieldType.nullable(listType.getType())); + + ListVector innerList5 = (ListVector) innerList4.getDataVector(); + innerList5.addOrGetVector(FieldType.nullable(listType.getType())); + + ListVector innerList6 = (ListVector) innerList5.getDataVector(); + innerList6.addOrGetVector(FieldType.nullable(scalarType.getType())); + + listVector.setInitialCapacity(128); + } + } + + @Test + public void testNestedListVector2() throws Exception { + try (ListVector listVector = ListVector.empty("sourceVector", allocator)) { + listVector.setInitialCapacity(1); + UnionListWriter listWriter = listVector.getWriter(); + /* allocate memory */ + listWriter.allocate(); + + /* write one or more inner lists at index 0 */ + listWriter.setPosition(0); + listWriter.startList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(50); + listWriter.list().bigInt().writeBigInt(100); + listWriter.list().bigInt().writeBigInt(200); + listWriter.list().endList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(75); + listWriter.list().bigInt().writeBigInt(125); + listWriter.list().endList(); + + listWriter.endList(); + + /* write one or more inner lists at index 1 */ + listWriter.setPosition(1); + listWriter.startList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(15); + listWriter.list().bigInt().writeBigInt(20); + listWriter.list().endList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(25); + listWriter.list().bigInt().writeBigInt(30); + listWriter.list().bigInt().writeBigInt(35); + listWriter.list().endList(); + + listWriter.endList(); + + assertEquals(1, listVector.getLastSet()); + + listVector.setValueCount(2); + + assertEquals(2, listVector.getValueCount()); + + /* get listVector value at index 0 -- the value itself is a listvector */ + Object result = listVector.getObject(0); + ArrayList> resultSet = (ArrayList>) result; + ArrayList list; + + assertEquals(2, resultSet.size()); /* 2 inner lists at index 0 */ + assertEquals(3, resultSet.get(0).size()); /* size of first inner list */ + assertEquals(2, resultSet.get(1).size()); /* size of second inner list */ + + list = resultSet.get(0); + assertEquals(new Long(50), list.get(0)); + assertEquals(new Long(100), list.get(1)); + assertEquals(new Long(200), list.get(2)); + + list = resultSet.get(1); + assertEquals(new Long(75), list.get(0)); + assertEquals(new Long(125), list.get(1)); + + /* get listVector value at index 1 -- the value itself is a listvector */ + result = listVector.getObject(1); + resultSet = (ArrayList>) result; + + assertEquals(2, resultSet.size()); /* 3 inner lists at index 1 */ + assertEquals(2, resultSet.get(0).size()); /* size of first inner list */ + assertEquals(3, resultSet.get(1).size()); /* size of second inner list */ + + list = resultSet.get(0); + assertEquals(new Long(15), list.get(0)); + assertEquals(new Long(20), list.get(1)); + + list = resultSet.get(1); + assertEquals(new Long(25), list.get(0)); + assertEquals(new Long(30), list.get(1)); + assertEquals(new Long(35), list.get(2)); + + /* check underlying bitVector */ + assertFalse(listVector.isNull(0)); + assertFalse(listVector.isNull(1)); + + /* check underlying offsets */ + final ArrowBuf offsetBuffer = listVector.getOffsetBuffer(); + + /* listVector has 2 lists at index 0 and 3 lists at index 1 */ + assertEquals(0, offsetBuffer.getInt(0 * ListVector.OFFSET_WIDTH)); + assertEquals(2, offsetBuffer.getInt(1 * ListVector.OFFSET_WIDTH)); + assertEquals(4, offsetBuffer.getInt(2 * ListVector.OFFSET_WIDTH)); + } + } + + @Test + public void testGetBufferAddress() throws Exception { + try (ListVector listVector = ListVector.empty("vector", allocator)) { + + UnionListWriter listWriter = listVector.getWriter(); + boolean error = false; + + listWriter.allocate(); + + listWriter.setPosition(0); + listWriter.startList(); + listWriter.bigInt().writeBigInt(50); + listWriter.bigInt().writeBigInt(100); + listWriter.bigInt().writeBigInt(200); + listWriter.endList(); + + listWriter.setPosition(1); + listWriter.startList(); + listWriter.bigInt().writeBigInt(250); + listWriter.bigInt().writeBigInt(300); + listWriter.endList(); + + listVector.setValueCount(2); + + /* check listVector contents */ + Object result = listVector.getObject(0); + ArrayList resultSet = (ArrayList) result; + assertEquals(3, resultSet.size()); + assertEquals(new Long(50), resultSet.get(0)); + assertEquals(new Long(100), resultSet.get(1)); + assertEquals(new Long(200), resultSet.get(2)); + + result = listVector.getObject(1); + resultSet = (ArrayList) result; + assertEquals(2, resultSet.size()); + assertEquals(new Long(250), resultSet.get(0)); + assertEquals(new Long(300), resultSet.get(1)); + + List buffers = listVector.getFieldBuffers(); + + long bitAddress = listVector.getValidityBufferAddress(); + long offsetAddress = listVector.getOffsetBufferAddress(); + + try { + long dataAddress = listVector.getDataBufferAddress(); + } catch (UnsupportedOperationException ue) { + error = true; + } finally { + assertTrue(error); + } + + assertEquals(2, buffers.size()); + assertEquals(bitAddress, buffers.get(0).memoryAddress()); + assertEquals(offsetAddress, buffers.get(1).memoryAddress()); + + /* (3+2)/2 */ + assertEquals(2.5, listVector.getDensity(), 0); + } + } + + @Test + public void testConsistentChildName() throws Exception { + try (ListVector listVector = ListVector.empty("sourceVector", allocator)) { + String emptyListStr = listVector.getField().toString(); + assertTrue(emptyListStr.contains(ListVector.DATA_VECTOR_NAME)); + + listVector.addOrGetVector(FieldType.nullable(MinorType.INT.getType())); + String emptyVectorStr = listVector.getField().toString(); + assertTrue(emptyVectorStr.contains(ListVector.DATA_VECTOR_NAME)); + } + } + + @Test + public void testSetInitialCapacity() { + try (final ListVector vector = ListVector.empty("", allocator)) { + vector.addOrGetVector(FieldType.nullable(MinorType.INT.getType())); + + /** + * use the default multiplier of 5, + * 512 * 5 => 2560 * 4 => 10240 bytes => 16KB => 4096 value capacity. + */ + vector.setInitialCapacity(512); + vector.allocateNew(); + assertEquals(512, vector.getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 512 * 5); + + /* use density as 4 */ + vector.setInitialCapacity(512, 4); + vector.allocateNew(); + assertEquals(512, vector.getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 512 * 4); + + /** + * inner value capacity we pass to data vector is 512 * 0.1 => 51 + * For an int vector this is 204 bytes of memory for data buffer + * and 7 bytes for validity buffer. + * and with power of 2 allocation, we allocate 256 bytes and 8 bytes + * for the data buffer and validity buffer of the inner vector. Thus + * value capacity of inner vector is 64 + */ + vector.setInitialCapacity(512, 0.1); + vector.allocateNew(); + assertEquals(512, vector.getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 51); + + /** + * inner value capacity we pass to data vector is 512 * 0.01 => 5 + * For an int vector this is 20 bytes of memory for data buffer + * and 1 byte for validity buffer. + * and with power of 2 allocation, we allocate 32 bytes and 1 bytes + * for the data buffer and validity buffer of the inner vector. Thus + * value capacity of inner vector is 8 + */ + vector.setInitialCapacity(512, 0.01); + vector.allocateNew(); + assertEquals(512, vector.getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 5); + + /** + * inner value capacity we pass to data vector is 5 * 0.1 => 0 + * which is then rounded off to 1. So we pass value count as 1 + * to the inner int vector. + * the offset buffer of the list vector is allocated for 6 values + * which is 24 bytes and then rounded off to 32 bytes (8 values) + * the validity buffer of the list vector is allocated for 5 + * values which is 1 byte. This is why value capacity of the list + * vector is 7 as we take the min of validity buffer value capacity + * and offset buffer value capacity. + */ + vector.setInitialCapacity(5, 0.1); + vector.allocateNew(); + assertEquals(7, vector.getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 1); + } + } + + @Test + public void testClearAndReuse() { + try (final ListVector vector = ListVector.empty("list", allocator)) { + BigIntVector bigIntVector = + (BigIntVector) vector.addOrGetVector(FieldType.nullable(MinorType.BIGINT.getType())).getVector(); + vector.setInitialCapacity(10); + vector.allocateNew(); + + vector.startNewValue(0); + bigIntVector.setSafe(0, 7); + vector.endValue(0, 1); + vector.startNewValue(1); + bigIntVector.setSafe(1, 8); + vector.endValue(1, 1); + vector.setValueCount(2); + + Object result = vector.getObject(0); + ArrayList resultSet = (ArrayList) result; + assertEquals(new Long(7), resultSet.get(0)); + + result = vector.getObject(1); + resultSet = (ArrayList) result; + assertEquals(new Long(8), resultSet.get(0)); + + // Clear and release the buffers to trigger a realloc when adding next value + vector.clear(); + + // The list vector should reuse a buffer when reallocating the offset buffer + vector.startNewValue(0); + bigIntVector.setSafe(0, 7); + vector.endValue(0, 1); + vector.startNewValue(1); + bigIntVector.setSafe(1, 8); + vector.endValue(1, 1); + vector.setValueCount(2); + + result = vector.getObject(0); + resultSet = (ArrayList) result; + assertEquals(new Long(7), resultSet.get(0)); + + result = vector.getObject(1); + resultSet = (ArrayList) result; + assertEquals(new Long(8), resultSet.get(0)); + } + } + + @Test + public void testWriterGetField() { + try (final ListVector vector = ListVector.empty("list", allocator)) { + + UnionListWriter writer = vector.getWriter(); + writer.allocate(); + + //set some values + writer.startList(); + writer.integer().writeInt(1); + writer.integer().writeInt(2); + writer.endList(); + vector.setValueCount(2); + + Field expectedDataField = new Field(BaseRepeatedValueVector.DATA_VECTOR_NAME, + FieldType.nullable(new ArrowType.Int(32, true)), null); + Field expectedField = new Field(vector.getName(), FieldType.nullable(ArrowType.List.INSTANCE), + Arrays.asList(expectedDataField)); + + assertEquals(expectedField, writer.getField()); + } + } + + @Test + public void testClose() throws Exception { + try (final ListVector vector = ListVector.empty("list", allocator)) { + + UnionListWriter writer = vector.getWriter(); + writer.allocate(); + + //set some values + writer.startList(); + writer.integer().writeInt(1); + writer.integer().writeInt(2); + writer.endList(); + vector.setValueCount(2); + + assertTrue(vector.getBufferSize() > 0); + assertTrue(vector.getDataVector().getBufferSize() > 0); + + writer.close(); + assertEquals(0, vector.getBufferSize()); + assertEquals(0, vector.getDataVector().getBufferSize()); + } + } + + @Test + public void testGetBufferSizeFor() { + try (final ListVector vector = ListVector.empty("list", allocator)) { + + UnionListWriter writer = vector.getWriter(); + writer.allocate(); + + //set some values + writeIntValues(writer, new int[] {1, 2}); + writeIntValues(writer, new int[] {3, 4}); + writeIntValues(writer, new int[] {5, 6}); + writeIntValues(writer, new int[] {7, 8, 9, 10}); + writeIntValues(writer, new int[] {11, 12, 13, 14}); + writer.setValueCount(5); + + IntVector dataVector = (IntVector) vector.getDataVector(); + int[] indices = new int[] {0, 2, 4, 6, 10, 14}; + + for (int valueCount = 1; valueCount <= 5; valueCount++) { + int validityBufferSize = BitVectorHelper.getValidityBufferSize(valueCount); + int offsetBufferSize = (valueCount + 1) * BaseRepeatedValueVector.OFFSET_WIDTH; + + int expectedSize = validityBufferSize + offsetBufferSize + dataVector.getBufferSizeFor(indices[valueCount]); + assertEquals(expectedSize, vector.getBufferSizeFor(valueCount)); + } + } + } + + @Test + public void testIsEmpty() { + try (final ListVector vector = ListVector.empty("list", allocator)) { + UnionListWriter writer = vector.getWriter(); + writer.allocate(); + + // set values [1,2], null, [], [5,6] + writeIntValues(writer, new int[] {1, 2}); + writer.setPosition(2); + writeIntValues(writer, new int[] {}); + writeIntValues(writer, new int[] {5, 6}); + writer.setValueCount(4); + + assertFalse(vector.isEmpty(0)); + assertTrue(vector.isNull(1)); + assertTrue(vector.isEmpty(1)); + assertFalse(vector.isNull(2)); + assertTrue(vector.isEmpty(2)); + assertFalse(vector.isEmpty(3)); + } + } + + private void writeIntValues(UnionListWriter writer, int[] values) { + writer.startList(); + for (int v: values) { + writer.integer().writeInt(v); + } + writer.endList(); + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestMapVector.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestMapVector.java new file mode 100644 index 000000000..9637021db --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestMapVector.java @@ -0,0 +1,1113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import java.util.ArrayList; +import java.util.Map; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.impl.UnionMapReader; +import org.apache.arrow.vector.complex.impl.UnionMapWriter; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.JsonStringArrayList; +import org.apache.arrow.vector.util.TransferPair; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestMapVector { + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + public T getResultKey(Map resultStruct) { + assertTrue(resultStruct.containsKey(MapVector.KEY_NAME)); + return resultStruct.get(MapVector.KEY_NAME); + } + + public T getResultValue(Map resultStruct) { + assertTrue(resultStruct.containsKey(MapVector.VALUE_NAME)); + return resultStruct.get(MapVector.VALUE_NAME); + } + + @Test + public void testBasicOperation() { + int count = 5; + try (MapVector mapVector = MapVector.empty("map", allocator, false)) { + mapVector.allocateNew(); + UnionMapWriter mapWriter = mapVector.getWriter(); + for (int i = 0; i < count; i++) { + mapWriter.startMap(); + for (int j = 0; j < i + 1; j++) { + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(j); + mapWriter.value().integer().writeInt(j); + mapWriter.endEntry(); + } + mapWriter.endMap(); + } + mapWriter.setValueCount(count); + UnionMapReader mapReader = mapVector.getReader(); + for (int i = 0; i < count; i++) { + mapReader.setPosition(i); + for (int j = 0; j < i + 1; j++) { + mapReader.next(); + assertEquals("record: " + i, j, mapReader.key().readLong().longValue()); + assertEquals(j, mapReader.value().readInteger().intValue()); + } + } + } + } + + @Test + public void testBasicOperationNulls() { + int count = 6; + try (MapVector mapVector = MapVector.empty("map", allocator, false)) { + mapVector.allocateNew(); + UnionMapWriter mapWriter = mapVector.getWriter(); + for (int i = 0; i < count; i++) { + // i == 1 is a NULL + if (i != 1) { + mapWriter.setPosition(i); + mapWriter.startMap(); + // i == 3 is an empty map + if (i != 3) { + for (int j = 0; j < i + 1; j++) { + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(j); + // i == 5 maps to a NULL value + if (i != 5) { + mapWriter.value().integer().writeInt(j); + } + mapWriter.endEntry(); + } + } + mapWriter.endMap(); + } + } + mapWriter.setValueCount(count); + UnionMapReader mapReader = mapVector.getReader(); + for (int i = 0; i < count; i++) { + mapReader.setPosition(i); + if (i == 1) { + assertFalse(mapReader.isSet()); + } else { + if (i == 3) { + JsonStringArrayList result = (JsonStringArrayList) mapReader.readObject(); + assertTrue(result.isEmpty()); + } else { + for (int j = 0; j < i + 1; j++) { + mapReader.next(); + assertEquals("record: " + i, j, mapReader.key().readLong().longValue()); + if (i == 5) { + assertFalse(mapReader.value().isSet()); + } else { + assertEquals(j, mapReader.value().readInteger().intValue()); + } + } + } + } + } + } + } + + @Test + public void testCopyFrom() throws Exception { + try (MapVector inVector = MapVector.empty("input", allocator, false); + MapVector outVector = MapVector.empty("output", allocator, false)) { + UnionMapWriter writer = inVector.getWriter(); + writer.allocate(); + + // populate input vector with the following records + // {1 -> 11, 2 -> 22, 3 -> 33} + // null + // {2 -> null} + writer.setPosition(0); // optional + writer.startMap(); + writer.startEntry(); + writer.key().bigInt().writeBigInt(1); + writer.value().bigInt().writeBigInt(11); + writer.endEntry(); + writer.startEntry(); + writer.key().bigInt().writeBigInt(2); + writer.value().bigInt().writeBigInt(22); + writer.endEntry(); + writer.startEntry(); + writer.key().bigInt().writeBigInt(3); + writer.value().bigInt().writeBigInt(33); + writer.endEntry(); + writer.endMap(); + + writer.setPosition(2); + writer.startMap(); + writer.startEntry(); + writer.key().bigInt().writeBigInt(2); + writer.endEntry(); + writer.endMap(); + + writer.setValueCount(3); + + // copy values from input to output + outVector.allocateNew(); + for (int i = 0; i < 3; i++) { + outVector.copyFrom(i, i, inVector); + } + outVector.setValueCount(3); + + // assert the output vector is correct + FieldReader reader = outVector.getReader(); + assertTrue("shouldn't be null", reader.isSet()); + reader.setPosition(1); + assertFalse("should be null", reader.isSet()); + reader.setPosition(2); + assertTrue("shouldn't be null", reader.isSet()); + + + /* index 0 */ + Object result = outVector.getObject(0); + ArrayList resultSet = (ArrayList) result; + assertEquals(3, resultSet.size()); + Map resultStruct = (Map) resultSet.get(0); + assertEquals(1L, getResultKey(resultStruct)); + assertEquals(11L, getResultValue(resultStruct)); + resultStruct = (Map) resultSet.get(1); + assertEquals(2L, getResultKey(resultStruct)); + assertEquals(22L, getResultValue(resultStruct)); + resultStruct = (Map) resultSet.get(2); + assertEquals(3L, getResultKey(resultStruct)); + assertEquals(33L, getResultValue(resultStruct)); + + /* index 1 */ + result = outVector.getObject(1); + assertNull(result); + + /* index 2 */ + result = outVector.getObject(2); + resultSet = (ArrayList) result; + assertEquals(1, resultSet.size()); + resultStruct = (Map) resultSet.get(0); + assertEquals(2L, getResultKey(resultStruct)); + assertFalse(resultStruct.containsKey(MapVector.VALUE_NAME)); + } + } + + @Test + public void testSplitAndTransfer() throws Exception { + try (MapVector mapVector = MapVector.empty("sourceVector", allocator, false)) { + + /* Explicitly add the map child vectors */ + FieldType type = new FieldType(false, ArrowType.Struct.INSTANCE, null, null); + AddOrGetResult addResult = mapVector.addOrGetVector(type); + FieldType keyType = new FieldType(false, MinorType.BIGINT.getType(), null, null); + FieldType valueType = FieldType.nullable(MinorType.FLOAT8.getType()); + addResult.getVector().addOrGet(MapVector.KEY_NAME, keyType, BigIntVector.class); + addResult.getVector().addOrGet(MapVector.VALUE_NAME, valueType, Float8Vector.class); + + UnionMapWriter mapWriter = mapVector.getWriter(); + + /* allocate memory */ + mapWriter.allocate(); + + /* populate data */ + mapWriter.setPosition(0); + mapWriter.startMap(); + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(10); + mapWriter.value().float8().writeFloat8(1.0); + mapWriter.endEntry(); + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(11); + mapWriter.value().float8().writeFloat8(1.1); + mapWriter.endEntry(); + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(12); + mapWriter.value().float8().writeFloat8(1.2); + mapWriter.endEntry(); + mapWriter.endMap(); + + mapWriter.setPosition(1); + mapWriter.startMap(); + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(13); + mapWriter.value().float8().writeFloat8(1.3); + mapWriter.endEntry(); + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(14); + mapWriter.value().float8().writeFloat8(1.4); + mapWriter.endEntry(); + mapWriter.endMap(); + + mapWriter.setPosition(2); + mapWriter.startMap(); + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(15); + mapWriter.value().float8().writeFloat8(1.5); + mapWriter.endEntry(); + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(16); + mapWriter.value().float8().writeFloat8(1.6); + mapWriter.endEntry(); + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(17); + mapWriter.value().float8().writeFloat8(1.7); + mapWriter.endEntry(); + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(18); + mapWriter.value().float8().writeFloat8(1.8); + mapWriter.endEntry(); + mapWriter.endMap(); + + mapWriter.setPosition(3); + mapWriter.startMap(); + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(19); + mapWriter.value().float8().writeFloat8(1.9); + mapWriter.endEntry(); + mapWriter.endMap(); + + mapWriter.setPosition(4); + mapWriter.startMap(); + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(20); + mapWriter.value().float8().writeFloat8(2.0); + mapWriter.endEntry(); + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(21); + mapWriter.value().float8().writeFloat8(2.1); + mapWriter.endEntry(); + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(22); + mapWriter.value().float8().writeFloat8(2.2); + mapWriter.endEntry(); + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(23); + mapWriter.value().float8().writeFloat8(2.3); + mapWriter.endEntry(); + mapWriter.endMap(); + + mapVector.setValueCount(5); + + assertEquals(4, mapVector.getLastSet()); + + /* get offset buffer */ + final ArrowBuf offsetBuffer = mapVector.getOffsetBuffer(); + + /* get dataVector */ + StructVector dataVector = (StructVector) mapVector.getDataVector(); + + /* check the vector output */ + int index = 0; + int offset = 0; + Map result = null; + + /* index 0 */ + assertFalse(mapVector.isNull(index)); + offset = offsetBuffer.getInt(index * MapVector.OFFSET_WIDTH); + assertEquals(Integer.toString(0), Integer.toString(offset)); + + result = dataVector.getObject(offset); + assertEquals(10L, getResultKey(result)); + assertEquals(1.0, getResultValue(result)); + offset++; + result = dataVector.getObject(offset); + assertEquals(11L, getResultKey(result)); + assertEquals(1.1, getResultValue(result)); + offset++; + result = dataVector.getObject(offset); + assertEquals(12L, getResultKey(result)); + assertEquals(1.2, getResultValue(result)); + + /* index 1 */ + index++; + assertFalse(mapVector.isNull(index)); + offset = offsetBuffer.getInt(index * MapVector.OFFSET_WIDTH); + assertEquals(Integer.toString(3), Integer.toString(offset)); + + result = dataVector.getObject(offset); + assertEquals(13L, getResultKey(result)); + assertEquals(1.3, getResultValue(result)); + offset++; + result = dataVector.getObject(offset); + assertEquals(14L, getResultKey(result)); + assertEquals(1.4, getResultValue(result)); + + /* index 2 */ + index++; + assertFalse(mapVector.isNull(index)); + offset = offsetBuffer.getInt(index * MapVector.OFFSET_WIDTH); + assertEquals(Integer.toString(5), Integer.toString(offset)); + + result = dataVector.getObject(offset); + assertEquals(15L, getResultKey(result)); + assertEquals(1.5, getResultValue(result)); + offset++; + result = dataVector.getObject(offset); + assertEquals(16L, getResultKey(result)); + assertEquals(1.6, getResultValue(result)); + offset++; + result = dataVector.getObject(offset); + assertEquals(17L, getResultKey(result)); + assertEquals(1.7, getResultValue(result)); + offset++; + result = dataVector.getObject(offset); + assertEquals(18L, getResultKey(result)); + assertEquals(1.8, getResultValue(result)); + + /* index 3 */ + index++; + assertFalse(mapVector.isNull(index)); + offset = offsetBuffer.getInt(index * MapVector.OFFSET_WIDTH); + assertEquals(Integer.toString(9), Integer.toString(offset)); + + result = dataVector.getObject(offset); + assertEquals(19L, getResultKey(result)); + assertEquals(1.9, getResultValue(result)); + + /* index 4 */ + index++; + assertFalse(mapVector.isNull(index)); + offset = offsetBuffer.getInt(index * MapVector.OFFSET_WIDTH); + assertEquals(Integer.toString(10), Integer.toString(offset)); + + result = dataVector.getObject(offset); + assertEquals(20L, getResultKey(result)); + assertEquals(2.0, getResultValue(result)); + offset++; + result = dataVector.getObject(offset); + assertEquals(21L, getResultKey(result)); + assertEquals(2.1, getResultValue(result)); + offset++; + result = dataVector.getObject(offset); + assertEquals(22L, getResultKey(result)); + assertEquals(2.2, getResultValue(result)); + offset++; + result = dataVector.getObject(offset); + assertEquals(23L, getResultKey(result)); + assertEquals(2.3, getResultValue(result)); + + /* index 5 */ + index++; + assertTrue(mapVector.isNull(index)); + offset = offsetBuffer.getInt(index * MapVector.OFFSET_WIDTH); + assertEquals(Integer.toString(14), Integer.toString(offset)); + + /* do split and transfer */ + try (MapVector toVector = MapVector.empty("toVector", allocator, false)) { + + TransferPair transferPair = mapVector.makeTransferPair(toVector); + + int[][] transferLengths = {{0, 2}, {3, 1}, {4, 1}}; + + for (final int[] transferLength : transferLengths) { + int start = transferLength[0]; + int splitLength = transferLength[1]; + + int dataLength1 = 0; + int dataLength2 = 0; + + int offset1 = 0; + int offset2 = 0; + + transferPair.splitAndTransfer(start, splitLength); + + /* get offsetBuffer of toVector */ + final ArrowBuf toOffsetBuffer = toVector.getOffsetBuffer(); + + /* get dataVector of toVector */ + StructVector dataVector1 = (StructVector) toVector.getDataVector(); + + for (int i = 0; i < splitLength; i++) { + dataLength1 = offsetBuffer.getInt((start + i + 1) * MapVector.OFFSET_WIDTH) - + offsetBuffer.getInt((start + i) * MapVector.OFFSET_WIDTH); + dataLength2 = toOffsetBuffer.getInt((i + 1) * MapVector.OFFSET_WIDTH) - + toOffsetBuffer.getInt(i * MapVector.OFFSET_WIDTH); + + assertEquals("Different data lengths at index: " + i + " and start: " + start, + dataLength1, dataLength2); + + offset1 = offsetBuffer.getInt((start + i) * MapVector.OFFSET_WIDTH); + offset2 = toOffsetBuffer.getInt(i * MapVector.OFFSET_WIDTH); + + for (int j = 0; j < dataLength1; j++) { + assertEquals("Different data at indexes: " + offset1 + " and " + offset2, + dataVector.getObject(offset1), dataVector1.getObject(offset2)); + + offset1++; + offset2++; + } + } + } + } + } + } + + @Test + public void testMapWithListValue() throws Exception { + try (MapVector mapVector = MapVector.empty("sourceVector", allocator, false)) { + + UnionMapWriter mapWriter = mapVector.getWriter(); + ListWriter valueWriter; + + /* allocate memory */ + mapWriter.allocate(); + + /* the dataVector that backs a listVector will also be a + * listVector for this test. + */ + + /* write one or more maps index 0 */ + mapWriter.setPosition(0); + mapWriter.startMap(); + + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(1); + valueWriter = mapWriter.value().list(); + valueWriter.startList(); + valueWriter.bigInt().writeBigInt(50); + valueWriter.bigInt().writeBigInt(100); + valueWriter.bigInt().writeBigInt(200); + valueWriter.endList(); + mapWriter.endEntry(); + + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(2); + valueWriter = mapWriter.value().list(); + valueWriter.startList(); + valueWriter.bigInt().writeBigInt(75); + valueWriter.bigInt().writeBigInt(125); + valueWriter.bigInt().writeBigInt(150); + valueWriter.bigInt().writeBigInt(175); + valueWriter.endList(); + mapWriter.endEntry(); + + mapWriter.endMap(); + + /* write one or more maps at index 1 */ + mapWriter.setPosition(1); + mapWriter.startMap(); + + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(3); + valueWriter = mapWriter.value().list(); + valueWriter.startList(); + valueWriter.bigInt().writeBigInt(10); + valueWriter.endList(); + mapWriter.endEntry(); + + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(4); + valueWriter = mapWriter.value().list(); + valueWriter.startList(); + valueWriter.bigInt().writeBigInt(15); + valueWriter.bigInt().writeBigInt(20); + valueWriter.endList(); + mapWriter.endEntry(); + + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(5); + valueWriter = mapWriter.value().list(); + valueWriter.startList(); + valueWriter.bigInt().writeBigInt(25); + valueWriter.bigInt().writeBigInt(30); + valueWriter.bigInt().writeBigInt(35); + valueWriter.endList(); + mapWriter.endEntry(); + + mapWriter.endMap(); + + assertEquals(1, mapVector.getLastSet()); + + mapWriter.setValueCount(2); + + assertEquals(2, mapVector.getValueCount()); + + // Get mapVector element at index 0 + Object result = mapVector.getObject(0); + ArrayList resultSet = (ArrayList) result; + + // 2 map entries at index 0 + assertEquals(2, resultSet.size()); + + // First Map entry + Map resultStruct = (Map) resultSet.get(0); + assertEquals(1L, getResultKey(resultStruct)); + ArrayList list = (ArrayList) getResultValue(resultStruct); + assertEquals(3, list.size()); // value is a list with 3 elements + assertEquals(new Long(50), list.get(0)); + assertEquals(new Long(100), list.get(1)); + assertEquals(new Long(200), list.get(2)); + + // Second Map entry + resultStruct = (Map) resultSet.get(1); + list = (ArrayList) getResultValue(resultStruct); + assertEquals(4, list.size()); // value is a list with 4 elements + assertEquals(new Long(75), list.get(0)); + assertEquals(new Long(125), list.get(1)); + assertEquals(new Long(150), list.get(2)); + assertEquals(new Long(175), list.get(3)); + + // Get mapVector element at index 1 + result = mapVector.getObject(1); + resultSet = (ArrayList) result; + + // First Map entry + resultStruct = (Map) resultSet.get(0); + assertEquals(3L, getResultKey(resultStruct)); + list = (ArrayList) getResultValue(resultStruct); + assertEquals(1, list.size()); // value is a list with 1 element + assertEquals(new Long(10), list.get(0)); + + // Second Map entry + resultStruct = (Map) resultSet.get(1); + assertEquals(4L, getResultKey(resultStruct)); + list = (ArrayList) getResultValue(resultStruct); + assertEquals(2, list.size()); // value is a list with 1 element + assertEquals(new Long(15), list.get(0)); + assertEquals(new Long(20), list.get(1)); + + // Third Map entry + resultStruct = (Map) resultSet.get(2); + assertEquals(5L, getResultKey(resultStruct)); + list = (ArrayList) getResultValue(resultStruct); + assertEquals(3, list.size()); // value is a list with 1 element + assertEquals(new Long(25), list.get(0)); + assertEquals(new Long(30), list.get(1)); + assertEquals(new Long(35), list.get(2)); + + /* check underlying bitVector */ + assertFalse(mapVector.isNull(0)); + assertFalse(mapVector.isNull(1)); + + /* check underlying offsets */ + final ArrowBuf offsetBuffer = mapVector.getOffsetBuffer(); + + /* mapVector has 2 entries at index 0 and 3 entries at index 1 */ + assertEquals(0, offsetBuffer.getInt(0 * MapVector.OFFSET_WIDTH)); + assertEquals(2, offsetBuffer.getInt(1 * MapVector.OFFSET_WIDTH)); + assertEquals(5, offsetBuffer.getInt(2 * MapVector.OFFSET_WIDTH)); + } + } + + @Test + public void testMapWithMapValue() throws Exception { + try (MapVector mapVector = MapVector.empty("sourceVector", allocator, false)) { + + UnionMapWriter mapWriter = mapVector.getWriter(); + MapWriter valueWriter; + + // we are essentially writing Map> + // populate map vector with the following three records + // [ + // null, + // [1:[50: 100, 200:400], 2:[75: 175, 150: 250]], + // [3:[10: 20], 4:[15: 20], 5:[25: 30, 35: null]] + // ] + + /* write null at index 0 */ + mapWriter.setPosition(0); + mapWriter.writeNull(); + + /* write one or more maps at index 1 */ + mapWriter.setPosition(1); + mapWriter.startMap(); + + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(1); + valueWriter = mapWriter.value().map(false); + valueWriter.startMap(); + writeEntry(valueWriter, 50, 100L); + writeEntry(valueWriter, 200, 400L); + valueWriter.endMap(); + mapWriter.endEntry(); + + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(2); + valueWriter = mapWriter.value().map(false); + valueWriter.startMap(); + writeEntry(valueWriter, 75, 175L); + writeEntry(valueWriter, 150, 250L); + valueWriter.endMap(); + mapWriter.endEntry(); + + mapWriter.endMap(); + + /* write one or more maps at index 2 */ + mapWriter.setPosition(2); + mapWriter.startMap(); + + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(3); + valueWriter = mapWriter.value().map(true); + valueWriter.startMap(); + writeEntry(valueWriter, 10, 20L); + valueWriter.endMap(); + mapWriter.endEntry(); + + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(4); + valueWriter = mapWriter.value().map(false); + valueWriter.startMap(); + writeEntry(valueWriter, 15, 20L); + valueWriter.endMap(); + mapWriter.endEntry(); + + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(5); + valueWriter = mapWriter.value().map(false); + valueWriter.startMap(); + writeEntry(valueWriter, 25, 30L); + writeEntry(valueWriter, 35, (Long) null); + valueWriter.endMap(); + mapWriter.endEntry(); + + mapWriter.endMap(); + + assertEquals(2, mapVector.getLastSet()); + + mapWriter.setValueCount(3); + + assertEquals(3, mapVector.getValueCount()); + + // Get mapVector element at index 0 + Object result = mapVector.getObject(0); + assertNull(result); + + // Get mapVector element at index 1 + result = mapVector.getObject(1); + ArrayList resultSet = (ArrayList) result; + + // 2 map entries at index 0 + assertEquals(2, resultSet.size()); + + // First Map entry + Map resultStruct = (Map) resultSet.get(0); + assertEquals(1L, getResultKey(resultStruct)); + ArrayList> list = (ArrayList>) getResultValue(resultStruct); + assertEquals(2, list.size()); // value is a list of 2 two maps + Map innerMap = list.get(0); + assertEquals(50L, getResultKey(innerMap)); + assertEquals(100L, getResultValue(innerMap)); + innerMap = list.get(1); + assertEquals(200L, getResultKey(innerMap)); + assertEquals(400L, getResultValue(innerMap)); + + // Second Map entry + resultStruct = (Map) resultSet.get(1); + assertEquals(2L, getResultKey(resultStruct)); + list = (ArrayList>) getResultValue(resultStruct); + assertEquals(2, list.size()); // value is a list of two maps + innerMap = list.get(0); + assertEquals(75L, getResultKey(innerMap)); + assertEquals(175L, getResultValue(innerMap)); + innerMap = list.get(1); + assertEquals(150L, getResultKey(innerMap)); + assertEquals(250L, getResultValue(innerMap)); + + // Get mapVector element at index 2 + result = mapVector.getObject(2); + resultSet = (ArrayList) result; + + // 3 map entries at index 1 + assertEquals(3, resultSet.size()); + + // First Map entry + resultStruct = (Map) resultSet.get(0); + assertEquals(3L, getResultKey(resultStruct)); + list = (ArrayList>) getResultValue(resultStruct); + assertEquals(1, list.size()); // value is a list of maps with 1 element + innerMap = list.get(0); + assertEquals(10L, getResultKey(innerMap)); + assertEquals(20L, getResultValue(innerMap)); + + // Second Map entry + resultStruct = (Map) resultSet.get(1); + assertEquals(4L, getResultKey(resultStruct)); + list = (ArrayList>) getResultValue(resultStruct); + assertEquals(1, list.size()); // value is a list of maps with 1 element + innerMap = list.get(0); + assertEquals(15L, getResultKey(innerMap)); + assertEquals(20L, getResultValue(innerMap)); + + // Third Map entry + resultStruct = (Map) resultSet.get(2); + assertEquals(5L, getResultKey(resultStruct)); + list = (ArrayList>) getResultValue(resultStruct); + assertEquals(2, list.size()); // value is a list of maps with 2 elements + innerMap = list.get(0); + assertEquals(25L, getResultKey(innerMap)); + assertEquals(30L, getResultValue(innerMap)); + innerMap = list.get(1); + assertEquals(35L, getResultKey(innerMap)); + assertNull(innerMap.get(MapVector.VALUE_NAME)); + + /* check underlying bitVector */ + assertTrue(mapVector.isNull(0)); + assertFalse(mapVector.isNull(1)); + assertFalse(mapVector.isNull(2)); + + /* check underlying offsets */ + final ArrowBuf offsetBuffer = mapVector.getOffsetBuffer(); + + /* mapVector has 0 entries at index 0, 2 entries at index 1, and 3 entries at index 2 */ + assertEquals(0, offsetBuffer.getInt(0 * MapVector.OFFSET_WIDTH)); + assertEquals(0, offsetBuffer.getInt(1 * MapVector.OFFSET_WIDTH)); + assertEquals(2, offsetBuffer.getInt(2 * MapVector.OFFSET_WIDTH)); + assertEquals(5, offsetBuffer.getInt(3 * MapVector.OFFSET_WIDTH)); + } + } + + @Test + public void testMapWithMapKeyAndMapValue() throws Exception { + try (MapVector mapVector = MapVector.empty("sourceVector", allocator, false)) { + + UnionMapWriter mapWriter = mapVector.getWriter(); + MapWriter keyWriter; + MapWriter valueWriter; + + // we are essentially writing Map, Map> + // populate map vector with the following two records + // [ + // [[5: 10, 20: 40]:[50: 100, 200: 400], [50: 100]:[75: 175, 150: 250]], + // [[1: 2]:[10: 20], [30: 40]:[15: 20], [50: 60, 70: null]:[25: 30, 35: null], [5: null]: null] + // ] + + mapWriter.setPosition(0); + mapWriter.startMap(); + + mapWriter.startEntry(); + keyWriter = mapWriter.key().map(false); + keyWriter.startMap(); + writeEntry(keyWriter, 5, 10); + writeEntry(keyWriter, 20, 40); + keyWriter.endMap(); + valueWriter = mapWriter.value().map(false); + valueWriter.startMap(); + writeEntry(valueWriter, 50, 100L); + writeEntry(valueWriter, 200, 400L); + valueWriter.endMap(); + mapWriter.endEntry(); + + mapWriter.startEntry(); + keyWriter = mapWriter.key().map(false); + keyWriter.startMap(); + writeEntry(keyWriter, 50, 100); + keyWriter.endMap(); + valueWriter = mapWriter.value().map(false); + valueWriter.startMap(); + writeEntry(valueWriter, 75, 175L); + writeEntry(valueWriter, 150, 250L); + valueWriter.endMap(); + mapWriter.endEntry(); + + mapWriter.endMap(); + + /* write one or more maps at index 1 */ + mapWriter.setPosition(1); + mapWriter.startMap(); + + mapWriter.startEntry(); + keyWriter = mapWriter.key().map(false); + keyWriter.startMap(); + writeEntry(keyWriter, 1, 2); + keyWriter.endMap(); + valueWriter = mapWriter.value().map(true); + valueWriter.startMap(); + writeEntry(valueWriter, 10, 20L); + valueWriter.endMap(); + mapWriter.endEntry(); + + mapWriter.startEntry(); + keyWriter = mapWriter.key().map(false); + keyWriter.startMap(); + writeEntry(keyWriter, 30, 40); + keyWriter.endMap(); + valueWriter = mapWriter.value().map(false); + valueWriter.startMap(); + writeEntry(valueWriter, 15, 20L); + valueWriter.endMap(); + mapWriter.endEntry(); + + mapWriter.startEntry(); + keyWriter = mapWriter.key().map(false); + keyWriter.startMap(); + writeEntry(keyWriter, 50, 60); + writeEntry(keyWriter, 70, (Integer) null); + keyWriter.endMap(); + valueWriter = mapWriter.value().map(false); + valueWriter.startMap(); + writeEntry(valueWriter, 25, 30L); + writeEntry(valueWriter, 35, (Long) null); + valueWriter.endMap(); + mapWriter.endEntry(); + + mapWriter.startEntry(); + keyWriter = mapWriter.key().map(false); + keyWriter.startMap(); + writeEntry(keyWriter, 5, (Integer) null); + keyWriter.endMap(); + valueWriter = mapWriter.value().map(false); + valueWriter.writeNull(); + mapWriter.endEntry(); + + mapWriter.endMap(); + + assertEquals(1, mapVector.getLastSet()); + + mapWriter.setValueCount(2); + + assertEquals(2, mapVector.getValueCount()); + + // Get mapVector element at index 0 + Object result = mapVector.getObject(0); + ArrayList resultSet = (ArrayList) result; + + // 2 map entries at index 0 + assertEquals(2, resultSet.size()); + + // First Map entry + Map>> resultStruct = (Map>>) resultSet.get(0); + ArrayList> list = getResultKey(resultStruct); + assertEquals(2, list.size()); // key is a list of 2 two maps + Map innerMap = list.get(0); + assertEquals(5, getResultKey(innerMap)); + assertEquals(10, getResultValue(innerMap)); + innerMap = list.get(1); + assertEquals(20, getResultKey(innerMap)); + assertEquals(40, getResultValue(innerMap)); + + list = getResultValue(resultStruct); + assertEquals(2, list.size()); // value is a list of 2 two maps + innerMap = list.get(0); + assertEquals(50L, getResultKey(innerMap)); + assertEquals(100L, getResultValue(innerMap)); + innerMap = list.get(1); + assertEquals(200L, getResultKey(innerMap)); + assertEquals(400L, getResultValue(innerMap)); + + // Second Map entry + resultStruct = (Map>>) resultSet.get(1); + list = getResultKey(resultStruct); + assertEquals(1, list.size()); // key is a list of 1 two map + innerMap = list.get(0); + assertEquals(50, getResultKey(innerMap)); + assertEquals(100, getResultValue(innerMap)); + + list = getResultValue(resultStruct); + assertEquals(2, list.size()); // value is a list of two maps + innerMap = list.get(0); + assertEquals(75L, getResultKey(innerMap)); + assertEquals(175L, getResultValue(innerMap)); + innerMap = list.get(1); + assertEquals(150L, getResultKey(innerMap)); + assertEquals(250L, getResultValue(innerMap)); + + // Get mapVector element at index 1 + result = mapVector.getObject(1); + resultSet = (ArrayList) result; + + // 4 map entries at index 1 + assertEquals(4, resultSet.size()); + + // First Map entry + resultStruct = (Map>>) resultSet.get(0); + list = getResultKey(resultStruct); + assertEquals(1, list.size()); // key is a list of 1 map + innerMap = list.get(0); + assertEquals(1, getResultKey(innerMap)); + assertEquals(2, getResultValue(innerMap)); + + list = getResultValue(resultStruct); + assertEquals(1, list.size()); // value is a list of maps with 1 element + innerMap = list.get(0); + assertEquals(10L, getResultKey(innerMap)); + assertEquals(20L, getResultValue(innerMap)); + + // Second Map entry + resultStruct = (Map>>) resultSet.get(1); + list = getResultKey(resultStruct); + assertEquals(1, list.size()); // key is a list of 1 map + innerMap = list.get(0); + assertEquals(30, getResultKey(innerMap)); + assertEquals(40, getResultValue(innerMap)); + + list = getResultValue(resultStruct); + assertEquals(1, list.size()); // value is a list of maps with 1 element + innerMap = list.get(0); + assertEquals(15L, getResultKey(innerMap)); + assertEquals(20L, getResultValue(innerMap)); + + // Third Map entry + resultStruct = (Map>>) resultSet.get(2); + list = getResultKey(resultStruct); + assertEquals(2, list.size()); // key is a list of two maps + innerMap = list.get(0); + assertEquals(50, getResultKey(innerMap)); + assertEquals(60, getResultValue(innerMap)); + innerMap = list.get(1); + assertEquals(70, getResultKey(innerMap)); + assertNull(innerMap.get(MapVector.VALUE_NAME)); + + list = getResultValue(resultStruct); + assertEquals(2, list.size()); // value is a list of maps with 2 elements + innerMap = list.get(0); + assertEquals(25L, getResultKey(innerMap)); + assertEquals(30L, getResultValue(innerMap)); + innerMap = list.get(1); + assertEquals(35L, getResultKey(innerMap)); + assertNull(innerMap.get(MapVector.VALUE_NAME)); + + // Fourth Map entry + resultStruct = (Map>>) resultSet.get(3); + list = getResultKey(resultStruct); + assertEquals(1, list.size()); // key is a list of two maps + innerMap = list.get(0); + assertEquals(5, getResultKey(innerMap)); + assertNull(innerMap.get(MapVector.VALUE_NAME)); + + assertNull(resultStruct.get(MapVector.VALUE_NAME)); + + /* check underlying bitVector */ + assertFalse(mapVector.isNull(0)); + assertFalse(mapVector.isNull(1)); + + /* check underlying offsets */ + final ArrowBuf offsetBuffer = mapVector.getOffsetBuffer(); + + /* mapVector has 2 entries at index 0 and 4 entries at index 1 */ + assertEquals(0, offsetBuffer.getInt(0 * MapVector.OFFSET_WIDTH)); + assertEquals(2, offsetBuffer.getInt(1 * MapVector.OFFSET_WIDTH)); + assertEquals(6, offsetBuffer.getInt(2 * MapVector.OFFSET_WIDTH)); + } + } + + private void writeEntry(MapWriter writer, long key, Long value) { + writer.startEntry(); + writer.key().bigInt().writeBigInt(key); + if (value != null) { + writer.value().bigInt().writeBigInt(value); + } + writer.endEntry(); + } + + private void writeEntry(MapWriter writer, int key, Integer value) { + writer.startEntry(); + writer.key().integer().writeInt(key); + if (value != null) { + writer.value().integer().writeInt(value); + } + writer.endEntry(); + } + + @Test + public void testClearAndReuse() { + try (final MapVector vector = MapVector.empty("map", allocator, false)) { + vector.allocateNew(); + UnionMapWriter mapWriter = vector.getWriter(); + + mapWriter.startMap(); + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(1); + mapWriter.value().integer().writeInt(11); + mapWriter.endEntry(); + mapWriter.endMap(); + + mapWriter.startMap(); + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(2); + mapWriter.value().integer().writeInt(22); + mapWriter.endEntry(); + mapWriter.endMap(); + + mapWriter.setValueCount(2); + + Object result = vector.getObject(0); + ArrayList resultSet = (ArrayList) result; + Map resultStruct = (Map) resultSet.get(0); + assertEquals(1L, getResultKey(resultStruct)); + assertEquals(11, getResultValue(resultStruct)); + + result = vector.getObject(1); + resultSet = (ArrayList) result; + resultStruct = (Map) resultSet.get(0); + assertEquals(2L, getResultKey(resultStruct)); + assertEquals(22, getResultValue(resultStruct)); + + // Clear and release the buffers to trigger a realloc when adding next value + vector.clear(); + mapWriter = new UnionMapWriter(vector); + + // The map vector should reuse a buffer when reallocating the offset buffer + mapWriter.startMap(); + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(3); + mapWriter.value().integer().writeInt(33); + mapWriter.endEntry(); + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(4); + mapWriter.value().integer().writeInt(44); + mapWriter.endEntry(); + mapWriter.endMap(); + + mapWriter.startMap(); + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(5); + mapWriter.value().integer().writeInt(55); + mapWriter.endEntry(); + mapWriter.endMap(); + + mapWriter.setValueCount(2); + + result = vector.getObject(0); + resultSet = (ArrayList) result; + resultStruct = (Map) resultSet.get(0); + assertEquals(3L, getResultKey(resultStruct)); + assertEquals(33, getResultValue(resultStruct)); + resultStruct = (Map) resultSet.get(1); + assertEquals(4L, getResultKey(resultStruct)); + assertEquals(44, getResultValue(resultStruct)); + + result = vector.getObject(1); + resultSet = (ArrayList) result; + resultStruct = (Map) resultSet.get(0); + assertEquals(5L, getResultKey(resultStruct)); + assertEquals(55, getResultValue(resultStruct)); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestNullCheckingForGet.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestNullCheckingForGet.java new file mode 100644 index 000000000..f1345e88a --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestNullCheckingForGet.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import java.lang.reflect.Field; +import java.net.URLClassLoader; + +import org.junit.Assert; +import org.junit.Test; + +/** + * Test cases for {@link NullCheckingForGet}. + */ +public class TestNullCheckingForGet { + + /** + * Get a copy of the current class loader. + * @return the newly created class loader. + */ + private ClassLoader copyClassLoader() { + ClassLoader curClassLoader = this.getClass().getClassLoader(); + if (curClassLoader instanceof URLClassLoader) { + // for Java 1.8 + return new URLClassLoader(((URLClassLoader) curClassLoader).getURLs(), null); + } + + // for Java 1.9 and Java 11. + return null; + } + + /** + * Get the value of flag {@link NullCheckingForGet#NULL_CHECKING_ENABLED}. + * @param classLoader the class loader from which to get the flag value. + * @return value of the flag. + */ + private boolean getFlagValue(ClassLoader classLoader) throws Exception { + Class clazz = classLoader.loadClass("org.apache.arrow.vector.NullCheckingForGet"); + Field field = clazz.getField("NULL_CHECKING_ENABLED"); + return (Boolean) field.get(null); + } + + /** + * Ensure the flag for null checking is enabled by default. + * This will protect users from JVM crashes. + */ + @Test + public void testDefaultValue() throws Exception { + ClassLoader classLoader = copyClassLoader(); + if (classLoader != null) { + boolean nullCheckingEnabled = getFlagValue(classLoader); + Assert.assertTrue(nullCheckingEnabled); + } + } + + /** + * Test setting the null checking flag by the system property. + * @throws Exception if loading class {@link NullCheckingForGet#NULL_CHECKING_ENABLED} fails. + */ + @Test + public void testEnableSysProperty() throws Exception { + String sysProperty = System.getProperty("arrow.enable_null_check_for_get"); + System.setProperty("arrow.enable_null_check_for_get", "false"); + + ClassLoader classLoader = copyClassLoader(); + if (classLoader != null) { + boolean nullCheckingEnabled = getFlagValue(classLoader); + Assert.assertFalse(nullCheckingEnabled); + } + + // restore system property + if (sysProperty != null) { + System.setProperty("arrow.enable_null_check_for_get", sysProperty); + } else { + System.clearProperty("arrow.enable_null_check_for_get"); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestOutOfMemoryForValueVector.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestOutOfMemoryForValueVector.java new file mode 100644 index 000000000..7f26b5c1b --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestOutOfMemoryForValueVector.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.memory.RootAllocator; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +/** + * This class tests cases where we expect to receive {@link OutOfMemoryException}. + */ +public class TestOutOfMemoryForValueVector { + + private static final String EMPTY_SCHEMA_PATH = ""; + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(200); // Start with low memory limit + } + + @Test(expected = OutOfMemoryException.class) + public void variableWidthVectorAllocateNew() { + try (VarCharVector vector = new VarCharVector(EMPTY_SCHEMA_PATH, allocator)) { + vector.allocateNew(); + } + } + + @Test(expected = OutOfMemoryException.class) + public void variableWidthVectorAllocateNewCustom() { + try (VarCharVector vector = new VarCharVector(EMPTY_SCHEMA_PATH, allocator)) { + vector.allocateNew(2342, 234); + } + } + + @Test(expected = OutOfMemoryException.class) + public void fixedWidthVectorAllocateNew() { + try (IntVector vector = new IntVector(EMPTY_SCHEMA_PATH, allocator)) { + vector.allocateNew(); + } + } + + @Test(expected = OutOfMemoryException.class) + public void fixedWidthVectorAllocateNewCustom() { + try (IntVector vector = new IntVector(EMPTY_SCHEMA_PATH, allocator)) { + vector.allocateNew(2342); + } + } + + @After + public void terminate() { + allocator.close(); + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestOversizedAllocationForValueVector.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestOversizedAllocationForValueVector.java new file mode 100644 index 000000000..23414e9f5 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestOversizedAllocationForValueVector.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.memory.util.LargeMemoryUtil.checkedCastToInt; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +/** + * This class tests that OversizedAllocationException occurs when a large memory is allocated for a vector. + * Typically, arrow allows the allocation of the size of at most Integer.MAX_VALUE, but this might cause OOM in tests. + * Thus, the max allocation size is limited to 1 KB in this class. Please see the surefire option in pom.xml. + */ +public class TestOversizedAllocationForValueVector { + + private static final String EMPTY_SCHEMA_PATH = ""; + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test(expected = OversizedAllocationException.class) + public void testFixedVectorReallocation() { + final UInt4Vector vector = new UInt4Vector(EMPTY_SCHEMA_PATH, allocator); + // edge case 1: buffer size = max value capacity + final int expectedValueCapacity = checkedCastToInt(BaseValueVector.MAX_ALLOCATION_SIZE / 4); + try { + vector.allocateNew(expectedValueCapacity); + assertEquals(expectedValueCapacity, vector.getValueCapacity()); + vector.reAlloc(); + assertEquals(expectedValueCapacity * 2, vector.getValueCapacity()); + } finally { + vector.close(); + } + + // common case: value count < max value capacity + try { + vector.allocateNew(checkedCastToInt(BaseValueVector.MAX_ALLOCATION_SIZE / 8)); + vector.reAlloc(); // value allocation reaches to MAX_VALUE_ALLOCATION + vector.reAlloc(); // this should throw an IOOB + } finally { + vector.close(); + } + } + + @Test(expected = OversizedAllocationException.class) + public void testBitVectorReallocation() { + final BitVector vector = new BitVector(EMPTY_SCHEMA_PATH, allocator); + // edge case 1: buffer size ~ max value capacity + final int expectedValueCapacity = 1 << 29; + try { + vector.allocateNew(expectedValueCapacity); + assertEquals(expectedValueCapacity, vector.getValueCapacity()); + vector.reAlloc(); + assertEquals(expectedValueCapacity * 2, vector.getValueCapacity()); + } finally { + vector.close(); + } + + // common: value count < MAX_VALUE_ALLOCATION + try { + vector.allocateNew(expectedValueCapacity); + for (int i = 0; i < 3; i++) { + vector.reAlloc(); // expand buffer size + } + assertEquals(Integer.MAX_VALUE, vector.getValueCapacity()); + vector.reAlloc(); // buffer size ~ max allocation + assertEquals(Integer.MAX_VALUE, vector.getValueCapacity()); + vector.reAlloc(); // overflow + } finally { + vector.close(); + } + } + + + @Test(expected = OversizedAllocationException.class) + public void testVariableVectorReallocation() { + final VarCharVector vector = new VarCharVector(EMPTY_SCHEMA_PATH, allocator); + // edge case 1: value count = MAX_VALUE_ALLOCATION + final long expectedAllocationInBytes = BaseValueVector.MAX_ALLOCATION_SIZE; + final int expectedOffsetSize = 10; + try { + vector.allocateNew(expectedAllocationInBytes, 10); + assertTrue(expectedOffsetSize <= vector.getValueCapacity()); + assertTrue(expectedAllocationInBytes <= vector.getDataBuffer().capacity()); + vector.reAlloc(); + assertTrue(expectedOffsetSize * 2 <= vector.getValueCapacity()); + assertTrue(expectedAllocationInBytes * 2 <= vector.getDataBuffer().capacity()); + } finally { + vector.close(); + } + + // common: value count < MAX_VALUE_ALLOCATION + try { + vector.allocateNew(BaseValueVector.MAX_ALLOCATION_SIZE / 2, 0); + vector.reAlloc(); // value allocation reaches to MAX_VALUE_ALLOCATION + vector.reAlloc(); // this tests if it overflows + } finally { + vector.close(); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestPeriodDuration.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestPeriodDuration.java new file mode 100644 index 000000000..c8965dec3 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestPeriodDuration.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; + +import java.time.Duration; +import java.time.Period; + +import org.junit.Test; + +public class TestPeriodDuration { + + @Test + public void testBasics() { + PeriodDuration pd1 = new PeriodDuration(Period.of(1, 2, 3), Duration.ofNanos(123)); + PeriodDuration pdEq1 = new PeriodDuration(Period.of(1, 2, 3), Duration.ofNanos(123)); + PeriodDuration pd2 = new PeriodDuration(Period.of(1, 2, 3), Duration.ofNanos(12)); + PeriodDuration pd3 = new PeriodDuration(Period.of(-1, -2, -3), Duration.ofNanos(-123)); + + assertEquals(pd1, pdEq1); + assertEquals(pd1.hashCode(), pdEq1.hashCode()); + + assertNotEquals(pd1, pd2); + assertNotEquals(pd1.hashCode(), pd2.hashCode()); + assertNotEquals(pd1, pd3); + assertNotEquals(pd1.hashCode(), pd3.hashCode()); + } + +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java new file mode 100644 index 000000000..e60b87e60 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java @@ -0,0 +1,410 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.types.pojo.ArrowType.Struct; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.jupiter.api.Assertions; + +public class TestSplitAndTransfer { + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + private void populateVarcharVector(final VarCharVector vector, int valueCount, String[] compareArray) { + for (int i = 0; i < valueCount; i += 3) { + final String s = String.format("%010d", i); + vector.set(i, s.getBytes()); + if (compareArray != null) { + compareArray[i] = s; + } + } + vector.setValueCount(valueCount); + } + + @Test /* VarCharVector */ + public void test() throws Exception { + try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator)) { + varCharVector.allocateNew(10000, 1000); + + final int valueCount = 500; + final String[] compareArray = new String[valueCount]; + + populateVarcharVector(varCharVector, valueCount, compareArray); + + final TransferPair tp = varCharVector.getTransferPair(allocator); + final VarCharVector newVarCharVector = (VarCharVector) tp.getTo(); + final int[][] startLengths = {{0, 201}, {201, 0}, {201, 200}, {401, 99}}; + + for (final int[] startLength : startLengths) { + final int start = startLength[0]; + final int length = startLength[1]; + tp.splitAndTransfer(start, length); + for (int i = 0; i < length; i++) { + final boolean expectedSet = ((start + i) % 3) == 0; + if (expectedSet) { + final byte[] expectedValue = compareArray[start + i].getBytes(); + assertFalse(newVarCharVector.isNull(i)); + assertArrayEquals(expectedValue, newVarCharVector.get(i)); + } else { + assertTrue(newVarCharVector.isNull(i)); + } + } + newVarCharVector.clear(); + } + } + } + + @Test + public void testMemoryConstrainedTransfer() { + try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator)) { + allocator.setLimit(32768); /* set limit of 32KB */ + + varCharVector.allocateNew(10000, 1000); + + final int valueCount = 1000; + + populateVarcharVector(varCharVector, valueCount, null); + + final TransferPair tp = varCharVector.getTransferPair(allocator); + final VarCharVector newVarCharVector = (VarCharVector) tp.getTo(); + final int[][] startLengths = {{0, 700}, {700, 299}}; + + for (final int[] startLength : startLengths) { + final int start = startLength[0]; + final int length = startLength[1]; + tp.splitAndTransfer(start, length); + newVarCharVector.clear(); + } + } + } + + @Test + public void testTransfer() { + try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator)) { + varCharVector.allocateNew(10000, 1000); + + final int valueCount = 500; + final String[] compareArray = new String[valueCount]; + populateVarcharVector(varCharVector, valueCount, compareArray); + + final TransferPair tp = varCharVector.getTransferPair(allocator); + final VarCharVector newVarCharVector = (VarCharVector) tp.getTo(); + tp.transfer(); + + assertEquals(0, varCharVector.valueCount); + assertEquals(valueCount, newVarCharVector.valueCount); + + for (int i = 0; i < valueCount; i++) { + final boolean expectedSet = (i % 3) == 0; + if (expectedSet) { + final byte[] expectedValue = compareArray[i].getBytes(); + assertFalse(newVarCharVector.isNull(i)); + assertArrayEquals(expectedValue, newVarCharVector.get(i)); + } else { + assertTrue(newVarCharVector.isNull(i)); + } + } + + newVarCharVector.clear(); + } + } + + @Test + public void testCopyValueSafe() { + try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator); + final VarCharVector newVarCharVector = new VarCharVector("newvector", allocator)) { + varCharVector.allocateNew(10000, 1000); + + final int valueCount = 500; + populateVarcharVector(varCharVector, valueCount, null); + + final TransferPair tp = varCharVector.makeTransferPair(newVarCharVector); + + // new vector memory is not pre-allocated, we expect copyValueSafe work fine. + for (int i = 0; i < valueCount; i++) { + tp.copyValueSafe(i, i); + } + newVarCharVector.setValueCount(valueCount); + + for (int i = 0; i < valueCount; i++) { + final boolean expectedSet = (i % 3) == 0; + if (expectedSet) { + assertFalse(varCharVector.isNull(i)); + assertFalse(newVarCharVector.isNull(i)); + assertArrayEquals(varCharVector.get(i), newVarCharVector.get(i)); + } else { + assertTrue(newVarCharVector.isNull(i)); + } + } + } + } + + @Test + public void testSplitAndTransferNon() { + try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator)) { + + varCharVector.allocateNew(10000, 1000); + final int valueCount = 500; + populateVarcharVector(varCharVector, valueCount, null); + + final TransferPair tp = varCharVector.getTransferPair(allocator); + VarCharVector newVarCharVector = (VarCharVector) tp.getTo(); + + tp.splitAndTransfer(0, 0); + assertEquals(0, newVarCharVector.getValueCount()); + + newVarCharVector.clear(); + } + } + + @Test + public void testSplitAndTransferAll() { + try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator)) { + + varCharVector.allocateNew(10000, 1000); + final int valueCount = 500; + populateVarcharVector(varCharVector, valueCount, null); + + final TransferPair tp = varCharVector.getTransferPair(allocator); + VarCharVector newVarCharVector = (VarCharVector) tp.getTo(); + + tp.splitAndTransfer(0, valueCount); + assertEquals(valueCount, newVarCharVector.getValueCount()); + + newVarCharVector.clear(); + } + } + + @Test + public void testInvalidStartIndex() { + try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator); + final VarCharVector newVarCharVector = new VarCharVector("newvector", allocator)) { + + varCharVector.allocateNew(10000, 1000); + final int valueCount = 500; + populateVarcharVector(varCharVector, valueCount, null); + + final TransferPair tp = varCharVector.makeTransferPair(newVarCharVector); + + IllegalArgumentException e = Assertions.assertThrows( + IllegalArgumentException.class, + () -> tp.splitAndTransfer(valueCount, 10)); + + assertEquals("Invalid parameters startIndex: 500, length: 10 for valueCount: 500", e.getMessage()); + + newVarCharVector.clear(); + } + } + + @Test + public void testInvalidLength() { + try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator); + final VarCharVector newVarCharVector = new VarCharVector("newvector", allocator)) { + + varCharVector.allocateNew(10000, 1000); + final int valueCount = 500; + populateVarcharVector(varCharVector, valueCount, null); + + final TransferPair tp = varCharVector.makeTransferPair(newVarCharVector); + + IllegalArgumentException e = Assertions.assertThrows( + IllegalArgumentException.class, + () -> tp.splitAndTransfer(0, valueCount * 2)); + + assertEquals("Invalid parameters startIndex: 0, length: 1000 for valueCount: 500", e.getMessage()); + + newVarCharVector.clear(); + } + } + + @Test + public void testZeroStartIndexAndLength() { + try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator); + final VarCharVector newVarCharVector = new VarCharVector("newvector", allocator)) { + + varCharVector.allocateNew(0, 0); + final int valueCount = 0; + populateVarcharVector(varCharVector, valueCount, null); + + final TransferPair tp = varCharVector.makeTransferPair(newVarCharVector); + + tp.splitAndTransfer(0, 0); + assertEquals(valueCount, newVarCharVector.getValueCount()); + + newVarCharVector.clear(); + } + } + + @Test + public void testZeroLength() { + try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator); + final VarCharVector newVarCharVector = new VarCharVector("newvector", allocator)) { + + varCharVector.allocateNew(10000, 1000); + final int valueCount = 500; + populateVarcharVector(varCharVector, valueCount, null); + + final TransferPair tp = varCharVector.makeTransferPair(newVarCharVector); + + tp.splitAndTransfer(500, 0); + assertEquals(0, newVarCharVector.getValueCount()); + + newVarCharVector.clear(); + } + } + + @Test + public void testUnionVectorZeroStartIndexAndLength() { + try (final UnionVector unionVector = UnionVector.empty("myvector", allocator); + final UnionVector newUnionVector = UnionVector.empty("newvector", allocator)) { + + unionVector.allocateNew(); + final int valueCount = 0; + unionVector.setValueCount(valueCount); + + final TransferPair tp = unionVector.makeTransferPair(newUnionVector); + + tp.splitAndTransfer(0, 0); + assertEquals(valueCount, newUnionVector.getValueCount()); + + newUnionVector.clear(); + } + } + + @Test + public void testFixedWidthVectorZeroStartIndexAndLength() { + try (final IntVector intVector = new IntVector("myvector", allocator); + final IntVector newIntVector = new IntVector("newvector", allocator)) { + + intVector.allocateNew(0); + final int valueCount = 0; + intVector.setValueCount(valueCount); + + final TransferPair tp = intVector.makeTransferPair(newIntVector); + + tp.splitAndTransfer(0, 0); + assertEquals(valueCount, newIntVector.getValueCount()); + + newIntVector.clear(); + } + } + + @Test + public void testBitVectorZeroStartIndexAndLength() { + try (final BitVector bitVector = new BitVector("myvector", allocator); + final BitVector newBitVector = new BitVector("newvector", allocator)) { + + bitVector.allocateNew(0); + final int valueCount = 0; + bitVector.setValueCount(valueCount); + + final TransferPair tp = bitVector.makeTransferPair(newBitVector); + + tp.splitAndTransfer(0, 0); + assertEquals(valueCount, newBitVector.getValueCount()); + + newBitVector.clear(); + } + } + + @Test + public void testFixedSizeListVectorZeroStartIndexAndLength() { + try (final FixedSizeListVector listVector = FixedSizeListVector.empty("list", 4, allocator); + final FixedSizeListVector newListVector = FixedSizeListVector.empty("newList", 4, allocator)) { + + listVector.allocateNew(); + final int valueCount = 0; + listVector.setValueCount(valueCount); + + final TransferPair tp = listVector.makeTransferPair(newListVector); + + tp.splitAndTransfer(0, 0); + assertEquals(valueCount, newListVector.getValueCount()); + + newListVector.clear(); + } + } + + @Test + public void testListVectorZeroStartIndexAndLength() { + try (final ListVector listVector = ListVector.empty("list", allocator); + final ListVector newListVector = ListVector.empty("newList", allocator)) { + + listVector.allocateNew(); + final int valueCount = 0; + listVector.setValueCount(valueCount); + + final TransferPair tp = listVector.makeTransferPair(newListVector); + + tp.splitAndTransfer(0, 0); + assertEquals(valueCount, newListVector.getValueCount()); + + newListVector.clear(); + } + } + + @Test + public void testStructVectorZeroStartIndexAndLength() { + Map metadata = new HashMap<>(); + metadata.put("k1", "v1"); + FieldType type = new FieldType(true, Struct.INSTANCE, null, metadata); + try (final StructVector structVector = new StructVector("structvec", allocator, type, null); + final StructVector newStructVector = new StructVector("newStructvec", allocator, type, null)) { + + structVector.allocateNew(); + final int valueCount = 0; + structVector.setValueCount(valueCount); + + final TransferPair tp = structVector.makeTransferPair(newStructVector); + + tp.splitAndTransfer(0, 0); + assertEquals(valueCount, newStructVector.getValueCount()); + + newStructVector.clear(); + } + } + + +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestStructVector.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestStructVector.java new file mode 100644 index 000000000..734ff4631 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestStructVector.java @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.*; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.holders.ComplexHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType.Struct; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class TestStructVector { + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testFieldMetadata() throws Exception { + Map metadata = new HashMap<>(); + metadata.put("k1", "v1"); + FieldType type = new FieldType(true, Struct.INSTANCE, null, metadata); + try (StructVector vector = new StructVector("struct", allocator, type, null)) { + Assert.assertEquals(vector.getField().getMetadata(), type.getMetadata()); + } + } + + @Test + public void testMakeTransferPair() { + try (final StructVector s1 = StructVector.empty("s1", allocator); + final StructVector s2 = StructVector.empty("s2", allocator)) { + s1.addOrGet("struct_child", FieldType.nullable(MinorType.INT.getType()), IntVector.class); + s1.makeTransferPair(s2); + final FieldVector child = s1.getChild("struct_child"); + final FieldVector toChild = s2.addOrGet("struct_child", child.getField().getFieldType(), child.getClass()); + assertEquals(0, toChild.getValueCapacity()); + assertEquals(0, toChild.getDataBuffer().capacity()); + assertEquals(0, toChild.getValidityBuffer().capacity()); + } + } + + @Test + public void testAllocateAfterReAlloc() throws Exception { + Map metadata = new HashMap<>(); + metadata.put("k1", "v1"); + FieldType type = new FieldType(true, Struct.INSTANCE, null, metadata); + try (StructVector vector = new StructVector("struct", allocator, type, null)) { + MinorType childtype = MinorType.INT; + vector.addOrGet("intchild", FieldType.nullable(childtype.getType()), IntVector.class); + + /* + * Allocate the default size, and then, reAlloc. This should double the allocation. + */ + vector.allocateNewSafe(); // Initial allocation + vector.reAlloc(); // Double the allocation size of self, and all children. + long savedValidityBufferCapacity = vector.getValidityBuffer().capacity(); + int savedValueCapacity = vector.getValueCapacity(); + + /* + * Clear and allocate again. + */ + vector.clear(); + vector.allocateNewSafe(); + + /* + * Verify that the buffer sizes haven't changed. + */ + Assert.assertEquals(vector.getValidityBuffer().capacity(), savedValidityBufferCapacity); + Assert.assertEquals(vector.getValueCapacity(), savedValueCapacity); + } + } + + @Test + public void testReadNullValue() { + Map metadata = new HashMap<>(); + metadata.put("k1", "v1"); + FieldType type = new FieldType(true, Struct.INSTANCE, null, metadata); + try (StructVector vector = new StructVector("struct", allocator, type, null)) { + MinorType childtype = MinorType.INT; + vector.addOrGet("intchild", FieldType.nullable(childtype.getType()), IntVector.class); + vector.setValueCount(2); + + IntVector intVector = (IntVector) vector.getChild("intchild"); + intVector.setSafe(0, 100); + vector.setIndexDefined(0); + intVector.setNull(1); + vector.setNull(1); + + ComplexHolder holder = new ComplexHolder(); + vector.get(0, holder); + assertNotEquals(0, holder.isSet); + assertNotNull(holder.reader); + + vector.get(1, holder); + assertEquals(0, holder.isSet); + assertNull(holder.reader); + } + } + + @Test + public void testGetPrimitiveVectors() { + FieldType type = new FieldType(true, Struct.INSTANCE, null, null); + try (StructVector vector = new StructVector("struct", allocator, type, null)) { + + // add list vector + vector.addOrGet("list", FieldType.nullable(MinorType.LIST.getType()), ListVector.class); + ListVector listVector = vector.addOrGetList("list"); + listVector.addOrGetVector(FieldType.nullable(MinorType.INT.getType())); + + // add union vector + vector.addOrGet("union", FieldType.nullable(MinorType.UNION.getType()), UnionVector.class); + UnionVector unionVector = vector.addOrGetUnion("union"); + unionVector.addVector(new BigIntVector("bigInt", allocator)); + unionVector.addVector(new SmallIntVector("smallInt", allocator)); + + // add varchar vector + vector.addOrGet("varchar", FieldType.nullable(MinorType.VARCHAR.getType()), VarCharVector.class); + + List primitiveVectors = vector.getPrimitiveVectors(); + assertEquals(4, primitiveVectors.size()); + assertEquals(MinorType.INT, primitiveVectors.get(0).getMinorType()); + assertEquals(MinorType.BIGINT, primitiveVectors.get(1).getMinorType()); + assertEquals(MinorType.SMALLINT, primitiveVectors.get(2).getMinorType()); + assertEquals(MinorType.VARCHAR, primitiveVectors.get(3).getMinorType()); + } + } + + @Test + public void testAddOrGetComplexChildVectors() { + FieldType type = new FieldType(true, Struct.INSTANCE, null, null); + try (StructVector vector = new StructVector("struct", allocator, type, null)) { + + vector.addOrGetList("list"); + vector.addOrGetFixedSizeList("fixedList", 2); + vector.addOrGetUnion("union"); + vector.addOrGetStruct("struct"); + vector.addOrGetMap("map", true); + + List childrens = vector.getChildrenFromFields(); + assertEquals(5, childrens.size()); + assertEquals(MinorType.LIST, childrens.get(0).getMinorType()); + assertEquals(MinorType.FIXED_SIZE_LIST, childrens.get(1).getMinorType()); + assertEquals(MinorType.UNION, childrens.get(2).getMinorType()); + assertEquals(MinorType.STRUCT, childrens.get(3).getMinorType()); + assertEquals(MinorType.MAP, childrens.get(4).getMinorType()); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestTypeLayout.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestTypeLayout.java new file mode 100644 index 000000000..97930f433 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestTypeLayout.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertEquals; + +import org.apache.arrow.vector.types.DateUnit; +import org.apache.arrow.vector.types.FloatingPointPrecision; +import org.apache.arrow.vector.types.IntervalUnit; +import org.apache.arrow.vector.types.TimeUnit; +import org.apache.arrow.vector.types.UnionMode; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.junit.Test; + +public class TestTypeLayout { + + @Test + public void testTypeBufferCount() { + ArrowType type = new ArrowType.Int(8, true); + assertEquals(TypeLayout.getTypeBufferCount(type), TypeLayout.getTypeLayout(type).getBufferLayouts().size()); + + type = new ArrowType.Union(UnionMode.Sparse, new int[2]); + assertEquals(TypeLayout.getTypeBufferCount(type), TypeLayout.getTypeLayout(type).getBufferLayouts().size()); + + type = new ArrowType.Union(UnionMode.Dense, new int[1]); + assertEquals(TypeLayout.getTypeBufferCount(type), TypeLayout.getTypeLayout(type).getBufferLayouts().size()); + + type = new ArrowType.Struct(); + assertEquals(TypeLayout.getTypeBufferCount(type), TypeLayout.getTypeLayout(type).getBufferLayouts().size()); + + type = new ArrowType.Timestamp(TimeUnit.MILLISECOND, null); + assertEquals(TypeLayout.getTypeBufferCount(type), TypeLayout.getTypeLayout(type).getBufferLayouts().size()); + + type = new ArrowType.List(); + assertEquals(TypeLayout.getTypeBufferCount(type), TypeLayout.getTypeLayout(type).getBufferLayouts().size()); + + type = new ArrowType.FixedSizeList(5); + assertEquals(TypeLayout.getTypeBufferCount(type), TypeLayout.getTypeLayout(type).getBufferLayouts().size()); + + type = new ArrowType.Map(false); + assertEquals(TypeLayout.getTypeBufferCount(type), TypeLayout.getTypeLayout(type).getBufferLayouts().size()); + + type = new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE); + assertEquals(TypeLayout.getTypeBufferCount(type), TypeLayout.getTypeLayout(type).getBufferLayouts().size()); + + type = new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE); + assertEquals(TypeLayout.getTypeBufferCount(type), TypeLayout.getTypeLayout(type).getBufferLayouts().size()); + + type = new ArrowType.Decimal(10, 10, 128); + assertEquals(TypeLayout.getTypeBufferCount(type), TypeLayout.getTypeLayout(type).getBufferLayouts().size()); + + type = new ArrowType.Decimal(10, 10, 256); + assertEquals(TypeLayout.getTypeBufferCount(type), TypeLayout.getTypeLayout(type).getBufferLayouts().size()); + + + type = new ArrowType.FixedSizeBinary(5); + assertEquals(TypeLayout.getTypeBufferCount(type), TypeLayout.getTypeLayout(type).getBufferLayouts().size()); + + type = new ArrowType.Bool(); + assertEquals(TypeLayout.getTypeBufferCount(type), TypeLayout.getTypeLayout(type).getBufferLayouts().size()); + + type = new ArrowType.Binary(); + assertEquals(TypeLayout.getTypeBufferCount(type), TypeLayout.getTypeLayout(type).getBufferLayouts().size()); + + type = new ArrowType.Utf8(); + assertEquals(TypeLayout.getTypeBufferCount(type), TypeLayout.getTypeLayout(type).getBufferLayouts().size()); + + type = new ArrowType.Null(); + assertEquals(TypeLayout.getTypeBufferCount(type), TypeLayout.getTypeLayout(type).getBufferLayouts().size()); + + type = new ArrowType.Date(DateUnit.DAY); + assertEquals(TypeLayout.getTypeBufferCount(type), TypeLayout.getTypeLayout(type).getBufferLayouts().size()); + + type = new ArrowType.Time(TimeUnit.MILLISECOND, 32); + assertEquals(TypeLayout.getTypeBufferCount(type), TypeLayout.getTypeLayout(type).getBufferLayouts().size()); + + type = new ArrowType.Interval(IntervalUnit.DAY_TIME); + assertEquals(TypeLayout.getTypeBufferCount(type), TypeLayout.getTypeLayout(type).getBufferLayouts().size()); + + type = new ArrowType.Duration(TimeUnit.MILLISECOND); + assertEquals(TypeLayout.getTypeBufferCount(type), TypeLayout.getTypeLayout(type).getBufferLayouts().size()); + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java new file mode 100644 index 000000000..f04998915 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java @@ -0,0 +1,520 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.complex.VectorWithOrdinal; +import org.apache.arrow.vector.complex.impl.UnionWriter; +import org.apache.arrow.vector.holders.NullableBitHolder; +import org.apache.arrow.vector.holders.NullableFloat4Holder; +import org.apache.arrow.vector.holders.NullableIntHolder; +import org.apache.arrow.vector.holders.NullableUInt4Holder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.UnionMode; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestUnionVector { + private static final String EMPTY_SCHEMA_PATH = ""; + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testUnionVector() throws Exception { + + final NullableUInt4Holder uInt4Holder = new NullableUInt4Holder(); + uInt4Holder.value = 100; + uInt4Holder.isSet = 1; + + try (UnionVector unionVector = + new UnionVector(EMPTY_SCHEMA_PATH, allocator, /* field type */ null, /* call-back */ null)) { + unionVector.allocateNew(); + + // write some data + unionVector.setType(0, MinorType.UINT4); + unionVector.setSafe(0, uInt4Holder); + unionVector.setType(2, MinorType.UINT4); + unionVector.setSafe(2, uInt4Holder); + unionVector.setValueCount(4); + + // check that what we wrote is correct + assertEquals(4, unionVector.getValueCount()); + + assertEquals(false, unionVector.isNull(0)); + assertEquals(100, unionVector.getObject(0)); + + assertNull(unionVector.getObject(1)); + + assertEquals(false, unionVector.isNull(2)); + assertEquals(100, unionVector.getObject(2)); + + assertNull(unionVector.getObject(3)); + } + } + + @Test + public void testUnionVectorMapValue() throws Exception { + try (UnionVector unionVector = + new UnionVector(EMPTY_SCHEMA_PATH, allocator, /* field type */ null, /* call-back */ null)) { + unionVector.allocateNew(); + + UnionWriter writer = (UnionWriter) unionVector.getWriter(); + + // populate map vector with the following two records + // [ + // null, + // [[1: 2], [3: 4], [5: null]] + // ] + + writer.setPosition(0); + writer.writeNull(); + + writer.setPosition(1); + writer.startMap(); + + writer.startEntry(); + writer.key().integer().writeInt(1); + writer.value().integer().writeInt(2); + writer.endEntry(); + + writer.startEntry(); + writer.key().integer().writeInt(3); + writer.value().integer().writeInt(4); + writer.endEntry(); + + writer.startEntry(); + writer.key().integer().writeInt(5); + writer.endEntry(); + + writer.endMap(); + + unionVector.setValueCount(2); + + // check that what we wrote is correct + assertEquals(2, unionVector.getValueCount()); + + // first entry + assertNull(unionVector.getObject(0)); + + // second entry + List> resultList = (List>) unionVector.getObject(1); + assertEquals(3, resultList.size()); + + Map resultMap = resultList.get(0); + assertEquals(1, (int) resultMap.get(MapVector.KEY_NAME)); + assertEquals(2, (int) resultMap.get(MapVector.VALUE_NAME)); + + resultMap = resultList.get(1); + assertEquals(3, (int) resultMap.get(MapVector.KEY_NAME)); + assertEquals(4, (int) resultMap.get(MapVector.VALUE_NAME)); + + resultMap = resultList.get(2); + assertEquals(5, (int) resultMap.get(MapVector.KEY_NAME)); + assertNull(resultMap.get(MapVector.VALUE_NAME)); + } + } + + @Test + public void testTransfer() throws Exception { + try (UnionVector srcVector = + new UnionVector(EMPTY_SCHEMA_PATH, allocator, /* field type */ null, /* call-back */ null)) { + srcVector.allocateNew(); + + // write some data + srcVector.setType(0, MinorType.INT); + srcVector.setSafe(0, newIntHolder(5)); + srcVector.setType(1, MinorType.BIT); + srcVector.setSafe(1, newBitHolder(false)); + srcVector.setType(3, MinorType.INT); + srcVector.setSafe(3, newIntHolder(10)); + srcVector.setType(5, MinorType.BIT); + srcVector.setSafe(5, newBitHolder(false)); + srcVector.setValueCount(6); + + try (UnionVector destVector = + new UnionVector(EMPTY_SCHEMA_PATH, allocator, /* field type */ null, /* call-back */ null)) { + TransferPair pair = srcVector.makeTransferPair(destVector); + + // Creating the transfer should transfer the type of the field at least. + assertEquals(srcVector.getField(), destVector.getField()); + + // transfer + pair.transfer(); + + assertEquals(srcVector.getField(), destVector.getField()); + + // now check the values are transferred + assertEquals(6, destVector.getValueCount()); + + assertFalse(destVector.isNull(0)); + assertEquals(5, destVector.getObject(0)); + + assertFalse(destVector.isNull(1)); + assertEquals(false, destVector.getObject(1)); + + assertNull(destVector.getObject(2)); + + assertFalse(destVector.isNull(3)); + assertEquals(10, destVector.getObject(3)); + + assertNull(destVector.getObject(4)); + + assertFalse(destVector.isNull(5)); + assertEquals(false, destVector.getObject(5)); + } + } + } + + @Test + public void testSplitAndTransfer() throws Exception { + try (UnionVector sourceVector = + new UnionVector(EMPTY_SCHEMA_PATH, allocator, /* field type */ null, /* call-back */ null)) { + + sourceVector.allocateNew(); + + /* populate the UnionVector */ + sourceVector.setType(0, MinorType.INT); + sourceVector.setSafe(0, newIntHolder(5)); + sourceVector.setType(1, MinorType.INT); + sourceVector.setSafe(1, newIntHolder(10)); + sourceVector.setType(2, MinorType.INT); + sourceVector.setSafe(2, newIntHolder(15)); + sourceVector.setType(3, MinorType.INT); + sourceVector.setSafe(3, newIntHolder(20)); + sourceVector.setType(4, MinorType.INT); + sourceVector.setSafe(4, newIntHolder(25)); + sourceVector.setType(5, MinorType.INT); + sourceVector.setSafe(5, newIntHolder(30)); + sourceVector.setType(6, MinorType.INT); + sourceVector.setSafe(6, newIntHolder(35)); + sourceVector.setType(7, MinorType.INT); + sourceVector.setSafe(7, newIntHolder(40)); + sourceVector.setType(8, MinorType.INT); + sourceVector.setSafe(8, newIntHolder(45)); + sourceVector.setType(9, MinorType.INT); + sourceVector.setSafe(9, newIntHolder(50)); + sourceVector.setValueCount(10); + + /* check the vector output */ + assertEquals(10, sourceVector.getValueCount()); + assertEquals(false, sourceVector.isNull(0)); + assertEquals(5, sourceVector.getObject(0)); + assertEquals(false, sourceVector.isNull(1)); + assertEquals(10, sourceVector.getObject(1)); + assertEquals(false, sourceVector.isNull(2)); + assertEquals(15, sourceVector.getObject(2)); + assertEquals(false, sourceVector.isNull(3)); + assertEquals(20, sourceVector.getObject(3)); + assertEquals(false, sourceVector.isNull(4)); + assertEquals(25, sourceVector.getObject(4)); + assertEquals(false, sourceVector.isNull(5)); + assertEquals(30, sourceVector.getObject(5)); + assertEquals(false, sourceVector.isNull(6)); + assertEquals(35, sourceVector.getObject(6)); + assertEquals(false, sourceVector.isNull(7)); + assertEquals(40, sourceVector.getObject(7)); + assertEquals(false, sourceVector.isNull(8)); + assertEquals(45, sourceVector.getObject(8)); + assertEquals(false, sourceVector.isNull(9)); + assertEquals(50, sourceVector.getObject(9)); + + try (UnionVector toVector = + new UnionVector(EMPTY_SCHEMA_PATH, allocator, /* field type */ null, /* call-back */ null)) { + + final TransferPair transferPair = sourceVector.makeTransferPair(toVector); + + final int[][] transferLengths = {{0, 3}, + {3, 1}, + {4, 2}, + {6, 1}, + {7, 1}, + {8, 2} + }; + + for (final int[] transferLength : transferLengths) { + final int start = transferLength[0]; + final int length = transferLength[1]; + + transferPair.splitAndTransfer(start, length); + + /* check the toVector output after doing the splitAndTransfer */ + for (int i = 0; i < length; i++) { + assertEquals("Different data at indexes: " + (start + i) + "and " + i, sourceVector.getObject(start + i), + toVector.getObject(i)); + } + } + } + } + } + + @Test + public void testSplitAndTransferWithMixedVectors() throws Exception { + try (UnionVector sourceVector = + new UnionVector(EMPTY_SCHEMA_PATH, allocator, /* field type */ null, /* call-back */ null)) { + + sourceVector.allocateNew(); + + /* populate the UnionVector */ + sourceVector.setType(0, MinorType.INT); + sourceVector.setSafe(0, newIntHolder(5)); + + sourceVector.setType(1, MinorType.FLOAT4); + sourceVector.setSafe(1, newFloat4Holder(5.5f)); + + sourceVector.setType(2, MinorType.INT); + sourceVector.setSafe(2, newIntHolder(10)); + + sourceVector.setType(3, MinorType.FLOAT4); + sourceVector.setSafe(3, newFloat4Holder(10.5f)); + + sourceVector.setType(4, MinorType.INT); + sourceVector.setSafe(4, newIntHolder(15)); + + sourceVector.setType(5, MinorType.FLOAT4); + sourceVector.setSafe(5, newFloat4Holder(15.5f)); + + sourceVector.setType(6, MinorType.INT); + sourceVector.setSafe(6, newIntHolder(20)); + + sourceVector.setType(7, MinorType.FLOAT4); + sourceVector.setSafe(7, newFloat4Holder(20.5f)); + + sourceVector.setType(8, MinorType.INT); + sourceVector.setSafe(8, newIntHolder(30)); + + sourceVector.setType(9, MinorType.FLOAT4); + sourceVector.setSafe(9, newFloat4Holder(30.5f)); + sourceVector.setValueCount(10); + + /* check the vector output */ + assertEquals(10, sourceVector.getValueCount()); + assertEquals(false, sourceVector.isNull(0)); + assertEquals(5, sourceVector.getObject(0)); + assertEquals(false, sourceVector.isNull(1)); + assertEquals(5.5f, sourceVector.getObject(1)); + assertEquals(false, sourceVector.isNull(2)); + assertEquals(10, sourceVector.getObject(2)); + assertEquals(false, sourceVector.isNull(3)); + assertEquals(10.5f, sourceVector.getObject(3)); + assertEquals(false, sourceVector.isNull(4)); + assertEquals(15, sourceVector.getObject(4)); + assertEquals(false, sourceVector.isNull(5)); + assertEquals(15.5f, sourceVector.getObject(5)); + assertEquals(false, sourceVector.isNull(6)); + assertEquals(20, sourceVector.getObject(6)); + assertEquals(false, sourceVector.isNull(7)); + assertEquals(20.5f, sourceVector.getObject(7)); + assertEquals(false, sourceVector.isNull(8)); + assertEquals(30, sourceVector.getObject(8)); + assertEquals(false, sourceVector.isNull(9)); + assertEquals(30.5f, sourceVector.getObject(9)); + + try (UnionVector toVector = + new UnionVector(EMPTY_SCHEMA_PATH, allocator, /* field type */ null, /* call-back */ null)) { + + final TransferPair transferPair = sourceVector.makeTransferPair(toVector); + + final int[][] transferLengths = {{0, 2}, + {2, 1}, + {3, 2}, + {5, 3}, + {8, 2} + }; + + for (final int[] transferLength : transferLengths) { + final int start = transferLength[0]; + final int length = transferLength[1]; + + transferPair.splitAndTransfer(start, length); + + /* check the toVector output after doing the splitAndTransfer */ + for (int i = 0; i < length; i++) { + assertEquals("Different values at index: " + i, sourceVector.getObject(start + i), toVector.getObject(i)); + } + } + } + } + } + + @Test + public void testGetFieldTypeInfo() throws Exception { + Map metadata = new HashMap<>(); + metadata.put("key1", "value1"); + + int[] typeIds = new int[2]; + typeIds[0] = MinorType.INT.ordinal(); + typeIds[1] = MinorType.VARCHAR.ordinal(); + + List children = new ArrayList<>(); + children.add(new Field("int", FieldType.nullable(MinorType.INT.getType()), null)); + children.add(new Field("varchar", FieldType.nullable(MinorType.VARCHAR.getType()), null)); + + final FieldType fieldType = new FieldType(false, new ArrowType.Union(UnionMode.Sparse, typeIds), + /*dictionary=*/null, metadata); + final Field field = new Field("union", fieldType, children); + + MinorType minorType = MinorType.UNION; + UnionVector vector = (UnionVector) minorType.getNewVector(field, allocator, null); + vector.initializeChildrenFromFields(children); + + assertTrue(vector.getField().equals(field)); + + // Union has 2 child vectors + assertEquals(vector.size(), 2); + + // Check child field 0 + VectorWithOrdinal intChild = vector.getChildVectorWithOrdinal("int"); + assertEquals(intChild.ordinal, 0); + assertEquals(intChild.vector.getField(), children.get(0)); + + // Check child field 1 + VectorWithOrdinal varcharChild = vector.getChildVectorWithOrdinal("varchar"); + assertEquals(varcharChild.ordinal, 1); + assertEquals(varcharChild.vector.getField(), children.get(1)); + } + + @Test + public void testGetBufferAddress() throws Exception { + try (UnionVector vector = + new UnionVector(EMPTY_SCHEMA_PATH, allocator, /* field type */ null, /* call-back */ null)) { + boolean error = false; + + vector.allocateNew(); + + /* populate the UnionVector */ + vector.setType(0, MinorType.INT); + vector.setSafe(0, newIntHolder(5)); + + vector.setType(1, MinorType.FLOAT4); + vector.setSafe(1, newFloat4Holder(5.5f)); + + vector.setType(2, MinorType.INT); + vector.setSafe(2, newIntHolder(10)); + + vector.setType(3, MinorType.FLOAT4); + vector.setSafe(3, newFloat4Holder(10.5f)); + + vector.setValueCount(10); + + /* check the vector output */ + assertEquals(10, vector.getValueCount()); + assertEquals(false, vector.isNull(0)); + assertEquals(5, vector.getObject(0)); + assertEquals(false, vector.isNull(1)); + assertEquals(5.5f, vector.getObject(1)); + assertEquals(false, vector.isNull(2)); + assertEquals(10, vector.getObject(2)); + assertEquals(false, vector.isNull(3)); + assertEquals(10.5f, vector.getObject(3)); + + List buffers = vector.getFieldBuffers(); + + + try { + long offsetAddress = vector.getOffsetBufferAddress(); + } catch (UnsupportedOperationException ue) { + error = true; + } finally { + assertTrue(error); + error = false; + } + + try { + long dataAddress = vector.getDataBufferAddress(); + } catch (UnsupportedOperationException ue) { + error = true; + } finally { + assertTrue(error); + } + + assertEquals(1, buffers.size()); + } + } + + @Test + public void testSetGetNull() { + try (UnionVector srcVector = + new UnionVector(EMPTY_SCHEMA_PATH, allocator, /* field type */ null, /* call-back */ null)) { + srcVector.allocateNew(); + + final NullableIntHolder holder = new NullableIntHolder(); + holder.isSet = 1; + holder.value = 5; + + // write some data + srcVector.setType(0, MinorType.INT); + srcVector.setSafe(0, holder); + + assertFalse(srcVector.isNull(0)); + + holder.isSet = 0; + srcVector.setSafe(0, holder); + + assertNull(srcVector.getObject(0)); + } + } + + private static NullableIntHolder newIntHolder(int value) { + final NullableIntHolder holder = new NullableIntHolder(); + holder.isSet = 1; + holder.value = value; + return holder; + } + + private static NullableBitHolder newBitHolder(boolean value) { + final NullableBitHolder holder = new NullableBitHolder(); + holder.isSet = 1; + holder.value = value ? 1 : 0; + return holder; + } + + private static NullableFloat4Holder newFloat4Holder(float value) { + final NullableFloat4Holder holder = new NullableFloat4Holder(); + holder.isSet = 1; + holder.value = value; + return holder; + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestUtils.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestUtils.java new file mode 100644 index 000000000..7e64dd386 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestUtils.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.FieldType; + +public class TestUtils { + + public static VarCharVector newVarCharVector(String name, BufferAllocator allocator) { + return (VarCharVector) + FieldType.nullable(new ArrowType.Utf8()).createNewSingleVector(name, allocator, null); + } + + public static VarBinaryVector newVarBinaryVector(String name, BufferAllocator allocator) { + return (VarBinaryVector) + FieldType.nullable(new ArrowType.Binary()).createNewSingleVector(name, allocator, null); + } + + public static T newVector(Class c, String name, ArrowType type, BufferAllocator allocator) { + return c.cast(FieldType.nullable(type).createNewSingleVector(name, allocator, null)); + } + + public static T newVector(Class c, String name, MinorType type, BufferAllocator allocator) { + return c.cast(FieldType.nullable(type.getType()).createNewSingleVector(name, allocator, null)); + } + +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java new file mode 100644 index 000000000..572c3d594 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -0,0 +1,3061 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.TestUtils.newVarBinaryVector; +import static org.apache.arrow.vector.TestUtils.newVarCharVector; +import static org.apache.arrow.vector.TestUtils.newVector; +import static org.apache.arrow.vector.testing.ValueVectorDataPopulator.setVector; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import java.nio.ByteBuffer; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.memory.rounding.DefaultRoundingPolicy; +import org.apache.arrow.memory.util.ArrowBufPointer; +import org.apache.arrow.memory.util.CommonUtil; +import org.apache.arrow.vector.compare.Range; +import org.apache.arrow.vector.compare.RangeEqualsVisitor; +import org.apache.arrow.vector.compare.VectorEqualsVisitor; +import org.apache.arrow.vector.complex.DenseUnionVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.complex.impl.NullableStructWriter; +import org.apache.arrow.vector.complex.impl.UnionListWriter; +import org.apache.arrow.vector.holders.NullableIntHolder; +import org.apache.arrow.vector.holders.NullableUInt4Holder; +import org.apache.arrow.vector.holders.NullableVarBinaryHolder; +import org.apache.arrow.vector.holders.NullableVarCharHolder; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.util.Text; +import org.apache.arrow.vector.util.TransferPair; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestValueVector { + + private static final String EMPTY_SCHEMA_PATH = ""; + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + private static final Charset utf8Charset = Charset.forName("UTF-8"); + private static final byte[] STR1 = "AAAAA1".getBytes(utf8Charset); + private static final byte[] STR2 = "BBBBBBBBB2".getBytes(utf8Charset); + private static final byte[] STR3 = "CCCC3".getBytes(utf8Charset); + private static final byte[] STR4 = "DDDDDDDD4".getBytes(utf8Charset); + private static final byte[] STR5 = "EEE5".getBytes(utf8Charset); + private static final byte[] STR6 = "FFFFF6".getBytes(utf8Charset); + private static final int MAX_VALUE_COUNT = + (int) (Integer.getInteger("arrow.vector.max_allocation_bytes", Integer.MAX_VALUE) / 7); + private static final int MAX_VALUE_COUNT_8BYTE = (int) (MAX_VALUE_COUNT / 2); + + @After + public void terminate() throws Exception { + allocator.close(); + } + + /* + * Tests for Fixed-Width vectors + * + * Covered types as of now + * + * -- UInt4Vector + * -- IntVector + * -- Float4Vector + * -- Float8Vector + * + * -- UInt4Vector + * -- IntVector + * -- Float4Vector + * + * TODO: + * + * -- SmallIntVector + * -- BigIntVector + * -- TinyIntVector + */ + + @Test /* UInt4Vector */ + public void testFixedType1() { + + // Create a new value vector for 1024 integers. + try (final UInt4Vector vector = new UInt4Vector(EMPTY_SCHEMA_PATH, allocator)) { + + boolean error = false; + int initialCapacity = 0; + + vector.allocateNew(1024); + initialCapacity = vector.getValueCapacity(); + assertTrue(initialCapacity >= 1024); + + // Put and set a few values + vector.setSafe(0, 100); + vector.setSafe(1, 101); + vector.setSafe(100, 102); + vector.setSafe(1022, 103); + vector.setSafe(1023, 104); + + assertEquals(100, vector.get(0)); + assertEquals(101, vector.get(1)); + assertEquals(102, vector.get(100)); + assertEquals(103, vector.get(1022)); + assertEquals(104, vector.get(1023)); + + try { + vector.set(initialCapacity, 10000); + } catch (IndexOutOfBoundsException ie) { + error = true; + } finally { + assertTrue(error); + error = false; + } + + try { + vector.get(initialCapacity); + } catch (IndexOutOfBoundsException ie) { + error = true; + } finally { + assertTrue(error); + error = false; + } + + /* this should trigger a realloc() */ + vector.setSafe(initialCapacity, 10000); + + /* underlying buffer should now be able to store double the number of values */ + assertTrue(vector.getValueCapacity() >= 2 * initialCapacity); + + /* check vector data after realloc */ + assertEquals(100, vector.get(0)); + assertEquals(101, vector.get(1)); + assertEquals(102, vector.get(100)); + assertEquals(103, vector.get(1022)); + assertEquals(104, vector.get(1023)); + assertEquals(10000, vector.get(initialCapacity)); + + /* reset the vector */ + int capacityBeforeReset = vector.getValueCapacity(); + vector.reset(); + + /* capacity shouldn't change after reset */ + assertEquals(capacityBeforeReset, vector.getValueCapacity()); + + /* vector data should have been zeroed out */ + for (int i = 0; i < capacityBeforeReset; i++) { + // TODO: test vector.get(i) is 0 after unsafe get added + assertEquals("non-zero data not expected at index: " + i, true, vector.isNull(i)); + } + } + } + + @Test /* IntVector */ + public void testFixedType2() { + try (final IntVector intVector = new IntVector(EMPTY_SCHEMA_PATH, allocator)) { + boolean error = false; + int initialCapacity = 16; + + /* we should not throw exception for these values of capacity */ + intVector.setInitialCapacity(MAX_VALUE_COUNT - 1); + intVector.setInitialCapacity(MAX_VALUE_COUNT); + + try { + intVector.setInitialCapacity(MAX_VALUE_COUNT * 2); + } catch (OversizedAllocationException oe) { + error = true; + } finally { + assertTrue(error); + error = false; + } + + intVector.setInitialCapacity(initialCapacity); + /* no memory allocation has happened yet so capacity of underlying buffer should be 0 */ + assertEquals(0, intVector.getValueCapacity()); + + /* allocate 64 bytes (16 * 4) */ + intVector.allocateNew(); + /* underlying buffer should be able to store 16 values */ + assertTrue(intVector.getValueCapacity() >= initialCapacity); + initialCapacity = intVector.getValueCapacity(); + + /* populate the vector */ + int j = 1; + for (int i = 0; i < initialCapacity; i += 2) { + intVector.set(i, j); + j++; + } + + try { + intVector.set(initialCapacity, j); + } catch (IndexOutOfBoundsException ie) { + error = true; + } finally { + assertTrue(error); + error = false; + } + + /* check vector contents */ + j = 1; + for (int i = 0; i < initialCapacity; i += 2) { + assertEquals("unexpected value at index: " + i, j, intVector.get(i)); + j++; + } + + try { + intVector.get(initialCapacity); + } catch (IndexOutOfBoundsException ie) { + error = true; + } finally { + assertTrue(error); + error = false; + } + + /* this should trigger a realloc() */ + intVector.setSafe(initialCapacity, j); + + /* underlying buffer should now be able to store double the number of values */ + assertTrue(intVector.getValueCapacity() >= initialCapacity * 2); + + /* vector data should still be intact after realloc */ + j = 1; + for (int i = 0; i <= initialCapacity; i += 2) { + assertEquals("unexpected value at index: " + i, j, intVector.get(i)); + j++; + } + + /* reset the vector */ + int capacityBeforeRealloc = intVector.getValueCapacity(); + intVector.reset(); + + /* capacity shouldn't change after reset */ + assertEquals(capacityBeforeRealloc, intVector.getValueCapacity()); + + /* vector data should have been zeroed out */ + for (int i = 0; i < capacityBeforeRealloc; i++) { + assertEquals("non-zero data not expected at index: " + i, true, intVector.isNull(i)); + } + } + } + + @Test /* VarCharVector */ + public void testSizeOfValueBuffer() { + try (final VarCharVector vector = new VarCharVector(EMPTY_SCHEMA_PATH, allocator)) { + int valueCount = 100; + int currentSize = 0; + vector.setInitialCapacity(valueCount); + vector.allocateNew(); + vector.setValueCount(valueCount); + for (int i = 0; i < valueCount; i++) { + currentSize += i; + vector.setSafe(i, new byte[i]); + } + + assertEquals(currentSize, vector.sizeOfValueBuffer()); + } + } + + @Test /* Float4Vector */ + public void testFixedType3() { + try (final Float4Vector floatVector = new Float4Vector(EMPTY_SCHEMA_PATH, allocator)) { + boolean error = false; + int initialCapacity = 16; + + /* we should not throw exception for these values of capacity */ + floatVector.setInitialCapacity(MAX_VALUE_COUNT - 1); + floatVector.setInitialCapacity(MAX_VALUE_COUNT); + + try { + floatVector.setInitialCapacity(MAX_VALUE_COUNT * 2); + } catch (OversizedAllocationException oe) { + error = true; + } finally { + assertTrue(error); + error = false; + } + + floatVector.setInitialCapacity(initialCapacity); + /* no memory allocation has happened yet so capacity of underlying buffer should be 0 */ + assertEquals(0, floatVector.getValueCapacity()); + + /* allocate 64 bytes (16 * 4) */ + floatVector.allocateNew(); + /* underlying buffer should be able to store 16 values */ + assertTrue(floatVector.getValueCapacity() >= initialCapacity); + initialCapacity = floatVector.getValueCapacity(); + + floatVector.zeroVector(); + + /* populate the floatVector */ + floatVector.set(0, 1.5f); + floatVector.set(2, 2.5f); + floatVector.set(4, 3.3f); + floatVector.set(6, 4.8f); + floatVector.set(8, 5.6f); + floatVector.set(10, 6.6f); + floatVector.set(12, 7.8f); + floatVector.set(14, 8.5f); + + try { + floatVector.set(initialCapacity, 9.5f); + } catch (IndexOutOfBoundsException ie) { + error = true; + } finally { + assertTrue(error); + error = false; + } + + /* check vector contents */ + assertEquals(1.5f, floatVector.get(0), 0); + assertEquals(2.5f, floatVector.get(2), 0); + assertEquals(3.3f, floatVector.get(4), 0); + assertEquals(4.8f, floatVector.get(6), 0); + assertEquals(5.6f, floatVector.get(8), 0); + assertEquals(6.6f, floatVector.get(10), 0); + assertEquals(7.8f, floatVector.get(12), 0); + assertEquals(8.5f, floatVector.get(14), 0); + + try { + floatVector.get(initialCapacity); + } catch (IndexOutOfBoundsException ie) { + error = true; + } finally { + assertTrue(error); + error = false; + } + + /* this should trigger a realloc() */ + floatVector.setSafe(initialCapacity, 9.5f); + + /* underlying buffer should now be able to store double the number of values */ + assertTrue(floatVector.getValueCapacity() >= initialCapacity * 2); + + /* vector data should still be intact after realloc */ + assertEquals(1.5f, floatVector.get(0), 0); + assertEquals(2.5f, floatVector.get(2), 0); + assertEquals(3.3f, floatVector.get(4), 0); + assertEquals(4.8f, floatVector.get(6), 0); + assertEquals(5.6f, floatVector.get(8), 0); + assertEquals(6.6f, floatVector.get(10), 0); + assertEquals(7.8f, floatVector.get(12), 0); + assertEquals(8.5f, floatVector.get(14), 0); + assertEquals(9.5f, floatVector.get(initialCapacity), 0); + + /* reset the vector */ + int capacityBeforeReset = floatVector.getValueCapacity(); + floatVector.reset(); + + /* capacity shouldn't change after reset */ + assertEquals(capacityBeforeReset, floatVector.getValueCapacity()); + + /* vector data should be zeroed out */ + for (int i = 0; i < capacityBeforeReset; i++) { + assertEquals("non-zero data not expected at index: " + i, true, floatVector.isNull(i)); + } + } + } + + @Test /* Float8Vector */ + public void testFixedType4() { + try (final Float8Vector floatVector = new Float8Vector(EMPTY_SCHEMA_PATH, allocator)) { + boolean error = false; + int initialCapacity = 16; + + /* we should not throw exception for these values of capacity */ + floatVector.setInitialCapacity(MAX_VALUE_COUNT_8BYTE - 1); + floatVector.setInitialCapacity(MAX_VALUE_COUNT_8BYTE); + + try { + floatVector.setInitialCapacity(MAX_VALUE_COUNT_8BYTE * 2); + } catch (OversizedAllocationException oe) { + error = true; + } finally { + assertTrue(error); + error = false; + } + + floatVector.setInitialCapacity(initialCapacity); + /* no memory allocation has happened yet so capacity of underlying buffer should be 0 */ + assertEquals(0, floatVector.getValueCapacity()); + + /* allocate 128 bytes (16 * 8) */ + floatVector.allocateNew(); + /* underlying buffer should be able to store 16 values */ + assertTrue(floatVector.getValueCapacity() >= initialCapacity); + initialCapacity = floatVector.getValueCapacity(); + + /* populate the vector */ + floatVector.set(0, 1.55); + floatVector.set(2, 2.53); + floatVector.set(4, 3.36); + floatVector.set(6, 4.82); + floatVector.set(8, 5.67); + floatVector.set(10, 6.67); + floatVector.set(12, 7.87); + floatVector.set(14, 8.56); + + try { + floatVector.set(initialCapacity, 9.53); + } catch (IndexOutOfBoundsException ie) { + error = true; + } finally { + assertTrue(error); + error = false; + } + + /* check floatVector contents */ + assertEquals(1.55, floatVector.get(0), 0); + assertEquals(2.53, floatVector.get(2), 0); + assertEquals(3.36, floatVector.get(4), 0); + assertEquals(4.82, floatVector.get(6), 0); + assertEquals(5.67, floatVector.get(8), 0); + assertEquals(6.67, floatVector.get(10), 0); + assertEquals(7.87, floatVector.get(12), 0); + assertEquals(8.56, floatVector.get(14), 0); + + try { + floatVector.get(initialCapacity); + } catch (IndexOutOfBoundsException ie) { + error = true; + } finally { + assertTrue(error); + error = false; + } + + /* this should trigger a realloc() */ + floatVector.setSafe(initialCapacity, 9.53); + + /* underlying buffer should now be able to store double the number of values */ + assertTrue(floatVector.getValueCapacity() >= initialCapacity * 2); + + /* vector data should still be intact after realloc */ + assertEquals(1.55, floatVector.get(0), 0); + assertEquals(2.53, floatVector.get(2), 0); + assertEquals(3.36, floatVector.get(4), 0); + assertEquals(4.82, floatVector.get(6), 0); + assertEquals(5.67, floatVector.get(8), 0); + assertEquals(6.67, floatVector.get(10), 0); + assertEquals(7.87, floatVector.get(12), 0); + assertEquals(8.56, floatVector.get(14), 0); + assertEquals(9.53, floatVector.get(initialCapacity), 0); + + /* reset the vector */ + int capacityBeforeReset = floatVector.getValueCapacity(); + floatVector.reset(); + + /* capacity shouldn't change after reset */ + assertEquals(capacityBeforeReset, floatVector.getValueCapacity()); + + /* vector data should be zeroed out */ + for (int i = 0; i < capacityBeforeReset; i++) { + assertEquals("non-zero data not expected at index: " + i, true, floatVector.isNull(i)); + } + } + } + + @Test /* UInt4Vector */ + public void testNullableFixedType1() { + + // Create a new value vector for 1024 integers. + try (final UInt4Vector vector = newVector(UInt4Vector.class, EMPTY_SCHEMA_PATH, new ArrowType.Int(32, false), + allocator);) { + boolean error = false; + int initialCapacity = 1024; + + vector.setInitialCapacity(initialCapacity); + /* no memory allocation has happened yet */ + assertEquals(0, vector.getValueCapacity()); + + vector.allocateNew(); + assertTrue(vector.getValueCapacity() >= initialCapacity); + initialCapacity = vector.getValueCapacity(); + + // Put and set a few values + vector.set(0, 100); + vector.set(1, 101); + vector.set(100, 102); + vector.set(initialCapacity - 2, 103); + vector.set(initialCapacity - 1, 104); + + /* check vector contents */ + assertEquals(100, vector.get(0)); + assertEquals(101, vector.get(1)); + assertEquals(102, vector.get(100)); + assertEquals(103, vector.get(initialCapacity - 2)); + assertEquals(104, vector.get(initialCapacity - 1)); + + int val = 0; + + /* check unset bits/null values */ + for (int i = 2, j = 101; i <= 99 || j <= initialCapacity - 3; i++, j++) { + if (i <= 99) { + assertTrue(vector.isNull(i)); + } + if (j <= initialCapacity - 3) { + assertTrue(vector.isNull(j)); + } + } + + try { + vector.set(initialCapacity, 10000); + } catch (IndexOutOfBoundsException ie) { + error = true; + } finally { + assertTrue(error); + error = false; + } + + try { + vector.get(initialCapacity); + } catch (IndexOutOfBoundsException ie) { + error = true; + } finally { + assertTrue(error); + error = false; + } + + /* should trigger a realloc of the underlying bitvector and valuevector */ + vector.setSafe(initialCapacity, 10000); + + /* check new capacity */ + assertTrue(vector.getValueCapacity() >= initialCapacity * 2); + + /* vector contents should still be intact after realloc */ + assertEquals(100, vector.get(0)); + assertEquals(101, vector.get(1)); + assertEquals(102, vector.get(100)); + assertEquals(103, vector.get(initialCapacity - 2)); + assertEquals(104, vector.get(initialCapacity - 1)); + assertEquals(10000, vector.get(initialCapacity)); + + val = 0; + + /* check unset bits/null values */ + for (int i = 2, j = 101; i < 99 || j < initialCapacity - 3; i++, j++) { + if (i <= 99) { + assertTrue(vector.isNull(i)); + } + if (j <= initialCapacity - 3) { + assertTrue(vector.isNull(j)); + } + } + + /* reset the vector */ + int capacityBeforeReset = vector.getValueCapacity(); + vector.reset(); + + /* capacity shouldn't change after reset */ + assertEquals(capacityBeforeReset, vector.getValueCapacity()); + + /* vector data should be zeroed out */ + for (int i = 0; i < capacityBeforeReset; i++) { + assertTrue("non-null data not expected at index: " + i, vector.isNull(i)); + } + } + } + + @Test /* Float4Vector */ + public void testNullableFixedType2() { + // Create a new value vector for 1024 integers + try (final Float4Vector vector = newVector(Float4Vector.class, EMPTY_SCHEMA_PATH, MinorType.FLOAT4, allocator);) { + boolean error = false; + int initialCapacity = 16; + + vector.setInitialCapacity(initialCapacity); + /* no memory allocation has happened yet */ + assertEquals(0, vector.getValueCapacity()); + + vector.allocateNew(); + assertTrue(vector.getValueCapacity() >= initialCapacity); + initialCapacity = vector.getValueCapacity(); + + /* populate the vector */ + vector.set(0, 100.5f); + vector.set(2, 201.5f); + vector.set(4, 300.3f); + vector.set(6, 423.8f); + vector.set(8, 555.6f); + vector.set(10, 66.6f); + vector.set(12, 78.8f); + vector.set(14, 89.5f); + + try { + vector.set(initialCapacity, 90.5f); + } catch (IndexOutOfBoundsException ie) { + error = true; + } finally { + assertTrue(error); + error = false; + } + + /* check vector contents */ + assertEquals(100.5f, vector.get(0), 0); + assertTrue(vector.isNull(1)); + assertEquals(201.5f, vector.get(2), 0); + assertTrue(vector.isNull(3)); + assertEquals(300.3f, vector.get(4), 0); + assertTrue(vector.isNull(5)); + assertEquals(423.8f, vector.get(6), 0); + assertTrue(vector.isNull(7)); + assertEquals(555.6f, vector.get(8), 0); + assertTrue(vector.isNull(9)); + assertEquals(66.6f, vector.get(10), 0); + assertTrue(vector.isNull(11)); + assertEquals(78.8f, vector.get(12), 0); + assertTrue(vector.isNull(13)); + assertEquals(89.5f, vector.get(14), 0); + assertTrue(vector.isNull(15)); + + try { + vector.get(initialCapacity); + } catch (IndexOutOfBoundsException ie) { + error = true; + } finally { + assertTrue(error); + error = false; + } + + /* this should trigger a realloc() */ + vector.setSafe(initialCapacity, 90.5f); + + /* underlying buffer should now be able to store double the number of values */ + assertTrue(vector.getValueCapacity() >= 2 * initialCapacity); + + /* vector data should still be intact after realloc */ + assertEquals(100.5f, vector.get(0), 0); + assertTrue(vector.isNull(1)); + assertEquals(201.5f, vector.get(2), 0); + assertTrue(vector.isNull(3)); + assertEquals(300.3f, vector.get(4), 0); + assertTrue(vector.isNull(5)); + assertEquals(423.8f, vector.get(6), 0); + assertTrue(vector.isNull(7)); + assertEquals(555.6f, vector.get(8), 0); + assertTrue(vector.isNull(9)); + assertEquals(66.6f, vector.get(10), 0); + assertTrue(vector.isNull(11)); + assertEquals(78.8f, vector.get(12), 0); + assertTrue(vector.isNull(13)); + assertEquals(89.5f, vector.get(14), 0); + assertTrue(vector.isNull(15)); + + /* reset the vector */ + int capacityBeforeReset = vector.getValueCapacity(); + vector.reset(); + + /* capacity shouldn't change after reset */ + assertEquals(capacityBeforeReset, vector.getValueCapacity()); + + /* vector data should be zeroed out */ + for (int i = 0; i < capacityBeforeReset; i++) { + assertTrue("non-null data not expected at index: " + i, vector.isNull(i)); + } + } + } + + @Test /* IntVector */ + public void testNullableFixedType3() { + // Create a new value vector for 1024 integers + try (final IntVector vector = newVector(IntVector.class, EMPTY_SCHEMA_PATH, MinorType.INT, allocator)) { + boolean error = false; + int initialCapacity = 1024; + + /* no memory allocation has happened yet so capacity of underlying buffer should be 0 */ + assertEquals(0, vector.getValueCapacity()); + /* allocate space for 4KB data (1024 * 4) */ + vector.allocateNew(initialCapacity); + /* underlying buffer should be able to store 1024 values */ + assertTrue(vector.getValueCapacity() >= initialCapacity); + initialCapacity = vector.getValueCapacity(); + + vector.set(0, 1); + vector.set(1, 2); + vector.set(100, 3); + vector.set(1022, 4); + vector.set(1023, 5); + + /* check vector contents */ + int j = 1; + for (int i = 0; i <= 1023; i++) { + if ((i >= 2 && i <= 99) || (i >= 101 && i <= 1021)) { + assertTrue("non-null data not expected at index: " + i, vector.isNull(i)); + } else { + assertFalse("null data not expected at index: " + i, vector.isNull(i)); + assertEquals("unexpected value at index: " + i, j, vector.get(i)); + j++; + } + } + + vector.setValueCount(1024); + Field field = vector.getField(); + + List buffers = vector.getFieldBuffers(); + + assertEquals(2, buffers.size()); + + ArrowBuf validityVectorBuf = buffers.get(0); + + /* bitvector tracks 1024 integers --> 1024 bits --> 128 bytes */ + assertTrue(validityVectorBuf.readableBytes() >= 128); + assertEquals(3, validityVectorBuf.getByte(0)); // 1st and second bit defined + for (int i = 1; i < 12; i++) { + assertEquals(0, validityVectorBuf.getByte(i)); // nothing defined until 100 + } + assertEquals(16, validityVectorBuf.getByte(12)); // 100th bit is defined (12 * 8 + 4) + for (int i = 13; i < 127; i++) { + assertEquals(0, validityVectorBuf.getByte(i)); // nothing defined between 100th and 1022nd + } + assertEquals(-64, validityVectorBuf.getByte(127)); // 1022nd and 1023rd bit defined + + /* this should trigger a realloc() */ + vector.setSafe(initialCapacity, 6); + + /* underlying buffer should now be able to store double the number of values */ + assertTrue(vector.getValueCapacity() >= 2 * initialCapacity); + + /* vector data should still be intact after realloc */ + j = 1; + for (int i = 0; i < (initialCapacity * 2); i++) { + if ((i > 1023 && i != initialCapacity) || (i >= 2 && i <= 99) || (i >= 101 && i <= 1021)) { + assertTrue("non-null data not expected at index: " + i, vector.isNull(i)); + } else { + assertFalse("null data not expected at index: " + i, vector.isNull(i)); + assertEquals("unexpected value at index: " + i, j, vector.get(i)); + j++; + } + } + + /* reset the vector */ + int capacityBeforeReset = vector.getValueCapacity(); + vector.reset(); + + /* capacity shouldn't change after reset */ + assertEquals(capacityBeforeReset, vector.getValueCapacity()); + + /* vector data should have been zeroed out */ + for (int i = 0; i < capacityBeforeReset; i++) { + assertTrue("non-null data not expected at index: " + i, vector.isNull(i)); + } + + vector.allocateNew(initialCapacity * 4); + // vector has been erased + for (int i = 0; i < initialCapacity * 4; i++) { + assertTrue("non-null data not expected at index: " + i, vector.isNull(i)); + } + } + } + + @Test /* IntVector */ + public void testNullableFixedType4() { + try (final IntVector vector = newVector(IntVector.class, EMPTY_SCHEMA_PATH, MinorType.INT, allocator)) { + + /* no memory allocation has happened yet */ + assertEquals(0, vector.getValueCapacity()); + + vector.allocateNew(); + int valueCapacity = vector.getValueCapacity(); + assertEquals(vector.INITIAL_VALUE_ALLOCATION, valueCapacity); + + int baseValue = 20000; + + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 1) { + vector.set(i, baseValue + i); + } + } + + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 1) { + assertFalse("unexpected null value at index: " + i, vector.isNull(i)); + assertEquals("unexpected value at index: " + i, (baseValue + i), vector.get(i)); + } else { + assertTrue("unexpected non-null value at index: " + i, vector.isNull(i)); + } + } + + vector.setSafe(valueCapacity, 20000000); + assertTrue(vector.getValueCapacity() >= valueCapacity * 2); + + for (int i = 0; i < vector.getValueCapacity(); i++) { + if (i == valueCapacity) { + assertFalse("unexpected null value at index: " + i, vector.isNull(i)); + assertEquals("unexpected value at index: " + i, 20000000, vector.get(i)); + } else if (i < valueCapacity) { + if ((i & 1) == 1) { + assertFalse("unexpected null value at index: " + i, vector.isNull(i)); + assertEquals("unexpected value at index: " + i, (baseValue + i), vector.get(i)); + } + } else { + assertTrue("unexpected non-null value at index: " + i, vector.isNull(i)); + } + } + + vector.zeroVector(); + + for (int i = 0; i < vector.getValueCapacity(); i += 2) { + vector.set(i, baseValue + i); + } + + for (int i = 0; i < vector.getValueCapacity(); i++) { + if (i % 2 == 0) { + assertFalse("unexpected null value at index: " + i, vector.isNull(i)); + assertEquals("unexpected value at index: " + i, (baseValue + i), vector.get(i)); + } else { + assertTrue("unexpected non-null value at index: " + i, vector.isNull(i)); + } + } + + int valueCapacityBeforeRealloc = vector.getValueCapacity(); + vector.setSafe(valueCapacityBeforeRealloc + 1000, 400000000); + assertTrue(vector.getValueCapacity() >= valueCapacity * 4); + + for (int i = 0; i < vector.getValueCapacity(); i++) { + if (i == (valueCapacityBeforeRealloc + 1000)) { + assertFalse("unexpected null value at index: " + i, vector.isNull(i)); + assertEquals("unexpected value at index: " + i, 400000000, vector.get(i)); + } else if (i < valueCapacityBeforeRealloc && (i % 2) == 0) { + assertFalse("unexpected null value at index: " + i, vector.isNull(i)); + assertEquals("unexpected value at index: " + i, baseValue + i, vector.get(i)); + } else { + assertTrue("unexpected non-null value at index: " + i, vector.isNull(i)); + } + } + + /* reset the vector */ + int valueCapacityBeforeReset = vector.getValueCapacity(); + vector.reset(); + + /* capacity shouldn't change after reset */ + assertEquals(valueCapacityBeforeReset, vector.getValueCapacity()); + + /* vector data should be zeroed out */ + for (int i = 0; i < valueCapacityBeforeReset; i++) { + assertTrue("non-null data not expected at index: " + i, vector.isNull(i)); + } + } + } + + /* + * Tests for Variable Width Vectors + * + * Covered types as of now + * + * -- VarCharVector + * -- VarBinaryVector + * + * TODO: + * + * -- VarCharVector + * -- VarBinaryVector + */ + + /** + * ARROW-7831: this checks that a slice taken off a buffer is still readable after that buffer's allocator is closed. + */ + @Test /* VarCharVector */ + public void testSplitAndTransfer1() { + try (final VarCharVector targetVector = newVarCharVector("split-target", allocator)) { + try (final VarCharVector sourceVector = newVarCharVector(EMPTY_SCHEMA_PATH, allocator)) { + sourceVector.allocateNew(1024 * 10, 1024); + + sourceVector.set(0, STR1); + sourceVector.set(1, STR2); + sourceVector.set(2, STR3); + sourceVector.setValueCount(3); + + final long allocatedMem = allocator.getAllocatedMemory(); + final int validityRefCnt = sourceVector.getValidityBuffer().refCnt(); + final int offsetRefCnt = sourceVector.getOffsetBuffer().refCnt(); + final int dataRefCnt = sourceVector.getDataBuffer().refCnt(); + + // split and transfer with slice starting at the beginning: this should not allocate anything new + sourceVector.splitAndTransferTo(0, 2, targetVector); + assertEquals(allocatedMem, allocator.getAllocatedMemory()); + // The validity and offset buffers are sliced from a same buffer.See BaseFixedWidthVector#allocateBytes. + // Therefore, the refcnt of the validity buffer is increased once since the startIndex is 0. The refcnt of the + // offset buffer is increased as well for the same reason. This amounts to a total of 2. + assertEquals(validityRefCnt + 2, sourceVector.getValidityBuffer().refCnt()); + assertEquals(offsetRefCnt + 2, sourceVector.getOffsetBuffer().refCnt()); + assertEquals(dataRefCnt + 1, sourceVector.getDataBuffer().refCnt()); + } + assertArrayEquals(STR1, targetVector.get(0)); + assertArrayEquals(STR2, targetVector.get(1)); + } + } + + /** + * ARROW-7831: this checks that a vector that got sliced is still readable after the slice's allocator got closed. + */ + @Test /* VarCharVector */ + public void testSplitAndTransfer2() { + try (final VarCharVector sourceVector = newVarCharVector(EMPTY_SCHEMA_PATH, allocator)) { + try (final VarCharVector targetVector = newVarCharVector("split-target", allocator)) { + sourceVector.allocateNew(1024 * 10, 1024); + + sourceVector.set(0, STR1); + sourceVector.set(1, STR2); + sourceVector.set(2, STR3); + sourceVector.setValueCount(3); + + final long allocatedMem = allocator.getAllocatedMemory(); + final int validityRefCnt = sourceVector.getValidityBuffer().refCnt(); + final int offsetRefCnt = sourceVector.getOffsetBuffer().refCnt(); + final int dataRefCnt = sourceVector.getDataBuffer().refCnt(); + + // split and transfer with slice starting at the beginning: this should not allocate anything new + sourceVector.splitAndTransferTo(0, 2, targetVector); + assertEquals(allocatedMem, allocator.getAllocatedMemory()); + // The validity and offset buffers are sliced from a same buffer.See BaseFixedWidthVector#allocateBytes. + // Therefore, the refcnt of the validity buffer is increased once since the startIndex is 0. The refcnt of the + // offset buffer is increased as well for the same reason. This amounts to a total of 2. + assertEquals(validityRefCnt + 2, sourceVector.getValidityBuffer().refCnt()); + assertEquals(offsetRefCnt + 2, sourceVector.getOffsetBuffer().refCnt()); + assertEquals(dataRefCnt + 1, sourceVector.getDataBuffer().refCnt()); + } + assertArrayEquals(STR1, sourceVector.get(0)); + assertArrayEquals(STR2, sourceVector.get(1)); + assertArrayEquals(STR3, sourceVector.get(2)); + } + } + + /** + * ARROW-7831: this checks an offset splitting optimization, in the case where all the values up to the start of the + * slice are null/empty, which avoids allocation for the offset buffer. + */ + @Test /* VarCharVector */ + public void testSplitAndTransfer3() { + try (final VarCharVector targetVector = newVarCharVector("split-target", allocator); + final VarCharVector sourceVector = newVarCharVector(EMPTY_SCHEMA_PATH, allocator)) { + sourceVector.allocateNew(1024 * 10, 1024); + + sourceVector.set(0, new byte[0]); + sourceVector.setNull(1); + sourceVector.set(2, STR1); + sourceVector.set(3, STR2); + sourceVector.set(4, STR3); + sourceVector.setValueCount(5); + + final long allocatedMem = allocator.getAllocatedMemory(); + final int validityRefCnt = sourceVector.getValidityBuffer().refCnt(); + final int offsetRefCnt = sourceVector.getOffsetBuffer().refCnt(); + final int dataRefCnt = sourceVector.getDataBuffer().refCnt(); + + sourceVector.splitAndTransferTo(2, 2, targetVector); + // because the offset starts at 0 since the first 2 values are empty/null, the allocation only consists in + // the size needed for the validity buffer + final long validitySize = + DefaultRoundingPolicy.DEFAULT_ROUNDING_POLICY.getRoundedSize( + BaseValueVector.getValidityBufferSizeFromCount(2)); + assertEquals(allocatedMem + validitySize, allocator.getAllocatedMemory()); + // The validity and offset buffers are sliced from a same buffer.See BaseFixedWidthVector#allocateBytes. + // Since values up to the startIndex are empty/null, the offset buffer doesn't need to be reallocated and + // therefore its refcnt is increased by 1. + assertEquals(validityRefCnt + 1, sourceVector.getValidityBuffer().refCnt()); + assertEquals(offsetRefCnt + 1, sourceVector.getOffsetBuffer().refCnt()); + assertEquals(dataRefCnt + 1, sourceVector.getDataBuffer().refCnt()); + + assertArrayEquals(STR1, targetVector.get(0)); + assertArrayEquals(STR2, targetVector.get(1)); + } + } + + /** + * ARROW-7831: ensures that data is transferred from one allocator to another in case of 0-index start special cases. + */ + @Test /* VarCharVector */ + public void testSplitAndTransfer4() { + try (final BufferAllocator targetAllocator = allocator.newChildAllocator("target-alloc", 256, 256); + final VarCharVector targetVector = newVarCharVector("split-target", targetAllocator)) { + try (final BufferAllocator sourceAllocator = allocator.newChildAllocator("source-alloc", 256, 256); + final VarCharVector sourceVector = newVarCharVector(EMPTY_SCHEMA_PATH, sourceAllocator)) { + sourceVector.allocateNew(50, 3); + + sourceVector.set(0, STR1); + sourceVector.set(1, STR2); + sourceVector.set(2, STR3); + sourceVector.setValueCount(3); + + final long allocatedMem = allocator.getAllocatedMemory(); + final int validityRefCnt = sourceVector.getValidityBuffer().refCnt(); + final int offsetRefCnt = sourceVector.getOffsetBuffer().refCnt(); + final int dataRefCnt = sourceVector.getDataBuffer().refCnt(); + + // split and transfer with slice starting at the beginning: this should not allocate anything new + sourceVector.splitAndTransferTo(0, 2, targetVector); + assertEquals(allocatedMem, allocator.getAllocatedMemory()); + // Unlike testSplitAndTransfer1 where the buffers originated from the same allocator, the refcnts of each + // buffers for this test should be the same as what the source allocator ended up with. + assertEquals(validityRefCnt, sourceVector.getValidityBuffer().refCnt()); + assertEquals(offsetRefCnt, sourceVector.getOffsetBuffer().refCnt()); + assertEquals(dataRefCnt, sourceVector.getDataBuffer().refCnt()); + } + assertArrayEquals(STR1, targetVector.get(0)); + assertArrayEquals(STR2, targetVector.get(1)); + } + } + + @Test /* VarCharVector */ + public void testNullableVarType1() { + + // Create a new value vector for 1024 integers. + try (final VarCharVector vector = newVarCharVector(EMPTY_SCHEMA_PATH, allocator)) { + vector.allocateNew(1024 * 10, 1024); + + vector.set(0, STR1); + vector.set(1, STR2); + vector.set(2, STR3); + vector.setSafe(3, STR3, 1, STR3.length - 1); + vector.setSafe(4, STR3, 2, STR3.length - 2); + ByteBuffer str3ByteBuffer = ByteBuffer.wrap(STR3); + vector.setSafe(5, str3ByteBuffer, 1, STR3.length - 1); + vector.setSafe(6, str3ByteBuffer, 2, STR3.length - 2); + + // Set with convenience function + Text txt = new Text("foo"); + vector.setSafe(7, txt); + + // Check the sample strings. + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(Arrays.copyOfRange(STR3, 1, STR3.length), vector.get(3)); + assertArrayEquals(Arrays.copyOfRange(STR3, 2, STR3.length), vector.get(4)); + assertArrayEquals(Arrays.copyOfRange(STR3, 1, STR3.length), vector.get(5)); + assertArrayEquals(Arrays.copyOfRange(STR3, 2, STR3.length), vector.get(6)); + + // Check returning a Text object + assertEquals(txt, vector.getObject(7)); + + // Ensure null value throws. + boolean b = false; + assertNull(vector.get(8)); + } + } + + @Test /* VarBinaryVector */ + public void testNullableVarType2() { + + // Create a new value vector for 1024 integers. + try (final VarBinaryVector vector = newVarBinaryVector(EMPTY_SCHEMA_PATH, allocator)) { + vector.allocateNew(1024 * 10, 1024); + + vector.set(0, STR1); + vector.set(1, STR2); + vector.set(2, STR3); + vector.setSafe(3, STR3, 1, STR3.length - 1); + vector.setSafe(4, STR3, 2, STR3.length - 2); + ByteBuffer str3ByteBuffer = ByteBuffer.wrap(STR3); + vector.setSafe(5, str3ByteBuffer, 1, STR3.length - 1); + vector.setSafe(6, str3ByteBuffer, 2, STR3.length - 2); + + // Check the sample strings. + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(Arrays.copyOfRange(STR3, 1, STR3.length), vector.get(3)); + assertArrayEquals(Arrays.copyOfRange(STR3, 2, STR3.length), vector.get(4)); + assertArrayEquals(Arrays.copyOfRange(STR3, 1, STR3.length), vector.get(5)); + assertArrayEquals(Arrays.copyOfRange(STR3, 2, STR3.length), vector.get(6)); + + // Ensure null value throws. + assertNull(vector.get(7)); + } + } + + + /* + * generic tests + * + * -- lastSet() and setValueCount() + * -- fillEmpties() + * -- VectorLoader and VectorUnloader + * -- some realloc tests + * + * TODO: + * + * The realloc() related tests below should be moved up and we need to + * add realloc related tests (edge cases) for more vector types. + */ + + @Test /* Float8Vector */ + public void testReallocAfterVectorTransfer1() { + try (final Float8Vector vector = new Float8Vector(EMPTY_SCHEMA_PATH, allocator)) { + int initialCapacity = 4096; + boolean error = false; + + /* use the default capacity; 4096*8 => 32KB */ + vector.setInitialCapacity(initialCapacity); + vector.allocateNew(); + + assertTrue(vector.getValueCapacity() >= initialCapacity); + initialCapacity = vector.getValueCapacity(); + + double baseValue = 100.375; + + for (int i = 0; i < initialCapacity; i++) { + vector.setSafe(i, baseValue + (double) i); + } + + /* the above setSafe calls should not have triggered a realloc as + * we are within the capacity. check the vector contents + */ + assertEquals(initialCapacity, vector.getValueCapacity()); + + for (int i = 0; i < initialCapacity; i++) { + double value = vector.get(i); + assertEquals(baseValue + (double) i, value, 0); + } + + /* this should trigger a realloc */ + vector.setSafe(initialCapacity, baseValue + (double) initialCapacity); + assertTrue(vector.getValueCapacity() >= initialCapacity * 2); + int capacityAfterRealloc1 = vector.getValueCapacity(); + + for (int i = initialCapacity + 1; i < capacityAfterRealloc1; i++) { + vector.setSafe(i, baseValue + (double) i); + } + + for (int i = 0; i < capacityAfterRealloc1; i++) { + double value = vector.get(i); + assertEquals(baseValue + (double) i, value, 0); + } + + /* this should trigger a realloc */ + vector.setSafe(capacityAfterRealloc1, baseValue + (double) (capacityAfterRealloc1)); + assertTrue(vector.getValueCapacity() >= initialCapacity * 4); + int capacityAfterRealloc2 = vector.getValueCapacity(); + + for (int i = capacityAfterRealloc1 + 1; i < capacityAfterRealloc2; i++) { + vector.setSafe(i, baseValue + (double) i); + } + + for (int i = 0; i < capacityAfterRealloc2; i++) { + double value = vector.get(i); + assertEquals(baseValue + (double) i, value, 0); + } + + /* at this point we are working with a 128KB buffer data for this + * vector. now let's transfer this vector + */ + + TransferPair transferPair = vector.getTransferPair(allocator); + transferPair.transfer(); + + Float8Vector toVector = (Float8Vector) transferPair.getTo(); + + /* now let's realloc the toVector */ + toVector.reAlloc(); + assertTrue(toVector.getValueCapacity() >= initialCapacity * 8); + + for (int i = 0; i < toVector.getValueCapacity(); i++) { + if (i < capacityAfterRealloc2) { + assertEquals(baseValue + (double) i, toVector.get(i), 0); + } else { + assertTrue(toVector.isNull(i)); + } + } + + toVector.close(); + } + } + + @Test /* Float8Vector */ + public void testReallocAfterVectorTransfer2() { + try (final Float8Vector vector = new Float8Vector(EMPTY_SCHEMA_PATH, allocator)) { + int initialCapacity = 4096; + boolean error = false; + + vector.allocateNew(initialCapacity); + assertTrue(vector.getValueCapacity() >= initialCapacity); + initialCapacity = vector.getValueCapacity(); + + double baseValue = 100.375; + + for (int i = 0; i < initialCapacity; i++) { + vector.setSafe(i, baseValue + (double) i); + } + + /* the above setSafe calls should not have triggered a realloc as + * we are within the capacity. check the vector contents + */ + assertEquals(initialCapacity, vector.getValueCapacity()); + + for (int i = 0; i < initialCapacity; i++) { + double value = vector.get(i); + assertEquals(baseValue + (double) i, value, 0); + } + + /* this should trigger a realloc */ + vector.setSafe(initialCapacity, baseValue + (double) initialCapacity); + assertTrue(vector.getValueCapacity() >= initialCapacity * 2); + int capacityAfterRealloc1 = vector.getValueCapacity(); + + for (int i = initialCapacity + 1; i < capacityAfterRealloc1; i++) { + vector.setSafe(i, baseValue + (double) i); + } + + for (int i = 0; i < capacityAfterRealloc1; i++) { + double value = vector.get(i); + assertEquals(baseValue + (double) i, value, 0); + } + + /* this should trigger a realloc */ + vector.setSafe(capacityAfterRealloc1, baseValue + (double) (capacityAfterRealloc1)); + assertTrue(vector.getValueCapacity() >= initialCapacity * 4); + int capacityAfterRealloc2 = vector.getValueCapacity(); + + for (int i = capacityAfterRealloc1 + 1; i < capacityAfterRealloc2; i++) { + vector.setSafe(i, baseValue + (double) i); + } + + for (int i = 0; i < capacityAfterRealloc2; i++) { + double value = vector.get(i); + assertEquals(baseValue + (double) i, value, 0); + } + + /* at this point we are working with a 128KB buffer data for this + * vector. now let's transfer this vector + */ + + TransferPair transferPair = vector.getTransferPair(allocator); + transferPair.transfer(); + + Float8Vector toVector = (Float8Vector) transferPair.getTo(); + + /* check toVector contents before realloc */ + for (int i = 0; i < toVector.getValueCapacity(); i++) { + assertFalse("unexpected null value at index: " + i, toVector.isNull(i)); + double value = toVector.get(i); + assertEquals("unexpected value at index: " + i, baseValue + (double) i, value, 0); + } + + /* now let's realloc the toVector and check contents again */ + toVector.reAlloc(); + assertTrue(toVector.getValueCapacity() >= initialCapacity * 8); + + for (int i = 0; i < toVector.getValueCapacity(); i++) { + if (i < capacityAfterRealloc2) { + assertFalse("unexpected null value at index: " + i, toVector.isNull(i)); + double value = toVector.get(i); + assertEquals("unexpected value at index: " + i, baseValue + (double) i, value, 0); + } else { + assertTrue("unexpected non-null value at index: " + i, toVector.isNull(i)); + } + } + + toVector.close(); + } + } + + @Test /* VarCharVector */ + public void testReallocAfterVectorTransfer3() { + try (final VarCharVector vector = new VarCharVector(EMPTY_SCHEMA_PATH, allocator)) { + /* 4096 values with 10 byte per record */ + vector.allocateNew(4096 * 10, 4096); + int valueCapacity = vector.getValueCapacity(); + assertTrue(valueCapacity >= 4096); + + /* populate the vector */ + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 1) { + vector.set(i, STR1); + } else { + vector.set(i, STR2); + } + } + + /* Check the vector output */ + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 1) { + assertArrayEquals(STR1, vector.get(i)); + } else { + assertArrayEquals(STR2, vector.get(i)); + } + } + + /* trigger first realloc */ + vector.setSafe(valueCapacity, STR2, 0, STR2.length); + assertTrue(vector.getValueCapacity() >= 2 * valueCapacity); + while (vector.getByteCapacity() < 10 * vector.getValueCapacity()) { + vector.reallocDataBuffer(); + } + + /* populate the remaining vector */ + for (int i = valueCapacity; i < vector.getValueCapacity(); i++) { + if ((i & 1) == 1) { + vector.set(i, STR1); + } else { + vector.set(i, STR2); + } + } + + /* Check the vector output */ + valueCapacity = vector.getValueCapacity(); + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 1) { + assertArrayEquals(STR1, vector.get(i)); + } else { + assertArrayEquals(STR2, vector.get(i)); + } + } + + /* trigger second realloc */ + vector.setSafe(valueCapacity + 10, STR2, 0, STR2.length); + assertTrue(vector.getValueCapacity() >= 2 * valueCapacity); + while (vector.getByteCapacity() < 10 * vector.getValueCapacity()) { + vector.reallocDataBuffer(); + } + + /* populate the remaining vector */ + for (int i = valueCapacity; i < vector.getValueCapacity(); i++) { + if ((i & 1) == 1) { + vector.set(i, STR1); + } else { + vector.set(i, STR2); + } + } + + /* Check the vector output */ + valueCapacity = vector.getValueCapacity(); + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 1) { + assertArrayEquals(STR1, vector.get(i)); + } else { + assertArrayEquals(STR2, vector.get(i)); + } + } + + /* we are potentially working with 4x the size of vector buffer + * that we initially started with. Now let's transfer the vector. + */ + + TransferPair transferPair = vector.getTransferPair(allocator); + transferPair.transfer(); + VarCharVector toVector = (VarCharVector) transferPair.getTo(); + valueCapacity = toVector.getValueCapacity(); + + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 1) { + assertArrayEquals(STR1, toVector.get(i)); + } else { + assertArrayEquals(STR2, toVector.get(i)); + } + } + + toVector.close(); + } + } + + @Test /* IntVector */ + public void testReallocAfterVectorTransfer4() { + try (final IntVector vector = new IntVector(EMPTY_SCHEMA_PATH, allocator)) { + + /* 4096 values */ + vector.allocateNew(4096); + int valueCapacity = vector.getValueCapacity(); + assertTrue(valueCapacity >= 4096); + + /* populate the vector */ + int baseValue = 1000; + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 0) { + vector.set(i, 1000 + i); + } + } + + /* Check the vector output */ + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 0) { + assertEquals(1000 + i, vector.get(i)); + } else { + assertTrue(vector.isNull(i)); + } + } + + /* trigger first realloc */ + vector.setSafe(valueCapacity, 10000000); + assertTrue(vector.getValueCapacity() >= valueCapacity * 2); + + /* populate the remaining vector */ + for (int i = valueCapacity; i < vector.getValueCapacity(); i++) { + if ((i & 1) == 0) { + vector.set(i, 1000 + i); + } + } + + /* Check the vector output */ + valueCapacity = vector.getValueCapacity(); + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 0) { + assertEquals(1000 + i, vector.get(i)); + } else { + assertTrue(vector.isNull(i)); + } + } + + /* trigger second realloc */ + vector.setSafe(valueCapacity, 10000000); + assertTrue(vector.getValueCapacity() >= valueCapacity * 2); + + /* populate the remaining vector */ + for (int i = valueCapacity; i < vector.getValueCapacity(); i++) { + if ((i & 1) == 0) { + vector.set(i, 1000 + i); + } + } + + /* Check the vector output */ + valueCapacity = vector.getValueCapacity(); + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 0) { + assertEquals(1000 + i, vector.get(i)); + } else { + assertTrue(vector.isNull(i)); + } + } + + /* we are potentially working with 4x the size of vector buffer + * that we initially started with. Now let's transfer the vector. + */ + + TransferPair transferPair = vector.getTransferPair(allocator); + transferPair.transfer(); + IntVector toVector = (IntVector) transferPair.getTo(); + /* value capacity of source and target vectors should be same after + * the transfer. + */ + assertEquals(valueCapacity, toVector.getValueCapacity()); + + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 0) { + assertEquals(1000 + i, toVector.get(i)); + } else { + assertTrue(toVector.isNull(i)); + } + } + + toVector.close(); + } + } + + @Test + public void testReAllocFixedWidthVector() { + // Create a new value vector for 1024 integers + try (final Float4Vector vector = newVector(Float4Vector.class, EMPTY_SCHEMA_PATH, MinorType.FLOAT4, allocator)) { + vector.allocateNew(1024); + + assertTrue(vector.getValueCapacity() >= 1024); + int initialCapacity = vector.getValueCapacity(); + + // Put values in indexes that fall within the initial allocation + vector.setSafe(0, 100.1f); + vector.setSafe(100, 102.3f); + vector.setSafe(1023, 104.5f); + + // Now try to put values in space that falls beyond the initial allocation + vector.setSafe(2000, 105.5f); + + // Check valueCapacity is more than initial allocation + assertTrue(vector.getValueCapacity() >= 2 * initialCapacity); + + assertEquals(100.1f, vector.get(0), 0); + assertEquals(102.3f, vector.get(100), 0); + assertEquals(104.5f, vector.get(1023), 0); + assertEquals(105.5f, vector.get(2000), 0); + + // Set the valueCount to be more than valueCapacity of current allocation. This is possible for ValueVectors + // as we don't call setSafe for null values, but we do call setValueCount when all values are inserted into the + // vector + vector.setValueCount(vector.getValueCapacity() + 200); + } + } + + @Test + public void testReAllocVariableWidthVector() { + try (final VarCharVector vector = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { + vector.setInitialCapacity(4095); + vector.allocateNew(); + + int initialCapacity = vector.getValueCapacity(); + assertTrue(initialCapacity >= 4095); + + /* Put values in indexes that fall within the initial allocation */ + vector.setSafe(0, STR1, 0, STR1.length); + vector.setSafe(initialCapacity - 1, STR2, 0, STR2.length); + + /* the above set calls should NOT have triggered a realloc */ + assertEquals(initialCapacity, vector.getValueCapacity()); + + /* Now try to put values in space that falls beyond the initial allocation */ + vector.setSafe(initialCapacity + 200, STR3, 0, STR3.length); + + /* Check valueCapacity is more than initial allocation */ + assertTrue(initialCapacity * 2 <= vector.getValueCapacity()); + + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(initialCapacity - 1)); + assertArrayEquals(STR3, vector.get(initialCapacity + 200)); + + // Set the valueCount to be more than valueCapacity of current allocation. This is possible for ValueVectors + // as we don't call setSafe for null values, but we do call setValueCount when the current batch is processed. + vector.setValueCount(vector.getValueCapacity() + 200); + } + } + + @Test + public void testFillEmptiesNotOverfill() { + try (final VarCharVector vector = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { + vector.setInitialCapacity(4095); + vector.allocateNew(); + + int initialCapacity = vector.getValueCapacity(); + assertTrue(initialCapacity >= 4095); + + vector.setSafe(4094, "hello".getBytes(), 0, 5); + /* the above set method should NOT have triggered a realloc */ + assertEquals(initialCapacity, vector.getValueCapacity()); + + long bufSizeBefore = vector.getFieldBuffers().get(1).capacity(); + vector.setValueCount(initialCapacity); + assertEquals(bufSizeBefore, vector.getFieldBuffers().get(1).capacity()); + assertEquals(initialCapacity, vector.getValueCapacity()); + } + } + + @Test + public void testSetSafeWithArrowBufNoExcessAllocs() { + final int numValues = BaseFixedWidthVector.INITIAL_VALUE_ALLOCATION * 2; + final byte[] valueBytes = "hello world".getBytes(); + final int valueBytesLength = valueBytes.length; + final int isSet = 1; + + try ( + final VarCharVector fromVector = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, + MinorType.VARCHAR, allocator); + final VarCharVector toVector = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, + MinorType.VARCHAR, allocator)) { + /* + * Populate the from vector with 'numValues' with byte-arrays, each of size 'valueBytesLength'. + */ + fromVector.setInitialCapacity(numValues); + fromVector.allocateNew(); + for (int i = 0; i < numValues; ++i) { + fromVector.setSafe(i, valueBytes, 0 /*start*/, valueBytesLength); + } + fromVector.setValueCount(numValues); + ArrowBuf fromDataBuffer = fromVector.getDataBuffer(); + assertTrue(numValues * valueBytesLength <= fromDataBuffer.capacity()); + + /* + * Copy the entries one-by-one from 'fromVector' to 'toVector', but use the setSafe with + * ArrowBuf API (instead of setSafe with byte-array). + */ + toVector.setInitialCapacity(numValues); + toVector.allocateNew(); + for (int i = 0; i < numValues; i++) { + int start = fromVector.getStartOffset(i); + int end = fromVector.getStartOffset(i + 1); + toVector.setSafe(i, isSet, start, end, fromDataBuffer); + } + + /* + * Since the 'fromVector' and 'toVector' have the same initial capacity, and were populated + * with the same varchar elements, the allocations and hence, the final capacity should be + * the same. + */ + assertEquals(fromDataBuffer.capacity(), toVector.getDataBuffer().capacity()); + } + } + + @Test + public void testCopyFromWithNulls() { + try (final VarCharVector vector = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator); + final VarCharVector vector2 = + newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { + + vector.setInitialCapacity(4095); + vector.allocateNew(); + int capacity = vector.getValueCapacity(); + assertTrue(capacity >= 4095); + + for (int i = 0; i < capacity; i++) { + if (i % 3 == 0) { + continue; + } + byte[] b = Integer.toString(i).getBytes(); + vector.setSafe(i, b, 0, b.length); + } + + /* NO reAlloc() should have happened in setSafe() */ + assertEquals(capacity, vector.getValueCapacity()); + + vector.setValueCount(capacity); + + for (int i = 0; i < capacity; i++) { + if (i % 3 == 0) { + assertNull(vector.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, Integer.toString(i), vector.getObject(i).toString()); + } + } + + vector2.setInitialCapacity(4095); + vector2.allocateNew(); + int capacity2 = vector2.getValueCapacity(); + assertEquals(capacity2, capacity); + + for (int i = 0; i < capacity; i++) { + vector2.copyFromSafe(i, i, vector); + if (i % 3 == 0) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, Integer.toString(i), vector2.getObject(i).toString()); + } + } + + /* NO reAlloc() should have happened in copyFrom */ + assertEquals(capacity, vector2.getValueCapacity()); + + vector2.setValueCount(capacity); + + for (int i = 0; i < capacity; i++) { + if (i % 3 == 0) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, Integer.toString(i), vector2.getObject(i).toString()); + } + } + } + } + + @Test + public void testCopyFromWithNulls1() { + try (final VarCharVector vector = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator); + final VarCharVector vector2 = + newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { + + vector.setInitialCapacity(4095); + vector.allocateNew(); + int capacity = vector.getValueCapacity(); + assertTrue(capacity >= 4095); + + for (int i = 0; i < capacity; i++) { + if (i % 3 == 0) { + continue; + } + byte[] b = Integer.toString(i).getBytes(); + vector.setSafe(i, b, 0, b.length); + } + + /* NO reAlloc() should have happened in setSafe() */ + assertEquals(capacity, vector.getValueCapacity()); + + vector.setValueCount(capacity); + + for (int i = 0; i < capacity; i++) { + if (i % 3 == 0) { + assertNull(vector.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, Integer.toString(i), vector.getObject(i).toString()); + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(1024 * 10, 1024); + + int capacity2 = vector2.getValueCapacity(); + assertTrue(capacity2 >= 1024); + assertTrue(capacity2 <= capacity); + + for (int i = 0; i < capacity; i++) { + vector2.copyFromSafe(i, i, vector); + if (i % 3 == 0) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, Integer.toString(i), vector2.getObject(i).toString()); + } + } + + /* 2 reAllocs should have happened in copyFromSafe() */ + assertEquals(capacity, vector2.getValueCapacity()); + + vector2.setValueCount(capacity); + + for (int i = 0; i < capacity; i++) { + if (i % 3 == 0) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, Integer.toString(i), vector2.getObject(i).toString()); + } + } + } + } + + @Test + public void testSetLastSetUsage() { + try (final VarCharVector vector = new VarCharVector("myvector", allocator)) { + vector.allocateNew(1024 * 10, 1024); + + setBytes(0, STR1, vector); + setBytes(1, STR2, vector); + setBytes(2, STR3, vector); + setBytes(3, STR4, vector); + setBytes(4, STR5, vector); + setBytes(5, STR6, vector); + + /* Check current lastSet */ + assertEquals(-1, vector.getLastSet()); + + /* Check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertArrayEquals(STR5, vector.get(4)); + assertArrayEquals(STR6, vector.get(5)); + + /* + * If we don't do setLastSe(5) before setValueCount(), then the latter will corrupt + * the value vector by filling in all positions [0,valuecount-1] will empty byte arrays. + * Run the test by commenting out next line and we should see incorrect vector output. + */ + vector.setLastSet(5); + vector.setValueCount(20); + + /* Check current lastSet */ + assertEquals(19, vector.getLastSet()); + + /* Check the vector output again */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertArrayEquals(STR5, vector.get(4)); + assertArrayEquals(STR6, vector.get(5)); + + assertEquals(0, vector.getValueLength(6)); + assertEquals(0, vector.getValueLength(7)); + assertEquals(0, vector.getValueLength(8)); + assertEquals(0, vector.getValueLength(9)); + assertEquals(0, vector.getValueLength(10)); + assertEquals(0, vector.getValueLength(11)); + assertEquals(0, vector.getValueLength(12)); + assertEquals(0, vector.getValueLength(13)); + assertEquals(0, vector.getValueLength(14)); + assertEquals(0, vector.getValueLength(15)); + assertEquals(0, vector.getValueLength(16)); + assertEquals(0, vector.getValueLength(17)); + assertEquals(0, vector.getValueLength(18)); + assertEquals(0, vector.getValueLength(19)); + + /* Check offsets */ + assertEquals(0, vector.offsetBuffer.getInt(0 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(6, vector.offsetBuffer.getInt(1 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(16, vector.offsetBuffer.getInt(2 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(21, vector.offsetBuffer.getInt(3 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(30, vector.offsetBuffer.getInt(4 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(34, vector.offsetBuffer.getInt(5 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(6 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(7 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(8 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(9 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(10 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(11 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(12 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(13 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(14 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(15 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(16 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(17 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(18 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(19 * BaseVariableWidthVector.OFFSET_WIDTH)); + + vector.set(19, STR6); + assertArrayEquals(STR6, vector.get(19)); + assertEquals(40, vector.offsetBuffer.getInt(19 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(46, vector.offsetBuffer.getInt(20 * BaseVariableWidthVector.OFFSET_WIDTH)); + } + } + + @Test + public void testVectorLoadUnload() { + + try (final VarCharVector vector1 = new VarCharVector("myvector", allocator)) { + + setVector(vector1, STR1, STR2, STR3, STR4, STR5, STR6); + + assertEquals(5, vector1.getLastSet()); + vector1.setValueCount(15); + assertEquals(14, vector1.getLastSet()); + + /* Check the vector output */ + assertArrayEquals(STR1, vector1.get(0)); + assertArrayEquals(STR2, vector1.get(1)); + assertArrayEquals(STR3, vector1.get(2)); + assertArrayEquals(STR4, vector1.get(3)); + assertArrayEquals(STR5, vector1.get(4)); + assertArrayEquals(STR6, vector1.get(5)); + + Field field = vector1.getField(); + String fieldName = field.getName(); + + List fields = new ArrayList<>(); + List fieldVectors = new ArrayList<>(); + + fields.add(field); + fieldVectors.add(vector1); + + Schema schema = new Schema(fields); + + VectorSchemaRoot schemaRoot1 = new VectorSchemaRoot(schema, fieldVectors, vector1.getValueCount()); + VectorUnloader vectorUnloader = new VectorUnloader(schemaRoot1); + + try ( + ArrowRecordBatch recordBatch = vectorUnloader.getRecordBatch(); + BufferAllocator finalVectorsAllocator = allocator.newChildAllocator("new vector", 0, Long.MAX_VALUE); + VectorSchemaRoot schemaRoot2 = VectorSchemaRoot.create(schema, finalVectorsAllocator); + ) { + + VectorLoader vectorLoader = new VectorLoader(schemaRoot2); + vectorLoader.load(recordBatch); + + VarCharVector vector2 = (VarCharVector) schemaRoot2.getVector(fieldName); + /* + * lastSet would have internally been set by VectorLoader.load() when it invokes + * loadFieldBuffers. + */ + assertEquals(14, vector2.getLastSet()); + vector2.setValueCount(25); + assertEquals(24, vector2.getLastSet()); + + /* Check the vector output */ + assertArrayEquals(STR1, vector2.get(0)); + assertArrayEquals(STR2, vector2.get(1)); + assertArrayEquals(STR3, vector2.get(2)); + assertArrayEquals(STR4, vector2.get(3)); + assertArrayEquals(STR5, vector2.get(4)); + assertArrayEquals(STR6, vector2.get(5)); + } + } + } + + @Test + public void testFillEmptiesUsage() { + try (final VarCharVector vector = new VarCharVector("myvector", allocator)) { + + vector.allocateNew(1024 * 10, 1024); + + setBytes(0, STR1, vector); + setBytes(1, STR2, vector); + setBytes(2, STR3, vector); + setBytes(3, STR4, vector); + setBytes(4, STR5, vector); + setBytes(5, STR6, vector); + + /* Check current lastSet */ + assertEquals(-1, vector.getLastSet()); + + /* Check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertArrayEquals(STR5, vector.get(4)); + assertArrayEquals(STR6, vector.get(5)); + + vector.setLastSet(5); + /* fill empty byte arrays from index [6, 9] */ + vector.fillEmpties(10); + + /* Check current lastSet */ + assertEquals(9, vector.getLastSet()); + + /* Check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertArrayEquals(STR5, vector.get(4)); + assertArrayEquals(STR6, vector.get(5)); + assertEquals(0, vector.getValueLength(6)); + assertEquals(0, vector.getValueLength(7)); + assertEquals(0, vector.getValueLength(8)); + assertEquals(0, vector.getValueLength(9)); + + setBytes(10, STR1, vector); + setBytes(11, STR2, vector); + + vector.setLastSet(11); + /* fill empty byte arrays from index [12, 14] */ + vector.setValueCount(15); + + /* Check current lastSet */ + assertEquals(14, vector.getLastSet()); + + /* Check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertArrayEquals(STR5, vector.get(4)); + assertArrayEquals(STR6, vector.get(5)); + assertEquals(0, vector.getValueLength(6)); + assertEquals(0, vector.getValueLength(7)); + assertEquals(0, vector.getValueLength(8)); + assertEquals(0, vector.getValueLength(9)); + assertArrayEquals(STR1, vector.get(10)); + assertArrayEquals(STR2, vector.get(11)); + assertEquals(0, vector.getValueLength(12)); + assertEquals(0, vector.getValueLength(13)); + assertEquals(0, vector.getValueLength(14)); + + /* Check offsets */ + assertEquals(0, + vector.offsetBuffer.getInt(0 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(6, + vector.offsetBuffer.getInt(1 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(16, + vector.offsetBuffer.getInt(2 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(21, + vector.offsetBuffer.getInt(3 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(30, + vector.offsetBuffer.getInt(4 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(34, + vector.offsetBuffer.getInt(5 * BaseVariableWidthVector.OFFSET_WIDTH)); + + assertEquals(40, + vector.offsetBuffer.getInt(6 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, + vector.offsetBuffer.getInt(7 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, + vector.offsetBuffer.getInt(8 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, + vector.offsetBuffer.getInt(9 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, + vector.offsetBuffer.getInt(10 * BaseVariableWidthVector.OFFSET_WIDTH)); + + assertEquals(46, + vector.offsetBuffer.getInt(11 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(56, + vector.offsetBuffer.getInt(12 * BaseVariableWidthVector.OFFSET_WIDTH)); + + assertEquals(56, + vector.offsetBuffer.getInt(13 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(56, + vector.offsetBuffer.getInt(14 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(56, + vector.offsetBuffer.getInt(15 * BaseVariableWidthVector.OFFSET_WIDTH)); + } + } + + @Test /* VarCharVector */ + public void testGetBufferAddress1() { + + try (final VarCharVector vector = new VarCharVector("myvector", allocator)) { + + setVector(vector, STR1, STR2, STR3, STR4, STR5, STR6); + vector.setValueCount(15); + + /* check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertArrayEquals(STR5, vector.get(4)); + assertArrayEquals(STR6, vector.get(5)); + + List buffers = vector.getFieldBuffers(); + long bitAddress = vector.getValidityBufferAddress(); + long offsetAddress = vector.getOffsetBufferAddress(); + long dataAddress = vector.getDataBufferAddress(); + + assertEquals(3, buffers.size()); + assertEquals(bitAddress, buffers.get(0).memoryAddress()); + assertEquals(offsetAddress, buffers.get(1).memoryAddress()); + assertEquals(dataAddress, buffers.get(2).memoryAddress()); + } + } + + @Test /* IntVector */ + public void testGetBufferAddress2() { + try (final IntVector vector = new IntVector("myvector", allocator)) { + boolean error = false; + vector.allocateNew(16); + + /* populate the vector */ + for (int i = 0; i < 16; i += 2) { + vector.set(i, i + 10); + } + + /* check the vector output */ + for (int i = 0; i < 16; i += 2) { + assertEquals(i + 10, vector.get(i)); + } + + List buffers = vector.getFieldBuffers(); + long bitAddress = vector.getValidityBufferAddress(); + long dataAddress = vector.getDataBufferAddress(); + + try { + long offsetAddress = vector.getOffsetBufferAddress(); + } catch (UnsupportedOperationException ue) { + error = true; + } finally { + assertTrue(error); + } + + assertEquals(2, buffers.size()); + assertEquals(bitAddress, buffers.get(0).memoryAddress()); + assertEquals(dataAddress, buffers.get(1).memoryAddress()); + } + } + + @Test + public void testMultipleClose() { + BufferAllocator vectorAllocator = allocator.newChildAllocator("vector_allocator", 0, Long.MAX_VALUE); + IntVector vector = newVector(IntVector.class, EMPTY_SCHEMA_PATH, MinorType.INT, vectorAllocator); + vector.close(); + vectorAllocator.close(); + vector.close(); + vectorAllocator.close(); + } + + /* this method is used by the tests to bypass the vector set methods that manipulate + * lastSet. The method is to test the lastSet property and that's why we load the vector + * in a way that lastSet is not set automatically. + */ + public static void setBytes(int index, byte[] bytes, VarCharVector vector) { + final int currentOffset = vector.offsetBuffer.getInt(index * BaseVariableWidthVector.OFFSET_WIDTH); + + BitVectorHelper.setBit(vector.validityBuffer, index); + vector.offsetBuffer.setInt((index + 1) * BaseVariableWidthVector.OFFSET_WIDTH, currentOffset + bytes.length); + vector.valueBuffer.setBytes(currentOffset, bytes, 0, bytes.length); + } + + @Test /* VarCharVector */ + public void testSetInitialCapacity() { + try (final VarCharVector vector = new VarCharVector(EMPTY_SCHEMA_PATH, allocator)) { + + /* use the default 8 data bytes on average per element */ + int defaultCapacity = BaseValueVector.INITIAL_VALUE_ALLOCATION - 1; + vector.setInitialCapacity(defaultCapacity); + vector.allocateNew(); + assertEquals(defaultCapacity, vector.getValueCapacity()); + assertEquals(CommonUtil.nextPowerOfTwo(defaultCapacity * 8), vector.getDataBuffer().capacity()); + + vector.setInitialCapacity(defaultCapacity, 1); + vector.allocateNew(); + assertEquals(defaultCapacity, vector.getValueCapacity()); + assertEquals(CommonUtil.nextPowerOfTwo(defaultCapacity), vector.getDataBuffer().capacity()); + + vector.setInitialCapacity(defaultCapacity, 0.1); + vector.allocateNew(); + assertEquals(defaultCapacity, vector.getValueCapacity()); + assertEquals(CommonUtil.nextPowerOfTwo((int) (defaultCapacity * 0.1)), vector.getDataBuffer().capacity()); + + vector.setInitialCapacity(defaultCapacity, 0.01); + vector.allocateNew(); + assertEquals(defaultCapacity, vector.getValueCapacity()); + assertEquals(CommonUtil.nextPowerOfTwo((int) (defaultCapacity * 0.01)), vector.getDataBuffer().capacity()); + + vector.setInitialCapacity(5, 0.01); + vector.allocateNew(); + assertEquals(5, vector.getValueCapacity()); + assertEquals(2, vector.getDataBuffer().capacity()); + } + } + + @Test + public void testDefaultAllocNewAll() { + int defaultCapacity = BaseValueVector.INITIAL_VALUE_ALLOCATION; + int expectedSize; + long beforeSize; + try (BufferAllocator childAllocator = allocator.newChildAllocator("defaultAllocs", 0, Long.MAX_VALUE); + final IntVector intVector = new IntVector(EMPTY_SCHEMA_PATH, childAllocator); + final BigIntVector bigIntVector = new BigIntVector(EMPTY_SCHEMA_PATH, childAllocator); + final BitVector bitVector = new BitVector(EMPTY_SCHEMA_PATH, childAllocator); + final DecimalVector decimalVector = new DecimalVector(EMPTY_SCHEMA_PATH, childAllocator, 38, 6); + final VarCharVector varCharVector = new VarCharVector(EMPTY_SCHEMA_PATH, childAllocator)) { + + // verify that the wastage is within bounds for IntVector. + beforeSize = childAllocator.getAllocatedMemory(); + intVector.allocateNew(); + assertTrue(intVector.getValueCapacity() >= defaultCapacity); + expectedSize = (defaultCapacity * IntVector.TYPE_WIDTH) + + BaseFixedWidthVector.getValidityBufferSizeFromCount(defaultCapacity); + assertTrue(childAllocator.getAllocatedMemory() - beforeSize <= expectedSize * 1.05); + + // verify that the wastage is within bounds for BigIntVector. + beforeSize = childAllocator.getAllocatedMemory(); + bigIntVector.allocateNew(); + assertTrue(bigIntVector.getValueCapacity() >= defaultCapacity); + expectedSize = (defaultCapacity * bigIntVector.TYPE_WIDTH) + + BaseFixedWidthVector.getValidityBufferSizeFromCount(defaultCapacity); + assertTrue(childAllocator.getAllocatedMemory() - beforeSize <= expectedSize * 1.05); + + // verify that the wastage is within bounds for DecimalVector. + beforeSize = childAllocator.getAllocatedMemory(); + decimalVector.allocateNew(); + assertTrue(decimalVector.getValueCapacity() >= defaultCapacity); + expectedSize = (defaultCapacity * decimalVector.TYPE_WIDTH) + + BaseFixedWidthVector.getValidityBufferSizeFromCount(defaultCapacity); + assertTrue(childAllocator.getAllocatedMemory() - beforeSize <= expectedSize * 1.05); + + // verify that the wastage is within bounds for VarCharVector. + // var char vector have an offsets array that is 1 less than defaultCapacity + beforeSize = childAllocator.getAllocatedMemory(); + varCharVector.allocateNew(); + assertTrue(varCharVector.getValueCapacity() >= defaultCapacity - 1); + expectedSize = (defaultCapacity * VarCharVector.OFFSET_WIDTH) + + BaseFixedWidthVector.getValidityBufferSizeFromCount(defaultCapacity) + + defaultCapacity * 8; + // wastage should be less than 5%. + assertTrue(childAllocator.getAllocatedMemory() - beforeSize <= expectedSize * 1.05); + + // verify that the wastage is within bounds for BitVector. + beforeSize = childAllocator.getAllocatedMemory(); + bitVector.allocateNew(); + assertTrue(bitVector.getValueCapacity() >= defaultCapacity); + expectedSize = BaseFixedWidthVector.getValidityBufferSizeFromCount(defaultCapacity) * 2; + assertTrue(childAllocator.getAllocatedMemory() - beforeSize <= expectedSize * 1.05); + + } + } + + @Test + public void testSetNullableVarCharHolder() { + try (VarCharVector vector = new VarCharVector("", allocator)) { + vector.allocateNew(100, 10); + + NullableVarCharHolder nullHolder = new NullableVarCharHolder(); + nullHolder.isSet = 0; + + NullableVarCharHolder stringHolder = new NullableVarCharHolder(); + stringHolder.isSet = 1; + + String str = "hello"; + ArrowBuf buf = allocator.buffer(16); + buf.setBytes(0, str.getBytes()); + + stringHolder.start = 0; + stringHolder.end = str.length(); + stringHolder.buffer = buf; + + vector.set(0, nullHolder); + vector.set(1, stringHolder); + + // verify results + assertTrue(vector.isNull(0)); + assertEquals(str, new String(vector.get(1))); + + buf.close(); + } + } + + @Test + public void testSetNullableVarCharHolderSafe() { + try (VarCharVector vector = new VarCharVector("", allocator)) { + vector.allocateNew(5, 1); + + NullableVarCharHolder nullHolder = new NullableVarCharHolder(); + nullHolder.isSet = 0; + + NullableVarCharHolder stringHolder = new NullableVarCharHolder(); + stringHolder.isSet = 1; + + String str = "hello world"; + ArrowBuf buf = allocator.buffer(16); + buf.setBytes(0, str.getBytes()); + + stringHolder.start = 0; + stringHolder.end = str.length(); + stringHolder.buffer = buf; + + vector.setSafe(0, stringHolder); + vector.setSafe(1, nullHolder); + + // verify results + assertEquals(str, new String(vector.get(0))); + assertTrue(vector.isNull(1)); + + buf.close(); + } + } + + @Test + public void testSetNullableVarBinaryHolder() { + try (VarBinaryVector vector = new VarBinaryVector("", allocator)) { + vector.allocateNew(100, 10); + + NullableVarBinaryHolder nullHolder = new NullableVarBinaryHolder(); + nullHolder.isSet = 0; + + NullableVarBinaryHolder binHolder = new NullableVarBinaryHolder(); + binHolder.isSet = 1; + + String str = "hello"; + ArrowBuf buf = allocator.buffer(16); + buf.setBytes(0, str.getBytes()); + + binHolder.start = 0; + binHolder.end = str.length(); + binHolder.buffer = buf; + + vector.set(0, nullHolder); + vector.set(1, binHolder); + + // verify results + assertTrue(vector.isNull(0)); + assertEquals(str, new String(vector.get(1))); + + buf.close(); + } + } + + @Test + public void testSetNullableVarBinaryHolderSafe() { + try (VarBinaryVector vector = new VarBinaryVector("", allocator)) { + vector.allocateNew(5, 1); + + NullableVarBinaryHolder nullHolder = new NullableVarBinaryHolder(); + nullHolder.isSet = 0; + + NullableVarBinaryHolder binHolder = new NullableVarBinaryHolder(); + binHolder.isSet = 1; + + String str = "hello world"; + ArrowBuf buf = allocator.buffer(16); + buf.setBytes(0, str.getBytes()); + + binHolder.start = 0; + binHolder.end = str.length(); + binHolder.buffer = buf; + + vector.setSafe(0, binHolder); + vector.setSafe(1, nullHolder); + + // verify results + assertEquals(str, new String(vector.get(0))); + assertTrue(vector.isNull(1)); + + buf.close(); + } + } + + @Test + public void testGetPointerFixedWidth() { + final int vectorLength = 100; + try (IntVector vec1 = new IntVector("vec1", allocator); + IntVector vec2 = new IntVector("vec2", allocator)) { + vec1.allocateNew(vectorLength); + vec2.allocateNew(vectorLength); + + for (int i = 0; i < vectorLength; i++) { + if (i % 10 == 0) { + vec1.setNull(i); + vec2.setNull(i); + } else { + vec1.set(i, i * 1234); + vec2.set(i, i * 1234); + } + } + + ArrowBufPointer ptr1 = new ArrowBufPointer(); + ArrowBufPointer ptr2 = new ArrowBufPointer(); + + for (int i = 0; i < vectorLength; i++) { + vec1.getDataPointer(i, ptr1); + vec2.getDataPointer(i, ptr2); + + if (i % 10 == 0) { + assertNull(ptr1.getBuf()); + assertNull(ptr2.getBuf()); + } + + assertTrue(ptr1.equals(ptr2)); + assertTrue(ptr2.equals(ptr2)); + } + } + } + + @Test + public void testGetPointerVariableWidth() { + final String[] sampleData = new String[]{ + "abc", "123", "def", null, "hello", "aaaaa", "world", "2019", null, "0717"}; + + try (VarCharVector vec1 = new VarCharVector("vec1", allocator); + VarCharVector vec2 = new VarCharVector("vec2", allocator)) { + vec1.allocateNew(sampleData.length * 10, sampleData.length); + vec2.allocateNew(sampleData.length * 10, sampleData.length); + + for (int i = 0; i < sampleData.length; i++) { + String str = sampleData[i]; + if (str != null) { + vec1.set(i, sampleData[i].getBytes()); + vec2.set(i, sampleData[i].getBytes()); + } else { + vec1.setNull(i); + vec2.setNull(i); + } + } + + ArrowBufPointer ptr1 = new ArrowBufPointer(); + ArrowBufPointer ptr2 = new ArrowBufPointer(); + + for (int i = 0; i < sampleData.length; i++) { + vec1.getDataPointer(i, ptr1); + vec2.getDataPointer(i, ptr2); + + assertTrue(ptr1.equals(ptr2)); + assertTrue(ptr2.equals(ptr2)); + } + } + } + + @Test + public void testGetNullFromVariableWidthVector() { + try (final VarCharVector varCharVector = new VarCharVector("varcharvec", allocator); + final VarBinaryVector varBinaryVector = new VarBinaryVector("varbinary", allocator)) { + varCharVector.allocateNew(10, 1); + varBinaryVector.allocateNew(10, 1); + + varCharVector.setNull(0); + varBinaryVector.setNull(0); + + assertNull(varCharVector.get(0)); + assertNull(varBinaryVector.get(0)); + } + } + + @Test + public void testZeroVectorEquals() { + try (final ZeroVector vector1 = new ZeroVector("vector"); + final ZeroVector vector2 = new ZeroVector("vector")) { + + VectorEqualsVisitor visitor = new VectorEqualsVisitor(); + assertTrue(visitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testZeroVectorNotEquals() { + try (final IntVector intVector = new IntVector("int", allocator); + final ZeroVector zeroVector = new ZeroVector("zero"); + final ZeroVector zeroVector1 = new ZeroVector("zero1")) { + + VectorEqualsVisitor zeroVisitor = new VectorEqualsVisitor(); + assertFalse(zeroVisitor.vectorEquals(intVector, zeroVector)); + + VectorEqualsVisitor intVisitor = new VectorEqualsVisitor(); + assertFalse(intVisitor.vectorEquals(zeroVector, intVector)); + + VectorEqualsVisitor twoZeroVisitor = new VectorEqualsVisitor(); + // they are not equal because of distinct names + assertFalse(twoZeroVisitor.vectorEquals(zeroVector, zeroVector1)); + } + } + + @Test + public void testIntVectorEqualsWithNull() { + try (final IntVector vector1 = new IntVector("int", allocator); + final IntVector vector2 = new IntVector("int", allocator)) { + + setVector(vector1, 1, 2); + setVector(vector2, 1, null); + + VectorEqualsVisitor visitor = new VectorEqualsVisitor(); + + assertFalse(visitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testIntVectorEquals() { + try (final IntVector vector1 = new IntVector("int", allocator); + final IntVector vector2 = new IntVector("int", allocator)) { + + setVector(vector1, 1, 2, 3); + setVector(vector2, 1, 2, null); + + VectorEqualsVisitor visitor = new VectorEqualsVisitor(); + + assertFalse(visitor.vectorEquals(vector1, vector2)); + + vector2.setValueCount(3); + vector2.setSafe(2, 2); + assertFalse(vector1.equals(vector2)); + + vector2.setSafe(2, 3); + assertTrue(visitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testDecimalVectorEquals() { + try (final DecimalVector vector1 = new DecimalVector("decimal", allocator, 3, 3); + final DecimalVector vector2 = new DecimalVector("decimal", allocator, 3, 3); + final DecimalVector vector3 = new DecimalVector("decimal", allocator, 3, 2)) { + + setVector(vector1, 100L, 200L); + setVector(vector2, 100L, 200L); + setVector(vector3, 100L, 200L); + + VectorEqualsVisitor visitor1 = new VectorEqualsVisitor(); + VectorEqualsVisitor visitor2 = new VectorEqualsVisitor(); + + assertTrue(visitor1.vectorEquals(vector1, vector2)); + assertFalse(visitor2.vectorEquals(vector1, vector3)); + } + } + + @Test + public void testVarcharVectorEqualsWithNull() { + try (final VarCharVector vector1 = new VarCharVector("varchar", allocator); + final VarCharVector vector2 = new VarCharVector("varchar", allocator)) { + + setVector(vector1, STR1, STR2); + setVector(vector2, STR1, null); + + VectorEqualsVisitor visitor = new VectorEqualsVisitor(); + assertFalse(visitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testVarcharVectorEquals() { + try (final VarCharVector vector1 = new VarCharVector("varchar", allocator); + final VarCharVector vector2 = new VarCharVector("varchar", allocator)) { + + setVector(vector1, STR1, STR2, STR3); + setVector(vector2, STR1, STR2); + + VectorEqualsVisitor visitor = new VectorEqualsVisitor(); + assertFalse(visitor.vectorEquals(vector1, vector2)); + + vector2.setSafe(2, STR3, 0, STR3.length); + vector2.setValueCount(3); + assertTrue(visitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testVarBinaryVectorEquals() { + try (final VarBinaryVector vector1 = new VarBinaryVector("binary", allocator); + final VarBinaryVector vector2 = new VarBinaryVector("binary", allocator)) { + + setVector(vector1, STR1, STR2, STR3); + setVector(vector2, STR1, STR2); + + VectorEqualsVisitor visitor = new VectorEqualsVisitor(); + assertFalse(visitor.vectorEquals(vector1, vector2)); + + vector2.setSafe(2, STR3, 0, STR3.length); + vector2.setValueCount(3); + assertTrue(visitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testListVectorEqualsWithNull() { + try (final ListVector vector1 = ListVector.empty("list", allocator); + final ListVector vector2 = ListVector.empty("list", allocator);) { + + UnionListWriter writer1 = vector1.getWriter(); + writer1.allocate(); + + //set some values + writeListVector(writer1, new int[] {1, 2}); + writeListVector(writer1, new int[] {3, 4}); + writeListVector(writer1, new int[] {}); + writer1.setValueCount(3); + + UnionListWriter writer2 = vector2.getWriter(); + writer2.allocate(); + + //set some values + writeListVector(writer2, new int[] {1, 2}); + writeListVector(writer2, new int[] {3, 4}); + writer2.setValueCount(3); + + VectorEqualsVisitor visitor = new VectorEqualsVisitor(); + + assertFalse(visitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testListVectorEquals() { + try (final ListVector vector1 = ListVector.empty("list", allocator); + final ListVector vector2 = ListVector.empty("list", allocator);) { + + UnionListWriter writer1 = vector1.getWriter(); + writer1.allocate(); + + //set some values + writeListVector(writer1, new int[] {1, 2}); + writeListVector(writer1, new int[] {3, 4}); + writeListVector(writer1, new int[] {5, 6}); + writer1.setValueCount(3); + + UnionListWriter writer2 = vector2.getWriter(); + writer2.allocate(); + + //set some values + writeListVector(writer2, new int[] {1, 2}); + writeListVector(writer2, new int[] {3, 4}); + writer2.setValueCount(2); + + VectorEqualsVisitor visitor = new VectorEqualsVisitor(); + assertFalse(visitor.vectorEquals(vector1, vector2)); + + writeListVector(writer2, new int[] {5, 6}); + writer2.setValueCount(3); + + assertTrue(visitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testStructVectorEqualsWithNull() { + + try (final StructVector vector1 = StructVector.empty("struct", allocator); + final StructVector vector2 = StructVector.empty("struct", allocator);) { + vector1.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class); + vector1.addOrGet("f1", FieldType.nullable(new ArrowType.Int(64, true)), BigIntVector.class); + vector2.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class); + vector2.addOrGet("f1", FieldType.nullable(new ArrowType.Int(64, true)), BigIntVector.class); + + NullableStructWriter writer1 = vector1.getWriter(); + writer1.allocate(); + + writeStructVector(writer1, 1, 10L); + writeStructVector(writer1, 2, 20L); + writeStructVector(writer1, 3, 30L); + writer1.setValueCount(3); + + NullableStructWriter writer2 = vector2.getWriter(); + writer2.allocate(); + + writeStructVector(writer2, 1, 10L); + writeStructVector(writer2, 3, 30L); + writer2.setValueCount(3); + + VectorEqualsVisitor visitor = new VectorEqualsVisitor(); + assertFalse(visitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testStructVectorEquals() { + try (final StructVector vector1 = StructVector.empty("struct", allocator); + final StructVector vector2 = StructVector.empty("struct", allocator);) { + vector1.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class); + vector1.addOrGet("f1", FieldType.nullable(new ArrowType.Int(64, true)), BigIntVector.class); + vector2.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class); + vector2.addOrGet("f1", FieldType.nullable(new ArrowType.Int(64, true)), BigIntVector.class); + + NullableStructWriter writer1 = vector1.getWriter(); + writer1.allocate(); + + writeStructVector(writer1, 1, 10L); + writeStructVector(writer1, 2, 20L); + writeStructVector(writer1, 3, 30L); + writer1.setValueCount(3); + + NullableStructWriter writer2 = vector2.getWriter(); + writer2.allocate(); + + writeStructVector(writer2, 1, 10L); + writeStructVector(writer2, 2, 20L); + writer2.setValueCount(2); + + VectorEqualsVisitor visitor = new VectorEqualsVisitor(); + assertFalse(visitor.vectorEquals(vector1, vector2)); + + writeStructVector(writer2, 3, 30L); + writer2.setValueCount(3); + + assertTrue(visitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testStructVectorEqualsWithDiffChild() { + try (final StructVector vector1 = StructVector.empty("struct", allocator); + final StructVector vector2 = StructVector.empty("struct", allocator);) { + vector1.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class); + vector1.addOrGet("f1", FieldType.nullable(new ArrowType.Int(64, true)), BigIntVector.class); + vector2.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class); + vector2.addOrGet("f10", FieldType.nullable(new ArrowType.Int(64, true)), BigIntVector.class); + + NullableStructWriter writer1 = vector1.getWriter(); + writer1.allocate(); + + writeStructVector(writer1, 1, 10L); + writeStructVector(writer1, 2, 20L); + writer1.setValueCount(2); + + NullableStructWriter writer2 = vector2.getWriter(); + writer2.allocate(); + + writeStructVector(writer2, 1, 10L); + writeStructVector(writer2, 2, 20L); + writer2.setValueCount(2); + + VectorEqualsVisitor visitor = new VectorEqualsVisitor(); + assertFalse(visitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testUnionVectorEquals() { + try (final UnionVector vector1 = new UnionVector("union", allocator, /* field type */ null, /* call-back */ null); + final UnionVector vector2 = new UnionVector("union", allocator, /* field type */ null, /* call-back */ null);) { + + final NullableUInt4Holder uInt4Holder = new NullableUInt4Holder(); + uInt4Holder.value = 10; + uInt4Holder.isSet = 1; + + final NullableIntHolder intHolder = new NullableIntHolder(); + uInt4Holder.value = 20; + uInt4Holder.isSet = 1; + + vector1.setType(0, Types.MinorType.UINT4); + vector1.setSafe(0, uInt4Holder); + + vector1.setType(1, Types.MinorType.INT); + vector1.setSafe(1, intHolder); + vector1.setValueCount(2); + + vector2.setType(0, Types.MinorType.UINT4); + vector2.setSafe(0, uInt4Holder); + + vector2.setType(1, Types.MinorType.INT); + vector2.setSafe(1, intHolder); + vector2.setValueCount(2); + + VectorEqualsVisitor visitor = new VectorEqualsVisitor(); + assertTrue(visitor.vectorEquals(vector1, vector2)); + } + } + + @Test(expected = IllegalArgumentException.class) + public void testEqualsWithIndexOutOfRange() { + try (final IntVector vector1 = new IntVector("int", allocator); + final IntVector vector2 = new IntVector("int", allocator)) { + + setVector(vector1, 1, 2); + setVector(vector2, 1, 2); + + assertTrue(new RangeEqualsVisitor(vector1, vector2).rangeEquals(new Range(2, 3, 1))); + } + } + + @Test + public void testFixedWidthVectorNullHashCode() { + try (IntVector intVec = new IntVector("int vector", allocator)) { + intVec.allocateNew(1); + intVec.setValueCount(1); + + intVec.set(0, 100); + intVec.setNull(0); + + assertEquals(0, intVec.hashCode(0)); + } + } + + @Test + public void testVariableWidthVectorNullHashCode() { + try (VarCharVector varChVec = new VarCharVector("var char vector", allocator)) { + varChVec.allocateNew(100, 1); + varChVec.setValueCount(1); + + varChVec.set(0, "abc".getBytes()); + varChVec.setNull(0); + + assertEquals(0, varChVec.hashCode(0)); + } + } + + @Test + public void testUnionNullHashCode() { + try (UnionVector srcVector = + new UnionVector(EMPTY_SCHEMA_PATH, allocator, /* field type */ null, /* call-back */ null)) { + srcVector.allocateNew(); + + final NullableIntHolder holder = new NullableIntHolder(); + holder.isSet = 0; + + // write some data + srcVector.setType(0, MinorType.INT); + srcVector.setSafe(0, holder); + + assertEquals(0, srcVector.hashCode(0)); + } + } + + @Test + public void testToString() { + try (final IntVector intVector = new IntVector("intVector", allocator); + final ListVector listVector = ListVector.empty("listVector", allocator); + final StructVector structVector = StructVector.empty("structVector", allocator)) { + + // validate intVector toString + assertEquals("[]", intVector.toString()); + intVector.setValueCount(3); + intVector.setSafe(0, 1); + intVector.setSafe(1, 2); + intVector.setSafe(2, 3); + assertEquals("[1, 2, 3]", intVector.toString()); + + // validate intVector with plenty values + intVector.setValueCount(100); + for (int i = 0; i < 100; i++) { + intVector.setSafe(i, i); + } + assertEquals("[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ... 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]", + intVector.toString()); + + // validate listVector toString + listVector.allocateNewSafe(); + listVector.initializeChildrenFromFields( + Collections.singletonList(Field.nullable("child", ArrowType.Utf8.INSTANCE))); + VarCharVector dataVector = (VarCharVector) listVector.getDataVector(); + + listVector.startNewValue(0); + dataVector.setSafe(0, "aaa".getBytes(StandardCharsets.UTF_8)); + dataVector.setSafe(1, "bbb".getBytes(StandardCharsets.UTF_8)); + listVector.endValue(0, 2); + + listVector.startNewValue(1); + dataVector.setSafe(2, "ccc".getBytes(StandardCharsets.UTF_8)); + dataVector.setSafe(3, "ddd".getBytes(StandardCharsets.UTF_8)); + listVector.endValue(1, 2); + listVector.setValueCount(2); + + assertEquals("[[\"aaa\",\"bbb\"], [\"ccc\",\"ddd\"]]", listVector.toString()); + + // validate structVector toString + structVector.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class); + structVector.addOrGet("f1", FieldType.nullable(new ArrowType.Int(64, true)), BigIntVector.class); + + NullableStructWriter structWriter = structVector.getWriter(); + structWriter.allocate(); + + writeStructVector(structWriter, 1, 10L); + writeStructVector(structWriter, 2, 20L); + structWriter.setValueCount(2); + + assertEquals("[{\"f0\":1,\"f1\":10}, {\"f0\":2,\"f1\":20}]", structVector.toString()); + } + } + + @Test + public void testUInt1VectorToString() { + try (final UInt1Vector uInt1Vector = new UInt1Vector("uInt1Vector", allocator)) { + setVector(uInt1Vector, (byte) 0xff); + assertEquals("[255]", uInt1Vector.toString()); + } + } + + @Test + public void testUInt2VectorToString() { + try (final UInt2Vector uInt2Vector = new UInt2Vector("uInt2Vector", allocator)) { + setVector(uInt2Vector, (char) 0xffff); + assertEquals("[65535]", uInt2Vector.toString()); + } + } + + @Test + public void testUInt4VectorToString() { + try (final UInt4Vector uInt4Vector = new UInt4Vector("uInt4Vector", allocator)) { + setVector(uInt4Vector, 0xffffffff); + assertEquals("[4294967295]", uInt4Vector.toString()); + } + } + + @Test + public void testUInt8VectorToString() { + try (final UInt8Vector uInt8Vector = new UInt8Vector("uInt8Vector", allocator)) { + setVector(uInt8Vector, 0xffffffffffffffffL); + assertEquals("[18446744073709551615]", uInt8Vector.toString()); + } + } + + @Test + public void testUnloadVariableWidthVector() { + try (final VarCharVector varCharVector = new VarCharVector("var char", allocator)) { + varCharVector.allocateNew(5, 2); + varCharVector.setValueCount(2); + + varCharVector.set(0, "abcd".getBytes()); + + List bufs = varCharVector.getFieldBuffers(); + assertEquals(3, bufs.size()); + + ArrowBuf offsetBuf = bufs.get(1); + ArrowBuf dataBuf = bufs.get(2); + + assertEquals(12, offsetBuf.writerIndex()); + assertEquals(4, offsetBuf.getInt(4)); + assertEquals(4, offsetBuf.getInt(8)); + + assertEquals(4, dataBuf.writerIndex()); + } + } + + private void writeStructVector(NullableStructWriter writer, int value1, long value2) { + writer.start(); + writer.integer("f0").writeInt(value1); + writer.bigInt("f1").writeBigInt(value2); + writer.end(); + } + + private void writeListVector(UnionListWriter writer, int[] values) { + writer.startList(); + for (int v: values) { + writer.integer().writeInt(v); + } + writer.endList(); + } + + @Test + public void testVariableVectorGetEndOffset() { + try (final VarCharVector vector1 = new VarCharVector("v1", allocator); + final VarBinaryVector vector2 = new VarBinaryVector("v2", allocator)) { + + setVector(vector1, STR1, null, STR2); + setVector(vector2, STR1, STR2, STR3); + + assertEquals(0, vector1.getStartOffset(0)); + assertEquals(STR1.length, vector1.getEndOffset(0)); + assertEquals(STR1.length, vector1.getStartOffset(1)); + assertEquals(STR1.length, vector1.getEndOffset(1)); + assertEquals(STR1.length, vector1.getStartOffset(2)); + assertEquals(STR1.length + STR2.length, vector1.getEndOffset(2)); + + assertEquals(0, vector2.getStartOffset(0)); + assertEquals(STR1.length, vector2.getEndOffset(0)); + assertEquals(STR1.length, vector2.getStartOffset(1)); + assertEquals(STR1.length + STR2.length, vector2.getEndOffset(1)); + assertEquals(STR1.length + STR2.length, vector2.getStartOffset(2)); + assertEquals(STR1.length + STR2.length + STR3.length, vector2.getEndOffset(2)); + } + } + + @Test + public void testEmptyBufBehavior() { + final int valueCount = 10; + + try (final IntVector vector = new IntVector("v", allocator)) { + assertEquals(1, vector.getDataBuffer().refCnt()); + assertEquals(1, vector.getValidityBuffer().refCnt()); + assertEquals(0, vector.getDataBuffer().capacity()); + assertEquals(0, vector.getValidityBuffer().capacity()); + + vector.allocateNew(valueCount); + assertEquals(2, vector.getDataBuffer().refCnt()); + assertEquals(2, vector.getValidityBuffer().refCnt()); + assertEquals(56, vector.getDataBuffer().capacity()); + assertEquals(8, vector.getValidityBuffer().capacity()); + + vector.close(); + assertEquals(1, vector.getDataBuffer().refCnt()); + assertEquals(1, vector.getValidityBuffer().refCnt()); + assertEquals(0, vector.getDataBuffer().capacity()); + assertEquals(0, vector.getValidityBuffer().capacity()); + } + + try (final VarCharVector vector = new VarCharVector("v", allocator)) { + assertEquals(1, vector.getDataBuffer().refCnt()); + assertEquals(1, vector.getValidityBuffer().refCnt()); + assertEquals(1, vector.getOffsetBuffer().refCnt()); + assertEquals(0, vector.getDataBuffer().capacity()); + assertEquals(0, vector.getValidityBuffer().capacity()); + assertEquals(0, vector.getOffsetBuffer().capacity()); + + vector.allocateNew(valueCount); + assertEquals(1, vector.getDataBuffer().refCnt()); + assertEquals(2, vector.getValidityBuffer().refCnt()); + assertEquals(2, vector.getOffsetBuffer().refCnt()); + assertEquals(32768, vector.getDataBuffer().capacity()); + assertEquals(8, vector.getValidityBuffer().capacity()); + assertEquals(56, vector.getOffsetBuffer().capacity()); + + vector.close(); + assertEquals(1, vector.getDataBuffer().refCnt()); + assertEquals(1, vector.getValidityBuffer().refCnt()); + assertEquals(1, vector.getOffsetBuffer().refCnt()); + assertEquals(0, vector.getDataBuffer().capacity()); + assertEquals(0, vector.getValidityBuffer().capacity()); + assertEquals(0, vector.getOffsetBuffer().capacity()); + } + + try (final ListVector vector = ListVector.empty("v", allocator)) { + assertEquals(1, vector.getValidityBuffer().refCnt()); + assertEquals(1, vector.getOffsetBuffer().refCnt()); + assertEquals(0, vector.getValidityBuffer().capacity()); + assertEquals(0, vector.getOffsetBuffer().capacity()); + + vector.setValueCount(valueCount); + vector.allocateNewSafe(); + assertEquals(1, vector.getValidityBuffer().refCnt()); + assertEquals(1, vector.getOffsetBuffer().refCnt()); + assertEquals(512, vector.getValidityBuffer().capacity()); + assertEquals(16384, vector.getOffsetBuffer().capacity()); + + vector.close(); + assertEquals(1, vector.getValidityBuffer().refCnt()); + assertEquals(1, vector.getOffsetBuffer().refCnt()); + assertEquals(0, vector.getValidityBuffer().capacity()); + assertEquals(0, vector.getOffsetBuffer().capacity()); + } + + try (final FixedSizeListVector vector = FixedSizeListVector.empty("v", 2, allocator)) { + assertEquals(1, vector.getValidityBuffer().refCnt()); + assertEquals(0, vector.getValidityBuffer().capacity()); + + vector.setValueCount(10); + vector.allocateNewSafe(); + assertEquals(1, vector.getValidityBuffer().refCnt()); + assertEquals(512, vector.getValidityBuffer().capacity()); + + vector.close(); + assertEquals(1, vector.getValidityBuffer().refCnt()); + assertEquals(0, vector.getValidityBuffer().capacity()); + } + + try (final StructVector vector = StructVector.empty("v", allocator)) { + assertEquals(1, vector.getValidityBuffer().refCnt()); + assertEquals(0, vector.getValidityBuffer().capacity()); + + vector.setValueCount(valueCount); + vector.allocateNewSafe(); + assertEquals(1, vector.getValidityBuffer().refCnt()); + assertEquals(512, vector.getValidityBuffer().capacity()); + + vector.close(); + assertEquals(1, vector.getValidityBuffer().refCnt()); + assertEquals(0, vector.getValidityBuffer().capacity()); + } + + try (final UnionVector vector = UnionVector.empty("v", allocator)) { + assertEquals(1, vector.getTypeBuffer().refCnt()); + assertEquals(0, vector.getTypeBuffer().capacity()); + + vector.setValueCount(10); + vector.allocateNewSafe(); + assertEquals(1, vector.getTypeBuffer().refCnt()); + assertEquals(4096, vector.getTypeBuffer().capacity()); + + vector.close(); + assertEquals(1, vector.getTypeBuffer().refCnt()); + assertEquals(0, vector.getTypeBuffer().capacity()); + } + + try (final DenseUnionVector vector = DenseUnionVector.empty("v", allocator)) { + assertEquals(1, vector.getTypeBuffer().refCnt()); + assertEquals(1, vector.getOffsetBuffer().refCnt()); + assertEquals(0, vector.getTypeBuffer().capacity()); + assertEquals(0, vector.getOffsetBuffer().capacity()); + + vector.setValueCount(valueCount); + vector.allocateNew(); + assertEquals(1, vector.getTypeBuffer().refCnt()); + assertEquals(1, vector.getOffsetBuffer().refCnt()); + assertEquals(4096, vector.getTypeBuffer().capacity()); + assertEquals(16384, vector.getOffsetBuffer().capacity()); + + vector.close(); + assertEquals(1, vector.getTypeBuffer().refCnt()); + assertEquals(1, vector.getOffsetBuffer().refCnt()); + assertEquals(0, vector.getTypeBuffer().capacity()); + assertEquals(0, vector.getOffsetBuffer().capacity()); + } + } + + @Test + public void testSetGetUInt1() { + try (UInt1Vector vector = new UInt1Vector("vector", allocator)) { + vector.allocateNew(2); + + vector.setWithPossibleTruncate(0, UInt1Vector.MAX_UINT1); + vector.setUnsafeWithPossibleTruncate(1, UInt1Vector.MAX_UINT1); + vector.setValueCount(2); + + assertEquals(UInt1Vector.MAX_UINT1 & UInt1Vector.PROMOTION_MASK, vector.getValueAsLong(0)); + assertEquals(UInt1Vector.MAX_UINT1 & UInt1Vector.PROMOTION_MASK, vector.getValueAsLong(1)); + } + } + + @Test + public void testSetGetUInt2() { + try (UInt2Vector vector = new UInt2Vector("vector", allocator)) { + vector.allocateNew(2); + + vector.setWithPossibleTruncate(0, UInt2Vector.MAX_UINT2); + vector.setUnsafeWithPossibleTruncate(1, UInt2Vector.MAX_UINT2); + vector.setValueCount(2); + + assertEquals(UInt2Vector.MAX_UINT2, vector.getValueAsLong(0)); + assertEquals(UInt2Vector.MAX_UINT2, vector.getValueAsLong(1)); + } + } + + @Test + public void testSetGetUInt4() { + try (UInt4Vector vector = new UInt4Vector("vector", allocator)) { + vector.allocateNew(2); + + vector.setWithPossibleTruncate(0, UInt4Vector.MAX_UINT4); + vector.setUnsafeWithPossibleTruncate(1, UInt4Vector.MAX_UINT4); + vector.setValueCount(2); + + long expected = UInt4Vector.MAX_UINT4 & UInt4Vector.PROMOTION_MASK; + assertEquals(expected, vector.getValueAsLong(0)); + assertEquals(expected, vector.getValueAsLong(1)); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestVarCharListVector.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestVarCharListVector.java new file mode 100644 index 000000000..a9b155499 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestVarCharListVector.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.impl.UnionListWriter; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class TestVarCharListVector { + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testVarCharListWithNulls() { + byte[] bytes = "a".getBytes(); + try (ListVector vector = new ListVector("VarList", allocator, FieldType.nullable(Types + .MinorType.VARCHAR.getType()), null); + ArrowBuf tempBuf = allocator.buffer(bytes.length)) { + UnionListWriter writer = vector.getWriter(); + writer.allocate(); + + // populate input vector with the following records + // ["a"] + // null + // ["b"] + writer.setPosition(0); // optional + writer.startList(); + tempBuf.setBytes(0, bytes); + writer.writeVarChar(0, bytes.length, tempBuf); + writer.endList(); + + writer.setPosition(2); + writer.startList(); + bytes = "b".getBytes(); + tempBuf.setBytes(0, bytes); + writer.writeVarChar(0, bytes.length, tempBuf); + writer.endList(); + + writer.setValueCount(2); + + Assert.assertTrue(vector.getValueCount() == 2); + Assert.assertTrue(vector.getDataVector().getValueCount() == 2); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestVectorAlloc.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestVectorAlloc.java new file mode 100644 index 000000000..dfc75ec8e --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestVectorAlloc.java @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.util.Arrays; +import java.util.Collections; + +import org.apache.arrow.memory.AllocationListener; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.memory.rounding.DefaultRoundingPolicy; +import org.apache.arrow.memory.rounding.RoundingPolicy; +import org.apache.arrow.vector.types.TimeUnit; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.ArrowType.Decimal; +import org.apache.arrow.vector.types.pojo.ArrowType.Duration; +import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeBinary; +import org.apache.arrow.vector.types.pojo.ArrowType.Timestamp; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestVectorAlloc { + private BufferAllocator rootAllocator; + + private BufferAllocator policyAllocator; + + @Before + public void init() { + rootAllocator = new RootAllocator(Long.MAX_VALUE); + policyAllocator = + new RootAllocator(AllocationListener.NOOP, Integer.MAX_VALUE, new CustomPolicy()); + } + + @After + public void terminate() throws Exception { + rootAllocator.close(); + policyAllocator.close(); + } + + private static Field field(String name, ArrowType type) { + return new Field(name, new FieldType(true, type, null), Collections.emptyList()); + } + + @Test + public void testVectorAllocWithField() { + Schema schema = new Schema(Arrays.asList( + field("TINYINT", MinorType.TINYINT.getType()), + field("SMALLINT", MinorType.SMALLINT.getType()), + field("INT", MinorType.INT.getType()), + field("BIGINT", MinorType.BIGINT.getType()), + field("UINT1", MinorType.UINT1.getType()), + field("UINT2", MinorType.UINT2.getType()), + field("UINT4", MinorType.UINT4.getType()), + field("UINT8", MinorType.UINT8.getType()), + field("FLOAT4", MinorType.FLOAT4.getType()), + field("FLOAT8", MinorType.FLOAT8.getType()), + field("UTF8", MinorType.VARCHAR.getType()), + field("VARBINARY", MinorType.VARBINARY.getType()), + field("BIT", MinorType.BIT.getType()), + field("DECIMAL", new Decimal(38, 5, 128)), + field("FIXEDSIZEBINARY", new FixedSizeBinary(50)), + field("DATEDAY", MinorType.DATEDAY.getType()), + field("DATEMILLI", MinorType.DATEMILLI.getType()), + field("TIMESEC", MinorType.TIMESEC.getType()), + field("TIMEMILLI", MinorType.TIMEMILLI.getType()), + field("TIMEMICRO", MinorType.TIMEMICRO.getType()), + field("TIMENANO", MinorType.TIMENANO.getType()), + field("TIMESTAMPSEC", MinorType.TIMESTAMPSEC.getType()), + field("TIMESTAMPMILLI", MinorType.TIMESTAMPMILLI.getType()), + field("TIMESTAMPMICRO", MinorType.TIMESTAMPMICRO.getType()), + field("TIMESTAMPNANO", MinorType.TIMESTAMPNANO.getType()), + field("TIMESTAMPSECTZ", new Timestamp(TimeUnit.SECOND, "PST")), + field("TIMESTAMPMILLITZ", new Timestamp(TimeUnit.MILLISECOND, "PST")), + field("TIMESTAMPMICROTZ", new Timestamp(TimeUnit.MICROSECOND, "PST")), + field("TIMESTAMPNANOTZ", new Timestamp(TimeUnit.NANOSECOND, "PST")), + field("INTERVALDAY", MinorType.INTERVALDAY.getType()), + field("INTERVALYEAR", MinorType.INTERVALYEAR.getType()), + field("DURATION", new Duration(TimeUnit.MILLISECOND)) + )); + + try (BufferAllocator allocator = rootAllocator.newChildAllocator("child", 0, Long.MAX_VALUE)) { + for (Field field : schema.getFields()) { + try (FieldVector vector = field.createVector(allocator)) { + assertEquals(vector.getMinorType(), + Types.getMinorTypeForArrowType(field.getFieldType().getType())); + vector.allocateNew(); + } + } + } + } + + private static final int CUSTOM_SEGMENT_SIZE = 200; + + /** + * A custom rounding policy that rounds the size to + * the next multiple of 200. + */ + private static class CustomPolicy implements RoundingPolicy { + + @Override + public long getRoundedSize(long requestSize) { + return (requestSize + CUSTOM_SEGMENT_SIZE - 1) / CUSTOM_SEGMENT_SIZE * CUSTOM_SEGMENT_SIZE; + } + } + + @Test + public void testFixedWidthVectorAllocation() { + try (IntVector vec1 = new IntVector("vec", policyAllocator); + IntVector vec2 = new IntVector("vec", rootAllocator)) { + assertTrue(vec1.getAllocator().getRoundingPolicy() instanceof CustomPolicy); + vec1.allocateNew(50); + long totalCapacity = vec1.getValidityBuffer().capacity() + vec1.getDataBuffer().capacity(); + + // the total capacity must be a multiple of the segment size + assertTrue(totalCapacity % CUSTOM_SEGMENT_SIZE == 0); + + assertTrue(vec2.getAllocator().getRoundingPolicy() instanceof DefaultRoundingPolicy); + vec2.allocateNew(50); + totalCapacity = vec2.getValidityBuffer().capacity() + vec2.getDataBuffer().capacity(); + + // the total capacity must be a power of two + assertEquals(totalCapacity & (totalCapacity - 1), 0); + } + } + + @Test + public void testVariableWidthVectorAllocation() { + try (VarCharVector vec1 = new VarCharVector("vec", policyAllocator); + VarCharVector vec2 = new VarCharVector("vec", rootAllocator)) { + assertTrue(vec1.getAllocator().getRoundingPolicy() instanceof CustomPolicy); + vec1.allocateNew(50); + long totalCapacity = vec1.getValidityBuffer().capacity() + vec1.getOffsetBuffer().capacity(); + + // the total capacity must be a multiple of the segment size + assertTrue(totalCapacity % CUSTOM_SEGMENT_SIZE == 0); + + assertTrue(vec2.getAllocator().getRoundingPolicy() instanceof DefaultRoundingPolicy); + vec2.allocateNew(50); + totalCapacity = vec2.getValidityBuffer().capacity() + vec2.getOffsetBuffer().capacity(); + + // the total capacity must be a power of two + assertEquals(totalCapacity & (totalCapacity - 1), 0); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java new file mode 100644 index 000000000..18bb2c957 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java @@ -0,0 +1,474 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.*; + +import java.nio.charset.StandardCharsets; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.complex.DenseUnionVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.complex.impl.NullableStructWriter; +import org.apache.arrow.vector.complex.impl.UnionFixedSizeListWriter; +import org.apache.arrow.vector.complex.impl.UnionListWriter; +import org.apache.arrow.vector.holders.NullableIntHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.DataSizeRoundingUtil; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + + +public class TestVectorReAlloc { + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testFixedType() { + try (final UInt4Vector vector = new UInt4Vector("", allocator)) { + vector.setInitialCapacity(512); + vector.allocateNew(); + + assertTrue(vector.getValueCapacity() >= 512); + int initialCapacity = vector.getValueCapacity(); + + try { + vector.set(initialCapacity, 0); + Assert.fail("Expected out of bounds exception"); + } catch (Exception e) { + // ok + } + + vector.reAlloc(); + assertTrue(vector.getValueCapacity() >= 2 * initialCapacity); + + vector.set(initialCapacity, 100); + assertEquals(100, vector.get(initialCapacity)); + } + } + + @Test + public void testNullableType() { + try (final VarCharVector vector = new VarCharVector("", allocator)) { + vector.setInitialCapacity(512); + vector.allocateNew(); + + assertTrue(vector.getValueCapacity() >= 512); + int initialCapacity = vector.getValueCapacity(); + + try { + vector.set(initialCapacity, "foo".getBytes(StandardCharsets.UTF_8)); + Assert.fail("Expected out of bounds exception"); + } catch (Exception e) { + // ok + } + + vector.reAlloc(); + assertTrue(vector.getValueCapacity() >= 2 * initialCapacity); + + vector.set(initialCapacity, "foo".getBytes(StandardCharsets.UTF_8)); + assertEquals("foo", new String(vector.get(initialCapacity), StandardCharsets.UTF_8)); + } + } + + @Test + public void testListType() { + try (final ListVector vector = ListVector.empty("", allocator)) { + vector.addOrGetVector(FieldType.nullable(MinorType.INT.getType())); + + vector.setInitialCapacity(512); + vector.allocateNew(); + + assertEquals(512, vector.getValueCapacity()); + + try { + vector.getInnerValueCountAt(2014); + Assert.fail("Expected out of bounds exception"); + } catch (Exception e) { + // ok + } + + vector.reAlloc(); + assertEquals(1024, vector.getValueCapacity()); + assertEquals(0, vector.getOffsetBuffer().getInt(2014 * ListVector.OFFSET_WIDTH)); + } + } + + @Test + public void testStructType() { + try (final StructVector vector = StructVector.empty("", allocator)) { + vector.addOrGet("", FieldType.nullable(MinorType.INT.getType()), IntVector.class); + + vector.setInitialCapacity(512); + vector.allocateNew(); + + assertEquals(512, vector.getValueCapacity()); + + try { + vector.getObject(513); + Assert.fail("Expected out of bounds exception"); + } catch (Exception e) { + // ok + } + + vector.reAlloc(); + assertEquals(1024, vector.getValueCapacity()); + assertNull(vector.getObject(513)); + } + } + + @Test + public void testVariableWidthTypeSetNullValues() { + // Test ARROW-11223 bug is fixed + try (final BaseVariableWidthVector v1 = new VarCharVector("var1", allocator)) { + v1.setInitialCapacity(512); + v1.allocateNew(); + int numNullValues1 = v1.getValueCapacity() + 1; + for (int i = 0; i < numNullValues1; i++) { + v1.setNull(i); + } + Assert.assertTrue(v1.getBufferSizeFor(numNullValues1) > 0); + } + + try (final BaseLargeVariableWidthVector v2 = new LargeVarCharVector("var2", allocator)) { + v2.setInitialCapacity(512); + v2.allocateNew(); + int numNullValues2 = v2.getValueCapacity() + 1; + for (int i = 0; i < numNullValues2; i++) { + v2.setNull(i); + } + Assert.assertTrue(v2.getBufferSizeFor(numNullValues2) > 0); + } + } + + @Test + public void testFixedAllocateAfterReAlloc() throws Exception { + try (final IntVector vector = new IntVector("", allocator)) { + /* + * Allocate the default size, and then, reAlloc. This should double the allocation. + */ + vector.allocateNewSafe(); // Initial allocation + vector.reAlloc(); // Double the allocation size. + int savedValueCapacity = vector.getValueCapacity(); + + /* + * Clear and allocate again. + */ + vector.clear(); + vector.allocateNewSafe(); + + /* + * Verify that the buffer sizes haven't changed. + */ + Assert.assertEquals(vector.getValueCapacity(), savedValueCapacity); + } + } + + @Test + public void testVariableAllocateAfterReAlloc() throws Exception { + try (final VarCharVector vector = new VarCharVector("", allocator)) { + /* + * Allocate the default size, and then, reAlloc. This should double the allocation. + */ + vector.allocateNewSafe(); // Initial allocation + vector.reAlloc(); // Double the allocation size. + int savedValueCapacity = vector.getValueCapacity(); + long savedValueBufferSize = vector.valueBuffer.capacity(); + + /* + * Clear and allocate again. + */ + vector.clear(); + vector.allocateNewSafe(); + + /* + * Verify that the buffer sizes haven't changed. + */ + Assert.assertEquals(vector.getValueCapacity(), savedValueCapacity); + Assert.assertEquals(vector.valueBuffer.capacity(), savedValueBufferSize); + } + } + + @Test + public void testLargeVariableAllocateAfterReAlloc() throws Exception { + try (final LargeVarCharVector vector = new LargeVarCharVector("", allocator)) { + /* + * Allocate the default size, and then, reAlloc. This should double the allocation. + */ + vector.allocateNewSafe(); // Initial allocation + vector.reAlloc(); // Double the allocation size. + int savedValueCapacity = vector.getValueCapacity(); + long savedValueBufferSize = vector.valueBuffer.capacity(); + + /* + * Clear and allocate again. + */ + vector.clear(); + vector.allocateNewSafe(); + + /* + * Verify that the buffer sizes haven't changed. + */ + Assert.assertEquals(vector.getValueCapacity(), savedValueCapacity); + Assert.assertEquals(vector.valueBuffer.capacity(), savedValueBufferSize); + } + } + + @Test + public void testVarCharAllocateNew() throws Exception { + final int count = 6000; + + try (final VarCharVector vector = new VarCharVector("", allocator)) { + vector.allocateNew(count); + + // verify that the validity buffer and value buffer have capacity for atleast 'count' elements. + Assert.assertTrue(vector.getValidityBuffer().capacity() >= DataSizeRoundingUtil.divideBy8Ceil(count)); + Assert.assertTrue(vector.getOffsetBuffer().capacity() >= (count + 1) * BaseVariableWidthVector.OFFSET_WIDTH); + } + } + + @Test + public void testLargeVarCharAllocateNew() throws Exception { + final int count = 6000; + + try (final LargeVarCharVector vector = new LargeVarCharVector("", allocator)) { + vector.allocateNew(count); + + // verify that the validity buffer and value buffer have capacity for atleast 'count' elements. + Assert.assertTrue(vector.getValidityBuffer().capacity() >= DataSizeRoundingUtil.divideBy8Ceil(count)); + Assert.assertTrue(vector.getOffsetBuffer().capacity() >= (count + 1) * BaseLargeVariableWidthVector.OFFSET_WIDTH); + } + } + + @Test + public void testVarCharAllocateNewUsingHelper() throws Exception { + final int count = 6000; + + try (final VarCharVector vector = new VarCharVector("", allocator)) { + AllocationHelper.allocateNew(vector, count); + + // verify that the validity buffer and value buffer have capacity for atleast 'count' elements. + Assert.assertTrue(vector.getValidityBuffer().capacity() >= DataSizeRoundingUtil.divideBy8Ceil(count)); + Assert.assertTrue(vector.getOffsetBuffer().capacity() >= (count + 1) * BaseVariableWidthVector.OFFSET_WIDTH); + } + } + + @Test + public void testLargeVarCharAllocateNewUsingHelper() throws Exception { + final int count = 6000; + + try (final LargeVarCharVector vector = new LargeVarCharVector("", allocator)) { + AllocationHelper.allocateNew(vector, count); + + // verify that the validity buffer and value buffer have capacity for atleast 'count' elements. + Assert.assertTrue(vector.getValidityBuffer().capacity() >= DataSizeRoundingUtil.divideBy8Ceil(count)); + Assert.assertTrue(vector.getOffsetBuffer().capacity() >= (count + 1) * BaseLargeVariableWidthVector.OFFSET_WIDTH); + } + } + + @Test + public void testFixedRepeatedClearAndSet() throws Exception { + try (final IntVector vector = new IntVector("", allocator)) { + vector.allocateNewSafe(); // Initial allocation + vector.clear(); // clear vector. + vector.setSafe(0, 10); + int savedValueCapacity = vector.getValueCapacity(); + + for (int i = 0; i < 1024; ++i) { + vector.clear(); // clear vector. + vector.setSafe(0, 10); + } + + // should be deterministic, and not cause a run-away increase in capacity. + Assert.assertEquals(vector.getValueCapacity(), savedValueCapacity); + } + } + + @Test + public void testVariableRepeatedClearAndSet() throws Exception { + try (final VarCharVector vector = new VarCharVector("", allocator)) { + vector.allocateNewSafe(); // Initial allocation + + vector.clear(); // clear vector. + vector.setSafe(0, "hello world".getBytes()); + int savedValueCapacity = vector.getValueCapacity(); + + for (int i = 0; i < 1024; ++i) { + vector.clear(); // clear vector. + vector.setSafe(0, "hello world".getBytes()); + } + + // should be deterministic, and not cause a run-away increase in capacity. + Assert.assertEquals(vector.getValueCapacity(), savedValueCapacity); + } + } + + @Test + public void testRepeatedValueVectorClearAndSet() throws Exception { + try (final ListVector vector = new ListVector("", allocator, FieldType.nullable(MinorType.INT.getType()), null)) { + vector.allocateNewSafe(); // Initial allocation + UnionListWriter writer = vector.getWriter(); + + vector.clear(); // clear vector. + writer.setPosition(0); // optional + writer.startList(); + writer.writeInt(0); + writer.endList(); + int savedValueCapacity = vector.getValueCapacity(); + + for (int i = 0; i < 1024; ++i) { + vector.clear(); // clear vector. + writer.setPosition(0); // optional + writer.startList(); + writer.writeInt(i); + writer.endList(); + } + + // should be deterministic, and not cause a run-away increase in capacity. + Assert.assertEquals(vector.getValueCapacity(), savedValueCapacity); + } + } + + @Test + public void testStructVectorClearAndSet() throws Exception { + try (final StructVector vector = StructVector.empty("v", allocator)) { + vector.allocateNewSafe(); // Initial allocation + + NullableStructWriter writer = vector.getWriter(); + + vector.clear(); // clear vector. + writer.setPosition(0); // optional + writer.start(); + writer.integer("int").writeInt(0); + writer.end(); + int savedValueCapacity = vector.getValueCapacity(); + + for (int i = 0; i < 1024; ++i) { + vector.clear(); // clear vector. + writer.setPosition(0); // optional + writer.start(); + writer.integer("int").writeInt(i); + writer.end(); + } + + // should be deterministic, and not cause a run-away increase in capacity. + Assert.assertEquals(vector.getValueCapacity(), savedValueCapacity); + } + } + + @Test + public void testFixedSizeListVectorClearAndSet() { + try (final FixedSizeListVector vector = new FixedSizeListVector("", allocator, + FieldType.nullable(new ArrowType.FixedSizeList(2)), null)) { + vector.allocateNewSafe(); // Initial allocation + UnionFixedSizeListWriter writer = vector.getWriter(); + + vector.clear(); // clear vector. + writer.setPosition(0); // optional + writer.startList(); + writer.writeInt(0); + writer.writeInt(1); + writer.endList(); + int savedValueCapacity = vector.getValueCapacity(); + + for (int i = 0; i < 1024; ++i) { + vector.clear(); // clear vector. + writer.setPosition(0); // optional + writer.startList(); + writer.writeInt(i); + writer.writeInt(i + 1); + writer.endList(); + } + + // should be deterministic, and not cause a run-away increase in capacity. + Assert.assertEquals(vector.getValueCapacity(), savedValueCapacity); + } + } + + @Test + public void testUnionVectorClearAndSet() { + try (final UnionVector vector = new UnionVector("", allocator, /* field type */ null, /* call-back */ null)) { + vector.allocateNewSafe(); // Initial allocation + + NullableIntHolder holder = new NullableIntHolder(); + holder.isSet = 1; + holder.value = 1; + + vector.clear(); // clear vector. + vector.setType(0, MinorType.INT); + vector.setSafe(0, holder); + int savedValueCapacity = vector.getValueCapacity(); + + for (int i = 0; i < 1024; ++i) { + vector.clear(); // clear vector. + vector.setType(0, MinorType.INT); + vector.setSafe(0, holder); + } + + // should be deterministic, and not cause a run-away increase in capacity. + Assert.assertEquals(vector.getValueCapacity(), savedValueCapacity); + } + } + + @Test + public void testDenseUnionVectorClearAndSet() { + try (final DenseUnionVector vector = new DenseUnionVector("", allocator, null, null)) { + vector.allocateNewSafe(); // Initial allocation + + NullableIntHolder holder = new NullableIntHolder(); + holder.isSet = 1; + holder.value = 1; + + byte intTypeId = vector.registerNewTypeId(Field.nullable("", MinorType.INT.getType())); + + vector.clear(); + vector.setTypeId(0, intTypeId); + vector.setSafe(0, holder); + + int savedValueCapacity = vector.getValueCapacity(); + + for (int i = 0; i < 1024; ++i) { + vector.clear(); + vector.setTypeId(0, intTypeId); + vector.setSafe(0, holder); + } + + // should be deterministic, and not cause a run-away increase in capacity. + Assert.assertEquals(vector.getValueCapacity(), savedValueCapacity); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java new file mode 100644 index 000000000..71009a333 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.nio.charset.StandardCharsets; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeList; +import org.apache.arrow.vector.types.pojo.ArrowType.Int; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestVectorReset { + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + private void resetVectorAndVerify(ValueVector vector, ArrowBuf[] bufs) { + long[] sizeBefore = new long[bufs.length]; + for (int i = 0; i < bufs.length; i++) { + sizeBefore[i] = bufs[i].capacity(); + } + vector.reset(); + for (int i = 0; i < bufs.length; i++) { + assertEquals(sizeBefore[i], bufs[i].capacity()); + verifyBufferZeroed(bufs[i]); + } + assertEquals(0, vector.getValueCount()); + } + + private void verifyBufferZeroed(ArrowBuf buf) { + for (int i = 0; i < buf.capacity(); i++) { + assertTrue((byte) 0 == buf.getByte(i)); + } + } + + @Test + public void testFixedTypeReset() { + try (final UInt4Vector vector = new UInt4Vector("UInt4", allocator)) { + vector.allocateNewSafe(); + vector.setNull(0); + vector.setValueCount(1); + resetVectorAndVerify(vector, vector.getBuffers(false)); + } + } + + @Test + public void testVariableTypeReset() { + try (final VarCharVector vector = new VarCharVector("VarChar", allocator)) { + vector.allocateNewSafe(); + vector.set(0, "a".getBytes(StandardCharsets.UTF_8)); + vector.setLastSet(0); + vector.setValueCount(1); + resetVectorAndVerify(vector, vector.getBuffers(false)); + assertEquals(-1, vector.getLastSet()); + } + } + + @Test + public void testLargeVariableTypeReset() { + try (final LargeVarCharVector vector = new LargeVarCharVector("LargeVarChar", allocator)) { + vector.allocateNewSafe(); + vector.set(0, "a".getBytes(StandardCharsets.UTF_8)); + vector.setLastSet(0); + vector.setValueCount(1); + resetVectorAndVerify(vector, vector.getBuffers(false)); + assertEquals(-1, vector.getLastSet()); + } + } + + @Test + public void testListTypeReset() { + try (final ListVector variableList = + new ListVector("VarList", allocator, FieldType.nullable(MinorType.INT.getType()), null); + final FixedSizeListVector fixedList = + new FixedSizeListVector("FixedList", allocator, FieldType.nullable(new FixedSizeList(2)), null) + ) { + // ListVector + variableList.allocateNewSafe(); + variableList.startNewValue(0); + variableList.endValue(0, 0); + variableList.setValueCount(1); + resetVectorAndVerify(variableList, variableList.getBuffers(false)); + assertEquals(-1, variableList.getLastSet()); + + // FixedSizeListVector + fixedList.allocateNewSafe(); + fixedList.setNull(0); + fixedList.setValueCount(1); + resetVectorAndVerify(fixedList, fixedList.getBuffers(false)); + } + } + + @Test + public void testStructTypeReset() { + try (final NonNullableStructVector nonNullableStructVector = + new NonNullableStructVector("Struct", allocator, FieldType.nullable(MinorType.INT.getType()), null); + final StructVector structVector = + new StructVector("NullableStruct", allocator, FieldType.nullable(MinorType.INT.getType()), null) + ) { + // NonNullableStructVector + nonNullableStructVector.allocateNewSafe(); + IntVector structChild = nonNullableStructVector + .addOrGet("child", FieldType.nullable(new Int(32, true)), IntVector.class); + structChild.setNull(0); + nonNullableStructVector.setValueCount(1); + resetVectorAndVerify(nonNullableStructVector, nonNullableStructVector.getBuffers(false)); + + // StructVector + structVector.allocateNewSafe(); + structVector.setNull(0); + structVector.setValueCount(1); + resetVectorAndVerify(structVector, structVector.getBuffers(false)); + } + } + + @Test + public void testUnionTypeReset() { + try (final UnionVector vector = new UnionVector("Union", allocator, /* field type */ null, /* call-back */ null); + final IntVector dataVector = new IntVector("Int", allocator) + ) { + vector.getBufferSize(); + vector.allocateNewSafe(); + dataVector.allocateNewSafe(); + vector.addVector(dataVector); + dataVector.setNull(0); + vector.setValueCount(1); + resetVectorAndVerify(vector, vector.getBuffers(false)); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestVectorSchemaRoot.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestVectorSchemaRoot.java new file mode 100644 index 000000000..4c5b6540f --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestVectorSchemaRoot.java @@ -0,0 +1,318 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static junit.framework.TestCase.assertTrue; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.impl.UnionListWriter; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestVectorSchemaRoot { + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + @After + public void terminate() { + allocator.close(); + } + + @Test + public void testResetRowCount() { + final int size = 20; + try (final BitVector vec1 = new BitVector("bit", allocator); + final IntVector vec2 = new IntVector("int", allocator)) { + VectorSchemaRoot vsr = VectorSchemaRoot.of(vec1, vec2); + + vsr.allocateNew(); + assertEquals(vsr.getRowCount(), 0); + + for (int i = 0; i < size; i++) { + vec1.setSafe(i, i % 2); + vec2.setSafe(i, i); + } + vsr.setRowCount(size); + checkCount(vec1, vec2, vsr, size); + + vsr.allocateNew(); + checkCount(vec1, vec2, vsr, 0); + + for (int i = 0; i < size; i++) { + vec1.setSafe(i, i % 2); + vec2.setSafe(i, i); + } + vsr.setRowCount(size); + checkCount(vec1, vec2, vsr, size); + + vsr.clear(); + checkCount(vec1, vec2, vsr, 0); + } + } + + private void checkCount(BitVector vec1, IntVector vec2, VectorSchemaRoot vsr, int count) { + assertEquals(vec1.getValueCount(), count); + assertEquals(vec2.getValueCount(), count); + assertEquals(vsr.getRowCount(), count); + } + + private VectorSchemaRoot createBatch() { + FieldType varCharType = new FieldType(true, new ArrowType.Utf8(), /*dictionary=*/null); + FieldType listType = new FieldType(true, new ArrowType.List(), /*dictionary=*/null); + + // create the schema + List schemaFields = new ArrayList<>(); + Field childField = new Field("varCharCol", varCharType, null); + List childFields = new ArrayList<>(); + childFields.add(childField); + schemaFields.add(new Field("listCol", listType, childFields)); + Schema schema = new Schema(schemaFields); + + VectorSchemaRoot schemaRoot = VectorSchemaRoot.create(schema, allocator); + // get and allocate the vector + ListVector vector = (ListVector) schemaRoot.getVector("listCol"); + vector.allocateNew(); + + // write data to the vector + UnionListWriter writer = vector.getWriter(); + + writer.setPosition(0); + + // write data vector(0) + writer.startList(); + + // write data vector(0)(0) + writer.list().startList(); + + // According to the schema above, the list element should have varchar type. + // When we write a big int, the original writer cannot handle this, so the writer will + // be promoted, and the vector structure will be different from the schema. + writer.list().bigInt().writeBigInt(0); + writer.list().bigInt().writeBigInt(1); + writer.list().endList(); + + // write data vector(0)(1) + writer.list().startList(); + writer.list().float8().writeFloat8(3.0D); + writer.list().float8().writeFloat8(7.0D); + writer.list().endList(); + + // finish data vector(0) + writer.endList(); + + writer.setPosition(1); + + // write data vector(1) + writer.startList(); + + // write data vector(1)(0) + writer.list().startList(); + writer.list().integer().writeInt(3); + writer.list().integer().writeInt(2); + writer.list().endList(); + + // finish data vector(1) + writer.endList(); + + vector.setValueCount(2); + + return schemaRoot; + } + + @Test + public void testAddVector() { + try (final IntVector intVector1 = new IntVector("intVector1", allocator); + final IntVector intVector2 = new IntVector("intVector2", allocator); + final IntVector intVector3 = new IntVector("intVector3", allocator);) { + + VectorSchemaRoot original = new VectorSchemaRoot(Arrays.asList(intVector1, intVector2)); + assertEquals(2, original.getFieldVectors().size()); + + VectorSchemaRoot newRecordBatch = original.addVector(1, intVector3); + assertEquals(3, newRecordBatch.getFieldVectors().size()); + assertEquals(intVector3, newRecordBatch.getFieldVectors().get(1)); + + original.close(); + newRecordBatch.close(); + } + } + + @Test + public void testRemoveVector() { + try (final IntVector intVector1 = new IntVector("intVector1", allocator); + final IntVector intVector2 = new IntVector("intVector2", allocator); + final IntVector intVector3 = new IntVector("intVector3", allocator);) { + + VectorSchemaRoot original = + new VectorSchemaRoot(Arrays.asList(intVector1, intVector2, intVector3)); + assertEquals(3, original.getFieldVectors().size()); + + VectorSchemaRoot newRecordBatch = original.removeVector(0); + assertEquals(2, newRecordBatch.getFieldVectors().size()); + assertEquals(intVector2, newRecordBatch.getFieldVectors().get(0)); + assertEquals(intVector3, newRecordBatch.getFieldVectors().get(1)); + + original.close(); + newRecordBatch.close(); + } + } + + @Test + public void testSlice() { + try (final IntVector intVector = new IntVector("intVector", allocator); + final Float4Vector float4Vector = new Float4Vector("float4Vector", allocator)) { + intVector.setValueCount(10); + float4Vector.setValueCount(10); + for (int i = 0; i < 10; i++) { + intVector.setSafe(i, i); + float4Vector.setSafe(i, i + 0.1f); + } + final VectorSchemaRoot original = new VectorSchemaRoot(Arrays.asList(intVector, float4Vector)); + + VectorSchemaRoot slice1 = original.slice(0, original.getRowCount()); + assertEquals(original, slice1); + + VectorSchemaRoot slice2 = original.slice(0, 5); + assertEquals(5, slice2.getRowCount()); + // validate data + IntVector childVector1 = (IntVector) slice2.getFieldVectors().get(0); + Float4Vector childVector2 = (Float4Vector) slice2.getFieldVectors().get(1); + for (int i = 0; i < 5; i++) { + assertEquals(i, childVector1.get(i)); + assertEquals(i + 0.1f, childVector2.get(i), 0); + } + + original.close(); + slice2.close(); + } + } + + @Test(expected = IllegalArgumentException.class) + public void testSliceWithInvalidParam() { + try (final IntVector intVector = new IntVector("intVector", allocator); + final Float4Vector float4Vector = new Float4Vector("float4Vector", allocator)) { + intVector.setValueCount(10); + float4Vector.setValueCount(10); + for (int i = 0; i < 10; i++) { + intVector.setSafe(i, i); + float4Vector.setSafe(i, i + 0.1f); + } + final VectorSchemaRoot original = new VectorSchemaRoot(Arrays.asList(intVector, float4Vector)); + + original.slice(0, 20); + } + } + + @Test + public void testEquals() { + try (final IntVector intVector1 = new IntVector("intVector1", allocator); + final IntVector intVector2 = new IntVector("intVector2", allocator); + final IntVector intVector3 = new IntVector("intVector3", allocator);) { + + intVector1.setValueCount(5); + for (int i = 0; i < 5; i++) { + intVector1.set(i, i); + } + + VectorSchemaRoot root1 = + new VectorSchemaRoot(Arrays.asList(intVector1, intVector2, intVector3)); + + VectorSchemaRoot root2 = + new VectorSchemaRoot(Arrays.asList(intVector1, intVector2)); + + VectorSchemaRoot root3 = + new VectorSchemaRoot(Arrays.asList(intVector1, intVector2, intVector3)); + + assertFalse(root1.equals(root2)); + assertTrue(root1.equals(root3)); + + root1.close(); + root2.close(); + root3.close(); + } + } + + @Test + public void testApproxEquals() { + try (final Float4Vector float4Vector1 = new Float4Vector("floatVector", allocator); + final Float4Vector float4Vector2 = new Float4Vector("floatVector", allocator); + final Float4Vector float4Vector3 = new Float4Vector("floatVector", allocator);) { + + float4Vector1.setValueCount(5); + float4Vector2.setValueCount(5); + float4Vector3.setValueCount(5); + final float epsilon = 1.0E-6f; + for (int i = 0; i < 5; i++) { + float4Vector1.set(i, i); + float4Vector2.set(i, i + epsilon * 2); + float4Vector3.set(i, i + epsilon / 2); + } + + VectorSchemaRoot root1 = + new VectorSchemaRoot(Arrays.asList(float4Vector1)); + + VectorSchemaRoot root2 = + new VectorSchemaRoot(Arrays.asList(float4Vector2)); + + VectorSchemaRoot root3 = + new VectorSchemaRoot(Arrays.asList(float4Vector3)); + + assertFalse(root1.approxEquals(root2)); + assertTrue(root1.approxEquals(root3)); + + root1.close(); + root2.close(); + root3.close(); + } + } + + @Test + public void testSchemaSync() { + //create vector schema root + try (VectorSchemaRoot schemaRoot = createBatch()) { + Schema newSchema = new Schema( + schemaRoot.getFieldVectors().stream().map(vec -> vec.getField()).collect(Collectors.toList())); + + assertNotEquals(newSchema, schemaRoot.getSchema()); + assertTrue(schemaRoot.syncSchema()); + assertEquals(newSchema, schemaRoot.getSchema()); + + // no schema update this time. + assertFalse(schemaRoot.syncSchema()); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java new file mode 100644 index 000000000..8e1941a8c --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java @@ -0,0 +1,332 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static java.util.Arrays.asList; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.impl.ComplexWriterImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.BaseWriter.ComplexWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.StructWriter; +import org.apache.arrow.vector.complex.writer.BigIntWriter; +import org.apache.arrow.vector.complex.writer.IntWriter; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class TestVectorUnloadLoad { + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testUnloadLoad() throws IOException { + int count = 10000; + Schema schema; + + try ( + BufferAllocator originalVectorsAllocator = + allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + NonNullableStructVector parent = NonNullableStructVector.empty("parent", originalVectorsAllocator)) { + + // write some data + ComplexWriter writer = new ComplexWriterImpl("root", parent); + StructWriter rootWriter = writer.rootAsStruct(); + IntWriter intWriter = rootWriter.integer("int"); + BigIntWriter bigIntWriter = rootWriter.bigInt("bigInt"); + for (int i = 0; i < count; i++) { + intWriter.setPosition(i); + intWriter.writeInt(i); + bigIntWriter.setPosition(i); + bigIntWriter.writeBigInt(i); + } + writer.setValueCount(count); + + // unload it + FieldVector root = parent.getChild("root"); + schema = new Schema(root.getField().getChildren()); + VectorUnloader vectorUnloader = newVectorUnloader(root); + try ( + ArrowRecordBatch recordBatch = vectorUnloader.getRecordBatch(); + BufferAllocator finalVectorsAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE); + VectorSchemaRoot newRoot = VectorSchemaRoot.create(schema, finalVectorsAllocator); + ) { + + // load it + VectorLoader vectorLoader = new VectorLoader(newRoot); + + vectorLoader.load(recordBatch); + + FieldReader intReader = newRoot.getVector("int").getReader(); + FieldReader bigIntReader = newRoot.getVector("bigInt").getReader(); + for (int i = 0; i < count; i++) { + intReader.setPosition(i); + Assert.assertEquals(i, intReader.readInteger().intValue()); + bigIntReader.setPosition(i); + Assert.assertEquals(i, bigIntReader.readLong().longValue()); + } + } + } + } + + @Test + public void testUnloadLoadAddPadding() throws IOException { + int count = 10000; + Schema schema; + try ( + BufferAllocator originalVectorsAllocator = + allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + NonNullableStructVector parent = NonNullableStructVector.empty("parent", originalVectorsAllocator)) { + + // write some data + ComplexWriter writer = new ComplexWriterImpl("root", parent); + StructWriter rootWriter = writer.rootAsStruct(); + ListWriter list = rootWriter.list("list"); + IntWriter intWriter = list.integer(); + for (int i = 0; i < count; i++) { + list.setPosition(i); + list.startList(); + for (int j = 0; j < i % 4 + 1; j++) { + intWriter.writeInt(i); + } + list.endList(); + } + writer.setValueCount(count); + + // unload it + FieldVector root = parent.getChild("root"); + schema = new Schema(root.getField().getChildren()); + VectorUnloader vectorUnloader = newVectorUnloader(root); + try ( + ArrowRecordBatch recordBatch = vectorUnloader.getRecordBatch(); + BufferAllocator finalVectorsAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE); + VectorSchemaRoot newRoot = VectorSchemaRoot.create(schema, finalVectorsAllocator); + ) { + List oldBuffers = recordBatch.getBuffers(); + List newBuffers = new ArrayList<>(); + for (ArrowBuf oldBuffer : oldBuffers) { + long l = oldBuffer.readableBytes(); + if (l % 64 != 0) { + // pad + l = l + 64 - l % 64; + } + ArrowBuf newBuffer = allocator.buffer(l); + for (long i = oldBuffer.readerIndex(); i < oldBuffer.writerIndex(); i++) { + newBuffer.setByte(i - oldBuffer.readerIndex(), oldBuffer.getByte(i)); + } + newBuffer.readerIndex(0); + newBuffer.writerIndex(l); + newBuffers.add(newBuffer); + } + + try (ArrowRecordBatch newBatch = + new ArrowRecordBatch(recordBatch.getLength(), recordBatch.getNodes(), newBuffers);) { + // load it + VectorLoader vectorLoader = new VectorLoader(newRoot); + + vectorLoader.load(newBatch); + + FieldReader reader = newRoot.getVector("list").getReader(); + for (int i = 0; i < count; i++) { + reader.setPosition(i); + List expected = new ArrayList<>(); + for (int j = 0; j < i % 4 + 1; j++) { + expected.add(i); + } + Assert.assertEquals(expected, reader.readObject()); + } + } + + for (ArrowBuf newBuf : newBuffers) { + newBuf.getReferenceManager().release(); + } + } + } + } + + /** + * The validity buffer can be empty if: + * - all values are defined. + * - all values are null. + * + * @throws IOException on error + */ + @Test + public void testLoadValidityBuffer() throws IOException { + Schema schema = new Schema(asList( + new Field("intDefined", FieldType.nullable(new ArrowType.Int(32, true)), Collections.emptyList()), + new Field("intNull", FieldType.nullable(new ArrowType.Int(32, true)), Collections.emptyList()) + )); + int count = 10; + ArrowBuf[] values = new ArrowBuf[4]; + for (int i = 0; i < 4; i += 2) { + ArrowBuf buf1 = allocator.buffer(BitVectorHelper.getValidityBufferSize(count)); + ArrowBuf buf2 = allocator.buffer(count * 4); // integers + buf1.setZero(0, buf1.capacity()); + buf2.setZero(0, buf2.capacity()); + values[i] = buf1; + values[i + 1] = buf2; + for (int j = 0; j < count; j++) { + if (i == 2) { + BitVectorHelper.unsetBit(buf1, j); + } else { + BitVectorHelper.setBit(buf1, j); + } + + buf2.setInt(j * 4, j); + } + buf1.writerIndex((int) Math.ceil(count / 8)); + buf2.writerIndex(count * 4); + } + + /* + * values[0] - validity buffer for first vector + * values[1] - data buffer for first vector + * values[2] - validity buffer for second vector + * values[3] - data buffer for second vector + */ + + try ( + ArrowRecordBatch recordBatch = new ArrowRecordBatch(count, asList(new ArrowFieldNode(count, 0), + new ArrowFieldNode(count, count)), asList(values[0], values[1], values[2], values[3])); + BufferAllocator finalVectorsAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE); + VectorSchemaRoot newRoot = VectorSchemaRoot.create(schema, finalVectorsAllocator); + ) { + + // load it + VectorLoader vectorLoader = new VectorLoader(newRoot); + + vectorLoader.load(recordBatch); + + IntVector intDefinedVector = (IntVector) newRoot.getVector("intDefined"); + IntVector intNullVector = (IntVector) newRoot.getVector("intNull"); + for (int i = 0; i < count; i++) { + assertFalse("#" + i, intDefinedVector.isNull(i)); + assertEquals("#" + i, i, intDefinedVector.get(i)); + assertTrue("#" + i, intNullVector.isNull(i)); + } + intDefinedVector.setSafe(count + 10, 1234); + assertTrue(intDefinedVector.isNull(count + 1)); + // empty slots should still default to unset + intDefinedVector.setSafe(count + 1, 789); + assertFalse(intDefinedVector.isNull(count + 1)); + assertEquals(789, intDefinedVector.get(count + 1)); + assertTrue(intDefinedVector.isNull(count)); + assertTrue(intDefinedVector.isNull(count + 2)); + assertTrue(intDefinedVector.isNull(count + 3)); + assertTrue(intDefinedVector.isNull(count + 4)); + assertTrue(intDefinedVector.isNull(count + 5)); + assertTrue(intDefinedVector.isNull(count + 6)); + assertTrue(intDefinedVector.isNull(count + 7)); + assertTrue(intDefinedVector.isNull(count + 8)); + assertTrue(intDefinedVector.isNull(count + 9)); + assertFalse(intDefinedVector.isNull(count + 10)); + assertEquals(1234, intDefinedVector.get(count + 10)); + } finally { + for (ArrowBuf arrowBuf : values) { + arrowBuf.getReferenceManager().release(); + } + } + } + + @Test + public void testUnloadLoadDuplicates() throws IOException { + int count = 10; + Schema schema = new Schema(asList( + new Field("duplicate", FieldType.nullable(new ArrowType.Int(32, true)), Collections.emptyList()), + new Field("duplicate", FieldType.nullable(new ArrowType.Int(32, true)), Collections.emptyList()) + )); + + try ( + BufferAllocator originalVectorsAllocator = + allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + ) { + List sources = new ArrayList<>(); + for (Field field : schema.getFields()) { + FieldVector vector = field.createVector(originalVectorsAllocator); + vector.allocateNew(); + sources.add(vector); + IntVector intVector = (IntVector) vector; + for (int i = 0; i < count; i++) { + intVector.set(i, i); + } + intVector.setValueCount(count); + } + + try (VectorSchemaRoot root = new VectorSchemaRoot(schema.getFields(), sources, count)) { + VectorUnloader vectorUnloader = new VectorUnloader(root); + try (ArrowRecordBatch recordBatch = vectorUnloader.getRecordBatch(); + BufferAllocator finalVectorsAllocator = + allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE); + VectorSchemaRoot newRoot = VectorSchemaRoot.create(schema, finalVectorsAllocator);) { + // load it + VectorLoader vectorLoader = new VectorLoader(newRoot); + vectorLoader.load(recordBatch); + + List targets = newRoot.getFieldVectors(); + Assert.assertEquals(sources.size(), targets.size()); + for (int k = 0; k < sources.size(); k++) { + IntVector src = (IntVector) sources.get(k); + IntVector tgt = (IntVector) targets.get(k); + Assert.assertEquals(src.getValueCount(), tgt.getValueCount()); + for (int i = 0; i < count; i++) { + Assert.assertEquals(src.get(i), tgt.get(i)); + } + } + } + } + } + } + + public static VectorUnloader newVectorUnloader(FieldVector root) { + Schema schema = new Schema(root.getField().getChildren()); + int valueCount = root.getValueCount(); + List fields = root.getChildrenFromFields(); + VectorSchemaRoot vsr = new VectorSchemaRoot(schema.getFields(), fields, valueCount); + return new VectorUnloader(vsr); + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java new file mode 100644 index 000000000..4495881ad --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java @@ -0,0 +1,740 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.compare; + +import static org.apache.arrow.vector.testing.ValueVectorDataPopulator.setVector; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.nio.charset.Charset; +import java.util.Arrays; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.LargeVarCharVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.ZeroVector; +import org.apache.arrow.vector.compare.util.ValueEpsilonEqualizers; +import org.apache.arrow.vector.complex.DenseUnionVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.complex.impl.NullableStructWriter; +import org.apache.arrow.vector.complex.impl.UnionFixedSizeListWriter; +import org.apache.arrow.vector.complex.impl.UnionListWriter; +import org.apache.arrow.vector.holders.NullableBigIntHolder; +import org.apache.arrow.vector.holders.NullableFloat4Holder; +import org.apache.arrow.vector.holders.NullableFloat8Holder; +import org.apache.arrow.vector.holders.NullableIntHolder; +import org.apache.arrow.vector.holders.NullableUInt4Holder; +import org.apache.arrow.vector.types.FloatingPointPrecision; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.junit.After; +import org.junit.Before; +import org.junit.Ignore; +import org.junit.Test; + +public class TestRangeEqualsVisitor { + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + private static final Charset utf8Charset = Charset.forName("UTF-8"); + private static final byte[] STR1 = "AAAAA1".getBytes(utf8Charset); + private static final byte[] STR2 = "BBBBBBBBB2".getBytes(utf8Charset); + private static final byte[] STR3 = "CCCC3".getBytes(utf8Charset); + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testIntVectorEqualsWithNull() { + try (final IntVector vector1 = new IntVector("int", allocator); + final IntVector vector2 = new IntVector("int", allocator)) { + + setVector(vector1, 1, 2); + setVector(vector2, 1, null); + + assertFalse(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testEqualsWithTypeChange() { + try (final IntVector vector1 = new IntVector("vector", allocator); + final IntVector vector2 = new IntVector("vector", allocator); + final BigIntVector vector3 = new BigIntVector("vector", allocator)) { + + setVector(vector1, 1, 2); + setVector(vector2, 1, 2); + + RangeEqualsVisitor visitor = new RangeEqualsVisitor(vector1, vector2); + Range range = new Range(0, 0, 2); + assertTrue(vector1.accept(visitor, range)); + // visitor left vector changed, will reset and check type again + assertFalse(vector3.accept(visitor, range)); + } + } + + @Test + public void testBaseFixedWidthVectorRangeEqual() { + try (final IntVector vector1 = new IntVector("int", allocator); + final IntVector vector2 = new IntVector("int", allocator)) { + + setVector(vector1, 1, 2, 3, 4, 5); + setVector(vector2, 11, 2, 3, 4, 55); + + RangeEqualsVisitor visitor = new RangeEqualsVisitor(vector1, vector2); + assertTrue(visitor.rangeEquals(new Range(1, 1, 3))); + } + } + + @Test + public void testBaseVariableVectorRangeEquals() { + try (final VarCharVector vector1 = new VarCharVector("varchar", allocator); + final VarCharVector vector2 = new VarCharVector("varchar", allocator)) { + + setVector(vector1, STR1, STR2, STR3, STR2, STR1); + setVector(vector2, STR1, STR2, STR3, STR2, STR1); + + RangeEqualsVisitor visitor = new RangeEqualsVisitor(vector1, vector2); + assertTrue(visitor.rangeEquals(new Range(1, 1, 3))); + } + } + + @Test + public void testListVectorWithDifferentChild() { + try (final ListVector vector1 = ListVector.empty("list", allocator); + final ListVector vector2 = ListVector.empty("list", allocator);) { + + vector1.allocateNew(); + vector1.initializeChildrenFromFields( + Arrays.asList(Field.nullable("child", new ArrowType.Int(32, true)))); + + vector2.allocateNew(); + vector2.initializeChildrenFromFields( + Arrays.asList(Field.nullable("child", new ArrowType.Int(64, true)))); + + RangeEqualsVisitor visitor = new RangeEqualsVisitor(vector1, vector2); + assertFalse(visitor.rangeEquals(new Range(0, 0, 0))); + } + } + + @Test + public void testListVectorRangeEquals() { + try (final ListVector vector1 = ListVector.empty("list", allocator); + final ListVector vector2 = ListVector.empty("list", allocator);) { + + UnionListWriter writer1 = vector1.getWriter(); + writer1.allocate(); + + //set some values + writeListVector(writer1, new int[] {1, 2}); + writeListVector(writer1, new int[] {3, 4}); + writeListVector(writer1, new int[] {5, 6}); + writeListVector(writer1, new int[] {7, 8}); + writeListVector(writer1, new int[] {9, 10}); + writer1.setValueCount(5); + + UnionListWriter writer2 = vector2.getWriter(); + writer2.allocate(); + + //set some values + writeListVector(writer2, new int[] {0, 0}); + writeListVector(writer2, new int[] {3, 4}); + writeListVector(writer2, new int[] {5, 6}); + writeListVector(writer2, new int[] {7, 8}); + writeListVector(writer2, new int[] {0, 0}); + writer2.setValueCount(5); + + RangeEqualsVisitor visitor = new RangeEqualsVisitor(vector1, vector2); + assertTrue(visitor.rangeEquals(new Range(1, 1, 3))); + } + } + + @Test + public void testFixedSizeListVectorRangeEquals() { + try (final FixedSizeListVector vector1 = FixedSizeListVector.empty("list", 2, allocator); + final FixedSizeListVector vector2 = FixedSizeListVector.empty("list", 2, allocator);) { + + UnionFixedSizeListWriter writer1 = vector1.getWriter(); + writer1.allocate(); + + //set some values + writeFixedSizeListVector(writer1, new int[] {1, 2}); + writeFixedSizeListVector(writer1, new int[] {3, 4}); + writeFixedSizeListVector(writer1, new int[] {5, 6}); + writeFixedSizeListVector(writer1, new int[] {7, 8}); + writeFixedSizeListVector(writer1, new int[] {9, 10}); + writer1.setValueCount(5); + + UnionFixedSizeListWriter writer2 = vector2.getWriter(); + writer2.allocate(); + + //set some values + writeFixedSizeListVector(writer2, new int[] {0, 0}); + writeFixedSizeListVector(writer2, new int[] {3, 4}); + writeFixedSizeListVector(writer2, new int[] {5, 6}); + writeFixedSizeListVector(writer2, new int[] {7, 8}); + writeFixedSizeListVector(writer2, new int[] {0, 0}); + writer2.setValueCount(5); + + RangeEqualsVisitor visitor = new RangeEqualsVisitor(vector1, vector2); + assertTrue(visitor.rangeEquals(new Range(1, 1, 3))); + assertFalse(visitor.rangeEquals(new Range(0, 0, 5))); + } + } + + @Test + public void testLargeVariableWidthVectorRangeEquals() { + try (final LargeVarCharVector vector1 = new LargeVarCharVector("vector1", allocator); + final LargeVarCharVector vector2 = new LargeVarCharVector("vector2", allocator)) { + setVector(vector1, "aaa", "bbb", "ccc", null, "ddd"); + setVector(vector2, "ccc", "aaa", "bbb", null, "ddd"); + + RangeEqualsVisitor visitor = new RangeEqualsVisitor(vector1, vector2, + (v1, v2) -> new TypeEqualsVisitor(v2, /*check name*/ false, /*check metadata*/ false).equals(v1)); + + assertFalse(visitor.rangeEquals(new Range(/*left start*/ 0, /*right start*/ 0, /*length*/ 1))); + assertTrue(visitor.rangeEquals(new Range(/*left start*/ 0, /*right start*/ 1, /*length*/ 1))); + assertFalse(visitor.rangeEquals(new Range(/*left start*/ 0, /*right start*/ 0, /*length*/ 3))); + assertTrue(visitor.rangeEquals(new Range(/*left start*/ 0, /*right start*/ 1, /*length*/ 2))); + assertTrue(visitor.rangeEquals(new Range(/*left start*/ 3, /*right start*/ 3, /*length*/ 1))); + assertTrue(visitor.rangeEquals(new Range(/*left start*/ 3, /*right start*/ 3, /*length*/ 2))); + assertFalse(visitor.rangeEquals(new Range(/*left start*/ 2, /*right start*/ 2, /*length*/ 2))); + } + } + + @Test + public void testStructVectorRangeEquals() { + try (final StructVector vector1 = StructVector.empty("struct", allocator); + final StructVector vector2 = StructVector.empty("struct", allocator);) { + vector1.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class); + vector1.addOrGet("f1", FieldType.nullable(new ArrowType.Int(64, true)), BigIntVector.class); + vector2.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class); + vector2.addOrGet("f1", FieldType.nullable(new ArrowType.Int(64, true)), BigIntVector.class); + + NullableStructWriter writer1 = vector1.getWriter(); + writer1.allocate(); + + writeStructVector(writer1, 1, 10L); + writeStructVector(writer1, 2, 20L); + writeStructVector(writer1, 3, 30L); + writeStructVector(writer1, 4, 40L); + writeStructVector(writer1, 5, 50L); + writer1.setValueCount(5); + + NullableStructWriter writer2 = vector2.getWriter(); + writer2.allocate(); + + writeStructVector(writer2, 0, 00L); + writeStructVector(writer2, 2, 20L); + writeStructVector(writer2, 3, 30L); + writeStructVector(writer2, 4, 40L); + writeStructVector(writer2, 0, 0L); + writer2.setValueCount(5); + + RangeEqualsVisitor visitor = new RangeEqualsVisitor(vector1, vector2); + assertTrue(visitor.rangeEquals(new Range(1, 1, 3))); + } + } + + @Test + public void testUnionVectorRangeEquals() { + try (final UnionVector vector1 = new UnionVector("union", allocator, /* field type */ null, /* call-back */ null); + final UnionVector vector2 = + new UnionVector("union", allocator, /* field type */ null, /* call-back */ null);) { + + final NullableUInt4Holder uInt4Holder = new NullableUInt4Holder(); + uInt4Holder.value = 10; + uInt4Holder.isSet = 1; + + final NullableIntHolder intHolder = new NullableIntHolder(); + uInt4Holder.value = 20; + uInt4Holder.isSet = 1; + + vector1.setType(0, Types.MinorType.UINT4); + vector1.setSafe(0, uInt4Holder); + + vector1.setType(1, Types.MinorType.INT); + vector1.setSafe(1, intHolder); + + vector1.setType(2, Types.MinorType.INT); + vector1.setSafe(2, intHolder); + vector1.setValueCount(3); + + vector2.setType(0, Types.MinorType.UINT4); + vector2.setSafe(0, uInt4Holder); + + vector2.setType(1, Types.MinorType.INT); + vector2.setSafe(1, intHolder); + + vector2.setType(2, Types.MinorType.INT); + vector2.setSafe(2, intHolder); + vector2.setValueCount(3); + + RangeEqualsVisitor visitor = new RangeEqualsVisitor(vector1, vector2); + assertTrue(visitor.rangeEquals(new Range(1, 1, 2))); + } + } + + /** + * Test comparing two union vectors. + * The two vectors are different in total, but have a range with equal values. + */ + @Test + public void testUnionVectorSubRangeEquals() { + try (final UnionVector vector1 = new UnionVector("union", allocator, null, null); + final UnionVector vector2 = new UnionVector("union", allocator, null, null);) { + + final NullableUInt4Holder uInt4Holder = new NullableUInt4Holder(); + uInt4Holder.value = 10; + uInt4Holder.isSet = 1; + + final NullableIntHolder intHolder = new NullableIntHolder(); + intHolder.value = 20; + intHolder.isSet = 1; + + vector1.setType(0, Types.MinorType.UINT4); + vector1.setSafe(0, uInt4Holder); + + vector1.setType(1, Types.MinorType.INT); + vector1.setSafe(1, intHolder); + + vector1.setType(2, Types.MinorType.INT); + vector1.setSafe(2, intHolder); + + vector1.setType(3, Types.MinorType.INT); + vector1.setSafe(3, intHolder); + + vector1.setValueCount(4); + + vector2.setType(0, Types.MinorType.UINT4); + vector2.setSafe(0, uInt4Holder); + + vector2.setType(1, Types.MinorType.INT); + vector2.setSafe(1, intHolder); + + vector2.setType(2, Types.MinorType.INT); + vector2.setSafe(2, intHolder); + + vector2.setType(3, Types.MinorType.UINT4); + vector2.setSafe(3, uInt4Holder); + + vector2.setValueCount(4); + + RangeEqualsVisitor visitor = new RangeEqualsVisitor(vector1, vector2); + assertFalse(visitor.rangeEquals(new Range(0, 0, 4))); + assertTrue(visitor.rangeEquals(new Range(1, 1, 2))); + } + } + + @Test + public void testDenseUnionVectorEquals() { + final NullableIntHolder intHolder = new NullableIntHolder(); + intHolder.isSet = 1; + intHolder.value = 100; + + final NullableBigIntHolder bigIntHolder = new NullableBigIntHolder(); + bigIntHolder.isSet = 1; + bigIntHolder.value = 200L; + + final NullableFloat4Holder float4Holder = new NullableFloat4Holder(); + float4Holder.isSet = 1; + float4Holder.value = 400F; + + final NullableFloat8Holder float8Holder = new NullableFloat8Holder(); + float8Holder.isSet = 1; + float8Holder.value = 800D; + + try (DenseUnionVector vector1 = new DenseUnionVector("vector1", allocator, null, null); + DenseUnionVector vector2 = new DenseUnionVector("vector2", allocator, null, null)) { + vector1.allocateNew(); + vector2.allocateNew(); + + // populate vector1: {100, 200L, null, 400F, 800D} + byte intTypeId = vector1.registerNewTypeId(Field.nullable("int", Types.MinorType.INT.getType())); + byte longTypeId = vector1.registerNewTypeId(Field.nullable("long", Types.MinorType.BIGINT.getType())); + byte floatTypeId = vector1.registerNewTypeId(Field.nullable("float", Types.MinorType.FLOAT4.getType())); + byte doubleTypeId = vector1.registerNewTypeId(Field.nullable("double", Types.MinorType.FLOAT8.getType())); + + vector1.setTypeId(0, intTypeId); + vector1.setSafe(0, intHolder); + + vector1.setTypeId(1, longTypeId); + vector1.setSafe(1, bigIntHolder); + + vector1.setTypeId(3, floatTypeId); + vector1.setSafe(3, float4Holder); + + vector1.setTypeId(4, doubleTypeId); + vector1.setSafe(4, float8Holder); + + vector1.setValueCount(5); + + // populate vector2: {400F, null, 200L, null, 400F, 800D, 100} + intTypeId = vector2.registerNewTypeId(Field.nullable("int", Types.MinorType.INT.getType())); + longTypeId = vector2.registerNewTypeId(Field.nullable("long", Types.MinorType.BIGINT.getType())); + floatTypeId = vector2.registerNewTypeId(Field.nullable("float", Types.MinorType.FLOAT4.getType())); + doubleTypeId = vector2.registerNewTypeId(Field.nullable("double", Types.MinorType.FLOAT8.getType())); + + vector2.setTypeId(0, floatTypeId); + vector2.setSafe(0, float4Holder); + + vector2.setTypeId(2, longTypeId); + vector2.setSafe(2, bigIntHolder); + + vector2.setTypeId(4, floatTypeId); + vector2.setSafe(4, float4Holder); + + vector2.setTypeId(5, doubleTypeId); + vector2.setSafe(5, float8Holder); + + vector2.setTypeId(6, intTypeId); + vector2.setSafe(6, intHolder); + + vector2.setValueCount(7); + + // compare ranges + TypeEqualsVisitor typeVisitor = + new TypeEqualsVisitor(vector2, /* check name */ false, /* check meta data */ true); + RangeEqualsVisitor equalsVisitor = + new RangeEqualsVisitor(vector1, vector2, (left, right) -> typeVisitor.equals(left)); + + // different ranges {100, 200L} != {400F, null} + assertFalse(equalsVisitor.rangeEquals(new Range(0, 0, 2))); + + // different ranges without null {100, 200L} != {400F, null} + assertFalse(equalsVisitor.rangeEquals(new Range(3, 5, 2))); + + // equal ranges {200L, null, 400F, 800D} + assertTrue(equalsVisitor.rangeEquals(new Range(1, 2, 4))); + + // equal ranges without null {400F, 800D} + assertTrue(equalsVisitor.rangeEquals(new Range(3, 4, 2))); + + // equal ranges with only null {null} + assertTrue(equalsVisitor.rangeEquals(new Range(2, 3, 1))); + + // equal ranges with single element {100} + assertTrue(equalsVisitor.rangeEquals(new Range(0, 6, 1))); + + // different ranges with single element {100} != {200L} + assertFalse(equalsVisitor.rangeEquals(new Range(0, 2, 1))); + } + } + + @Ignore + @Test + public void testEqualsWithOutTypeCheck() { + try (final IntVector intVector = new IntVector("int", allocator); + final ZeroVector zeroVector = new ZeroVector("zero")) { + + assertTrue(VectorEqualsVisitor.vectorEquals(intVector, zeroVector, null)); + assertTrue(VectorEqualsVisitor.vectorEquals(zeroVector, intVector, null)); + } + } + + @Test + public void testFloat4ApproxEquals() { + try (final Float4Vector vector1 = new Float4Vector("float", allocator); + final Float4Vector vector2 = new Float4Vector("float", allocator); + final Float4Vector vector3 = new Float4Vector("float", allocator)) { + + final float epsilon = 1.0E-6f; + setVector(vector1, 1.1f, 2.2f); + setVector(vector2, 1.1f + epsilon / 2, 2.2f + epsilon / 2); + setVector(vector3, 1.1f + epsilon * 2, 2.2f + epsilon * 2); + + Range range = new Range(0, 0, vector1.getValueCount()); + + ApproxEqualsVisitor visitor12 = new ApproxEqualsVisitor(vector1, vector2, epsilon, epsilon); + assertTrue(visitor12.rangeEquals(range)); + + ApproxEqualsVisitor visitor13 = new ApproxEqualsVisitor(vector1, vector3, epsilon, epsilon); + assertFalse(visitor13.rangeEquals(range)); + } + } + + @Test + public void testFloat8ApproxEquals() { + try (final Float8Vector vector1 = new Float8Vector("float", allocator); + final Float8Vector vector2 = new Float8Vector("float", allocator); + final Float8Vector vector3 = new Float8Vector("float", allocator)) { + + final float epsilon = 1.0E-6f; + setVector(vector1, 1.1, 2.2); + setVector(vector2, 1.1 + epsilon / 2, 2.2 + epsilon / 2); + setVector(vector3, 1.1 + epsilon * 2, 2.2 + epsilon * 2); + + Range range = new Range(0, 0, vector1.getValueCount()); + assertTrue(new ApproxEqualsVisitor(vector1, vector2, epsilon, epsilon).rangeEquals(range)); + assertFalse(new ApproxEqualsVisitor(vector1, vector3, epsilon, epsilon).rangeEquals(range)); + } + } + + @Test + public void testStructVectorApproxEquals() { + try (final StructVector right = StructVector.empty("struct", allocator); + final StructVector left1 = StructVector.empty("struct", allocator); + final StructVector left2 = StructVector.empty("struct", allocator)) { + right.addOrGet("f0", + FieldType.nullable(new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)), Float4Vector.class); + right.addOrGet("f1", + FieldType.nullable(new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)), Float8Vector.class); + left1.addOrGet("f0", + FieldType.nullable(new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)), Float4Vector.class); + left1.addOrGet("f1", + FieldType.nullable(new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)), Float8Vector.class); + left2.addOrGet("f0", + FieldType.nullable(new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)), Float4Vector.class); + left2.addOrGet("f1", + FieldType.nullable(new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)), Float8Vector.class); + + final float epsilon = 1.0E-6f; + + NullableStructWriter rightWriter = right.getWriter(); + rightWriter.allocate(); + writeStructVector(rightWriter, 1.1f, 2.2); + writeStructVector(rightWriter, 2.02f, 4.04); + rightWriter.setValueCount(2); + + NullableStructWriter leftWriter1 = left1.getWriter(); + leftWriter1.allocate(); + writeStructVector(leftWriter1, 1.1f + epsilon / 2, 2.2 + epsilon / 2); + writeStructVector(leftWriter1, 2.02f - epsilon / 2, 4.04 - epsilon / 2); + leftWriter1.setValueCount(2); + + NullableStructWriter leftWriter2 = left2.getWriter(); + leftWriter2.allocate(); + writeStructVector(leftWriter2, 1.1f + epsilon * 2, 2.2 + epsilon * 2); + writeStructVector(leftWriter2, 2.02f - epsilon * 2, 4.04 - epsilon * 2); + leftWriter2.setValueCount(2); + + Range range = new Range(0, 0, right.getValueCount()); + assertTrue(new ApproxEqualsVisitor(left1, right, epsilon, epsilon).rangeEquals(range)); + assertFalse(new ApproxEqualsVisitor(left2, right, epsilon, epsilon).rangeEquals(range)); + } + } + + @Test + public void testUnionVectorApproxEquals() { + try (final UnionVector right = new UnionVector("union", allocator, /* field type */ null, /* call-back */ null); + final UnionVector left1 = new UnionVector("union", allocator, /* field type */ null, /* call-back */ null); + final UnionVector left2 = new UnionVector("union", allocator, /* field type */ null, /* call-back */ null);) { + + final NullableFloat4Holder float4Holder = new NullableFloat4Holder(); + float4Holder.value = 1.01f; + float4Holder.isSet = 1; + + final NullableFloat8Holder float8Holder = new NullableFloat8Holder(); + float8Holder.value = 2.02f; + float8Holder.isSet = 1; + + final float epsilon = 1.0E-6f; + + right.setType(0, Types.MinorType.FLOAT4); + right.setSafe(0, float4Holder); + right.setType(1, Types.MinorType.FLOAT8); + right.setSafe(1, float8Holder); + right.setValueCount(2); + + float4Holder.value += epsilon / 2; + float8Holder.value += epsilon / 2; + + left1.setType(0, Types.MinorType.FLOAT4); + left1.setSafe(0, float4Holder); + left1.setType(1, Types.MinorType.FLOAT8); + left1.setSafe(1, float8Holder); + left1.setValueCount(2); + + float4Holder.value += epsilon * 2; + float8Holder.value += epsilon * 2; + + left2.setType(0, Types.MinorType.FLOAT4); + left2.setSafe(0, float4Holder); + left2.setType(1, Types.MinorType.FLOAT8); + left2.setSafe(1, float8Holder); + left2.setValueCount(2); + + Range range = new Range(0, 0, right.getValueCount()); + assertTrue(new ApproxEqualsVisitor(left1, right, epsilon, epsilon).rangeEquals(range)); + assertFalse(new ApproxEqualsVisitor(left2, right, epsilon, epsilon).rangeEquals(range)); + } + } + + @Test + public void testDenseUnionVectorApproxEquals() { + final NullableFloat4Holder float4Holder = new NullableFloat4Holder(); + float4Holder.isSet = 1; + + final NullableFloat8Holder float8Holder = new NullableFloat8Holder(); + float8Holder.isSet = 1; + + final float floatEpsilon = 0.02F; + final double doubleEpsilon = 0.02; + + try (final DenseUnionVector vector1 = new DenseUnionVector("vector1", allocator, null, null); + final DenseUnionVector vector2 = new DenseUnionVector("vector2", allocator, null, null); + final DenseUnionVector vector3 = new DenseUnionVector("vector2", allocator, null, null)) { + + vector1.allocateNew(); + vector2.allocateNew(); + vector3.allocateNew(); + + // populate vector1: {1.0f, 2.0D} + byte floatTypeId = vector1.registerNewTypeId(Field.nullable("float", Types.MinorType.FLOAT4.getType())); + byte doubleTypeId = vector1.registerNewTypeId(Field.nullable("double", Types.MinorType.FLOAT8.getType())); + + float4Holder.value = 1.0f; + vector1.setTypeId(0, floatTypeId); + vector1.setSafe(0, float4Holder); + float8Holder.value = 2.0D; + vector1.setTypeId(1, doubleTypeId); + vector1.setSafe(1, float8Holder); + vector1.setValueCount(2); + + // populate vector2: {1.01f, 2.01D} + floatTypeId = vector2.registerNewTypeId(Field.nullable("float", Types.MinorType.FLOAT4.getType())); + doubleTypeId = vector2.registerNewTypeId(Field.nullable("double", Types.MinorType.FLOAT8.getType())); + + float4Holder.value = 1.01f; + vector2.setTypeId(0, floatTypeId); + vector2.setSafe(0, float4Holder); + float8Holder.value = 2.01D; + vector2.setTypeId(1, doubleTypeId); + vector2.setSafe(1, float8Holder); + vector2.setValueCount(2); + + // populate vector3: {1.05f, 2.05D} + floatTypeId = vector3.registerNewTypeId(Field.nullable("float", Types.MinorType.FLOAT4.getType())); + doubleTypeId = vector3.registerNewTypeId(Field.nullable("double", Types.MinorType.FLOAT8.getType())); + + float4Holder.value = 1.05f; + vector3.setTypeId(0, floatTypeId); + vector3.setSafe(0, float4Holder); + float8Holder.value = 2.05D; + vector3.setTypeId(1, doubleTypeId); + vector3.setSafe(1, float8Holder); + vector3.setValueCount(2); + + // verify comparison results + Range range = new Range(0, 0, 2); + + // compare vector1 and vector2 + ApproxEqualsVisitor approxEqualsVisitor = new ApproxEqualsVisitor( + vector1, vector2, + new ValueEpsilonEqualizers.Float4EpsilonEqualizer(floatEpsilon), + new ValueEpsilonEqualizers.Float8EpsilonEqualizer(doubleEpsilon), + (v1, v2) -> new TypeEqualsVisitor(v2, /* check name */ false, /* check meta */ true).equals(v1)); + assertTrue(approxEqualsVisitor.rangeEquals(range)); + + // compare vector1 and vector3 + approxEqualsVisitor = new ApproxEqualsVisitor( + vector1, vector3, + new ValueEpsilonEqualizers.Float4EpsilonEqualizer(floatEpsilon), + new ValueEpsilonEqualizers.Float8EpsilonEqualizer(doubleEpsilon), + (v1, v2) -> new TypeEqualsVisitor(v2, /* check name */ false, /* check meta */ true).equals(v1)); + assertFalse(approxEqualsVisitor.rangeEquals(range)); + } + } + + @Test + public void testListVectorApproxEquals() { + try (final ListVector right = ListVector.empty("list", allocator); + final ListVector left1 = ListVector.empty("list", allocator); + final ListVector left2 = ListVector.empty("list", allocator);) { + + final float epsilon = 1.0E-6f; + + UnionListWriter rightWriter = right.getWriter(); + rightWriter.allocate(); + writeListVector(rightWriter, new double[] {1, 2}); + writeListVector(rightWriter, new double[] {1.01, 2.02}); + rightWriter.setValueCount(2); + + UnionListWriter leftWriter1 = left1.getWriter(); + leftWriter1.allocate(); + writeListVector(leftWriter1, new double[] {1, 2}); + writeListVector(leftWriter1, new double[] {1.01 + epsilon / 2, 2.02 - epsilon / 2}); + leftWriter1.setValueCount(2); + + UnionListWriter leftWriter2 = left2.getWriter(); + leftWriter2.allocate(); + writeListVector(leftWriter2, new double[] {1, 2}); + writeListVector(leftWriter2, new double[] {1.01 + epsilon * 2, 2.02 - epsilon * 2}); + leftWriter2.setValueCount(2); + + Range range = new Range(0, 0, right.getValueCount()); + assertTrue(new ApproxEqualsVisitor(left1, right, epsilon, epsilon).rangeEquals(range)); + assertFalse(new ApproxEqualsVisitor(left2, right, epsilon, epsilon).rangeEquals(range)); + } + } + + private void writeStructVector(NullableStructWriter writer, int value1, long value2) { + writer.start(); + writer.integer("f0").writeInt(value1); + writer.bigInt("f1").writeBigInt(value2); + writer.end(); + } + + private void writeStructVector(NullableStructWriter writer, float value1, double value2) { + writer.start(); + writer.float4("f0").writeFloat4(value1); + writer.float8("f1").writeFloat8(value2); + writer.end(); + } + + private void writeListVector(UnionListWriter writer, int[] values) { + writer.startList(); + for (int v: values) { + writer.integer().writeInt(v); + } + writer.endList(); + } + + private void writeFixedSizeListVector(UnionFixedSizeListWriter writer, int[] values) { + writer.startList(); + for (int v: values) { + writer.integer().writeInt(v); + } + writer.endList(); + } + + private void writeListVector(UnionListWriter writer, double[] values) { + writer.startList(); + for (double v: values) { + writer.float8().writeFloat8(v); + } + writer.endList(); + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/compare/TestTypeEqualsVisitor.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/compare/TestTypeEqualsVisitor.java new file mode 100644 index 000000000..c0a3bd89d --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/compare/TestTypeEqualsVisitor.java @@ -0,0 +1,185 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.compare; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.nio.charset.Charset; +import java.util.HashMap; +import java.util.Map; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.complex.DenseUnionVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestTypeEqualsVisitor { + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + private static final Charset utf8Charset = Charset.forName("UTF-8"); + private static final byte[] STR1 = "AAAAA1".getBytes(utf8Charset); + private static final byte[] STR2 = "BBBBBBBBB2".getBytes(utf8Charset); + private static final byte[] STR3 = "CCCC3".getBytes(utf8Charset); + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testTypeEqualsWithName() { + try (final IntVector right = new IntVector("int", allocator); + final IntVector left1 = new IntVector("int", allocator); + final IntVector left2 = new IntVector("int2", allocator)) { + + TypeEqualsVisitor visitor = new TypeEqualsVisitor(right); + assertTrue(visitor.equals(left1)); + assertFalse(visitor.equals(left2)); + } + } + + @Test + public void testTypeEqualsWithMetadata() { + Map metadata = new HashMap<>(); + metadata.put("key1", "value1"); + FieldType typeWithoutMeta = new FieldType(true, new ArrowType.Int(32, true), + null, null); + FieldType typeWithMeta = new FieldType(true, new ArrowType.Int(32, true), + null, metadata); + + try (IntVector right = (IntVector) typeWithoutMeta.createNewSingleVector("int", allocator, null); + IntVector left1 = (IntVector) typeWithoutMeta.createNewSingleVector("int", allocator, null); + IntVector left2 = (IntVector) typeWithMeta.createNewSingleVector("int", allocator, null)) { + + TypeEqualsVisitor visitor = new TypeEqualsVisitor(right); + assertTrue(visitor.equals(left1)); + assertFalse(visitor.equals(left2)); + } + } + + @Test + public void testListTypeEquals() { + try (final ListVector right = ListVector.empty("list", allocator); + final ListVector left1 = ListVector.empty("list", allocator); + final ListVector left2 = ListVector.empty("list", allocator)) { + + right.addOrGetVector(FieldType.nullable(new ArrowType.Utf8())); + left1.addOrGetVector(FieldType.nullable(new ArrowType.Utf8())); + left2.addOrGetVector(FieldType.nullable(new ArrowType.FixedSizeBinary(2))); + + TypeEqualsVisitor visitor = new TypeEqualsVisitor(right); + assertTrue(visitor.equals(left1)); + assertFalse(visitor.equals(left2)); + } + } + + @Test + public void testStructTypeEquals() { + try (final StructVector right = StructVector.empty("struct", allocator); + final StructVector left1 = StructVector.empty("struct", allocator); + final StructVector left2 = StructVector.empty("struct", allocator)) { + + right.addOrGet("child", FieldType.nullable(new ArrowType.Utf8()), VarCharVector.class); + left1.addOrGet("child", FieldType.nullable(new ArrowType.Utf8()), VarCharVector.class); + left2.addOrGet("child2", FieldType.nullable(new ArrowType.Utf8()), VarCharVector.class); + + TypeEqualsVisitor visitor = new TypeEqualsVisitor(right); + assertTrue(visitor.equals(left1)); + assertFalse(visitor.equals(left2)); + } + } + + @Test + public void testUnionTypeEquals() { + try (final UnionVector right = new UnionVector("union", allocator, /* field type */ null, /* call-back */ null); + final UnionVector left1 = new UnionVector("union", allocator, /* field type */ null, /* call-back */ null); + final UnionVector left2 = new UnionVector("union", allocator, /* field type */ null, /* call-back */ null)) { + + right.addVector(new IntVector("int", allocator)); + left1.addVector(new IntVector("int", allocator)); + left2.addVector(new BigIntVector("bigint", allocator)); + + TypeEqualsVisitor visitor = new TypeEqualsVisitor(right); + assertTrue(visitor.equals(left1)); + assertFalse(visitor.equals(left2)); + } + } + + @Test + public void testDenseUnionTypeEquals() { + try (DenseUnionVector vector1 = new DenseUnionVector("vector1", allocator, null, null); + DenseUnionVector vector2 = new DenseUnionVector("vector2", allocator, null, null)) { + vector1.allocateNew(); + vector2.allocateNew(); + + // set children for vector1 + byte intTypeId = vector1.registerNewTypeId(Field.nullable("int", Types.MinorType.INT.getType())); + byte longTypeId = vector1.registerNewTypeId(Field.nullable("long", Types.MinorType.BIGINT.getType())); + byte floatTypeId = vector1.registerNewTypeId(Field.nullable("float", Types.MinorType.FLOAT4.getType())); + byte doubleTypeId = vector1.registerNewTypeId(Field.nullable("double", Types.MinorType.FLOAT8.getType())); + + vector1.addVector(floatTypeId, new Float4Vector("", allocator)); + vector1.addVector(longTypeId, new BigIntVector("", allocator)); + vector1.addVector(intTypeId, new IntVector("", allocator)); + vector1.addVector(doubleTypeId, new Float8Vector("", allocator)); + + // set children for vector2 + intTypeId = vector2.registerNewTypeId(Field.nullable("int", Types.MinorType.INT.getType())); + longTypeId = vector2.registerNewTypeId(Field.nullable("long", Types.MinorType.BIGINT.getType())); + floatTypeId = vector2.registerNewTypeId(Field.nullable("float", Types.MinorType.FLOAT4.getType())); + doubleTypeId = vector2.registerNewTypeId(Field.nullable("double", Types.MinorType.FLOAT8.getType())); + + // add vectors in a different order + vector2.addVector(intTypeId, new IntVector("", allocator)); + vector2.addVector(floatTypeId, new Float4Vector("", allocator)); + vector2.addVector(doubleTypeId, new Float8Vector("", allocator)); + vector2.addVector(longTypeId, new BigIntVector("", allocator)); + + // compare ranges + TypeEqualsVisitor typeVisitor = + new TypeEqualsVisitor(vector2, /* check name */ false, /* check meta data */ true); + assertTrue(typeVisitor.equals(vector1)); + + // if we check names, the types should be different + typeVisitor = + new TypeEqualsVisitor(vector2, /* check name */ true, /* check meta data */ true); + assertFalse(typeVisitor.equals(vector1)); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestComplexCopier.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestComplexCopier.java new file mode 100644 index 000000000..f314a98ee --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestComplexCopier.java @@ -0,0 +1,763 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex.impl; + +import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import java.math.BigDecimal; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.compare.VectorEqualsVisitor; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.BaseWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.StructWriter; +import org.apache.arrow.vector.complex.writer.FieldWriter; +import org.apache.arrow.vector.holders.DecimalHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.DecimalUtility; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestComplexCopier { + + private BufferAllocator allocator; + + private static final int COUNT = 100; + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testCopyFixedSizeListVector() { + try (FixedSizeListVector from = FixedSizeListVector.empty("v", 3, allocator); + FixedSizeListVector to = FixedSizeListVector.empty("v", 3, allocator)) { + + from.addOrGetVector(FieldType.nullable(Types.MinorType.INT.getType())); + to.addOrGetVector(FieldType.nullable(Types.MinorType.INT.getType())); + + // populate from vector + UnionFixedSizeListWriter writer = from.getWriter(); + for (int i = 0; i < COUNT; i++) { + writer.startList(); + writer.integer().writeInt(i); + writer.integer().writeInt(i * 2); + writer.integer().writeInt(i * 3); + writer.endList(); + } + from.setValueCount(COUNT); + to.setValueCount(COUNT); + + // copy values + FieldReader in = from.getReader(); + FieldWriter out = to.getWriter(); + for (int i = 0; i < COUNT; i++) { + in.setPosition(i); + out.setPosition(i); + ComplexCopier.copy(in, out); + } + + // validate equals + assertTrue(VectorEqualsVisitor.vectorEquals(from, to)); + + } + } + + @Test + public void testInvalidCopyFixedSizeListVector() { + try (FixedSizeListVector from = FixedSizeListVector.empty("v", 3, allocator); + FixedSizeListVector to = FixedSizeListVector.empty("v", 2, allocator)) { + + from.addOrGetVector(FieldType.nullable(Types.MinorType.INT.getType())); + to.addOrGetVector(FieldType.nullable(Types.MinorType.INT.getType())); + + // populate from vector + UnionFixedSizeListWriter writer = from.getWriter(); + for (int i = 0; i < COUNT; i++) { + writer.startList(); + writer.integer().writeInt(i); + writer.integer().writeInt(i * 2); + writer.integer().writeInt(i * 3); + writer.endList(); + } + from.setValueCount(COUNT); + to.setValueCount(COUNT); + + // copy values + FieldReader in = from.getReader(); + FieldWriter out = to.getWriter(); + IllegalStateException e = assertThrows(IllegalStateException.class, + () -> ComplexCopier.copy(in, out)); + assertTrue(e.getMessage().contains("greater than listSize")); + } + } + + @Test + public void testCopyMapVector() { + try (final MapVector from = MapVector.empty("v", allocator, false); + final MapVector to = MapVector.empty("v", allocator, false)) { + + from.allocateNew(); + + UnionMapWriter mapWriter = from.getWriter(); + for (int i = 0; i < COUNT; i++) { + mapWriter.setPosition(i); + mapWriter.startMap(); + mapWriter.startEntry(); + mapWriter.key().integer().writeInt(i); + mapWriter.value().integer().writeInt(i); + mapWriter.endEntry(); + mapWriter.startEntry(); + mapWriter.key().decimal().writeDecimal(BigDecimal.valueOf(i * 2)); + mapWriter.value().decimal().writeDecimal(BigDecimal.valueOf(i * 2)); + mapWriter.endEntry(); + mapWriter.endMap(); + } + + from.setValueCount(COUNT); + + // copy values + FieldReader in = from.getReader(); + FieldWriter out = to.getWriter(); + for (int i = 0; i < COUNT; i++) { + in.setPosition(i); + out.setPosition(i); + ComplexCopier.copy(in, out); + } + to.setValueCount(COUNT); + + // validate equals + assertTrue(VectorEqualsVisitor.vectorEquals(from, to)); + } + } + + @Test + public void testCopyListVector() { + try (ListVector from = ListVector.empty("v", allocator); + ListVector to = ListVector.empty("v", allocator)) { + + UnionListWriter listWriter = from.getWriter(); + listWriter.allocate(); + + for (int i = 0; i < COUNT; i++) { + listWriter.setPosition(i); + listWriter.startList(); + + listWriter.integer().writeInt(i); + listWriter.integer().writeInt(i * 2); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(i); + listWriter.list().bigInt().writeBigInt(i * 2); + listWriter.list().bigInt().writeBigInt(i * 3); + listWriter.list().endList(); + + listWriter.list().startList(); + listWriter.list().decimal().writeDecimal(BigDecimal.valueOf(i * 4)); + listWriter.list().decimal().writeDecimal(BigDecimal.valueOf(i * 5)); + listWriter.list().endList(); + listWriter.endList(); + } + from.setValueCount(COUNT); + + // copy values + FieldReader in = from.getReader(); + FieldWriter out = to.getWriter(); + for (int i = 0; i < COUNT; i++) { + in.setPosition(i); + out.setPosition(i); + ComplexCopier.copy(in, out); + } + + to.setValueCount(COUNT); + + // validate equals + assertTrue(VectorEqualsVisitor.vectorEquals(from, to)); + + } + } + + @Test + public void testCopyListVectorToANonEmptyList() { + try (ListVector from = ListVector.empty("v", allocator); + ListVector to = ListVector.empty("v", allocator)) { + + UnionListWriter listWriter = from.getWriter(); + listWriter.allocate(); + + for (int i = 0; i < COUNT; i++) { + listWriter.setPosition(i); + listWriter.startList(); + listWriter.integer().writeInt(i); + listWriter.integer().writeInt(i * 2); + listWriter.endList(); + } + from.setValueCount(COUNT); + + // copy values + FieldReader in = from.getReader(); + FieldWriter out = to.getWriter(); + for (int i = 0; i < COUNT; i++) { + in.setPosition(i); + out.setPosition(i); + ComplexCopier.copy(in, out); + } + to.setValueCount(COUNT); + // validate equals + assertTrue(VectorEqualsVisitor.vectorEquals(from, to)); + + // Copy again to the target vector which is non-empty + for (int i = 0; i < COUNT; i++) { + in.setPosition(i); + out.setPosition(i); + ComplexCopier.copy(in, out); + } + to.setValueCount(COUNT); + + // validate equals + assertTrue(VectorEqualsVisitor.vectorEquals(from, to)); + + // copy using copyFromSafe method + for (int i = 0; i < COUNT; i++) { + to.copyFromSafe(i, i, from); + } + to.setValueCount(COUNT); + + // validate equals + assertTrue(VectorEqualsVisitor.vectorEquals(from, to)); + } + } + + @Test + public void testCopyListVectorWithNulls() { + try (ListVector from = ListVector.empty("v", allocator); + ListVector to = ListVector.empty("v", allocator)) { + + UnionListWriter listWriter = from.getWriter(); + listWriter.allocate(); + + // writer null, [null,i,null,i*2,null] alternatively + for (int i = 0; i < COUNT; i++) { + listWriter.setPosition(i); + if (i % 2 == 0) { + listWriter.writeNull(); + continue; + } + listWriter.startList(); + listWriter.integer().writeNull(); + listWriter.integer().writeInt(i); + listWriter.integer().writeNull(); + listWriter.integer().writeInt(i * 2); + listWriter.integer().writeNull(); + listWriter.endList(); + } + from.setValueCount(COUNT); + + // copy values + FieldReader in = from.getReader(); + FieldWriter out = to.getWriter(); + for (int i = 0; i < COUNT; i++) { + in.setPosition(i); + out.setPosition(i); + ComplexCopier.copy(in, out); + } + + to.setValueCount(COUNT); + + // validate equals + assertTrue(VectorEqualsVisitor.vectorEquals(from, to)); + } + } + + @Test + public void testCopyListOfListVectorWithNulls() { + try (ListVector from = ListVector.empty("v", allocator); + ListVector to = ListVector.empty("v", allocator);) { + + UnionListWriter listWriter = from.getWriter(); + listWriter.allocate(); + + // write null, [null,[50,100,null,200],null, + // [null,50,null,100,null,200,null],null] alternatively + for (int i = 0; i < COUNT; i++) { + listWriter.setPosition(i); + if (i % 2 == 0) { + listWriter.writeNull(); + continue; + } + listWriter.startList(); + listWriter.list().writeNull(); + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(50); + listWriter.list().bigInt().writeBigInt(100); + listWriter.list().bigInt().writeNull(); + listWriter.list().bigInt().writeBigInt(200); + listWriter.list().endList(); + listWriter.list().writeNull(); + listWriter.list().startList(); + listWriter.list().bigInt().writeNull(); + listWriter.list().bigInt().writeBigInt(50); + listWriter.list().bigInt().writeNull(); + listWriter.list().bigInt().writeBigInt(100); + listWriter.list().bigInt().writeNull(); + listWriter.list().bigInt().writeBigInt(200); + listWriter.list().bigInt().writeNull(); + listWriter.list().endList(); + listWriter.list().writeNull(); + listWriter.endList(); + } + from.setValueCount(COUNT); + + // copy values + FieldReader in = from.getReader(); + FieldWriter out = to.getWriter(); + for (int i = 0; i < COUNT; i++) { + in.setPosition(i); + out.setPosition(i); + ComplexCopier.copy(in, out); + } + + to.setValueCount(COUNT); + + // validate equals + assertTrue(VectorEqualsVisitor.vectorEquals(from, to)); + } + } + + @Test + public void testCopyListOStructVectorWithNulls() { + try (ListVector from = ListVector.empty("v", allocator); + ListVector to = ListVector.empty("v", allocator);) { + + UnionListWriter listWriter = from.getWriter(); + listWriter.allocate(); + + // write null, [null,{"f1":1,"f2":2},null, + // {"f1":1,"f2":2},null] alternatively + for (int i = 0; i < COUNT; i++) { + listWriter.setPosition(i); + if (i % 2 == 0) { + listWriter.writeNull(); + continue; + } + listWriter.startList(); + listWriter.struct().writeNull(); + listWriter.struct().start(); + listWriter.struct().integer("f1").writeInt(1); + listWriter.struct().integer("f2").writeInt(2); + listWriter.struct().integer("f3").writeNull(); + listWriter.struct().end(); + listWriter.struct().writeNull(); + listWriter.struct().start(); + listWriter.struct().integer("f1").writeInt(1); + listWriter.struct().integer("f2").writeInt(2); + listWriter.struct().integer("f3").writeNull(); + listWriter.struct().end(); + listWriter.struct().writeNull(); + listWriter.endList(); + } + from.setValueCount(COUNT); + + // copy values + FieldReader in = from.getReader(); + FieldWriter out = to.getWriter(); + for (int i = 0; i < COUNT; i++) { + in.setPosition(i); + out.setPosition(i); + ComplexCopier.copy(in, out); + } + + to.setValueCount(COUNT); + + // validate equals + assertTrue(VectorEqualsVisitor.vectorEquals(from, to)); + } + } + + @Test + public void testCopyListOfListOfStructVectorWithNulls() { + try (ListVector from = ListVector.empty("v", allocator); + ListVector to = ListVector.empty("v", allocator);) { + + UnionListWriter listWriter = from.getWriter(); + listWriter.allocate(); + + // write null, + // [null,[{"f1":50},null,{"f1":100},null,{"f1":200}],null, + // [null,{"f1":50},null,{"f1":100},null,{"f1":200},null],null] + // alternatively + for (int i = 0; i < COUNT; i++) { + listWriter.setPosition(i); + if (i % 2 == 0) { + listWriter.writeNull(); + continue; + } + listWriter.startList(); + listWriter.list().writeNull(); + listWriter.list().startList(); + listWriter.list().struct().start(); + listWriter.list().struct().bigInt("f1").writeBigInt(50); + listWriter.list().struct().end(); + listWriter.list().struct().writeNull(); + listWriter.list().struct().start(); + listWriter.list().struct().bigInt("f1").writeBigInt(100); + listWriter.list().struct().end(); + listWriter.list().struct().writeNull(); + listWriter.list().struct().start(); + listWriter.list().struct().bigInt("f1").writeBigInt(200); + listWriter.list().struct().end(); + listWriter.list().endList(); + + listWriter.list().writeNull(); + + listWriter.list().startList(); + listWriter.list().struct().writeNull(); + listWriter.list().struct().start(); + listWriter.list().struct().bigInt("f1").writeBigInt(50); + listWriter.list().struct().end(); + + listWriter.list().struct().writeNull(); + listWriter.list().struct().start(); + listWriter.list().struct().bigInt("f1").writeBigInt(100); + listWriter.list().struct().end(); + + listWriter.list().struct().writeNull(); + listWriter.list().struct().start(); + listWriter.list().struct().bigInt("f1").writeBigInt(200); + listWriter.list().struct().end(); + + listWriter.list().struct().writeNull(); + listWriter.list().endList(); + + listWriter.list().writeNull(); + + listWriter.endList(); + } + from.setValueCount(COUNT); + + // copy values + FieldReader in = from.getReader(); + FieldWriter out = to.getWriter(); + for (int i = 0; i < COUNT; i++) { + in.setPosition(i); + out.setPosition(i); + ComplexCopier.copy(in, out); + } + + to.setValueCount(COUNT); + + // validate equals + assertTrue(VectorEqualsVisitor.vectorEquals(from, to)); + } + } + + @Test + public void testMapWithListValue() throws Exception { + try (MapVector from = MapVector.empty("map", allocator, false); + MapVector to = MapVector.empty("map", allocator, false)) { + + UnionMapWriter mapWriter = from.getWriter(); + BaseWriter.ListWriter valueWriter; + + /* allocate memory */ + mapWriter.allocate(); + + // write null, [{}, + // {"value":[]},{"key":1,"value":[null,50,null,100,null,200,null]}, + // null,{"key":2,"value":[null,75,null,125,null,150,null,175,null]}] + // alternatively + for (int i = 0; i < COUNT; i++) { + mapWriter.setPosition(i); + if (i % 2 == 0) { + mapWriter.writeNull(); + continue; + } + + mapWriter.startMap(); + + mapWriter.startEntry(); + mapWriter.key().bigInt().writeNull(); + mapWriter.value().list().writeNull(); + mapWriter.endEntry(); + + mapWriter.startEntry(); + mapWriter.key().bigInt().writeNull(); + valueWriter = mapWriter.value().list(); + valueWriter.startList(); + valueWriter.endList(); + mapWriter.endEntry(); + + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(1); + valueWriter = mapWriter.value().list(); + valueWriter.startList(); + valueWriter.bigInt().writeNull(); + valueWriter.bigInt().writeBigInt(50); + valueWriter.bigInt().writeNull(); + valueWriter.bigInt().writeBigInt(100); + valueWriter.bigInt().writeNull(); + valueWriter.bigInt().writeBigInt(200); + valueWriter.bigInt().writeNull(); + valueWriter.endList(); + mapWriter.endEntry(); + + mapWriter.writeNull(); + + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(2); + valueWriter = mapWriter.value().list(); + valueWriter.startList(); + valueWriter.bigInt().writeNull(); + valueWriter.bigInt().writeBigInt(75); + valueWriter.bigInt().writeNull(); + valueWriter.bigInt().writeBigInt(125); + valueWriter.bigInt().writeNull(); + valueWriter.bigInt().writeBigInt(150); + valueWriter.bigInt().writeNull(); + valueWriter.bigInt().writeBigInt(175); + valueWriter.bigInt().writeNull(); + valueWriter.endList(); + mapWriter.endEntry(); + + mapWriter.endMap(); + } + mapWriter.setValueCount(COUNT); + + // copy values + FieldReader in = from.getReader(); + FieldWriter out = to.getWriter(); + for (int i = 0; i < COUNT; i++) { + in.setPosition(i); + out.setPosition(i); + ComplexCopier.copy(in, out); + } + + to.setValueCount(COUNT); + + // validate equals + assertTrue(VectorEqualsVisitor.vectorEquals(from, to)); + } + } + + @Test + public void testCopyFixedSizedListOfDecimalsVector() { + try (FixedSizeListVector from = FixedSizeListVector.empty("v", 4, allocator); + FixedSizeListVector to = FixedSizeListVector.empty("v", 4, allocator)) { + from.addOrGetVector(FieldType.nullable(new ArrowType.Decimal(3, 0, 128))); + to.addOrGetVector(FieldType.nullable(new ArrowType.Decimal(3, 0, 128))); + + DecimalHolder holder = new DecimalHolder(); + holder.buffer = allocator.buffer(DecimalVector.TYPE_WIDTH); + ArrowType arrowType = new ArrowType.Decimal(3, 0, 128); + + // populate from vector + UnionFixedSizeListWriter writer = from.getWriter(); + for (int i = 0; i < COUNT; i++) { + writer.startList(); + writer.decimal().writeDecimal(BigDecimal.valueOf(i)); + + DecimalUtility.writeBigDecimalToArrowBuf(new BigDecimal(i * 2), holder.buffer, 0, DecimalVector.TYPE_WIDTH); + holder.start = 0; + holder.scale = 0; + holder.precision = 3; + writer.decimal().write(holder); + + DecimalUtility.writeBigDecimalToArrowBuf(new BigDecimal(i * 3), holder.buffer, 0, DecimalVector.TYPE_WIDTH); + writer.decimal().writeDecimal(0, holder.buffer, arrowType); + + writer.decimal().writeBigEndianBytesToDecimal(BigDecimal.valueOf(i * 4).unscaledValue().toByteArray(), + arrowType); + + writer.endList(); + } + from.setValueCount(COUNT); + to.setValueCount(COUNT); + + // copy values + FieldReader in = from.getReader(); + FieldWriter out = to.getWriter(); + for (int i = 0; i < COUNT; i++) { + in.setPosition(i); + out.setPosition(i); + ComplexCopier.copy(in, out); + } + + // validate equals + assertTrue(VectorEqualsVisitor.vectorEquals(from, to)); + holder.buffer.close(); + } + } + + @Test + public void testCopyUnionListWithDecimal() { + try (ListVector from = ListVector.empty("v", allocator); + ListVector to = ListVector.empty("v", allocator)) { + + UnionListWriter listWriter = from.getWriter(); + listWriter.allocate(); + + for (int i = 0; i < COUNT; i++) { + listWriter.setPosition(i); + listWriter.startList(); + + listWriter.decimal().writeDecimal(BigDecimal.valueOf(i * 2)); + listWriter.integer().writeInt(i); + listWriter.decimal().writeBigEndianBytesToDecimal(BigDecimal.valueOf(i * 3).unscaledValue().toByteArray(), + new ArrowType.Decimal(3, 0, 128)); + + listWriter.endList(); + } + from.setValueCount(COUNT); + + // copy values + FieldReader in = from.getReader(); + FieldWriter out = to.getWriter(); + for (int i = 0; i < COUNT; i++) { + in.setPosition(i); + out.setPosition(i); + ComplexCopier.copy(in, out); + } + + to.setValueCount(COUNT); + + // validate equals + assertTrue(VectorEqualsVisitor.vectorEquals(from, to)); + + } + } + + @Test + public void testCopyStructVector() { + try (final StructVector from = StructVector.empty("v", allocator); + final StructVector to = StructVector.empty("v", allocator)) { + + from.allocateNewSafe(); + + NullableStructWriter structWriter = from.getWriter(); + for (int i = 0; i < COUNT; i++) { + structWriter.setPosition(i); + structWriter.start(); + structWriter.integer("int").writeInt(i); + structWriter.decimal("dec", 0, 38).writeDecimal(BigDecimal.valueOf(i * 2)); + StructWriter innerStructWriter = structWriter.struct("struc"); + innerStructWriter.start(); + innerStructWriter.integer("innerint").writeInt(i * 3); + innerStructWriter.decimal("innerdec", 0, 38).writeDecimal(BigDecimal.valueOf(i * 4)); + innerStructWriter.decimal("innerdec", 0, 38).writeBigEndianBytesToDecimal(BigDecimal.valueOf(i * 4) + .unscaledValue().toByteArray(), new ArrowType.Decimal(3, 0, 128)); + innerStructWriter.end(); + structWriter.end(); + } + + from.setValueCount(COUNT); + + // copy values + FieldReader in = from.getReader(); + FieldWriter out = to.getWriter(); + for (int i = 0; i < COUNT; i++) { + in.setPosition(i); + out.setPosition(i); + ComplexCopier.copy(in, out); + } + to.setValueCount(COUNT); + + // validate equals + assertTrue(VectorEqualsVisitor.vectorEquals(from, to)); + } + } + + @Test + public void testCopyDecimalVectorWrongScale() { + try (FixedSizeListVector from = FixedSizeListVector.empty("v", 3, allocator); + FixedSizeListVector to = FixedSizeListVector.empty("v", 3, allocator)) { + from.addOrGetVector(FieldType.nullable(new ArrowType.Decimal(3, 2, 128))); + to.addOrGetVector(FieldType.nullable(new ArrowType.Decimal(3, 1, 128))); + + // populate from vector + UnionFixedSizeListWriter writer = from.getWriter(); + for (int i = 0; i < COUNT; i++) { + writer.startList(); + writer.decimal().writeDecimal(BigDecimal.valueOf(1.23)); + writer.decimal().writeDecimal(BigDecimal.valueOf(2.45)); + writer.endList(); + } + from.setValueCount(COUNT); + to.setValueCount(COUNT); + + // copy values + FieldReader in = from.getReader(); + FieldWriter out = to.getWriter(); + UnsupportedOperationException e = assertThrows(UnsupportedOperationException.class, + () -> ComplexCopier.copy(in, out)); + assertTrue(e.getMessage().contains("BigDecimal scale must equal that in the Arrow vector: 2 != 1")); + } + } + + @Test + public void testCopyStructVectorWithNulls() { + try (StructVector from = StructVector.empty("v", allocator); + StructVector to = StructVector.empty("v", allocator)) { + + NullableStructWriter writer = from.getWriter(); + + for (int i = 0; i < COUNT; ++i) { + writer.setPosition(i); + writer.start(); + writer.integer("int").writeInt(i); + if (i % 3 == 0) { + writer.float4("child").writeFloat4(12.3f); + } else if (i % 3 == 1) { + writer.integer("child").writeInt(123); + } else { + writer.integer("child").writeNull(); + } + writer.end(); + } + from.setValueCount(COUNT); + + // copy values + FieldReader in = from.getReader(); + FieldWriter out = to.getWriter(); + for (int i = 0; i < COUNT; i++) { + in.setPosition(i); + out.setPosition(i); + ComplexCopier.copy(in, out); + } + to.setValueCount(COUNT); + + // validate equals + assertTrue(VectorEqualsVisitor.vectorEquals(from, to)); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java new file mode 100644 index 000000000..9dce33122 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex.impl; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.DirtyRootAllocator; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.complex.writer.BaseWriter.StructWriter; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeID; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestPromotableWriter { + private static final String EMPTY_SCHEMA_PATH = ""; + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testPromoteToUnion() throws Exception { + + try (final NonNullableStructVector container = NonNullableStructVector.empty(EMPTY_SCHEMA_PATH, allocator); + final StructVector v = container.addOrGetStruct("test"); + final PromotableWriter writer = new PromotableWriter(v, container)) { + + container.allocateNew(); + + writer.start(); + + writer.setPosition(0); + writer.bit("A").writeBit(0); + + writer.setPosition(1); + writer.bit("A").writeBit(1); + + writer.decimal("dec", 10, 10); + + writer.setPosition(2); + writer.integer("A").writeInt(10); + + // we don't write anything in 3 + + writer.setPosition(4); + writer.integer("A").writeInt(100); + + writer.end(); + + container.setValueCount(5); + + final UnionVector uv = v.getChild("A", UnionVector.class); + + assertFalse("0 shouldn't be null", uv.isNull(0)); + assertEquals(false, uv.getObject(0)); + + assertFalse("1 shouldn't be null", uv.isNull(1)); + assertEquals(true, uv.getObject(1)); + + assertFalse("2 shouldn't be null", uv.isNull(2)); + assertEquals(10, uv.getObject(2)); + + assertNull("3 should be null", uv.getObject(3)); + + assertFalse("4 shouldn't be null", uv.isNull(4)); + assertEquals(100, uv.getObject(4)); + + container.clear(); + container.allocateNew(); + + ComplexWriterImpl newWriter = new ComplexWriterImpl(EMPTY_SCHEMA_PATH, container); + + StructWriter newStructWriter = newWriter.rootAsStruct(); + + newStructWriter.start(); + + newStructWriter.setPosition(2); + newStructWriter.integer("A").writeInt(10); + + Field childField1 = container.getField().getChildren().get(0).getChildren().get(0); + Field childField2 = container.getField().getChildren().get(0).getChildren().get(1); + assertEquals("Child field should be union type: " + + childField1.getName(), ArrowTypeID.Union, childField1.getType().getTypeID()); + assertEquals("Child field should be decimal type: " + + childField2.getName(), ArrowTypeID.Decimal, childField2.getType().getTypeID()); + } + } + + @Test + public void testNoPromoteToUnionWithNull() throws Exception { + + try (final NonNullableStructVector container = NonNullableStructVector.empty(EMPTY_SCHEMA_PATH, allocator); + final StructVector v = container.addOrGetStruct("test"); + final PromotableWriter writer = new PromotableWriter(v, container)) { + + container.allocateNew(); + + writer.start(); + writer.list("list").startList(); + writer.list("list").endList(); + writer.end(); + + FieldType childTypeOfListInContainer = container.getField().getChildren().get(0).getChildren().get(0) + .getChildren().get(0).getFieldType(); + + + // create a listvector with same type as list in container to, say, hold a copy + // this will be a nullvector + ListVector lv = ListVector.empty("name", allocator); + lv.addOrGetVector(childTypeOfListInContainer); + assertEquals(childTypeOfListInContainer.getType(), Types.MinorType.NULL.getType()); + assertEquals(lv.getChildrenFromFields().get(0).getMinorType().getType(), Types.MinorType.NULL.getType()); + + writer.start(); + writer.list("list").startList(); + writer.list("list").float4().writeFloat4(1.36f); + writer.list("list").endList(); + writer.end(); + + container.setValueCount(2); + + childTypeOfListInContainer = container.getField().getChildren().get(0).getChildren().get(0) + .getChildren().get(0).getFieldType(); + + // repeat but now the type in container has been changed from null to float + // we expect same behaviour from listvector + lv.addOrGetVector(childTypeOfListInContainer); + assertEquals(childTypeOfListInContainer.getType(), Types.MinorType.FLOAT4.getType()); + assertEquals(lv.getChildrenFromFields().get(0).getMinorType().getType(), Types.MinorType.FLOAT4.getType()); + + lv.close(); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java new file mode 100644 index 000000000..d4cf6ea89 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java @@ -0,0 +1,1335 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex.writer; + +import static org.junit.Assert.*; + +import java.math.BigDecimal; +import java.time.LocalDateTime; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.util.AutoCloseables; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.SchemaChangeCallBack; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.complex.impl.ComplexWriterImpl; +import org.apache.arrow.vector.complex.impl.SingleStructReaderImpl; +import org.apache.arrow.vector.complex.impl.SingleStructWriter; +import org.apache.arrow.vector.complex.impl.UnionListReader; +import org.apache.arrow.vector.complex.impl.UnionListWriter; +import org.apache.arrow.vector.complex.impl.UnionMapReader; +import org.apache.arrow.vector.complex.impl.UnionReader; +import org.apache.arrow.vector.complex.impl.UnionWriter; +import org.apache.arrow.vector.complex.reader.BaseReader.StructReader; +import org.apache.arrow.vector.complex.reader.BigIntReader; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.reader.Float4Reader; +import org.apache.arrow.vector.complex.reader.Float8Reader; +import org.apache.arrow.vector.complex.reader.IntReader; +import org.apache.arrow.vector.complex.writer.BaseWriter.ComplexWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.StructWriter; +import org.apache.arrow.vector.holders.DecimalHolder; +import org.apache.arrow.vector.holders.IntHolder; +import org.apache.arrow.vector.holders.NullableTimeStampNanoTZHolder; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeID; +import org.apache.arrow.vector.types.pojo.ArrowType.Int; +import org.apache.arrow.vector.types.pojo.ArrowType.Struct; +import org.apache.arrow.vector.types.pojo.ArrowType.Timestamp; +import org.apache.arrow.vector.types.pojo.ArrowType.Union; +import org.apache.arrow.vector.types.pojo.ArrowType.Utf8; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.DecimalUtility; +import org.apache.arrow.vector.util.JsonStringArrayList; +import org.apache.arrow.vector.util.JsonStringHashMap; +import org.apache.arrow.vector.util.Text; +import org.apache.arrow.vector.util.TransferPair; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class TestComplexWriter { + + private BufferAllocator allocator; + + private static final int COUNT = 100; + + @Before + public void init() { + allocator = new RootAllocator(Integer.MAX_VALUE); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void simpleNestedTypes() { + NonNullableStructVector parent = populateStructVector(null); + StructReader rootReader = new SingleStructReaderImpl(parent).reader("root"); + for (int i = 0; i < COUNT; i++) { + rootReader.setPosition(i); + Assert.assertEquals(i, rootReader.reader("int").readInteger().intValue()); + Assert.assertEquals(i, rootReader.reader("bigInt").readLong().longValue()); + } + + parent.close(); + } + + @Test + public void transferPairSchemaChange() { + SchemaChangeCallBack callBack1 = new SchemaChangeCallBack(); + SchemaChangeCallBack callBack2 = new SchemaChangeCallBack(); + try (NonNullableStructVector parent = populateStructVector(callBack1)) { + TransferPair tp = parent.getTransferPair("newVector", allocator, callBack2); + + ComplexWriter writer = new ComplexWriterImpl("newWriter", parent); + StructWriter rootWriter = writer.rootAsStruct(); + IntWriter intWriter = rootWriter.integer("newInt"); + intWriter.writeInt(1); + writer.setValueCount(1); + + assertTrue(callBack1.getSchemaChangedAndReset()); + // The second vector should not have registered a schema change + assertFalse(callBack1.getSchemaChangedAndReset()); + } + } + + private NonNullableStructVector populateStructVector(CallBack callBack) { + NonNullableStructVector parent = + new NonNullableStructVector("parent", allocator, new FieldType(false, Struct.INSTANCE, null, null), callBack); + ComplexWriter writer = new ComplexWriterImpl("root", parent); + StructWriter rootWriter = writer.rootAsStruct(); + IntWriter intWriter = rootWriter.integer("int"); + BigIntWriter bigIntWriter = rootWriter.bigInt("bigInt"); + for (int i = 0; i < COUNT; i++) { + rootWriter.start(); + intWriter.writeInt(i); + bigIntWriter.writeBigInt(i); + rootWriter.end(); + } + writer.setValueCount(COUNT); + return parent; + } + + @Test + public void nullableStruct() { + try (NonNullableStructVector structVector = NonNullableStructVector.empty("parent", allocator)) { + ComplexWriter writer = new ComplexWriterImpl("root", structVector); + StructWriter rootWriter = writer.rootAsStruct(); + for (int i = 0; i < COUNT; i++) { + rootWriter.start(); + if (i % 2 == 0) { + StructWriter structWriter = rootWriter.struct("struct"); + structWriter.setPosition(i); + structWriter.start(); + structWriter.bigInt("nested").writeBigInt(i); + structWriter.end(); + } + rootWriter.end(); + } + writer.setValueCount(COUNT); + checkNullableStruct(structVector); + } + } + + /** + * This test is similar to {@link #nullableStruct()} ()} but we get the inner struct writer once at the beginning. + */ + @Test + public void nullableStruct2() { + try (NonNullableStructVector structVector = NonNullableStructVector.empty("parent", allocator)) { + ComplexWriter writer = new ComplexWriterImpl("root", structVector); + StructWriter rootWriter = writer.rootAsStruct(); + StructWriter structWriter = rootWriter.struct("struct"); + + for (int i = 0; i < COUNT; i++) { + rootWriter.start(); + if (i % 2 == 0) { + structWriter.setPosition(i); + structWriter.start(); + structWriter.bigInt("nested").writeBigInt(i); + structWriter.end(); + } + rootWriter.end(); + } + writer.setValueCount(COUNT); + checkNullableStruct(structVector); + } + } + + private void checkNullableStruct(NonNullableStructVector structVector) { + StructReader rootReader = new SingleStructReaderImpl(structVector).reader("root"); + for (int i = 0; i < COUNT; i++) { + rootReader.setPosition(i); + assertTrue("index is set: " + i, rootReader.isSet()); + FieldReader struct = rootReader.reader("struct"); + if (i % 2 == 0) { + assertTrue("index is set: " + i, struct.isSet()); + assertNotNull("index is set: " + i, struct.readObject()); + assertEquals(i, struct.reader("nested").readLong().longValue()); + } else { + assertFalse("index is not set: " + i, struct.isSet()); + assertNull("index is not set: " + i, struct.readObject()); + } + } + } + + @Test + public void testList() { + try (NonNullableStructVector parent = NonNullableStructVector.empty("parent", allocator)) { + ComplexWriter writer = new ComplexWriterImpl("root", parent); + StructWriter rootWriter = writer.rootAsStruct(); + + rootWriter.start(); + rootWriter.bigInt("int").writeBigInt(0); + rootWriter.list("list").startList(); + rootWriter.list("list").bigInt().writeBigInt(0); + rootWriter.list("list").endList(); + rootWriter.end(); + + rootWriter.start(); + rootWriter.bigInt("int").writeBigInt(1); + rootWriter.end(); + + writer.setValueCount(2); + + StructReader rootReader = new SingleStructReaderImpl(parent).reader("root"); + + rootReader.setPosition(0); + assertTrue("row 0 list is not set", rootReader.reader("list").isSet()); + assertEquals(Long.valueOf(0), rootReader.reader("list").reader().readLong()); + + rootReader.setPosition(1); + assertFalse("row 1 list is set", rootReader.reader("list").isSet()); + } + } + + @Test + public void listScalarType() { + try (ListVector listVector = ListVector.empty("list", allocator)) { + listVector.allocateNew(); + UnionListWriter listWriter = new UnionListWriter(listVector); + for (int i = 0; i < COUNT; i++) { + listWriter.startList(); + for (int j = 0; j < i % 7; j++) { + if (j % 2 == 0) { + listWriter.writeInt(j); + } else { + IntHolder holder = new IntHolder(); + holder.value = j; + listWriter.write(holder); + } + } + listWriter.endList(); + } + listWriter.setValueCount(COUNT); + UnionListReader listReader = new UnionListReader(listVector); + for (int i = 0; i < COUNT; i++) { + listReader.setPosition(i); + for (int j = 0; j < i % 7; j++) { + listReader.next(); + assertEquals(j, listReader.reader().readInteger().intValue()); + } + } + } + } + + @Test + public void testListScalarNull() { + /* Write to a integer list vector + * each list of size 8 and having it's data values alternating between null and a non-null. + * Read and verify + */ + try (ListVector listVector = ListVector.empty("list", allocator)) { + listVector.allocateNew(); + UnionListWriter listWriter = new UnionListWriter(listVector); + for (int i = 0; i < COUNT; i++) { + listWriter.startList(); + for (int j = 0; j < i % 7; j++) { + if (j % 2 == 0) { + listWriter.writeNull(); + } else { + IntHolder holder = new IntHolder(); + holder.value = j; + listWriter.write(holder); + } + } + listWriter.endList(); + } + listWriter.setValueCount(COUNT); + UnionListReader listReader = new UnionListReader(listVector); + for (int i = 0; i < COUNT; i++) { + listReader.setPosition(i); + for (int j = 0; j < i % 7; j++) { + listReader.next(); + if (j % 2 == 0) { + assertFalse("index is set: " + j, listReader.reader().isSet()); + } else { + assertTrue("index is not set: " + j, listReader.reader().isSet()); + assertEquals(j, listReader.reader().readInteger().intValue()); + } + } + } + } + } + + @Test + public void listDecimalType() { + try (ListVector listVector = ListVector.empty("list", allocator)) { + listVector.allocateNew(); + UnionListWriter listWriter = new UnionListWriter(listVector); + DecimalHolder holder = new DecimalHolder(); + holder.buffer = allocator.buffer(DecimalVector.TYPE_WIDTH); + ArrowType arrowType = new ArrowType.Decimal(10, 0, 128); + for (int i = 0; i < COUNT; i++) { + listWriter.startList(); + for (int j = 0; j < i % 7; j++) { + if (j % 4 == 0) { + listWriter.writeDecimal(new BigDecimal(j)); + } else if (j % 4 == 1) { + DecimalUtility.writeBigDecimalToArrowBuf(new BigDecimal(j), holder.buffer, 0, DecimalVector.TYPE_WIDTH); + holder.start = 0; + holder.scale = 0; + holder.precision = 10; + listWriter.write(holder); + } else if (j % 4 == 2) { + DecimalUtility.writeBigDecimalToArrowBuf(new BigDecimal(j), holder.buffer, 0, DecimalVector.TYPE_WIDTH); + listWriter.writeDecimal(0, holder.buffer, arrowType); + } else { + byte[] value = BigDecimal.valueOf(j).unscaledValue().toByteArray(); + listWriter.writeBigEndianBytesToDecimal(value, arrowType); + } + } + listWriter.endList(); + } + listWriter.setValueCount(COUNT); + UnionListReader listReader = new UnionListReader(listVector); + for (int i = 0; i < COUNT; i++) { + listReader.setPosition(i); + for (int j = 0; j < i % 7; j++) { + listReader.next(); + Object expected = new BigDecimal(j); + Object actual = listReader.reader().readBigDecimal(); + assertEquals(expected, actual); + } + } + holder.buffer.close(); + } + } + + @Test + public void listScalarTypeNullable() { + try (ListVector listVector = ListVector.empty("list", allocator)) { + listVector.allocateNew(); + UnionListWriter listWriter = new UnionListWriter(listVector); + for (int i = 0; i < COUNT; i++) { + if (i % 2 == 0) { + listWriter.setPosition(i); + listWriter.startList(); + for (int j = 0; j < i % 7; j++) { + listWriter.writeInt(j); + } + listWriter.endList(); + } + } + listWriter.setValueCount(COUNT); + UnionListReader listReader = new UnionListReader(listVector); + for (int i = 0; i < COUNT; i++) { + listReader.setPosition(i); + if (i % 2 == 0) { + assertTrue("index is set: " + i, listReader.isSet()); + assertEquals("correct length at: " + i, i % 7, ((List) listReader.readObject()).size()); + } else { + assertFalse("index is not set: " + i, listReader.isSet()); + assertNull("index is not set: " + i, listReader.readObject()); + } + } + } + } + + @Test + public void listStructType() { + try (ListVector listVector = ListVector.empty("list", allocator)) { + listVector.allocateNew(); + UnionListWriter listWriter = new UnionListWriter(listVector); + StructWriter structWriter = listWriter.struct(); + for (int i = 0; i < COUNT; i++) { + listWriter.startList(); + for (int j = 0; j < i % 7; j++) { + structWriter.start(); + structWriter.integer("int").writeInt(j); + structWriter.bigInt("bigInt").writeBigInt(j); + structWriter.end(); + } + listWriter.endList(); + } + listWriter.setValueCount(COUNT); + UnionListReader listReader = new UnionListReader(listVector); + for (int i = 0; i < COUNT; i++) { + listReader.setPosition(i); + for (int j = 0; j < i % 7; j++) { + listReader.next(); + Assert.assertEquals("record: " + i, j, listReader.reader().reader("int").readInteger().intValue()); + Assert.assertEquals(j, listReader.reader().reader("bigInt").readLong().longValue()); + } + } + } + } + + @Test + public void listListType() { + try (ListVector listVector = ListVector.empty("list", allocator)) { + listVector.allocateNew(); + UnionListWriter listWriter = new UnionListWriter(listVector); + for (int i = 0; i < COUNT; i++) { + listWriter.startList(); + for (int j = 0; j < i % 7; j++) { + ListWriter innerListWriter = listWriter.list(); + innerListWriter.startList(); + for (int k = 0; k < i % 13; k++) { + innerListWriter.integer().writeInt(k); + } + innerListWriter.endList(); + } + listWriter.endList(); + } + listWriter.setValueCount(COUNT); + checkListOfLists(listVector); + } + } + + /** + * This test is similar to {@link #listListType()} but we get the inner list writer once at the beginning. + */ + @Test + public void listListType2() { + try (ListVector listVector = ListVector.empty("list", allocator)) { + listVector.allocateNew(); + UnionListWriter listWriter = new UnionListWriter(listVector); + ListWriter innerListWriter = listWriter.list(); + + for (int i = 0; i < COUNT; i++) { + listWriter.startList(); + for (int j = 0; j < i % 7; j++) { + innerListWriter.startList(); + for (int k = 0; k < i % 13; k++) { + innerListWriter.integer().writeInt(k); + } + innerListWriter.endList(); + } + listWriter.endList(); + } + listWriter.setValueCount(COUNT); + checkListOfLists(listVector); + } + } + + private void checkListOfLists(final ListVector listVector) { + UnionListReader listReader = new UnionListReader(listVector); + for (int i = 0; i < COUNT; i++) { + listReader.setPosition(i); + for (int j = 0; j < i % 7; j++) { + listReader.next(); + FieldReader innerListReader = listReader.reader(); + for (int k = 0; k < i % 13; k++) { + innerListReader.next(); + Assert.assertEquals("record: " + i, k, innerListReader.reader().readInteger().intValue()); + } + } + } + } + + @Test + public void unionListListType() { + try (ListVector listVector = ListVector.empty("list", allocator)) { + listVector.allocateNew(); + UnionListWriter listWriter = new UnionListWriter(listVector); + for (int i = 0; i < COUNT; i++) { + listWriter.startList(); + for (int j = 0; j < i % 7; j++) { + ListWriter innerListWriter = listWriter.list(); + innerListWriter.startList(); + for (int k = 0; k < i % 13; k++) { + if (k % 2 == 0) { + innerListWriter.integer().writeInt(k); + } else { + innerListWriter.bigInt().writeBigInt(k); + } + } + innerListWriter.endList(); + } + listWriter.endList(); + } + listWriter.setValueCount(COUNT); + checkUnionList(listVector); + } + } + + /** + * This test is similar to {@link #unionListListType()} but we get the inner list writer once at the beginning. + */ + @Test + public void unionListListType2() { + try (ListVector listVector = ListVector.empty("list", allocator)) { + listVector.allocateNew(); + UnionListWriter listWriter = new UnionListWriter(listVector); + ListWriter innerListWriter = listWriter.list(); + + for (int i = 0; i < COUNT; i++) { + listWriter.startList(); + for (int j = 0; j < i % 7; j++) { + innerListWriter.startList(); + for (int k = 0; k < i % 13; k++) { + if (k % 2 == 0) { + innerListWriter.integer().writeInt(k); + } else { + innerListWriter.bigInt().writeBigInt(k); + } + } + innerListWriter.endList(); + } + listWriter.endList(); + } + listWriter.setValueCount(COUNT); + checkUnionList(listVector); + } + } + + private void checkUnionList(ListVector listVector) { + UnionListReader listReader = new UnionListReader(listVector); + for (int i = 0; i < COUNT; i++) { + listReader.setPosition(i); + for (int j = 0; j < i % 7; j++) { + listReader.next(); + FieldReader innerListReader = listReader.reader(); + for (int k = 0; k < i % 13; k++) { + innerListReader.next(); + if (k % 2 == 0) { + Assert.assertEquals("record: " + i, k, innerListReader.reader().readInteger().intValue()); + } else { + Assert.assertEquals("record: " + i, k, innerListReader.reader().readLong().longValue()); + } + } + } + } + } + + @Test + public void testListMapType() { + try (ListVector listVector = ListVector.empty("list", allocator)) { + listVector.allocateNew(); + UnionListWriter listWriter = new UnionListWriter(listVector); + MapWriter innerMapWriter = listWriter.map(true); + + for (int i = 0; i < COUNT; i++) { + listWriter.startList(); + for (int j = 0; j < i % 7; j++) { + innerMapWriter.startMap(); + for (int k = 0; k < i % 13; k++) { + innerMapWriter.startEntry(); + innerMapWriter.key().integer().writeInt(k); + if (k % 2 == 0) { + innerMapWriter.value().bigInt().writeBigInt(k); + } + innerMapWriter.endEntry(); + } + innerMapWriter.endMap(); + } + listWriter.endList(); + } + listWriter.setValueCount(COUNT); + checkListMap(listVector); + + // Verify that the map vector has keysSorted = true + MapVector mapVector = (MapVector) listVector.getDataVector(); + ArrowType arrowType = mapVector.getField().getFieldType().getType(); + assertTrue(((ArrowType.Map) arrowType).getKeysSorted()); + } + } + + private void checkListMap(ListVector listVector) { + UnionListReader listReader = new UnionListReader(listVector); + for (int i = 0; i < COUNT; i++) { + listReader.setPosition(i); + for (int j = 0; j < i % 7; j++) { + listReader.next(); + UnionMapReader mapReader = (UnionMapReader) listReader.reader(); + for (int k = 0; k < i % 13; k++) { + mapReader.next(); + Assert.assertEquals("record key: " + i, k, mapReader.key().readInteger().intValue()); + if (k % 2 == 0) { + Assert.assertEquals("record value: " + i, k, mapReader.value().readLong().longValue()); + } else { + Assert.assertNull("record value: " + i, mapReader.value().readLong()); + } + } + } + } + } + + @Test + public void simpleUnion() { + UnionVector vector = new UnionVector("union", allocator, /* field type */ null, /* call-back */ null); + UnionWriter unionWriter = new UnionWriter(vector); + unionWriter.allocate(); + for (int i = 0; i < COUNT; i++) { + unionWriter.setPosition(i); + if (i % 2 == 0) { + unionWriter.writeInt(i); + } else { + unionWriter.writeFloat4((float) i); + } + } + vector.setValueCount(COUNT); + UnionReader unionReader = new UnionReader(vector); + for (int i = 0; i < COUNT; i++) { + unionReader.setPosition(i); + if (i % 2 == 0) { + Assert.assertEquals(i, i, unionReader.readInteger()); + } else { + Assert.assertEquals((float) i, unionReader.readFloat(), 1e-12); + } + } + vector.close(); + } + + @Test + public void promotableWriter() { + try (NonNullableStructVector parent = NonNullableStructVector.empty("parent", allocator)) { + + ComplexWriter writer = new ComplexWriterImpl("root", parent); + StructWriter rootWriter = writer.rootAsStruct(); + for (int i = 0; i < 100; i++) { + BigIntWriter bigIntWriter = rootWriter.bigInt("a"); + bigIntWriter.setPosition(i); + bigIntWriter.writeBigInt(i); + } + Field field = parent.getField().getChildren().get(0).getChildren().get(0); + Assert.assertEquals("a", field.getName()); + Assert.assertEquals(Int.TYPE_TYPE, field.getType().getTypeID()); + Int intType = (Int) field.getType(); + + Assert.assertEquals(64, intType.getBitWidth()); + Assert.assertTrue(intType.getIsSigned()); + for (int i = 100; i < 200; i++) { + VarCharWriter varCharWriter = rootWriter.varChar("a"); + varCharWriter.setPosition(i); + byte[] bytes = Integer.toString(i).getBytes(); + ArrowBuf tempBuf = allocator.buffer(bytes.length); + tempBuf.setBytes(0, bytes); + varCharWriter.writeVarChar(0, bytes.length, tempBuf); + tempBuf.close(); + } + field = parent.getField().getChildren().get(0).getChildren().get(0); + Assert.assertEquals("a", field.getName()); + Assert.assertEquals(Union.TYPE_TYPE, field.getType().getTypeID()); + Assert.assertEquals(Int.TYPE_TYPE, field.getChildren().get(0).getType().getTypeID()); + Assert.assertEquals(Utf8.TYPE_TYPE, field.getChildren().get(1).getType().getTypeID()); + StructReader rootReader = new SingleStructReaderImpl(parent).reader("root"); + for (int i = 0; i < 100; i++) { + rootReader.setPosition(i); + FieldReader reader = rootReader.reader("a"); + Long value = reader.readLong(); + Assert.assertNotNull("index: " + i, value); + Assert.assertEquals(i, value.intValue()); + } + for (int i = 100; i < 200; i++) { + rootReader.setPosition(i); + FieldReader reader = rootReader.reader("a"); + Text value = reader.readText(); + Assert.assertEquals(Integer.toString(i), value.toString()); + } + } + } + + /** + * Even without writing to the writer, the union schema is created correctly. + */ + @Test + public void promotableWriterSchema() { + try (NonNullableStructVector parent = NonNullableStructVector.empty("parent", allocator)) { + ComplexWriter writer = new ComplexWriterImpl("root", parent); + StructWriter rootWriter = writer.rootAsStruct(); + rootWriter.bigInt("a"); + rootWriter.varChar("a"); + + Field field = parent.getField().getChildren().get(0).getChildren().get(0); + Assert.assertEquals("a", field.getName()); + Assert.assertEquals(ArrowTypeID.Union, field.getType().getTypeID()); + + Assert.assertEquals(ArrowTypeID.Int, field.getChildren().get(0).getType().getTypeID()); + Int intType = (Int) field.getChildren().get(0).getType(); + Assert.assertEquals(64, intType.getBitWidth()); + Assert.assertTrue(intType.getIsSigned()); + Assert.assertEquals(ArrowTypeID.Utf8, field.getChildren().get(1).getType().getTypeID()); + } + } + + private Set getFieldNames(List fields) { + Set fieldNames = new HashSet<>(); + for (Field field : fields) { + fieldNames.add(field.getName()); + if (!field.getChildren().isEmpty()) { + for (String name : getFieldNames(field.getChildren())) { + fieldNames.add(field.getName() + "::" + name); + } + } + } + return fieldNames; + } + + @Test + public void structWriterMixedCaseFieldNames() { + try (NonNullableStructVector parent = NonNullableStructVector.empty("parent", allocator)) { + // test case-sensitive StructWriter + ComplexWriter writer = new ComplexWriterImpl("rootCaseSensitive", parent, false, true); + StructWriter rootWriterCaseSensitive = writer.rootAsStruct(); + rootWriterCaseSensitive.bigInt("int_field"); + rootWriterCaseSensitive.bigInt("Int_Field"); + rootWriterCaseSensitive.float4("float_field"); + rootWriterCaseSensitive.float4("Float_Field"); + StructWriter structFieldWriterCaseSensitive = rootWriterCaseSensitive.struct("struct_field"); + structFieldWriterCaseSensitive.varChar("char_field"); + structFieldWriterCaseSensitive.varChar("Char_Field"); + ListWriter listFieldWriterCaseSensitive = rootWriterCaseSensitive.list("list_field"); + StructWriter listStructFieldWriterCaseSensitive = listFieldWriterCaseSensitive.struct(); + listStructFieldWriterCaseSensitive.bit("bit_field"); + listStructFieldWriterCaseSensitive.bit("Bit_Field"); + + List fieldsCaseSensitive = parent.getField().getChildren().get(0).getChildren(); + Set fieldNamesCaseSensitive = getFieldNames(fieldsCaseSensitive); + Assert.assertEquals(11, fieldNamesCaseSensitive.size()); + Assert.assertTrue(fieldNamesCaseSensitive.contains("int_field")); + Assert.assertTrue(fieldNamesCaseSensitive.contains("Int_Field")); + Assert.assertTrue(fieldNamesCaseSensitive.contains("float_field")); + Assert.assertTrue(fieldNamesCaseSensitive.contains("Float_Field")); + Assert.assertTrue(fieldNamesCaseSensitive.contains("struct_field")); + Assert.assertTrue(fieldNamesCaseSensitive.contains("struct_field::char_field")); + Assert.assertTrue(fieldNamesCaseSensitive.contains("struct_field::Char_Field")); + Assert.assertTrue(fieldNamesCaseSensitive.contains("list_field")); + Assert.assertTrue(fieldNamesCaseSensitive.contains("list_field::$data$")); + Assert.assertTrue(fieldNamesCaseSensitive.contains("list_field::$data$::bit_field")); + Assert.assertTrue(fieldNamesCaseSensitive.contains("list_field::$data$::Bit_Field")); + + // test case-insensitive StructWriter + ComplexWriter writerCaseInsensitive = new ComplexWriterImpl("rootCaseInsensitive", parent, false, false); + StructWriter rootWriterCaseInsensitive = writerCaseInsensitive.rootAsStruct(); + + rootWriterCaseInsensitive.bigInt("int_field"); + rootWriterCaseInsensitive.bigInt("Int_Field"); + rootWriterCaseInsensitive.float4("float_field"); + rootWriterCaseInsensitive.float4("Float_Field"); + StructWriter structFieldWriterCaseInsensitive = rootWriterCaseInsensitive.struct("struct_field"); + structFieldWriterCaseInsensitive.varChar("char_field"); + structFieldWriterCaseInsensitive.varChar("Char_Field"); + ListWriter listFieldWriterCaseInsensitive = rootWriterCaseInsensitive.list("list_field"); + StructWriter listStructFieldWriterCaseInsensitive = listFieldWriterCaseInsensitive.struct(); + listStructFieldWriterCaseInsensitive.bit("bit_field"); + listStructFieldWriterCaseInsensitive.bit("Bit_Field"); + + List fieldsCaseInsensitive = parent.getField().getChildren().get(1).getChildren(); + Set fieldNamesCaseInsensitive = getFieldNames(fieldsCaseInsensitive); + Assert.assertEquals(7, fieldNamesCaseInsensitive.size()); + Assert.assertTrue(fieldNamesCaseInsensitive.contains("int_field")); + Assert.assertTrue(fieldNamesCaseInsensitive.contains("float_field")); + Assert.assertTrue(fieldNamesCaseInsensitive.contains("struct_field")); + Assert.assertTrue(fieldNamesCaseInsensitive.contains("struct_field::char_field")); + Assert.assertTrue(fieldNamesCaseSensitive.contains("list_field")); + Assert.assertTrue(fieldNamesCaseSensitive.contains("list_field::$data$")); + Assert.assertTrue(fieldNamesCaseSensitive.contains("list_field::$data$::bit_field")); + } + } + + @Test + public void timeStampSecWriter() throws Exception { + // test values + final long expectedSecs = 981173106L; + final LocalDateTime expectedSecDateTime = LocalDateTime.of(2001, 2, 3, 4, 5, 6, 0); + + try (NonNullableStructVector parent = NonNullableStructVector.empty("parent", allocator)) { + // write + + ComplexWriter writer = new ComplexWriterImpl("root", parent); + StructWriter rootWriter = writer.rootAsStruct(); + + { + TimeStampSecWriter timeStampSecWriter = rootWriter.timeStampSec("sec"); + timeStampSecWriter.setPosition(0); + timeStampSecWriter.writeTimeStampSec(expectedSecs); + } + { + TimeStampSecTZWriter timeStampSecTZWriter = rootWriter.timeStampSecTZ("secTZ", "UTC"); + timeStampSecTZWriter.setPosition(1); + timeStampSecTZWriter.writeTimeStampSecTZ(expectedSecs); + } + // schema + List children = parent.getField().getChildren().get(0).getChildren(); + checkTimestampField(children.get(0), "sec"); + checkTimestampTZField(children.get(1), "secTZ", "UTC"); + + // read + StructReader rootReader = new SingleStructReaderImpl(parent).reader("root"); + { + FieldReader secReader = rootReader.reader("sec"); + secReader.setPosition(0); + LocalDateTime secDateTime = secReader.readLocalDateTime(); + Assert.assertEquals(expectedSecDateTime, secDateTime); + long secLong = secReader.readLong(); + Assert.assertEquals(expectedSecs, secLong); + } + { + FieldReader secTZReader = rootReader.reader("secTZ"); + secTZReader.setPosition(1); + long secTZLong = secTZReader.readLong(); + Assert.assertEquals(expectedSecs, secTZLong); + } + } + } + + @Test + public void timeStampMilliWriters() throws Exception { + // test values + final long expectedMillis = 981173106123L; + final LocalDateTime expectedMilliDateTime = LocalDateTime.of(2001, 2, 3, 4, 5, 6, 123 * 1_000_000); + + try (NonNullableStructVector parent = NonNullableStructVector.empty("parent", allocator);) { + // write + ComplexWriter writer = new ComplexWriterImpl("root", parent); + StructWriter rootWriter = writer.rootAsStruct(); + { + TimeStampMilliWriter timeStampWriter = rootWriter.timeStampMilli("milli"); + timeStampWriter.setPosition(0); + timeStampWriter.writeTimeStampMilli(expectedMillis); + } + String tz = "UTC"; + { + TimeStampMilliTZWriter timeStampTZWriter = rootWriter.timeStampMilliTZ("milliTZ", tz); + timeStampTZWriter.setPosition(0); + timeStampTZWriter.writeTimeStampMilliTZ(expectedMillis); + } + // schema + List children = parent.getField().getChildren().get(0).getChildren(); + checkTimestampField(children.get(0), "milli"); + checkTimestampTZField(children.get(1), "milliTZ", tz); + + // read + StructReader rootReader = new SingleStructReaderImpl(parent).reader("root"); + + { + FieldReader milliReader = rootReader.reader("milli"); + milliReader.setPosition(0); + LocalDateTime milliDateTime = milliReader.readLocalDateTime(); + Assert.assertEquals(expectedMilliDateTime, milliDateTime); + long milliLong = milliReader.readLong(); + Assert.assertEquals(expectedMillis, milliLong); + } + { + FieldReader milliTZReader = rootReader.reader("milliTZ"); + milliTZReader.setPosition(0); + long milliTZLong = milliTZReader.readLong(); + Assert.assertEquals(expectedMillis, milliTZLong); + } + } + } + + private void checkTimestampField(Field field, String name) { + Assert.assertEquals(name, field.getName()); + Assert.assertEquals(ArrowType.Timestamp.TYPE_TYPE, field.getType().getTypeID()); + } + + private void checkTimestampTZField(Field field, String name, String tz) { + checkTimestampField(field, name); + Assert.assertEquals(tz, ((Timestamp) field.getType()).getTimezone()); + } + + @Test + public void timeStampMicroWriters() throws Exception { + // test values + final long expectedMicros = 981173106123456L; + final LocalDateTime expectedMicroDateTime = LocalDateTime.of(2001, 2, 3, 4, 5, 6, 123456 * 1000); + + try (NonNullableStructVector parent = NonNullableStructVector.empty("parent", allocator)) { + // write + ComplexWriter writer = new ComplexWriterImpl("root", parent); + StructWriter rootWriter = writer.rootAsStruct(); + + { + TimeStampMicroWriter timeStampMicroWriter = rootWriter.timeStampMicro("micro"); + timeStampMicroWriter.setPosition(0); + timeStampMicroWriter.writeTimeStampMicro(expectedMicros); + } + String tz = "UTC"; + { + TimeStampMicroTZWriter timeStampMicroWriter = rootWriter.timeStampMicroTZ("microTZ", tz); + timeStampMicroWriter.setPosition(1); + timeStampMicroWriter.writeTimeStampMicroTZ(expectedMicros); + } + + // schema + List children = parent.getField().getChildren().get(0).getChildren(); + checkTimestampField(children.get(0), "micro"); + checkTimestampTZField(children.get(1), "microTZ", tz); + + // read + StructReader rootReader = new SingleStructReaderImpl(parent).reader("root"); + { + FieldReader microReader = rootReader.reader("micro"); + microReader.setPosition(0); + LocalDateTime microDateTime = microReader.readLocalDateTime(); + Assert.assertEquals(expectedMicroDateTime, microDateTime); + long microLong = microReader.readLong(); + Assert.assertEquals(expectedMicros, microLong); + } + { + FieldReader microReader = rootReader.reader("microTZ"); + microReader.setPosition(1); + long microLong = microReader.readLong(); + Assert.assertEquals(expectedMicros, microLong); + } + } + } + + @Test + public void timeStampNanoWriters() throws Exception { + // test values + final long expectedNanos = 981173106123456789L; + final LocalDateTime expectedNanoDateTime = LocalDateTime.of(2001, 2, 3, 4, 5, 6, 123456789); + + try (NonNullableStructVector parent = NonNullableStructVector.empty("parent", allocator)) { + // write + ComplexWriter writer = new ComplexWriterImpl("root", parent); + StructWriter rootWriter = writer.rootAsStruct(); + + { + TimeStampNanoWriter timeStampNanoWriter = rootWriter.timeStampNano("nano"); + timeStampNanoWriter.setPosition(0); + timeStampNanoWriter.writeTimeStampNano(expectedNanos); + } + String tz = "UTC"; + { + TimeStampNanoTZWriter timeStampNanoWriter = rootWriter.timeStampNanoTZ("nanoTZ", tz); + timeStampNanoWriter.setPosition(0); + timeStampNanoWriter.writeTimeStampNanoTZ(expectedNanos); + } + // schema + List children = parent.getField().getChildren().get(0).getChildren(); + checkTimestampField(children.get(0), "nano"); + checkTimestampTZField(children.get(1), "nanoTZ", tz); + // read + StructReader rootReader = new SingleStructReaderImpl(parent).reader("root"); + + { + FieldReader nanoReader = rootReader.reader("nano"); + nanoReader.setPosition(0); + LocalDateTime nanoDateTime = nanoReader.readLocalDateTime(); + Assert.assertEquals(expectedNanoDateTime, nanoDateTime); + long nanoLong = nanoReader.readLong(); + Assert.assertEquals(expectedNanos, nanoLong); + } + { + FieldReader nanoReader = rootReader.reader("nanoTZ"); + nanoReader.setPosition(0); + long nanoLong = nanoReader.readLong(); + Assert.assertEquals(expectedNanos, nanoLong); + NullableTimeStampNanoTZHolder h = new NullableTimeStampNanoTZHolder(); + nanoReader.read(h); + Assert.assertEquals(expectedNanos, h.value); + } + } + + } + + @Test + public void fixedSizeBinaryWriters() throws Exception { + // test values + int numValues = 10; + int byteWidth = 9; + byte[][] values = new byte[numValues][byteWidth]; + for (int i = 0; i < numValues; i++) { + for (int j = 0; j < byteWidth; j++) { + values[i][j] = ((byte) i); + } + } + ArrowBuf[] bufs = new ArrowBuf[numValues]; + for (int i = 0; i < numValues; i++) { + bufs[i] = allocator.buffer(byteWidth); + bufs[i].setBytes(0, values[i]); + } + + try (NonNullableStructVector parent = NonNullableStructVector.empty("parent", allocator)) { + // write + ComplexWriter writer = new ComplexWriterImpl("root", parent); + StructWriter rootWriter = writer.rootAsStruct(); + + String fieldName = "fixedSizeBinary"; + FixedSizeBinaryWriter fixedSizeBinaryWriter = rootWriter.fixedSizeBinary(fieldName, byteWidth); + for (int i = 0; i < numValues; i++) { + fixedSizeBinaryWriter.setPosition(i); + fixedSizeBinaryWriter.writeFixedSizeBinary(bufs[i]); + } + + // schema + List children = parent.getField().getChildren().get(0).getChildren(); + Assert.assertEquals(fieldName, children.get(0).getName()); + Assert.assertEquals(ArrowType.FixedSizeBinary.TYPE_TYPE, children.get(0).getType().getTypeID()); + + // read + StructReader rootReader = new SingleStructReaderImpl(parent).reader("root"); + + FieldReader fixedSizeBinaryReader = rootReader.reader(fieldName); + for (int i = 0; i < numValues; i++) { + fixedSizeBinaryReader.setPosition(i); + byte[] readValues = fixedSizeBinaryReader.readByteArray(); + Assert.assertArrayEquals(values[i], readValues); + } + } + + AutoCloseables.close(bufs); + } + + @Test + public void complexCopierWithList() { + try (NonNullableStructVector parent = NonNullableStructVector.empty("parent", allocator)) { + ComplexWriter writer = new ComplexWriterImpl("root", parent); + StructWriter rootWriter = writer.rootAsStruct(); + ListWriter listWriter = rootWriter.list("list"); + StructWriter innerStructWriter = listWriter.struct(); + IntWriter outerIntWriter = listWriter.integer(); + rootWriter.start(); + listWriter.startList(); + outerIntWriter.writeInt(1); + outerIntWriter.writeInt(2); + innerStructWriter.start(); + IntWriter intWriter = innerStructWriter.integer("a"); + intWriter.writeInt(1); + innerStructWriter.end(); + innerStructWriter.start(); + intWriter = innerStructWriter.integer("a"); + intWriter.writeInt(2); + innerStructWriter.end(); + listWriter.endList(); + rootWriter.end(); + writer.setValueCount(1); + + StructVector structVector = (StructVector) parent.getChild("root"); + TransferPair tp = structVector.getTransferPair(allocator); + tp.splitAndTransfer(0, 1); + NonNullableStructVector toStructVector = (NonNullableStructVector) tp.getTo(); + JsonStringHashMap toMapValue = (JsonStringHashMap) toStructVector.getObject(0); + JsonStringArrayList object = (JsonStringArrayList) toMapValue.get("list"); + assertEquals(1, object.get(0)); + assertEquals(2, object.get(1)); + JsonStringHashMap innerStruct = (JsonStringHashMap) object.get(2); + assertEquals(1, innerStruct.get("a")); + innerStruct = (JsonStringHashMap) object.get(3); + assertEquals(2, innerStruct.get("a")); + toStructVector.close(); + } + } + + @Test + public void testSingleStructWriter1() { + /* initialize a SingleStructWriter with empty StructVector and then lazily + * create all vectors with expected initialCapacity. + */ + try (NonNullableStructVector parent = NonNullableStructVector.empty("parent", allocator)) { + SingleStructWriter singleStructWriter = new SingleStructWriter(parent); + + int initialCapacity = 1024; + singleStructWriter.setInitialCapacity(initialCapacity); + + IntWriter intWriter = singleStructWriter.integer("intField"); + BigIntWriter bigIntWriter = singleStructWriter.bigInt("bigIntField"); + Float4Writer float4Writer = singleStructWriter.float4("float4Field"); + Float8Writer float8Writer = singleStructWriter.float8("float8Field"); + ListWriter listWriter = singleStructWriter.list("listField"); + MapWriter mapWriter = singleStructWriter.map("mapField", false); + + int intValue = 100; + long bigIntValue = 10000; + float float4Value = 100.5f; + double float8Value = 100.375; + + for (int i = 0; i < initialCapacity; i++) { + singleStructWriter.start(); + + intWriter.writeInt(intValue + i); + bigIntWriter.writeBigInt(bigIntValue + (long) i); + float4Writer.writeFloat4(float4Value + (float) i); + float8Writer.writeFloat8(float8Value + (double) i); + + listWriter.setPosition(i); + listWriter.startList(); + listWriter.integer().writeInt(intValue + i); + listWriter.integer().writeInt(intValue + i + 1); + listWriter.integer().writeInt(intValue + i + 2); + listWriter.integer().writeInt(intValue + i + 3); + listWriter.endList(); + + mapWriter.setPosition(i); + mapWriter.startMap(); + mapWriter.startEntry(); + mapWriter.key().integer().writeInt(intValue + i); + mapWriter.value().integer().writeInt(intValue + i + 1); + mapWriter.endEntry(); + mapWriter.startEntry(); + mapWriter.key().integer().writeInt(intValue + i + 2); + mapWriter.value().integer().writeInt(intValue + i + 3); + mapWriter.endEntry(); + mapWriter.endMap(); + + singleStructWriter.end(); + } + + IntVector intVector = (IntVector) parent.getChild("intField"); + BigIntVector bigIntVector = (BigIntVector) parent.getChild("bigIntField"); + Float4Vector float4Vector = (Float4Vector) parent.getChild("float4Field"); + Float8Vector float8Vector = (Float8Vector) parent.getChild("float8Field"); + + int capacity = singleStructWriter.getValueCapacity(); + assertTrue(capacity >= initialCapacity && capacity < initialCapacity * 2); + capacity = intVector.getValueCapacity(); + assertTrue(capacity >= initialCapacity && capacity < initialCapacity * 2); + capacity = bigIntVector.getValueCapacity(); + assertTrue(capacity >= initialCapacity && capacity < initialCapacity * 2); + capacity = float4Vector.getValueCapacity(); + assertTrue(capacity >= initialCapacity && capacity < initialCapacity * 2); + capacity = float8Vector.getValueCapacity(); + assertTrue(capacity >= initialCapacity && capacity < initialCapacity * 2); + + StructReader singleStructReader = new SingleStructReaderImpl(parent); + + IntReader intReader = singleStructReader.reader("intField"); + BigIntReader bigIntReader = singleStructReader.reader("bigIntField"); + Float4Reader float4Reader = singleStructReader.reader("float4Field"); + Float8Reader float8Reader = singleStructReader.reader("float8Field"); + UnionListReader listReader = (UnionListReader) singleStructReader.reader("listField"); + UnionMapReader mapReader = (UnionMapReader) singleStructReader.reader("mapField"); + + for (int i = 0; i < initialCapacity; i++) { + intReader.setPosition(i); + bigIntReader.setPosition(i); + float4Reader.setPosition(i); + float8Reader.setPosition(i); + listReader.setPosition(i); + mapReader.setPosition(i); + + assertEquals(intValue + i, intReader.readInteger().intValue()); + assertEquals(bigIntValue + (long) i, bigIntReader.readLong().longValue()); + assertEquals(float4Value + (float) i, float4Reader.readFloat().floatValue(), 0); + assertEquals(float8Value + (double) i, float8Reader.readDouble().doubleValue(), 0); + + for (int j = 0; j < 4; j++) { + listReader.next(); + assertEquals(intValue + i + j, listReader.reader().readInteger().intValue()); + } + + for (int k = 0; k < 4; k += 2) { + mapReader.next(); + assertEquals(intValue + k + i, mapReader.key().readInteger().intValue()); + assertEquals(intValue + k + i + 1, mapReader.value().readInteger().intValue()); + } + } + } + + + } + + @Test + public void testListWriterWithNulls() { + try (ListVector listVector = ListVector.empty("list", allocator)) { + listVector.setInitialCapacity(COUNT); + listVector.allocateNew(); + listVector.getValidityBuffer().setOne(0, (int) listVector.getValidityBuffer().capacity()); + + UnionListWriter listWriter = listVector.getWriter(); + + // expected listVector : [[null], null, [2, 4], null, [null], null, [6, 12], ...] + for (int i = 0; i < COUNT; i++) { + listWriter.setPosition(i); + if (i % 2 == 0) { + listWriter.startList(); + if (i % 4 == 0) { + listWriter.integer().writeNull(); + } else { + listWriter.integer().writeInt(i); + listWriter.integer().writeInt(i * 2); + } + listWriter.endList(); + } else { + listWriter.writeNull(); + } + } + listVector.setValueCount(COUNT); + + UnionListReader listReader = new UnionListReader(listVector); + for (int i = 0; i < COUNT; i++) { + listReader.setPosition(i); + if (i % 2 == 0) { + Assert.assertTrue(listReader.isSet()); + listReader.next(); + if (i % 4 == 0) { + Assert.assertNull(listReader.reader().readInteger()); + } else { + Assert.assertEquals(i, listReader.reader().readInteger().intValue()); + listReader.next(); + Assert.assertEquals(i * 2, listReader.reader().readInteger().intValue()); + } + } else { + Assert.assertFalse(listReader.isSet()); + } + } + } + } + + @Test + public void testListOfListWriterWithNulls() { + try (ListVector listVector = ListVector.empty("listoflist", allocator)) { + listVector.setInitialCapacity(COUNT); + listVector.allocateNew(); + listVector.getValidityBuffer().setOne(0, (int) listVector.getValidityBuffer().capacity()); + + UnionListWriter listWriter = listVector.getWriter(); + + // create list : [ [null], null, [[null, 2, 4]], null, [null], null, [[null, 6, 12]], ... ] + for (int i = 0; i < COUNT; i++) { + listWriter.setPosition(i); + if (i % 2 == 0) { + listWriter.startList(); + if (i % 4 == 0) { + listWriter.list().writeNull(); + } else { + listWriter.list().startList(); + listWriter.list().integer().writeNull(); + listWriter.list().integer().writeInt(i); + listWriter.list().integer().writeInt(i * 2); + listWriter.list().endList(); + } + listWriter.endList(); + } else { + listWriter.writeNull(); + } + } + listVector.setValueCount(COUNT); + + UnionListReader listReader = new UnionListReader(listVector); + for (int i = 0; i < COUNT; i++) { + listReader.setPosition(i); + if (i % 2 == 0) { + Assert.assertTrue(listReader.isSet()); + listReader.next(); + if (i % 4 == 0) { + Assert.assertFalse(listReader.reader().isSet()); + } else { + listReader.reader().next(); + Assert.assertFalse(listReader.reader().reader().isSet()); + listReader.reader().next(); + Assert.assertEquals(i, listReader.reader().reader().readInteger().intValue()); + listReader.reader().next(); + Assert.assertEquals(i * 2, listReader.reader().reader().readInteger().intValue()); + } + } else { + Assert.assertFalse(listReader.isSet()); + } + } + } + } + + @Test + public void testListOfListOfListWriterWithNulls() { + try (ListVector listVector = ListVector.empty("listoflistoflist", allocator)) { + listVector.setInitialCapacity(COUNT); + listVector.allocateNew(); + listVector.getValidityBuffer().setOne(0, (int) listVector.getValidityBuffer().capacity()); + + UnionListWriter listWriter = listVector.getWriter(); + + // create list : [ null, [null], [[null]], [[[null, 1, 2]]], null, [null], ... + for (int i = 0; i < COUNT; i++) { + listWriter.setPosition(i); + if (i % 4 == 0) { + listWriter.writeNull(); + } else { + listWriter.startList(); + if (i % 4 == 1) { + listWriter.list().writeNull(); + } else if (i % 4 == 2) { + listWriter.list().startList(); + listWriter.list().list().writeNull(); + listWriter.list().endList(); + } else { + listWriter.list().startList(); + listWriter.list().list().startList(); + listWriter.list().list().integer().writeNull(); + listWriter.list().list().integer().writeInt(i); + listWriter.list().list().integer().writeInt(i * 2); + listWriter.list().list().endList(); + listWriter.list().endList(); + } + listWriter.endList(); + } + } + listVector.setValueCount(COUNT); + + UnionListReader listReader = new UnionListReader(listVector); + for (int i = 0; i < COUNT; i++) { + listReader.setPosition(i); + if (i % 4 == 0) { + Assert.assertFalse(listReader.isSet()); + } else { + Assert.assertTrue(listReader.isSet()); + listReader.next(); + if (i % 4 == 1) { + Assert.assertFalse(listReader.reader().isSet()); + } else if (i % 4 == 2) { + listReader.reader().next(); + Assert.assertFalse(listReader.reader().reader().isSet()); + } else { + listReader.reader().next(); + listReader.reader().reader().next(); + Assert.assertFalse(listReader.reader().reader().reader().isSet()); + listReader.reader().reader().next(); + Assert.assertEquals(i, listReader.reader().reader().reader().readInteger().intValue()); + listReader.reader().reader().next(); + Assert.assertEquals(i * 2, listReader.reader().reader().reader().readInteger().intValue()); + } + } + } + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/BaseFileTest.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/BaseFileTest.java new file mode 100644 index 000000000..8663c0c49 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/BaseFileTest.java @@ -0,0 +1,849 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +import static org.apache.arrow.vector.TestUtils.newVarCharVector; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.io.IOException; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.charset.StandardCharsets; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.ZoneId; +import java.time.ZoneOffset; +import java.util.Arrays; +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.util.Collections2; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.TimeMilliVector; +import org.apache.arrow.vector.UInt1Vector; +import org.apache.arrow.vector.UInt2Vector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.UInt8Vector; +import org.apache.arrow.vector.VarBinaryVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.impl.ComplexWriterImpl; +import org.apache.arrow.vector.complex.impl.UnionListWriter; +import org.apache.arrow.vector.complex.impl.UnionMapReader; +import org.apache.arrow.vector.complex.impl.UnionMapWriter; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.BaseWriter.ComplexWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.StructWriter; +import org.apache.arrow.vector.complex.writer.BigIntWriter; +import org.apache.arrow.vector.complex.writer.DateMilliWriter; +import org.apache.arrow.vector.complex.writer.Float4Writer; +import org.apache.arrow.vector.complex.writer.IntWriter; +import org.apache.arrow.vector.complex.writer.TimeMilliWriter; +import org.apache.arrow.vector.complex.writer.TimeStampMilliTZWriter; +import org.apache.arrow.vector.complex.writer.TimeStampMilliWriter; +import org.apache.arrow.vector.complex.writer.TimeStampNanoWriter; +import org.apache.arrow.vector.complex.writer.UInt1Writer; +import org.apache.arrow.vector.complex.writer.UInt2Writer; +import org.apache.arrow.vector.complex.writer.UInt4Writer; +import org.apache.arrow.vector.complex.writer.UInt8Writer; +import org.apache.arrow.vector.dictionary.Dictionary; +import org.apache.arrow.vector.dictionary.DictionaryEncoder; +import org.apache.arrow.vector.dictionary.DictionaryProvider; +import org.apache.arrow.vector.holders.NullableTimeStampMilliHolder; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.DictionaryEncoding; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.JsonStringArrayList; +import org.apache.arrow.vector.util.Text; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Helps testing the file formats. + */ +public class BaseFileTest { + private static final Logger LOGGER = LoggerFactory.getLogger(BaseFileTest.class); + protected static final int COUNT = 10; + protected BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Integer.MAX_VALUE); + } + + @After + public void tearDown() { + allocator.close(); + } + + + private static short [] uint1Values = new short[]{0, 255, 1, 128, 2}; + private static char [] uint2Values = new char[]{0, Character.MAX_VALUE, 1, Short.MAX_VALUE * 2, 2}; + private static long [] uint4Values = new long[]{0, Integer.MAX_VALUE + 1L, 1, Integer.MAX_VALUE * 2L, 2}; + private static BigInteger[] uint8Values = new BigInteger[]{BigInteger.valueOf(0), + BigInteger.valueOf(Long.MAX_VALUE).multiply(BigInteger.valueOf(2)), BigInteger.valueOf(2), + BigInteger.valueOf(Long.MAX_VALUE).add(BigInteger.valueOf(1)), BigInteger.valueOf(2)}; + + protected void writeData(int count, StructVector parent) { + ComplexWriter writer = new ComplexWriterImpl("root", parent); + StructWriter rootWriter = writer.rootAsStruct(); + IntWriter intWriter = rootWriter.integer("int"); + UInt1Writer uint1Writer = rootWriter.uInt1("uint1"); + UInt2Writer uint2Writer = rootWriter.uInt2("uint2"); + UInt4Writer uint4Writer = rootWriter.uInt4("uint4"); + UInt8Writer uint8Writer = rootWriter.uInt8("uint8"); + BigIntWriter bigIntWriter = rootWriter.bigInt("bigInt"); + Float4Writer float4Writer = rootWriter.float4("float"); + for (int i = 0; i < count; i++) { + intWriter.setPosition(i); + intWriter.writeInt(i); + uint1Writer.setPosition(i); + // TODO: Fix add safe write methods on uint methods. + uint1Writer.setPosition(i); + uint1Writer.writeUInt1((byte) uint1Values[i % uint1Values.length] ); + uint2Writer.setPosition(i); + uint2Writer.writeUInt2((char) uint2Values[i % uint2Values.length] ); + uint4Writer.setPosition(i); + uint4Writer.writeUInt4((int) uint4Values[i % uint4Values.length] ); + uint8Writer.setPosition(i); + uint8Writer.writeUInt8(uint8Values[i % uint8Values.length].longValue()); + bigIntWriter.setPosition(i); + bigIntWriter.writeBigInt(i); + float4Writer.setPosition(i); + float4Writer.writeFloat4(i == 0 ? Float.NaN : i); + } + writer.setValueCount(count); + } + + + protected void validateContent(int count, VectorSchemaRoot root) { + for (int i = 0; i < count; i++) { + Assert.assertEquals(i, root.getVector("int").getObject(i)); + Assert.assertEquals((Short) uint1Values[i % uint1Values.length], + ((UInt1Vector) root.getVector("uint1")).getObjectNoOverflow(i)); + Assert.assertEquals("Failed for index: " + i, (Character) uint2Values[i % uint2Values.length], + (Character) ((UInt2Vector) root.getVector("uint2")).get(i)); + Assert.assertEquals("Failed for index: " + i, (Long) uint4Values[i % uint4Values.length], + ((UInt4Vector) root.getVector("uint4")).getObjectNoOverflow(i)); + Assert.assertEquals("Failed for index: " + i, uint8Values[i % uint8Values.length], + ((UInt8Vector) root.getVector("uint8")).getObjectNoOverflow(i)); + Assert.assertEquals(Long.valueOf(i), root.getVector("bigInt").getObject(i)); + Assert.assertEquals(i == 0 ? Float.NaN : i, root.getVector("float").getObject(i)); + } + } + + protected void writeComplexData(int count, StructVector parent) { + ArrowBuf varchar = allocator.buffer(3); + varchar.readerIndex(0); + varchar.setByte(0, 'a'); + varchar.setByte(1, 'b'); + varchar.setByte(2, 'c'); + varchar.writerIndex(3); + ComplexWriter writer = new ComplexWriterImpl("root", parent); + StructWriter rootWriter = writer.rootAsStruct(); + IntWriter intWriter = rootWriter.integer("int"); + BigIntWriter bigIntWriter = rootWriter.bigInt("bigInt"); + ListWriter listWriter = rootWriter.list("list"); + StructWriter structWriter = rootWriter.struct("struct"); + for (int i = 0; i < count; i++) { + if (i % 5 != 3) { + intWriter.setPosition(i); + intWriter.writeInt(i); + } + bigIntWriter.setPosition(i); + bigIntWriter.writeBigInt(i); + listWriter.setPosition(i); + listWriter.startList(); + for (int j = 0; j < i % 3; j++) { + listWriter.varChar().writeVarChar(0, 3, varchar); + } + listWriter.endList(); + structWriter.setPosition(i); + structWriter.start(); + structWriter.timeStampMilli("timestamp").writeTimeStampMilli(i); + structWriter.end(); + } + writer.setValueCount(count); + varchar.getReferenceManager().release(); + } + + public void printVectors(List vectors) { + for (FieldVector vector : vectors) { + LOGGER.debug(vector.getField().getName()); + int valueCount = vector.getValueCount(); + for (int i = 0; i < valueCount; i++) { + LOGGER.debug(String.valueOf(vector.getObject(i))); + } + } + } + + protected void validateComplexContent(int count, VectorSchemaRoot root) { + Assert.assertEquals(count, root.getRowCount()); + printVectors(root.getFieldVectors()); + for (int i = 0; i < count; i++) { + + Object intVal = root.getVector("int").getObject(i); + if (i % 5 != 3) { + Assert.assertEquals(i, intVal); + } else { + Assert.assertNull(intVal); + } + Assert.assertEquals(Long.valueOf(i), root.getVector("bigInt").getObject(i)); + Assert.assertEquals(i % 3, ((List) root.getVector("list").getObject(i)).size()); + NullableTimeStampMilliHolder h = new NullableTimeStampMilliHolder(); + FieldReader structReader = root.getVector("struct").getReader(); + structReader.setPosition(i); + structReader.reader("timestamp").read(h); + Assert.assertEquals(i, h.value); + } + } + + private LocalDateTime makeDateTimeFromCount(int i) { + return LocalDateTime.of(2000 + i, 1 + i, 1 + i, i, i, i, i * 100_000_000 + i); + } + + protected void writeDateTimeData(int count, StructVector parent) { + Assert.assertTrue(count < 100); + ComplexWriter writer = new ComplexWriterImpl("root", parent); + StructWriter rootWriter = writer.rootAsStruct(); + DateMilliWriter dateWriter = rootWriter.dateMilli("date"); + TimeMilliWriter timeWriter = rootWriter.timeMilli("time"); + TimeStampMilliWriter timeStampMilliWriter = rootWriter.timeStampMilli("timestamp-milli"); + TimeStampMilliTZWriter timeStampMilliTZWriter = rootWriter.timeStampMilliTZ("timestamp-milliTZ", "Europe/Paris"); + TimeStampNanoWriter timeStampNanoWriter = rootWriter.timeStampNano("timestamp-nano"); + for (int i = 0; i < count; i++) { + LocalDateTime dt = makeDateTimeFromCount(i); + // Number of days in milliseconds since epoch, stored as 64-bit integer, only date part is used + dateWriter.setPosition(i); + long dateLong = dt.toLocalDate().atStartOfDay().toInstant(ZoneOffset.UTC).toEpochMilli(); + dateWriter.writeDateMilli(dateLong); + // Time is a value in milliseconds since midnight, stored as 32-bit integer + timeWriter.setPosition(i); + int milliOfDay = (int) java.util.concurrent.TimeUnit.NANOSECONDS.toMillis(dt.toLocalTime().toNanoOfDay()); + timeWriter.writeTimeMilli(milliOfDay); + // Timestamp as milliseconds since the epoch, stored as 64-bit integer + timeStampMilliWriter.setPosition(i); + timeStampMilliWriter.writeTimeStampMilli(dt.toInstant(ZoneOffset.UTC).toEpochMilli()); + // Timestamp as milliseconds since epoch with timezone + timeStampMilliTZWriter.setPosition(i); + timeStampMilliTZWriter.writeTimeStampMilliTZ(dt.atZone(ZoneId.of("Europe/Paris")).toInstant().toEpochMilli()); + // Timestamp as nanoseconds since epoch + timeStampNanoWriter.setPosition(i); + long tsNanos = dt.toInstant(ZoneOffset.UTC).toEpochMilli() * 1_000_000 + i; // need to add back in nano val + timeStampNanoWriter.writeTimeStampNano(tsNanos); + } + writer.setValueCount(count); + } + + protected void validateDateTimeContent(int count, VectorSchemaRoot root) { + Assert.assertEquals(count, root.getRowCount()); + printVectors(root.getFieldVectors()); + for (int i = 0; i < count; i++) { + LocalDateTime dt = makeDateTimeFromCount(i); + LocalDateTime dtMilli = dt.minusNanos(i); + LocalDateTime dateVal = ((DateMilliVector) root.getVector("date")).getObject(i); + LocalDateTime dateExpected = dt.toLocalDate().atStartOfDay(); + Assert.assertEquals(dateExpected, dateVal); + LocalTime timeVal = ((TimeMilliVector) root.getVector("time")).getObject(i).toLocalTime(); + Assert.assertEquals(dtMilli.toLocalTime(), timeVal); + Object timestampMilliVal = root.getVector("timestamp-milli").getObject(i); + Assert.assertEquals(dtMilli, timestampMilliVal); + Object timestampMilliTZVal = root.getVector("timestamp-milliTZ").getObject(i); + Assert.assertEquals(dt.atZone(ZoneId.of("Europe/Paris")).toInstant().toEpochMilli(), timestampMilliTZVal); + Object timestampNanoVal = root.getVector("timestamp-nano").getObject(i); + Assert.assertEquals(dt, timestampNanoVal); + } + } + + protected VectorSchemaRoot writeFlatDictionaryData( + BufferAllocator bufferAllocator, + DictionaryProvider.MapDictionaryProvider provider) { + + // Define dictionaries and add to provider + VarCharVector dictionary1Vector = newVarCharVector("D1", bufferAllocator); + dictionary1Vector.allocateNewSafe(); + dictionary1Vector.set(0, "foo".getBytes(StandardCharsets.UTF_8)); + dictionary1Vector.set(1, "bar".getBytes(StandardCharsets.UTF_8)); + dictionary1Vector.set(2, "baz".getBytes(StandardCharsets.UTF_8)); + dictionary1Vector.setValueCount(3); + + Dictionary dictionary1 = new Dictionary(dictionary1Vector, new DictionaryEncoding(1L, false, null)); + provider.put(dictionary1); + + VarCharVector dictionary2Vector = newVarCharVector("D2", bufferAllocator); + dictionary2Vector.allocateNewSafe(); + dictionary2Vector.set(0, "micro".getBytes(StandardCharsets.UTF_8)); + dictionary2Vector.set(1, "small".getBytes(StandardCharsets.UTF_8)); + dictionary2Vector.set(2, "large".getBytes(StandardCharsets.UTF_8)); + dictionary2Vector.setValueCount(3); + + Dictionary dictionary2 = new Dictionary(dictionary2Vector, new DictionaryEncoding(2L, false, null)); + provider.put(dictionary2); + + // Populate the vectors + VarCharVector vector1A = newVarCharVector("varcharA", bufferAllocator); + vector1A.allocateNewSafe(); + vector1A.set(0, "foo".getBytes(StandardCharsets.UTF_8)); + vector1A.set(1, "bar".getBytes(StandardCharsets.UTF_8)); + vector1A.set(3, "baz".getBytes(StandardCharsets.UTF_8)); + vector1A.set(4, "bar".getBytes(StandardCharsets.UTF_8)); + vector1A.set(5, "baz".getBytes(StandardCharsets.UTF_8)); + vector1A.setValueCount(6); + + FieldVector encodedVector1A = (FieldVector) DictionaryEncoder.encode(vector1A, dictionary1); + vector1A.close(); // Done with this vector after encoding + + // Write this vector using indices instead of encoding + IntVector encodedVector1B = new IntVector("varcharB", bufferAllocator); + encodedVector1B.allocateNewSafe(); + encodedVector1B.set(0, 2); // "baz" + encodedVector1B.set(1, 1); // "bar" + encodedVector1B.set(2, 2); // "baz" + encodedVector1B.set(4, 1); // "bar" + encodedVector1B.set(5, 0); // "foo" + encodedVector1B.setValueCount(6); + + VarCharVector vector2 = newVarCharVector("sizes", bufferAllocator); + vector2.allocateNewSafe(); + vector2.set(1, "large".getBytes(StandardCharsets.UTF_8)); + vector2.set(2, "small".getBytes(StandardCharsets.UTF_8)); + vector2.set(3, "small".getBytes(StandardCharsets.UTF_8)); + vector2.set(4, "large".getBytes(StandardCharsets.UTF_8)); + vector2.setValueCount(6); + + FieldVector encodedVector2 = (FieldVector) DictionaryEncoder.encode(vector2, dictionary2); + vector2.close(); // Done with this vector after encoding + + List fields = Arrays.asList(encodedVector1A.getField(), encodedVector1B.getField(), + encodedVector2.getField()); + List vectors = Collections2.asImmutableList(encodedVector1A, encodedVector1B, encodedVector2); + + return new VectorSchemaRoot(fields, vectors, encodedVector1A.getValueCount()); + } + + protected void validateFlatDictionary(VectorSchemaRoot root, DictionaryProvider provider) { + FieldVector vector1A = root.getVector("varcharA"); + Assert.assertNotNull(vector1A); + + DictionaryEncoding encoding1A = vector1A.getField().getDictionary(); + Assert.assertNotNull(encoding1A); + Assert.assertEquals(1L, encoding1A.getId()); + + Assert.assertEquals(6, vector1A.getValueCount()); + Assert.assertEquals(0, vector1A.getObject(0)); + Assert.assertEquals(1, vector1A.getObject(1)); + Assert.assertEquals(null, vector1A.getObject(2)); + Assert.assertEquals(2, vector1A.getObject(3)); + Assert.assertEquals(1, vector1A.getObject(4)); + Assert.assertEquals(2, vector1A.getObject(5)); + + FieldVector vector1B = root.getVector("varcharB"); + Assert.assertNotNull(vector1B); + + DictionaryEncoding encoding1B = vector1A.getField().getDictionary(); + Assert.assertNotNull(encoding1B); + Assert.assertTrue(encoding1A.equals(encoding1B)); + Assert.assertEquals(1L, encoding1B.getId()); + + Assert.assertEquals(6, vector1B.getValueCount()); + Assert.assertEquals(2, vector1B.getObject(0)); + Assert.assertEquals(1, vector1B.getObject(1)); + Assert.assertEquals(2, vector1B.getObject(2)); + Assert.assertEquals(null, vector1B.getObject(3)); + Assert.assertEquals(1, vector1B.getObject(4)); + Assert.assertEquals(0, vector1B.getObject(5)); + + FieldVector vector2 = root.getVector("sizes"); + Assert.assertNotNull(vector2); + + DictionaryEncoding encoding2 = vector2.getField().getDictionary(); + Assert.assertNotNull(encoding2); + Assert.assertEquals(2L, encoding2.getId()); + + Assert.assertEquals(6, vector2.getValueCount()); + Assert.assertEquals(null, vector2.getObject(0)); + Assert.assertEquals(2, vector2.getObject(1)); + Assert.assertEquals(1, vector2.getObject(2)); + Assert.assertEquals(1, vector2.getObject(3)); + Assert.assertEquals(2, vector2.getObject(4)); + Assert.assertEquals(null, vector2.getObject(5)); + + Dictionary dictionary1 = provider.lookup(1L); + Assert.assertNotNull(dictionary1); + VarCharVector dictionaryVector = ((VarCharVector) dictionary1.getVector()); + Assert.assertEquals(3, dictionaryVector.getValueCount()); + Assert.assertEquals(new Text("foo"), dictionaryVector.getObject(0)); + Assert.assertEquals(new Text("bar"), dictionaryVector.getObject(1)); + Assert.assertEquals(new Text("baz"), dictionaryVector.getObject(2)); + + Dictionary dictionary2 = provider.lookup(2L); + Assert.assertNotNull(dictionary2); + dictionaryVector = ((VarCharVector) dictionary2.getVector()); + Assert.assertEquals(3, dictionaryVector.getValueCount()); + Assert.assertEquals(new Text("micro"), dictionaryVector.getObject(0)); + Assert.assertEquals(new Text("small"), dictionaryVector.getObject(1)); + Assert.assertEquals(new Text("large"), dictionaryVector.getObject(2)); + } + + protected VectorSchemaRoot writeNestedDictionaryData( + BufferAllocator bufferAllocator, + DictionaryProvider.MapDictionaryProvider provider) { + + // Define the dictionary and add to the provider + VarCharVector dictionaryVector = newVarCharVector("D2", bufferAllocator); + dictionaryVector.allocateNewSafe(); + dictionaryVector.set(0, "foo".getBytes(StandardCharsets.UTF_8)); + dictionaryVector.set(1, "bar".getBytes(StandardCharsets.UTF_8)); + dictionaryVector.setValueCount(2); + + Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(2L, false, null)); + provider.put(dictionary); + + // Write the vector data using dictionary indices + ListVector listVector = ListVector.empty("list", bufferAllocator); + DictionaryEncoding encoding = dictionary.getEncoding(); + listVector.addOrGetVector(new FieldType(true, encoding.getIndexType(), encoding)); + listVector.allocateNew(); + UnionListWriter listWriter = new UnionListWriter(listVector); + listWriter.startList(); + listWriter.writeInt(0); + listWriter.writeInt(1); + listWriter.endList(); + listWriter.startList(); + listWriter.writeInt(0); + listWriter.endList(); + listWriter.startList(); + listWriter.writeInt(1); + listWriter.endList(); + listWriter.setValueCount(3); + + List fields = Collections2.asImmutableList(listVector.getField()); + List vectors = Collections2.asImmutableList(listVector); + return new VectorSchemaRoot(fields, vectors, 3); + } + + protected void validateNestedDictionary(VectorSchemaRoot root, DictionaryProvider provider) { + FieldVector vector = root.getFieldVectors().get(0); + Assert.assertNotNull(vector); + Assert.assertNull(vector.getField().getDictionary()); + Field nestedField = vector.getField().getChildren().get(0); + + DictionaryEncoding encoding = nestedField.getDictionary(); + Assert.assertNotNull(encoding); + Assert.assertEquals(2L, encoding.getId()); + Assert.assertEquals(new ArrowType.Int(32, true), encoding.getIndexType()); + + Assert.assertEquals(3, vector.getValueCount()); + Assert.assertEquals(Arrays.asList(0, 1), vector.getObject(0)); + Assert.assertEquals(Arrays.asList(0), vector.getObject(1)); + Assert.assertEquals(Arrays.asList(1), vector.getObject(2)); + + Dictionary dictionary = provider.lookup(2L); + Assert.assertNotNull(dictionary); + VarCharVector dictionaryVector = ((VarCharVector) dictionary.getVector()); + Assert.assertEquals(2, dictionaryVector.getValueCount()); + Assert.assertEquals(new Text("foo"), dictionaryVector.getObject(0)); + Assert.assertEquals(new Text("bar"), dictionaryVector.getObject(1)); + } + + protected VectorSchemaRoot writeDecimalData(BufferAllocator bufferAllocator) { + DecimalVector decimalVector1 = new DecimalVector("decimal1", bufferAllocator, 10, 3); + DecimalVector decimalVector2 = new DecimalVector("decimal2", bufferAllocator, 4, 2); + DecimalVector decimalVector3 = new DecimalVector("decimal3", bufferAllocator, 16, 8); + + int count = 10; + decimalVector1.allocateNew(count); + decimalVector2.allocateNew(count); + decimalVector3.allocateNew(count); + + for (int i = 0; i < count; i++) { + decimalVector1.setSafe(i, new BigDecimal(BigInteger.valueOf(i), 3)); + decimalVector2.setSafe(i, new BigDecimal(BigInteger.valueOf(i * (1 << 10)), 2)); + decimalVector3.setSafe(i, new BigDecimal(BigInteger.valueOf(i * 1111111111111111L), 8)); + } + + decimalVector1.setValueCount(count); + decimalVector2.setValueCount(count); + decimalVector3.setValueCount(count); + + List fields = Collections2.asImmutableList(decimalVector1.getField(), decimalVector2.getField(), + decimalVector3.getField()); + List vectors = Collections2.asImmutableList(decimalVector1, decimalVector2, decimalVector3); + return new VectorSchemaRoot(fields, vectors, count); + } + + protected void validateDecimalData(VectorSchemaRoot root) { + DecimalVector decimalVector1 = (DecimalVector) root.getVector("decimal1"); + DecimalVector decimalVector2 = (DecimalVector) root.getVector("decimal2"); + DecimalVector decimalVector3 = (DecimalVector) root.getVector("decimal3"); + int count = 10; + Assert.assertEquals(count, root.getRowCount()); + + for (int i = 0; i < count; i++) { + // Verify decimal 1 vector + BigDecimal readValue = decimalVector1.getObject(i); + ArrowType.Decimal type = (ArrowType.Decimal) decimalVector1.getField().getType(); + BigDecimal genValue = new BigDecimal(BigInteger.valueOf(i), type.getScale()); + Assert.assertEquals(genValue, readValue); + + // Verify decimal 2 vector + readValue = decimalVector2.getObject(i); + type = (ArrowType.Decimal) decimalVector2.getField().getType(); + genValue = new BigDecimal(BigInteger.valueOf(i * (1 << 10)), type.getScale()); + Assert.assertEquals(genValue, readValue); + + // Verify decimal 3 vector + readValue = decimalVector3.getObject(i); + type = (ArrowType.Decimal) decimalVector3.getField().getType(); + genValue = new BigDecimal(BigInteger.valueOf(i * 1111111111111111L), type.getScale()); + Assert.assertEquals(genValue, readValue); + } + } + + protected VectorSchemaRoot writeNullData(int valueCount) { + NullVector nullVector1 = new NullVector("vector1"); + NullVector nullVector2 = new NullVector("vector2"); + nullVector1.setValueCount(valueCount); + nullVector2.setValueCount(valueCount); + + List fields = Collections2.asImmutableList(nullVector1.getField(), nullVector2.getField()); + List vectors = Collections2.asImmutableList(nullVector1, nullVector2); + return new VectorSchemaRoot(fields, vectors, valueCount); + } + + protected void validateNullData(VectorSchemaRoot root, int valueCount) { + + NullVector vector1 = (NullVector) root.getFieldVectors().get(0); + NullVector vector2 = (NullVector) root.getFieldVectors().get(1); + + assertEquals(valueCount, vector1.getValueCount()); + assertEquals(valueCount, vector2.getValueCount()); + } + + public void validateUnionData(int count, VectorSchemaRoot root) { + FieldReader unionReader = root.getVector("union").getReader(); + for (int i = 0; i < count; i++) { + unionReader.setPosition(i); + switch (i % 4) { + case 0: + Assert.assertEquals(i, unionReader.readInteger().intValue()); + break; + case 1: + Assert.assertEquals(i, unionReader.readLong().longValue()); + break; + case 2: + Assert.assertEquals(i % 3, unionReader.size()); + break; + case 3: + NullableTimeStampMilliHolder h = new NullableTimeStampMilliHolder(); + unionReader.reader("timestamp").read(h); + Assert.assertEquals(i, h.value); + break; + default: + assert false : "Unexpected value in switch statement: " + i; + } + } + } + + public void writeUnionData(int count, StructVector parent) { + ArrowBuf varchar = allocator.buffer(3); + varchar.readerIndex(0); + varchar.setByte(0, 'a'); + varchar.setByte(1, 'b'); + varchar.setByte(2, 'c'); + varchar.writerIndex(3); + ComplexWriter writer = new ComplexWriterImpl("root", parent); + StructWriter rootWriter = writer.rootAsStruct(); + IntWriter intWriter = rootWriter.integer("union"); + BigIntWriter bigIntWriter = rootWriter.bigInt("union"); + ListWriter listWriter = rootWriter.list("union"); + StructWriter structWriter = rootWriter.struct("union"); + for (int i = 0; i < count; i++) { + switch (i % 4) { + case 0: + intWriter.setPosition(i); + intWriter.writeInt(i); + break; + case 1: + bigIntWriter.setPosition(i); + bigIntWriter.writeBigInt(i); + break; + case 2: + listWriter.setPosition(i); + listWriter.startList(); + for (int j = 0; j < i % 3; j++) { + listWriter.varChar().writeVarChar(0, 3, varchar); + } + listWriter.endList(); + break; + case 3: + structWriter.setPosition(i); + structWriter.start(); + structWriter.timeStampMilli("timestamp").writeTimeStampMilli(i); + structWriter.end(); + break; + default: + assert false : "Unexpected value in switch statement: " + i; + } + } + writer.setValueCount(count); + varchar.getReferenceManager().release(); + } + + protected void writeVarBinaryData(int count, StructVector parent) { + Assert.assertTrue(count < 100); + ComplexWriter writer = new ComplexWriterImpl("root", parent); + StructWriter rootWriter = writer.rootAsStruct(); + ListWriter listWriter = rootWriter.list("list"); + ArrowBuf varbin = allocator.buffer(count); + for (int i = 0; i < count; i++) { + varbin.setByte(i, i); + listWriter.setPosition(i); + listWriter.startList(); + for (int j = 0; j < i % 3; j++) { + listWriter.varBinary().writeVarBinary(0, i + 1, varbin); + } + listWriter.endList(); + } + writer.setValueCount(count); + varbin.getReferenceManager().release(); + } + + protected void validateVarBinary(int count, VectorSchemaRoot root) { + Assert.assertEquals(count, root.getRowCount()); + ListVector listVector = (ListVector) root.getVector("list"); + byte[] expectedArray = new byte[count]; + int numVarBinaryValues = 0; + for (int i = 0; i < count; i++) { + expectedArray[i] = (byte) i; + List objList = listVector.getObject(i); + if (i % 3 == 0) { + Assert.assertTrue(objList.isEmpty()); + } else { + byte[] expected = Arrays.copyOfRange(expectedArray, 0, i + 1); + for (int j = 0; j < i % 3; j++) { + byte[] result = (byte[]) objList.get(j); + Assert.assertArrayEquals(result, expected); + numVarBinaryValues++; + } + } + } + + // ListVector lastSet should be the index of last value + 1 + Assert.assertEquals(listVector.getLastSet(), count - 1); + + // VarBinaryVector lastSet should be the index of last value + VarBinaryVector binaryVector = (VarBinaryVector) listVector.getChildrenFromFields().get(0); + Assert.assertEquals(binaryVector.getLastSet(), numVarBinaryValues - 1); + } + + protected void writeBatchData(ArrowWriter writer, IntVector vector, VectorSchemaRoot root) throws IOException { + writer.start(); + + vector.setNull(0); + vector.setSafe(1, 1); + vector.setSafe(2, 2); + vector.setNull(3); + vector.setSafe(4, 1); + vector.setValueCount(5); + root.setRowCount(5); + writer.writeBatch(); + + vector.setNull(0); + vector.setSafe(1, 1); + vector.setSafe(2, 2); + vector.setValueCount(3); + root.setRowCount(3); + writer.writeBatch(); + + writer.end(); + } + + protected void validateBatchData(ArrowReader reader, IntVector vector) throws IOException { + reader.loadNextBatch(); + + assertEquals(vector.getValueCount(), 5); + assertTrue(vector.isNull(0)); + assertEquals(vector.get(1), 1); + assertEquals(vector.get(2), 2); + assertTrue(vector.isNull(3)); + assertEquals(vector.get(4), 1); + + reader.loadNextBatch(); + + assertEquals(vector.getValueCount(), 3); + assertTrue(vector.isNull(0)); + assertEquals(vector.get(1), 1); + assertEquals(vector.get(2), 2); + } + + protected VectorSchemaRoot writeMapData(BufferAllocator bufferAllocator) { + MapVector mapVector = MapVector.empty("map", bufferAllocator, false); + MapVector sortedMapVector = MapVector.empty("mapSorted", bufferAllocator, true); + mapVector.allocateNew(); + sortedMapVector.allocateNew(); + UnionMapWriter mapWriter = mapVector.getWriter(); + UnionMapWriter sortedMapWriter = sortedMapVector.getWriter(); + + final int count = 10; + for (int i = 0; i < count; i++) { + // Write mapVector with NULL values + // i == 1 is a NULL + if (i != 1) { + mapWriter.setPosition(i); + mapWriter.startMap(); + // i == 3 is an empty map + if (i != 3) { + for (int j = 0; j < i + 1; j++) { + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(j); + // i == 5 maps to a NULL value + if (i != 5) { + mapWriter.value().integer().writeInt(j); + } + mapWriter.endEntry(); + } + } + mapWriter.endMap(); + } + // Write sortedMapVector + sortedMapWriter.setPosition(i); + sortedMapWriter.startMap(); + for (int j = 0; j < i + 1; j++) { + sortedMapWriter.startEntry(); + sortedMapWriter.key().bigInt().writeBigInt(j); + sortedMapWriter.value().integer().writeInt(j); + sortedMapWriter.endEntry(); + } + sortedMapWriter.endMap(); + } + mapWriter.setValueCount(COUNT); + sortedMapWriter.setValueCount(COUNT); + + List fields = Collections2.asImmutableList(mapVector.getField(), sortedMapVector.getField()); + List vectors = Collections2.asImmutableList(mapVector, sortedMapVector); + return new VectorSchemaRoot(fields, vectors, count); + } + + protected void validateMapData(VectorSchemaRoot root) { + MapVector mapVector = (MapVector) root.getVector("map"); + MapVector sortedMapVector = (MapVector) root.getVector("mapSorted"); + + final int count = 10; + Assert.assertEquals(count, root.getRowCount()); + + UnionMapReader mapReader = new UnionMapReader(mapVector); + UnionMapReader sortedMapReader = new UnionMapReader(sortedMapVector); + for (int i = 0; i < count; i++) { + // Read mapVector with NULL values + mapReader.setPosition(i); + if (i == 1) { + assertFalse(mapReader.isSet()); + } else { + if (i == 3) { + JsonStringArrayList result = (JsonStringArrayList) mapReader.readObject(); + assertTrue(result.isEmpty()); + } else { + for (int j = 0; j < i + 1; j++) { + mapReader.next(); + assertEquals(j, mapReader.key().readLong().longValue()); + if (i == 5) { + assertFalse(mapReader.value().isSet()); + } else { + assertEquals(j, mapReader.value().readInteger().intValue()); + } + } + } + } + // Read sortedMapVector + sortedMapReader.setPosition(i); + for (int j = 0; j < i + 1; j++) { + sortedMapReader.next(); + assertEquals(j, sortedMapReader.key().readLong().longValue()); + assertEquals(j, sortedMapReader.value().readInteger().intValue()); + } + } + } + + protected VectorSchemaRoot writeListAsMapData(BufferAllocator bufferAllocator) { + ListVector mapEntryList = ListVector.empty("entryList", bufferAllocator); + FieldType mapEntryType = new FieldType(false, ArrowType.Struct.INSTANCE, null, null); + StructVector mapEntryData = new StructVector("entryData", bufferAllocator, mapEntryType, null); + mapEntryData.addOrGet("myKey", new FieldType(false, new ArrowType.Int(64, true), null), BigIntVector.class); + mapEntryData.addOrGet("myValue", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class); + mapEntryList.initializeChildrenFromFields(Collections2.asImmutableList(mapEntryData.getField())); + UnionListWriter entryWriter = mapEntryList.getWriter(); + entryWriter.allocate(); + + final int count = 10; + for (int i = 0; i < count; i++) { + entryWriter.setPosition(i); + entryWriter.startList(); + for (int j = 0; j < i + 1; j++) { + entryWriter.struct().start(); + entryWriter.struct().bigInt("myKey").writeBigInt(j); + entryWriter.struct().integer("myValue").writeInt(j); + entryWriter.struct().end(); + } + entryWriter.endList(); + } + entryWriter.setValueCount(COUNT); + + MapVector mapVector = MapVector.empty("map", bufferAllocator, false); + mapEntryList.makeTransferPair(mapVector).transfer(); + + List fields = Collections2.asImmutableList(mapVector.getField()); + List vectors = Collections2.asImmutableList(mapVector); + return new VectorSchemaRoot(fields, vectors, count); + } + + protected void validateListAsMapData(VectorSchemaRoot root) { + MapVector sortedMapVector = (MapVector) root.getVector("map"); + + final int count = 10; + Assert.assertEquals(count, root.getRowCount()); + + UnionMapReader sortedMapReader = new UnionMapReader(sortedMapVector); + sortedMapReader.setKeyValueNames("myKey", "myValue"); + for (int i = 0; i < count; i++) { + sortedMapReader.setPosition(i); + for (int j = 0; j < i + 1; j++) { + sortedMapReader.next(); + assertEquals(j, sortedMapReader.key().readLong().longValue()); + assertEquals(j, sortedMapReader.value().readInteger().intValue()); + } + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/ITTestIPCWithLargeArrowBuffers.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/ITTestIPCWithLargeArrowBuffers.java new file mode 100644 index 000000000..d3c91fd14 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/ITTestIPCWithLargeArrowBuffers.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.Arrays; +import java.util.Map; + +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.dictionary.Dictionary; +import org.apache.arrow.vector.dictionary.DictionaryProvider; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.DictionaryEncoding; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Integration test for reading/writing {@link org.apache.arrow.vector.VectorSchemaRoot} with + * large (more than 2GB) buffers by {@link ArrowReader} and {@link ArrowWriter}.. + * To run this test, please make sure there is at least 8GB free memory, and 8GB + * free.disk space in the system. + */ +public class ITTestIPCWithLargeArrowBuffers { + + private static final Logger logger = LoggerFactory.getLogger(ITTestIPCWithLargeArrowBuffers.class); + + // 4GB buffer size + static final long BUFFER_SIZE = 4 * 1024 * 1024 * 1024L; + + static final int DICTIONARY_VECTOR_SIZE = (int) (BUFFER_SIZE / BigIntVector.TYPE_WIDTH); + + static final int ENCODED_VECTOR_SIZE = (int) (BUFFER_SIZE / IntVector.TYPE_WIDTH); + + static final String FILE_NAME = "largeArrowData.data"; + + static final long DICTIONARY_ID = 123L; + + static final ArrowType.Int ENCODED_VECTOR_TYPE = new ArrowType.Int(32, true); + + static final DictionaryEncoding DICTIONARY_ENCODING = + new DictionaryEncoding(DICTIONARY_ID, false, ENCODED_VECTOR_TYPE); + + static final FieldType ENCODED_FIELD_TYPE = + new FieldType(true, ENCODED_VECTOR_TYPE, DICTIONARY_ENCODING, null); + + static final Field ENCODED_VECTOR_FIELD = new Field("encoded vector", ENCODED_FIELD_TYPE, null); + + private void testWriteLargeArrowData(boolean streamMode) throws IOException { + // simulate encoding big int as int + try (RootAllocator allocator = new RootAllocator(Long.MAX_VALUE); + BigIntVector dictVector = new BigIntVector("dic vector", allocator); + FileOutputStream out = new FileOutputStream(FILE_NAME); + IntVector encodedVector = (IntVector) ENCODED_VECTOR_FIELD.createVector(allocator)) { + + // prepare dictionary provider. + DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider(); + Dictionary dictionary = new Dictionary(dictVector, DICTIONARY_ENCODING); + provider.put(dictionary); + + // populate the dictionary vector + dictVector.allocateNew(DICTIONARY_VECTOR_SIZE); + for (int i = 0; i < DICTIONARY_VECTOR_SIZE; i++) { + dictVector.set(i, i); + } + dictVector.setValueCount(DICTIONARY_VECTOR_SIZE); + assertTrue(dictVector.getDataBuffer().capacity() > Integer.MAX_VALUE); + logger.trace("Populating dictionary vector finished"); + + // populate the encoded vector + encodedVector.allocateNew(ENCODED_VECTOR_SIZE); + for (int i = 0; i < ENCODED_VECTOR_SIZE; i++) { + encodedVector.set(i, i % DICTIONARY_VECTOR_SIZE); + } + encodedVector.setValueCount(ENCODED_VECTOR_SIZE); + assertTrue(encodedVector.getDataBuffer().capacity() > Integer.MAX_VALUE); + logger.trace("Populating encoded vector finished"); + + // build vector schema root and write data. + try (VectorSchemaRoot root = + new VectorSchemaRoot( + Arrays.asList(ENCODED_VECTOR_FIELD), Arrays.asList(encodedVector), ENCODED_VECTOR_SIZE); + ArrowWriter writer = streamMode ? + new ArrowStreamWriter(root, provider, out) : + new ArrowFileWriter(root, provider, out.getChannel())) { + writer.start(); + writer.writeBatch(); + writer.end(); + logger.trace("Writing data finished"); + } + } + + assertTrue(new File(FILE_NAME).exists()); + } + + private void testReadLargeArrowData(boolean streamMode) throws IOException { + try (RootAllocator allocator = new RootAllocator(Long.MAX_VALUE); + FileInputStream in = new FileInputStream(FILE_NAME); + ArrowReader reader = streamMode ? + new ArrowStreamReader(in, allocator) : + new ArrowFileReader(in.getChannel(), allocator)) { + + // verify schema + Schema readSchema = reader.getVectorSchemaRoot().getSchema(); + assertEquals(1, readSchema.getFields().size()); + assertEquals(ENCODED_VECTOR_FIELD, readSchema.getFields().get(0)); + logger.trace("Verifying schema finished"); + + // verify vector schema root + assertTrue(reader.loadNextBatch()); + VectorSchemaRoot root = reader.getVectorSchemaRoot(); + + assertEquals(ENCODED_VECTOR_SIZE, root.getRowCount()); + assertEquals(1, root.getFieldVectors().size()); + assertTrue(root.getFieldVectors().get(0) instanceof IntVector); + + IntVector encodedVector = (IntVector) root.getVector(0); + for (int i = 0; i < ENCODED_VECTOR_SIZE; i++) { + assertEquals(i % DICTIONARY_VECTOR_SIZE, encodedVector.get(i)); + } + logger.trace("Verifying encoded vector finished"); + + // verify dictionary + Map dictVectors = reader.getDictionaryVectors(); + assertEquals(1, dictVectors.size()); + Dictionary dictionary = dictVectors.get(DICTIONARY_ID); + assertNotNull(dictionary); + + assertTrue(dictionary.getVector() instanceof BigIntVector); + BigIntVector dictVector = (BigIntVector) dictionary.getVector(); + assertEquals(DICTIONARY_VECTOR_SIZE, dictVector.getValueCount()); + for (int i = 0; i < DICTIONARY_VECTOR_SIZE; i++) { + assertEquals(i, dictVector.get(i)); + } + logger.trace("Verifying dictionary vector finished"); + + // ensure no more data available + assertFalse(reader.loadNextBatch()); + } finally { + File dataFile = new File(FILE_NAME); + dataFile.delete(); + assertFalse(dataFile.exists()); + } + } + + @Test + public void testIPC() throws IOException { + logger.trace("Start testing reading/writing large arrow stream data"); + testWriteLargeArrowData(true); + testReadLargeArrowData(true); + logger.trace("Finish testing reading/writing large arrow stream data"); + + logger.trace("Start testing reading/writing large arrow file data"); + testWriteLargeArrowData(false); + testReadLargeArrowData(false); + logger.trace("Finish testing reading/writing large arrow file data"); + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/MessageSerializerTest.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/MessageSerializerTest.java new file mode 100644 index 000000000..11b8d4fad --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/MessageSerializerTest.java @@ -0,0 +1,247 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +import static java.util.Arrays.asList; +import static org.apache.arrow.memory.util.LargeMemoryUtil.checkedCastToInt; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.channels.Channels; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.ipc.message.ArrowMessage; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; +import org.apache.arrow.vector.ipc.message.IpcOption; +import org.apache.arrow.vector.ipc.message.MessageSerializer; +import org.apache.arrow.vector.types.MetadataVersion; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.DictionaryEncoding; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +public class MessageSerializerTest { + + public static ArrowBuf buf(BufferAllocator alloc, byte[] bytes) { + ArrowBuf buffer = alloc.buffer(bytes.length); + buffer.writeBytes(bytes); + return buffer; + } + + public static byte[] array(ArrowBuf buf) { + byte[] bytes = new byte[checkedCastToInt(buf.readableBytes())]; + buf.readBytes(bytes); + return bytes; + } + + private int intToByteRoundtrip(int v, byte[] bytes) { + MessageSerializer.intToBytes(v, bytes); + return MessageSerializer.bytesToInt(bytes); + } + + @Test + public void testIntToBytes() { + byte[] bytes = new byte[4]; + int[] values = new int[]{1, 15, 1 << 8, 1 << 16, Integer.MAX_VALUE}; + for (int v : values) { + assertEquals(intToByteRoundtrip(v, bytes), v); + } + } + + @Test + public void testWriteMessageBufferAligned() throws IOException { + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + WriteChannel out = new WriteChannel(Channels.newChannel(outputStream)); + + // This is not a valid Arrow Message, only to test writing and alignment + ByteBuffer buffer = ByteBuffer.allocate(8).order(ByteOrder.nativeOrder()); + buffer.putInt(1); + buffer.putInt(2); + buffer.flip(); + + int bytesWritten = MessageSerializer.writeMessageBuffer(out, 8, buffer); + assertEquals(16, bytesWritten); + + buffer.rewind(); + buffer.putInt(3); + buffer.flip(); + bytesWritten = MessageSerializer.writeMessageBuffer(out, 4, buffer); + assertEquals(16, bytesWritten); + + ByteArrayInputStream inputStream = new ByteArrayInputStream(outputStream.toByteArray()); + ReadChannel in = new ReadChannel(Channels.newChannel(inputStream)); + ByteBuffer result = ByteBuffer.allocate(32).order(ByteOrder.nativeOrder()); + in.readFully(result); + result.rewind(); + + // First message continuation, size, and 2 int values + assertEquals(MessageSerializer.IPC_CONTINUATION_TOKEN, result.getInt()); + // mesage length is represented in little endian + result.order(ByteOrder.LITTLE_ENDIAN); + assertEquals(8, result.getInt()); + result.order(ByteOrder.nativeOrder()); + assertEquals(1, result.getInt()); + assertEquals(2, result.getInt()); + + // Second message continuation, size, 1 int value and 4 bytes padding + assertEquals(MessageSerializer.IPC_CONTINUATION_TOKEN, result.getInt()); + // mesage length is represented in little endian + result.order(ByteOrder.LITTLE_ENDIAN); + assertEquals(8, result.getInt()); + result.order(ByteOrder.nativeOrder()); + assertEquals(3, result.getInt()); + assertEquals(0, result.getInt()); + } + + @Test + public void testSchemaMessageSerialization() throws IOException { + Schema schema = testSchema(); + ByteArrayOutputStream out = new ByteArrayOutputStream(); + long size = MessageSerializer.serialize( + new WriteChannel(Channels.newChannel(out)), schema); + assertEquals(size, out.toByteArray().length); + + ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); + Schema deserialized = MessageSerializer.deserializeSchema( + new ReadChannel(Channels.newChannel(in))); + assertEquals(schema, deserialized); + assertEquals(1, deserialized.getFields().size()); + } + + @Test + public void testSchemaDictionaryMessageSerialization() throws IOException { + DictionaryEncoding dictionary = new DictionaryEncoding(9L, false, new ArrowType.Int(8, true)); + Field field = new Field("test", new FieldType(true, ArrowType.Utf8.INSTANCE, dictionary, null), null); + Schema schema = new Schema(Collections.singletonList(field)); + ByteArrayOutputStream out = new ByteArrayOutputStream(); + long size = MessageSerializer.serialize(new WriteChannel(Channels.newChannel(out)), schema); + assertEquals(size, out.toByteArray().length); + + ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); + Schema deserialized = MessageSerializer.deserializeSchema(new ReadChannel(Channels.newChannel(in))); + assertEquals(schema, deserialized); + } + + @Rule + public ExpectedException expectedEx = ExpectedException.none(); + + @Test + public void testSerializeRecordBatchV4() throws IOException { + byte[] validity = new byte[]{(byte) 255, 0}; + // second half is "undefined" + byte[] values = new byte[]{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; + + BufferAllocator alloc = new RootAllocator(Long.MAX_VALUE); + ArrowBuf validityb = buf(alloc, validity); + ArrowBuf valuesb = buf(alloc, values); + + ArrowRecordBatch batch = new ArrowRecordBatch( + 16, asList(new ArrowFieldNode(16, 8)), asList(validityb, valuesb)); + + // avoid writing legacy ipc format by default + IpcOption option = new IpcOption(false, MetadataVersion.V4); + ByteArrayOutputStream out = new ByteArrayOutputStream(); + MessageSerializer.serialize(new WriteChannel(Channels.newChannel(out)), batch, option); + + ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); + ReadChannel channel = new ReadChannel(Channels.newChannel(in)); + ArrowMessage deserialized = MessageSerializer.deserializeMessageBatch(channel, alloc); + assertEquals(ArrowRecordBatch.class, deserialized.getClass()); + verifyBatch((ArrowRecordBatch) deserialized, validity, values); + } + + @Test + public void testSerializeRecordBatchV5() throws Exception { + byte[] validity = new byte[]{(byte) 255, 0}; + // second half is "undefined" + byte[] values = new byte[]{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; + + BufferAllocator alloc = new RootAllocator(Long.MAX_VALUE); + ArrowBuf validityb = buf(alloc, validity); + ArrowBuf valuesb = buf(alloc, values); + + ArrowRecordBatch batch = new ArrowRecordBatch( + 16, asList(new ArrowFieldNode(16, 8)), asList(validityb, valuesb)); + + // avoid writing legacy ipc format by default + IpcOption option = new IpcOption(false, MetadataVersion.V5); + ByteArrayOutputStream out = new ByteArrayOutputStream(); + MessageSerializer.serialize(new WriteChannel(Channels.newChannel(out)), batch, option); + validityb.close(); + valuesb.close(); + batch.close(); + + { + ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); + ReadChannel channel = new ReadChannel(Channels.newChannel(in)); + ArrowMessage deserialized = MessageSerializer.deserializeMessageBatch(channel, alloc); + assertEquals(ArrowRecordBatch.class, deserialized.getClass()); + verifyBatch((ArrowRecordBatch) deserialized, validity, values); + deserialized.close(); + } + + { + byte[] validBytes = out.toByteArray(); + byte[] missingBytes = Arrays.copyOfRange(validBytes, /*from=*/0, validBytes.length - 1); + + ByteArrayInputStream in = new ByteArrayInputStream(missingBytes); + ReadChannel channel = new ReadChannel(Channels.newChannel(in)); + + assertThrows(IOException.class, () -> MessageSerializer.deserializeMessageBatch(channel, alloc)); + } + + alloc.close(); + } + + public static Schema testSchema() { + return new Schema(asList(new Field( + "testField", FieldType.nullable(new ArrowType.Int(8, true)), Collections.emptyList()))); + } + + // Verifies batch contents matching test schema. + public static void verifyBatch(ArrowRecordBatch batch, byte[] validity, byte[] values) { + assertTrue(batch != null); + List nodes = batch.getNodes(); + assertEquals(1, nodes.size()); + ArrowFieldNode node = nodes.get(0); + assertEquals(16, node.getLength()); + assertEquals(8, node.getNullCount()); + List buffers = batch.getBuffers(); + assertEquals(2, buffers.size()); + assertArrayEquals(validity, MessageSerializerTest.array(buffers.get(0))); + assertArrayEquals(values, MessageSerializerTest.array(buffers.get(1))); + } + +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowFile.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowFile.java new file mode 100644 index 000000000..4fb582278 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowFile.java @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +import static java.nio.channels.Channels.newChannel; +import static org.apache.arrow.vector.TestUtils.newVarCharVector; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.List; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.util.Collections2; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.types.pojo.Field; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class TestArrowFile extends BaseFileTest { + private static final Logger LOGGER = LoggerFactory.getLogger(TestArrowFile.class); + + @Test + public void testWrite() throws IOException { + File file = new File("target/mytest_write.arrow"); + int count = COUNT; + try ( + BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + StructVector parent = StructVector.empty("parent", vectorAllocator)) { + writeData(count, parent); + write(parent.getChild("root"), file, new ByteArrayOutputStream()); + } + } + + @Test + public void testWriteComplex() throws IOException { + File file = new File("target/mytest_write_complex.arrow"); + int count = COUNT; + try ( + BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + StructVector parent = StructVector.empty("parent", vectorAllocator)) { + writeComplexData(count, parent); + FieldVector root = parent.getChild("root"); + validateComplexContent(count, new VectorSchemaRoot(root)); + write(root, file, new ByteArrayOutputStream()); + } + } + + /** + * Writes the contents of parents to file. If outStream is non-null, also writes it + * to outStream in the streaming serialized format. + */ + private void write(FieldVector parent, File file, OutputStream outStream) throws IOException { + VectorSchemaRoot root = new VectorSchemaRoot(parent); + + try (FileOutputStream fileOutputStream = new FileOutputStream(file); + ArrowFileWriter arrowWriter = new ArrowFileWriter(root, null, fileOutputStream.getChannel());) { + LOGGER.debug("writing schema: " + root.getSchema()); + arrowWriter.start(); + arrowWriter.writeBatch(); + arrowWriter.end(); + } + + // Also try serializing to the stream writer. + if (outStream != null) { + try (ArrowStreamWriter arrowWriter = new ArrowStreamWriter(root, null, outStream)) { + arrowWriter.start(); + arrowWriter.writeBatch(); + arrowWriter.end(); + } + } + } + + @Test + public void testFileStreamHasEos() throws IOException { + + try (VarCharVector vector1 = newVarCharVector("varchar1", allocator)) { + vector1.allocateNewSafe(); + vector1.set(0, "foo".getBytes(StandardCharsets.UTF_8)); + vector1.set(1, "bar".getBytes(StandardCharsets.UTF_8)); + vector1.set(3, "baz".getBytes(StandardCharsets.UTF_8)); + vector1.set(4, "bar".getBytes(StandardCharsets.UTF_8)); + vector1.set(5, "baz".getBytes(StandardCharsets.UTF_8)); + vector1.setValueCount(6); + + List fields = Arrays.asList(vector1.getField()); + List vectors = Collections2.asImmutableList(vector1); + VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors, vector1.getValueCount()); + + // write data + ByteArrayOutputStream out = new ByteArrayOutputStream(); + ArrowFileWriter writer = new ArrowFileWriter(root, null, newChannel(out)); + writer.start(); + writer.writeBatch(); + writer.end(); + + byte[] bytes = out.toByteArray(); + byte[] bytesWithoutMagic = new byte[bytes.length - 8]; + System.arraycopy(bytes, 8, bytesWithoutMagic, 0, bytesWithoutMagic.length); + + try (ArrowStreamReader reader = new ArrowStreamReader(new ByteArrayInputStream(bytesWithoutMagic), allocator)) { + assertTrue(reader.loadNextBatch()); + // here will throw exception if read footer instead of eos. + assertFalse(reader.loadNextBatch()); + } + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowFooter.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowFooter.java new file mode 100644 index 000000000..38c65bdde --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowFooter.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +import static java.util.Arrays.asList; +import static org.junit.Assert.assertEquals; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.arrow.flatbuf.Footer; +import org.apache.arrow.vector.ipc.message.ArrowBlock; +import org.apache.arrow.vector.ipc.message.ArrowFooter; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.Test; + +import com.google.flatbuffers.FlatBufferBuilder; + +public class TestArrowFooter { + + @Test + public void test() { + Schema schema = new Schema(asList( + new Field("a", FieldType.nullable(new ArrowType.Int(8, true)), Collections.emptyList()) + )); + ArrowFooter footer = + new ArrowFooter(schema, Collections.emptyList(), Collections.emptyList()); + ArrowFooter newFooter = roundTrip(footer); + assertEquals(footer, newFooter); + + List ids = new ArrayList<>(); + ids.add(new ArrowBlock(0, 1, 2)); + ids.add(new ArrowBlock(4, 5, 6)); + footer = new ArrowFooter(schema, ids, ids); + assertEquals(footer, roundTrip(footer)); + } + + + private ArrowFooter roundTrip(ArrowFooter footer) { + FlatBufferBuilder builder = new FlatBufferBuilder(); + int i = footer.writeTo(builder); + builder.finish(i); + ByteBuffer dataBuffer = builder.dataBuffer(); + ArrowFooter newFooter = new ArrowFooter(Footer.getRootAsFooter(dataBuffer)); + return newFooter; + } + +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowReaderWriter.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowReaderWriter.java new file mode 100644 index 000000000..1167819de --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowReaderWriter.java @@ -0,0 +1,882 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +import static java.nio.channels.Channels.newChannel; +import static java.util.Arrays.asList; +import static org.apache.arrow.memory.util.LargeMemoryUtil.checkedCastToInt; +import static org.apache.arrow.vector.TestUtils.newVarCharVector; +import static org.apache.arrow.vector.TestUtils.newVector; +import static org.apache.arrow.vector.testing.ValueVectorDataPopulator.setVector; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.channels.Channels; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.function.BiFunction; +import java.util.stream.Collectors; + +import org.apache.arrow.flatbuf.FieldNode; +import org.apache.arrow.flatbuf.Message; +import org.apache.arrow.flatbuf.RecordBatch; +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.util.AutoCloseables; +import org.apache.arrow.util.Collections2; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.TestUtils; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VectorLoader; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.VectorUnloader; +import org.apache.arrow.vector.compare.Range; +import org.apache.arrow.vector.compare.RangeEqualsVisitor; +import org.apache.arrow.vector.compare.TypeEqualsVisitor; +import org.apache.arrow.vector.compare.VectorEqualsVisitor; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.dictionary.Dictionary; +import org.apache.arrow.vector.dictionary.DictionaryEncoder; +import org.apache.arrow.vector.dictionary.DictionaryProvider; +import org.apache.arrow.vector.ipc.message.ArrowBlock; +import org.apache.arrow.vector.ipc.message.ArrowDictionaryBatch; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; +import org.apache.arrow.vector.ipc.message.IpcOption; +import org.apache.arrow.vector.ipc.message.MessageSerializer; +import org.apache.arrow.vector.types.MetadataVersion; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.DictionaryEncoding; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel; +import org.apache.arrow.vector.util.DictionaryUtility; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestArrowReaderWriter { + + private BufferAllocator allocator; + + private VarCharVector dictionaryVector1; + private VarCharVector dictionaryVector2; + private VarCharVector dictionaryVector3; + private StructVector dictionaryVector4; + + private Dictionary dictionary1; + private Dictionary dictionary2; + private Dictionary dictionary3; + private Dictionary dictionary4; + + private Schema schema; + private Schema encodedSchema; + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + + dictionaryVector1 = newVarCharVector("D1", allocator); + setVector(dictionaryVector1, + "foo".getBytes(StandardCharsets.UTF_8), + "bar".getBytes(StandardCharsets.UTF_8), + "baz".getBytes(StandardCharsets.UTF_8)); + + dictionaryVector2 = newVarCharVector("D2", allocator); + setVector(dictionaryVector2, + "aa".getBytes(StandardCharsets.UTF_8), + "bb".getBytes(StandardCharsets.UTF_8), + "cc".getBytes(StandardCharsets.UTF_8)); + + dictionaryVector3 = newVarCharVector("D3", allocator); + setVector(dictionaryVector3, + "foo".getBytes(StandardCharsets.UTF_8), + "bar".getBytes(StandardCharsets.UTF_8), + "baz".getBytes(StandardCharsets.UTF_8), + "aa".getBytes(StandardCharsets.UTF_8), + "bb".getBytes(StandardCharsets.UTF_8), + "cc".getBytes(StandardCharsets.UTF_8)); + + dictionaryVector4 = newVector(StructVector.class, "D4", MinorType.STRUCT, allocator); + final Map> dictionaryValues4 = new HashMap<>(); + dictionaryValues4.put("a", Arrays.asList(1, 2, 3)); + dictionaryValues4.put("b", Arrays.asList(4, 5, 6)); + setVector(dictionaryVector4, dictionaryValues4); + + dictionary1 = new Dictionary(dictionaryVector1, + new DictionaryEncoding(/*id=*/1L, /*ordered=*/false, /*indexType=*/null)); + dictionary2 = new Dictionary(dictionaryVector2, + new DictionaryEncoding(/*id=*/2L, /*ordered=*/false, /*indexType=*/null)); + dictionary3 = new Dictionary(dictionaryVector3, + new DictionaryEncoding(/*id=*/1L, /*ordered=*/false, /*indexType=*/null)); + dictionary4 = new Dictionary(dictionaryVector4, + new DictionaryEncoding(/*id=*/3L, /*ordered=*/false, /*indexType=*/null)); + } + + @After + public void terminate() throws Exception { + dictionaryVector1.close(); + dictionaryVector2.close(); + dictionaryVector3.close(); + dictionaryVector4.close(); + allocator.close(); + } + + ArrowBuf buf(byte[] bytes) { + ArrowBuf buffer = allocator.buffer(bytes.length); + buffer.writeBytes(bytes); + return buffer; + } + + byte[] array(ArrowBuf buf) { + byte[] bytes = new byte[checkedCastToInt(buf.readableBytes())]; + buf.readBytes(bytes); + return bytes; + } + + @Test + public void test() throws IOException { + Schema schema = new Schema(asList(new Field("testField", FieldType.nullable(new ArrowType.Int(8, true)), + Collections.emptyList()))); + ArrowType type = schema.getFields().get(0).getType(); + FieldVector vector = TestUtils.newVector(FieldVector.class, "testField", type, allocator); + vector.initializeChildrenFromFields(schema.getFields().get(0).getChildren()); + + byte[] validity = new byte[] {(byte) 255, 0}; + // second half is "undefined" + byte[] values = new byte[] {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; + + ByteArrayOutputStream out = new ByteArrayOutputStream(); + try (VectorSchemaRoot root = new VectorSchemaRoot(schema.getFields(), asList(vector), 16); + ArrowFileWriter writer = new ArrowFileWriter(root, null, newChannel(out))) { + ArrowBuf validityb = buf(validity); + ArrowBuf valuesb = buf(values); + ArrowRecordBatch batch = new ArrowRecordBatch(16, asList(new ArrowFieldNode(16, 8)), asList(validityb, valuesb)); + VectorLoader loader = new VectorLoader(root); + loader.load(batch); + writer.writeBatch(); + + validityb.close(); + valuesb.close(); + batch.close(); + } + + byte[] byteArray = out.toByteArray(); + + try (SeekableReadChannel channel = new SeekableReadChannel(new ByteArrayReadableSeekableByteChannel(byteArray)); + ArrowFileReader reader = new ArrowFileReader(channel, allocator)) { + Schema readSchema = reader.getVectorSchemaRoot().getSchema(); + assertEquals(schema, readSchema); + // TODO: dictionaries + List recordBatches = reader.getRecordBlocks(); + assertEquals(1, recordBatches.size()); + reader.loadNextBatch(); + VectorUnloader unloader = new VectorUnloader(reader.getVectorSchemaRoot()); + ArrowRecordBatch recordBatch = unloader.getRecordBatch(); + List nodes = recordBatch.getNodes(); + assertEquals(1, nodes.size()); + ArrowFieldNode node = nodes.get(0); + assertEquals(16, node.getLength()); + assertEquals(8, node.getNullCount()); + List buffers = recordBatch.getBuffers(); + assertEquals(2, buffers.size()); + assertArrayEquals(validity, array(buffers.get(0))); + assertArrayEquals(values, array(buffers.get(1))); + + // Read just the header. This demonstrates being able to read without need to + // deserialize the buffer. + ByteBuffer headerBuffer = ByteBuffer.allocate(recordBatches.get(0).getMetadataLength()); + headerBuffer.put(byteArray, (int) recordBatches.get(0).getOffset(), headerBuffer.capacity()); + // new format prefix_size ==8 + headerBuffer.position(8); + Message messageFB = Message.getRootAsMessage(headerBuffer); + RecordBatch recordBatchFB = (RecordBatch) messageFB.header(new RecordBatch()); + assertEquals(2, recordBatchFB.buffersLength()); + assertEquals(1, recordBatchFB.nodesLength()); + FieldNode nodeFB = recordBatchFB.nodes(0); + assertEquals(16, nodeFB.length()); + assertEquals(8, nodeFB.nullCount()); + + recordBatch.close(); + } + } + + @Test + public void testWriteReadNullVector() throws IOException { + + int valueCount = 3; + + NullVector nullVector = new NullVector("vector"); + nullVector.setValueCount(valueCount); + + Schema schema = new Schema(asList(nullVector.getField())); + + ByteArrayOutputStream out = new ByteArrayOutputStream(); + try (VectorSchemaRoot root = new VectorSchemaRoot(schema.getFields(), asList(nullVector), valueCount); + ArrowFileWriter writer = new ArrowFileWriter(root, null, newChannel(out))) { + ArrowRecordBatch batch = new ArrowRecordBatch(valueCount, + asList(new ArrowFieldNode(valueCount, 0)), + Collections.emptyList()); + VectorLoader loader = new VectorLoader(root); + loader.load(batch); + writer.writeBatch(); + } + + byte[] byteArray = out.toByteArray(); + + try (SeekableReadChannel channel = new SeekableReadChannel(new ByteArrayReadableSeekableByteChannel(byteArray)); + ArrowFileReader reader = new ArrowFileReader(channel, allocator)) { + Schema readSchema = reader.getVectorSchemaRoot().getSchema(); + assertEquals(schema, readSchema); + List recordBatches = reader.getRecordBlocks(); + assertEquals(1, recordBatches.size()); + + assertTrue(reader.loadNextBatch()); + assertEquals(1, reader.getVectorSchemaRoot().getFieldVectors().size()); + + NullVector readNullVector = (NullVector) reader.getVectorSchemaRoot().getFieldVectors().get(0); + assertEquals(valueCount, readNullVector.getValueCount()); + } + } + + @Test + public void testWriteReadWithDictionaries() throws IOException { + DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider(); + provider.put(dictionary1); + + VarCharVector vector1 = newVarCharVector("varchar1", allocator); + vector1.allocateNewSafe(); + vector1.set(0, "foo".getBytes(StandardCharsets.UTF_8)); + vector1.set(1, "bar".getBytes(StandardCharsets.UTF_8)); + vector1.set(3, "baz".getBytes(StandardCharsets.UTF_8)); + vector1.set(4, "bar".getBytes(StandardCharsets.UTF_8)); + vector1.set(5, "baz".getBytes(StandardCharsets.UTF_8)); + vector1.setValueCount(6); + FieldVector encodedVector1 = (FieldVector) DictionaryEncoder.encode(vector1, dictionary1); + vector1.close(); + + VarCharVector vector2 = newVarCharVector("varchar2", allocator); + vector2.allocateNewSafe(); + vector2.set(0, "bar".getBytes(StandardCharsets.UTF_8)); + vector2.set(1, "baz".getBytes(StandardCharsets.UTF_8)); + vector2.set(2, "foo".getBytes(StandardCharsets.UTF_8)); + vector2.set(3, "foo".getBytes(StandardCharsets.UTF_8)); + vector2.set(4, "foo".getBytes(StandardCharsets.UTF_8)); + vector2.set(5, "bar".getBytes(StandardCharsets.UTF_8)); + vector2.setValueCount(6); + FieldVector encodedVector2 = (FieldVector) DictionaryEncoder.encode(vector2, dictionary1); + vector2.close(); + + List fields = Arrays.asList(encodedVector1.getField(), encodedVector2.getField()); + List vectors = Collections2.asImmutableList(encodedVector1, encodedVector2); + try (VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors, encodedVector1.getValueCount()); + ByteArrayOutputStream out = new ByteArrayOutputStream(); + ArrowFileWriter writer = new ArrowFileWriter(root, provider, newChannel(out));) { + + writer.start(); + writer.writeBatch(); + writer.end(); + + try (SeekableReadChannel channel = new SeekableReadChannel( + new ByteArrayReadableSeekableByteChannel(out.toByteArray())); + ArrowFileReader reader = new ArrowFileReader(channel, allocator)) { + Schema readSchema = reader.getVectorSchemaRoot().getSchema(); + assertEquals(root.getSchema(), readSchema); + assertEquals(1, reader.getDictionaryBlocks().size()); + assertEquals(1, reader.getRecordBlocks().size()); + + reader.loadNextBatch(); + assertEquals(2, reader.getVectorSchemaRoot().getFieldVectors().size()); + } + } + } + + @Test + public void testWriteReadWithStructDictionaries() throws IOException { + DictionaryProvider.MapDictionaryProvider provider = + new DictionaryProvider.MapDictionaryProvider(); + provider.put(dictionary4); + + try (final StructVector vector = + newVector(StructVector.class, "D4", MinorType.STRUCT, allocator)) { + final Map> values = new HashMap<>(); + // Index: 0, 2, 1, 2, 1, 0, 0 + values.put("a", Arrays.asList(1, 3, 2, 3, 2, 1, 1)); + values.put("b", Arrays.asList(4, 6, 5, 6, 5, 4, 4)); + setVector(vector, values); + FieldVector encodedVector = (FieldVector) DictionaryEncoder.encode(vector, dictionary4); + + List fields = Arrays.asList(encodedVector.getField()); + List vectors = Collections2.asImmutableList(encodedVector); + try ( + VectorSchemaRoot root = + new VectorSchemaRoot(fields, vectors, encodedVector.getValueCount()); + ByteArrayOutputStream out = new ByteArrayOutputStream(); + ArrowFileWriter writer = new ArrowFileWriter(root, provider, newChannel(out));) { + + writer.start(); + writer.writeBatch(); + writer.end(); + + try ( + SeekableReadChannel channel = new SeekableReadChannel( + new ByteArrayReadableSeekableByteChannel(out.toByteArray())); + ArrowFileReader reader = new ArrowFileReader(channel, allocator)) { + final VectorSchemaRoot readRoot = reader.getVectorSchemaRoot(); + final Schema readSchema = readRoot.getSchema(); + assertEquals(root.getSchema(), readSchema); + assertEquals(1, reader.getDictionaryBlocks().size()); + assertEquals(1, reader.getRecordBlocks().size()); + + reader.loadNextBatch(); + assertEquals(1, readRoot.getFieldVectors().size()); + assertEquals(1, reader.getDictionaryVectors().size()); + + // Read the encoded vector and check it + final FieldVector readEncoded = readRoot.getVector(0); + assertEquals(encodedVector.getValueCount(), readEncoded.getValueCount()); + assertTrue(new RangeEqualsVisitor(encodedVector, readEncoded) + .rangeEquals(new Range(0, 0, encodedVector.getValueCount()))); + + // Read the dictionary + final Map readDictionaryMap = reader.getDictionaryVectors(); + final Dictionary readDictionary = + readDictionaryMap.get(readEncoded.getField().getDictionary().getId()); + assertNotNull(readDictionary); + + // Assert the dictionary vector is correct + final FieldVector readDictionaryVector = readDictionary.getVector(); + assertEquals(dictionaryVector4.getValueCount(), readDictionaryVector.getValueCount()); + final BiFunction typeComparatorIgnoreName = + (v1, v2) -> new TypeEqualsVisitor(v1, false, true).equals(v2); + assertTrue("Dictionary vectors are not equal", + new RangeEqualsVisitor(dictionaryVector4, readDictionaryVector, + typeComparatorIgnoreName) + .rangeEquals(new Range(0, 0, dictionaryVector4.getValueCount()))); + + // Assert the decoded vector is correct + try (final ValueVector readVector = + DictionaryEncoder.decode(readEncoded, readDictionary)) { + assertEquals(vector.getValueCount(), readVector.getValueCount()); + assertTrue("Decoded vectors are not equal", + new RangeEqualsVisitor(vector, readVector, typeComparatorIgnoreName) + .rangeEquals(new Range(0, 0, vector.getValueCount()))); + } + } + } + } + } + + @Test + public void testEmptyStreamInFileIPC() throws IOException { + + DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider(); + provider.put(dictionary1); + + VarCharVector vector = newVarCharVector("varchar", allocator); + vector.allocateNewSafe(); + vector.set(0, "foo".getBytes(StandardCharsets.UTF_8)); + vector.set(1, "bar".getBytes(StandardCharsets.UTF_8)); + vector.set(3, "baz".getBytes(StandardCharsets.UTF_8)); + vector.set(4, "bar".getBytes(StandardCharsets.UTF_8)); + vector.set(5, "baz".getBytes(StandardCharsets.UTF_8)); + vector.setValueCount(6); + + FieldVector encodedVector1A = (FieldVector) DictionaryEncoder.encode(vector, dictionary1); + vector.close(); + + List fields = Arrays.asList(encodedVector1A.getField()); + List vectors = Collections2.asImmutableList(encodedVector1A); + + try (VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors, encodedVector1A.getValueCount()); + ByteArrayOutputStream out = new ByteArrayOutputStream(); + ArrowFileWriter writer = new ArrowFileWriter(root, provider, newChannel(out))) { + + writer.start(); + writer.end(); + + try (SeekableReadChannel channel = new SeekableReadChannel( + new ByteArrayReadableSeekableByteChannel(out.toByteArray())); + ArrowFileReader reader = new ArrowFileReader(channel, allocator)) { + Schema readSchema = reader.getVectorSchemaRoot().getSchema(); + assertEquals(root.getSchema(), readSchema); + assertEquals(1, reader.getDictionaryVectors().size()); + assertEquals(0, reader.getDictionaryBlocks().size()); + assertEquals(0, reader.getRecordBlocks().size()); + } + } + + } + + @Test + public void testEmptyStreamInStreamingIPC() throws IOException { + + DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider(); + provider.put(dictionary1); + + VarCharVector vector = newVarCharVector("varchar", allocator); + vector.allocateNewSafe(); + vector.set(0, "foo".getBytes(StandardCharsets.UTF_8)); + vector.set(1, "bar".getBytes(StandardCharsets.UTF_8)); + vector.set(3, "baz".getBytes(StandardCharsets.UTF_8)); + vector.set(4, "bar".getBytes(StandardCharsets.UTF_8)); + vector.set(5, "baz".getBytes(StandardCharsets.UTF_8)); + vector.setValueCount(6); + + FieldVector encodedVector = (FieldVector) DictionaryEncoder.encode(vector, dictionary1); + vector.close(); + + List fields = Arrays.asList(encodedVector.getField()); + try (VectorSchemaRoot root = + new VectorSchemaRoot(fields, Arrays.asList(encodedVector), encodedVector.getValueCount()); + ByteArrayOutputStream out = new ByteArrayOutputStream(); + ArrowStreamWriter writer = new ArrowStreamWriter(root, provider, newChannel(out))) { + + writer.start(); + writer.end(); + + + try (ArrowStreamReader reader = new ArrowStreamReader( + new ByteArrayReadableSeekableByteChannel(out.toByteArray()), allocator)) { + Schema readSchema = reader.getVectorSchemaRoot().getSchema(); + assertEquals(root.getSchema(), readSchema); + assertEquals(1, reader.getDictionaryVectors().size()); + assertFalse(reader.loadNextBatch()); + } + } + + } + + @Test + public void testDictionaryReplacement() throws Exception { + VarCharVector vector1 = newVarCharVector("varchar1", allocator); + setVector(vector1, + "foo".getBytes(StandardCharsets.UTF_8), + "bar".getBytes(StandardCharsets.UTF_8), + "baz".getBytes(StandardCharsets.UTF_8), + "bar".getBytes(StandardCharsets.UTF_8)); + + FieldVector encodedVector1 = (FieldVector) DictionaryEncoder.encode(vector1, dictionary1); + + VarCharVector vector2 = newVarCharVector("varchar2", allocator); + setVector(vector2, + "foo".getBytes(StandardCharsets.UTF_8), + "foo".getBytes(StandardCharsets.UTF_8), + "foo".getBytes(StandardCharsets.UTF_8), + "foo".getBytes(StandardCharsets.UTF_8)); + + FieldVector encodedVector2 = (FieldVector) DictionaryEncoder.encode(vector2, dictionary1); + + DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider(); + provider.put(dictionary1); + List schemaFields = new ArrayList<>(); + schemaFields.add(DictionaryUtility.toMessageFormat(encodedVector1.getField(), provider, new HashSet<>())); + schemaFields.add(DictionaryUtility.toMessageFormat(encodedVector2.getField(), provider, new HashSet<>())); + Schema schema = new Schema(schemaFields); + + ByteArrayOutputStream outStream = new ByteArrayOutputStream(); + WriteChannel out = new WriteChannel(newChannel(outStream)); + + // write schema + MessageSerializer.serialize(out, schema); + + List closeableList = new ArrayList<>(); + + // write non-delta dictionary with id=1 + serializeDictionaryBatch(out, dictionary3, false, closeableList); + + // write non-delta dictionary with id=1 + serializeDictionaryBatch(out, dictionary1, false, closeableList); + + // write recordBatch2 + serializeRecordBatch(out, Arrays.asList(encodedVector1, encodedVector2), closeableList); + + // write eos + out.writeIntLittleEndian(0); + + try (ArrowStreamReader reader = new ArrowStreamReader( + new ByteArrayReadableSeekableByteChannel(outStream.toByteArray()), allocator)) { + assertEquals(1, reader.getDictionaryVectors().size()); + assertTrue(reader.loadNextBatch()); + FieldVector dictionaryVector = reader.getDictionaryVectors().get(1L).getVector(); + // make sure the delta dictionary is concatenated. + assertTrue(VectorEqualsVisitor.vectorEquals(dictionaryVector, dictionaryVector1, null)); + assertFalse(reader.loadNextBatch()); + } + + vector1.close(); + vector2.close(); + AutoCloseables.close(closeableList); + } + + @Test + public void testDeltaDictionary() throws Exception { + VarCharVector vector1 = newVarCharVector("varchar1", allocator); + setVector(vector1, + "foo".getBytes(StandardCharsets.UTF_8), + "bar".getBytes(StandardCharsets.UTF_8), + "baz".getBytes(StandardCharsets.UTF_8), + "bar".getBytes(StandardCharsets.UTF_8)); + + FieldVector encodedVector1 = (FieldVector) DictionaryEncoder.encode(vector1, dictionary1); + + VarCharVector vector2 = newVarCharVector("varchar2", allocator); + setVector(vector2, + "foo".getBytes(StandardCharsets.UTF_8), + "aa".getBytes(StandardCharsets.UTF_8), + "bb".getBytes(StandardCharsets.UTF_8), + "cc".getBytes(StandardCharsets.UTF_8)); + + FieldVector encodedVector2 = (FieldVector) DictionaryEncoder.encode(vector2, dictionary3); + + DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider(); + provider.put(dictionary1); + provider.put(dictionary3); + List schemaFields = new ArrayList<>(); + schemaFields.add(DictionaryUtility.toMessageFormat(encodedVector1.getField(), provider, new HashSet<>())); + schemaFields.add(DictionaryUtility.toMessageFormat(encodedVector2.getField(), provider, new HashSet<>())); + Schema schema = new Schema(schemaFields); + + ByteArrayOutputStream outStream = new ByteArrayOutputStream(); + WriteChannel out = new WriteChannel(newChannel(outStream)); + + // write schema + MessageSerializer.serialize(out, schema); + + List closeableList = new ArrayList<>(); + + // write non-delta dictionary with id=1 + serializeDictionaryBatch(out, dictionary1, false, closeableList); + + // write delta dictionary with id=1 + Dictionary deltaDictionary = + new Dictionary(dictionaryVector2, new DictionaryEncoding(1L, false, null)); + serializeDictionaryBatch(out, deltaDictionary, true, closeableList); + deltaDictionary.getVector().close(); + + // write recordBatch2 + serializeRecordBatch(out, Arrays.asList(encodedVector1, encodedVector2), closeableList); + + // write eos + out.writeIntLittleEndian(0); + + try (ArrowStreamReader reader = new ArrowStreamReader( + new ByteArrayReadableSeekableByteChannel(outStream.toByteArray()), allocator)) { + assertEquals(1, reader.getDictionaryVectors().size()); + assertTrue(reader.loadNextBatch()); + FieldVector dictionaryVector = reader.getDictionaryVectors().get(1L).getVector(); + // make sure the delta dictionary is concatenated. + assertTrue(VectorEqualsVisitor.vectorEquals(dictionaryVector, dictionaryVector3, null)); + assertFalse(reader.loadNextBatch()); + } + + vector1.close(); + vector2.close(); + AutoCloseables.close(closeableList); + + } + + private void serializeDictionaryBatch( + WriteChannel out, + Dictionary dictionary, + boolean isDelta, + List closeables) throws IOException { + + FieldVector dictVector = dictionary.getVector(); + VectorSchemaRoot root = new VectorSchemaRoot( + Collections.singletonList(dictVector.getField()), + Collections.singletonList(dictVector), + dictVector.getValueCount()); + ArrowDictionaryBatch batch = + new ArrowDictionaryBatch(dictionary.getEncoding().getId(), new VectorUnloader(root).getRecordBatch(), isDelta); + MessageSerializer.serialize(out, batch); + closeables.add(batch); + closeables.add(root); + } + + private void serializeRecordBatch( + WriteChannel out, + List vectors, + List closeables) throws IOException { + + List fields = vectors.stream().map(v -> v.getField()).collect(Collectors.toList()); + VectorSchemaRoot root = new VectorSchemaRoot( + fields, + vectors, + vectors.get(0).getValueCount()); + VectorUnloader unloader = new VectorUnloader(root); + ArrowRecordBatch batch = unloader.getRecordBatch(); + MessageSerializer.serialize(out, batch); + closeables.add(batch); + closeables.add(root); + } + + @Test + public void testReadInterleavedData() throws IOException { + List batches = createRecordBatches(); + + ByteArrayOutputStream outStream = new ByteArrayOutputStream(); + WriteChannel out = new WriteChannel(newChannel(outStream)); + + // write schema + MessageSerializer.serialize(out, schema); + + // write dictionary1 + FieldVector dictVector1 = dictionary1.getVector(); + VectorSchemaRoot dictRoot1 = new VectorSchemaRoot( + Collections.singletonList(dictVector1.getField()), + Collections.singletonList(dictVector1), + dictVector1.getValueCount()); + ArrowDictionaryBatch dictionaryBatch1 = + new ArrowDictionaryBatch(1, new VectorUnloader(dictRoot1).getRecordBatch()); + MessageSerializer.serialize(out, dictionaryBatch1); + dictionaryBatch1.close(); + dictRoot1.close(); + + // write recordBatch1 + MessageSerializer.serialize(out, batches.get(0)); + + // write dictionary2 + FieldVector dictVector2 = dictionary2.getVector(); + VectorSchemaRoot dictRoot2 = new VectorSchemaRoot( + Collections.singletonList(dictVector2.getField()), + Collections.singletonList(dictVector2), + dictVector2.getValueCount()); + ArrowDictionaryBatch dictionaryBatch2 = + new ArrowDictionaryBatch(2, new VectorUnloader(dictRoot2).getRecordBatch()); + MessageSerializer.serialize(out, dictionaryBatch2); + dictionaryBatch2.close(); + dictRoot2.close(); + + // write recordBatch1 + MessageSerializer.serialize(out, batches.get(1)); + + // write eos + out.writeIntLittleEndian(0); + + try (ArrowStreamReader reader = new ArrowStreamReader( + new ByteArrayReadableSeekableByteChannel(outStream.toByteArray()), allocator)) { + Schema readSchema = reader.getVectorSchemaRoot().getSchema(); + assertEquals(encodedSchema, readSchema); + assertEquals(2, reader.getDictionaryVectors().size()); + assertTrue(reader.loadNextBatch()); + assertTrue(reader.loadNextBatch()); + assertFalse(reader.loadNextBatch()); + } + + batches.forEach(batch -> batch.close()); + } + + private List createRecordBatches() { + List batches = new ArrayList<>(); + + DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider(); + provider.put(dictionary1); + provider.put(dictionary2); + + VarCharVector vectorA1 = newVarCharVector("varcharA1", allocator); + vectorA1.allocateNewSafe(); + vectorA1.set(0, "foo".getBytes(StandardCharsets.UTF_8)); + vectorA1.set(1, "bar".getBytes(StandardCharsets.UTF_8)); + vectorA1.set(3, "baz".getBytes(StandardCharsets.UTF_8)); + vectorA1.set(4, "bar".getBytes(StandardCharsets.UTF_8)); + vectorA1.set(5, "baz".getBytes(StandardCharsets.UTF_8)); + vectorA1.setValueCount(6); + + VarCharVector vectorA2 = newVarCharVector("varcharA2", allocator); + vectorA2.setValueCount(6); + FieldVector encodedVectorA1 = (FieldVector) DictionaryEncoder.encode(vectorA1, dictionary1); + vectorA1.close(); + FieldVector encodedVectorA2 = (FieldVector) DictionaryEncoder.encode(vectorA1, dictionary2); + vectorA2.close(); + + List fields = Arrays.asList(encodedVectorA1.getField(), encodedVectorA2.getField()); + List vectors = Collections2.asImmutableList(encodedVectorA1, encodedVectorA2); + VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors, encodedVectorA1.getValueCount()); + VectorUnloader unloader = new VectorUnloader(root); + batches.add(unloader.getRecordBatch()); + root.close(); + + VarCharVector vectorB1 = newVarCharVector("varcharB1", allocator); + vectorB1.setValueCount(6); + + VarCharVector vectorB2 = newVarCharVector("varcharB2", allocator); + vectorB2.allocateNew(); + vectorB2.setValueCount(6); + vectorB2.set(0, "aa".getBytes(StandardCharsets.UTF_8)); + vectorB2.set(1, "aa".getBytes(StandardCharsets.UTF_8)); + vectorB2.set(3, "bb".getBytes(StandardCharsets.UTF_8)); + vectorB2.set(4, "bb".getBytes(StandardCharsets.UTF_8)); + vectorB2.set(5, "cc".getBytes(StandardCharsets.UTF_8)); + vectorB2.setValueCount(6); + FieldVector encodedVectorB1 = (FieldVector) DictionaryEncoder.encode(vectorB1, dictionary1); + vectorB1.close(); + FieldVector encodedVectorB2 = (FieldVector) DictionaryEncoder.encode(vectorB2, dictionary2); + vectorB2.close(); + + List fieldsB = Arrays.asList(encodedVectorB1.getField(), encodedVectorB2.getField()); + List vectorsB = Collections2.asImmutableList(encodedVectorB1, encodedVectorB2); + VectorSchemaRoot rootB = new VectorSchemaRoot(fieldsB, vectorsB, 6); + VectorUnloader unloaderB = new VectorUnloader(rootB); + batches.add(unloaderB.getRecordBatch()); + rootB.close(); + + List schemaFields = new ArrayList<>(); + schemaFields.add(DictionaryUtility.toMessageFormat(encodedVectorA1.getField(), provider, new HashSet<>())); + schemaFields.add(DictionaryUtility.toMessageFormat(encodedVectorA2.getField(), provider, new HashSet<>())); + schema = new Schema(schemaFields); + + encodedSchema = new Schema(Arrays.asList(encodedVectorA1.getField(), encodedVectorA2.getField())); + + return batches; + } + + @Test + public void testLegacyIpcBackwardsCompatibility() throws Exception { + Schema schema = new Schema(asList(Field.nullable("field", new ArrowType.Int(32, true)))); + IntVector vector = new IntVector("vector", allocator); + final int valueCount = 2; + vector.setValueCount(valueCount); + vector.setSafe(0, 1); + vector.setSafe(1, 2); + ArrowRecordBatch batch = new ArrowRecordBatch(valueCount, asList(new ArrowFieldNode(valueCount, 0)), + asList(vector.getValidityBuffer(), vector.getDataBuffer())); + + ByteArrayOutputStream outStream = new ByteArrayOutputStream(); + WriteChannel out = new WriteChannel(newChannel(outStream)); + + // write legacy ipc format + IpcOption option = new IpcOption(true, MetadataVersion.DEFAULT); + MessageSerializer.serialize(out, schema, option); + MessageSerializer.serialize(out, batch); + + ReadChannel in = new ReadChannel(newChannel(new ByteArrayInputStream(outStream.toByteArray()))); + Schema readSchema = MessageSerializer.deserializeSchema(in); + assertEquals(schema, readSchema); + ArrowRecordBatch readBatch = MessageSerializer.deserializeRecordBatch(in, allocator); + assertEquals(batch.getLength(), readBatch.getLength()); + assertEquals(batch.computeBodyLength(), readBatch.computeBodyLength()); + readBatch.close(); + + // write ipc format with continuation + option = IpcOption.DEFAULT; + MessageSerializer.serialize(out, schema, option); + MessageSerializer.serialize(out, batch); + + ReadChannel in2 = new ReadChannel(newChannel(new ByteArrayInputStream(outStream.toByteArray()))); + Schema readSchema2 = MessageSerializer.deserializeSchema(in2); + assertEquals(schema, readSchema2); + ArrowRecordBatch readBatch2 = MessageSerializer.deserializeRecordBatch(in2, allocator); + assertEquals(batch.getLength(), readBatch2.getLength()); + assertEquals(batch.computeBodyLength(), readBatch2.computeBodyLength()); + readBatch2.close(); + + batch.close(); + vector.close(); + } + + @Test + public void testChannelReadFully() throws IOException { + final ByteBuffer buf = ByteBuffer.allocate(4).order(ByteOrder.nativeOrder()); + buf.putInt(200); + buf.rewind(); + + try (ReadChannel channel = new ReadChannel(Channels.newChannel(new ByteArrayInputStream(buf.array()))); + ArrowBuf arrBuf = allocator.buffer(8)) { + arrBuf.setInt(0, 100); + arrBuf.writerIndex(4); + assertEquals(4, arrBuf.writerIndex()); + + long n = channel.readFully(arrBuf, 4); + assertEquals(4, n); + assertEquals(8, arrBuf.writerIndex()); + + assertEquals(100, arrBuf.getInt(0)); + assertEquals(200, arrBuf.getInt(4)); + } + } + + @Test + public void testChannelReadFullyEos() throws IOException { + final ByteBuffer buf = ByteBuffer.allocate(4).order(ByteOrder.nativeOrder()); + buf.putInt(10); + buf.rewind(); + + try (ReadChannel channel = new ReadChannel(Channels.newChannel(new ByteArrayInputStream(buf.array()))); + ArrowBuf arrBuf = allocator.buffer(8)) { + int n = channel.readFully(arrBuf.nioBuffer(0, 8)); + assertEquals(4, n); + + // the input has only 4 bytes, so the number of bytes read should be 4 + assertEquals(4, channel.bytesRead()); + + // the first 4 bytes have been read successfully. + assertEquals(10, arrBuf.getInt(0)); + } + } + + @Test + public void testCustomMetaData() throws IOException { + + VarCharVector vector = newVarCharVector("varchar1", allocator); + + List fields = Arrays.asList(vector.getField()); + List vectors = Collections2.asImmutableList(vector); + Map metadata = new HashMap<>(); + metadata.put("key1", "value1"); + metadata.put("key2", "value2"); + try (VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors, vector.getValueCount()); + ByteArrayOutputStream out = new ByteArrayOutputStream(); + ArrowFileWriter writer = new ArrowFileWriter(root, null, newChannel(out), metadata);) { + + writer.start(); + writer.end(); + + try (SeekableReadChannel channel = new SeekableReadChannel( + new ByteArrayReadableSeekableByteChannel(out.toByteArray())); + ArrowFileReader reader = new ArrowFileReader(channel, allocator)) { + reader.getVectorSchemaRoot(); + + Map readMeta = reader.getMetaData(); + assertEquals(2, readMeta.size()); + assertEquals("value1", readMeta.get("key1")); + assertEquals("value2", readMeta.get("key2")); + } + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowStream.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowStream.java new file mode 100644 index 000000000..9348cd3a6 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowStream.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.channels.Channels; +import java.util.Collections; + +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.TinyIntVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.Assert; +import org.junit.Test; + +public class TestArrowStream extends BaseFileTest { + @Test + public void testEmptyStream() throws IOException { + Schema schema = MessageSerializerTest.testSchema(); + VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator); + + // Write the stream. + ByteArrayOutputStream out = new ByteArrayOutputStream(); + ArrowStreamWriter writer = new ArrowStreamWriter(root, null, out); + writer.close(); + Assert.assertTrue(out.size() > 0); + + ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); + try (ArrowStreamReader reader = new ArrowStreamReader(in, allocator)) { + assertEquals(schema, reader.getVectorSchemaRoot().getSchema()); + // Empty should return false + Assert.assertFalse(reader.loadNextBatch()); + assertEquals(0, reader.getVectorSchemaRoot().getRowCount()); + Assert.assertFalse(reader.loadNextBatch()); + assertEquals(0, reader.getVectorSchemaRoot().getRowCount()); + } + } + + @Test + public void testStreamZeroLengthBatch() throws IOException { + ByteArrayOutputStream os = new ByteArrayOutputStream(); + + try (IntVector vector = new IntVector("foo", allocator);) { + Schema schema = new Schema(Collections.singletonList(vector.getField())); + try (VectorSchemaRoot root = + new VectorSchemaRoot(schema, Collections.singletonList(vector), vector.getValueCount()); + ArrowStreamWriter writer = new ArrowStreamWriter(root, null, Channels.newChannel(os));) { + vector.setValueCount(0); + root.setRowCount(0); + writer.writeBatch(); + writer.end(); + } + } + + ByteArrayInputStream in = new ByteArrayInputStream(os.toByteArray()); + + try (ArrowStreamReader reader = new ArrowStreamReader(in, allocator);) { + VectorSchemaRoot root = reader.getVectorSchemaRoot(); + IntVector vector = (IntVector) root.getFieldVectors().get(0); + reader.loadNextBatch(); + assertEquals(vector.getValueCount(), 0); + assertEquals(root.getRowCount(), 0); + } + } + + @Test + public void testReadWrite() throws IOException { + Schema schema = MessageSerializerTest.testSchema(); + try (VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { + int numBatches = 1; + + root.getFieldVectors().get(0).allocateNew(); + TinyIntVector vector = (TinyIntVector) root.getFieldVectors().get(0); + for (int i = 0; i < 16; i++) { + vector.set(i, i < 8 ? 1 : 0, (byte) (i + 1)); + } + vector.setValueCount(16); + root.setRowCount(16); + + ByteArrayOutputStream out = new ByteArrayOutputStream(); + long bytesWritten = 0; + try (ArrowStreamWriter writer = new ArrowStreamWriter(root, null, out)) { + writer.start(); + for (int i = 0; i < numBatches; i++) { + writer.writeBatch(); + } + writer.end(); + bytesWritten = writer.bytesWritten(); + } + + ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); + try (ArrowStreamReader reader = new ArrowStreamReader(in, allocator)) { + Schema readSchema = reader.getVectorSchemaRoot().getSchema(); + assertEquals(schema, readSchema); + for (int i = 0; i < numBatches; i++) { + assertTrue(reader.loadNextBatch()); + } + // TODO figure out why reader isn't getting padding bytes + assertEquals(bytesWritten, reader.bytesRead() + 8); + assertFalse(reader.loadNextBatch()); + assertEquals(0, reader.getVectorSchemaRoot().getRowCount()); + } + } + } + + @Test + public void testReadWriteMultipleBatches() throws IOException { + ByteArrayOutputStream os = new ByteArrayOutputStream(); + + try (IntVector vector = new IntVector("foo", allocator);) { + Schema schema = new Schema(Collections.singletonList(vector.getField())); + try (VectorSchemaRoot root = + new VectorSchemaRoot(schema, Collections.singletonList(vector), vector.getValueCount()); + ArrowStreamWriter writer = new ArrowStreamWriter(root, null, Channels.newChannel(os));) { + writeBatchData(writer, vector, root); + } + } + + ByteArrayInputStream in = new ByteArrayInputStream(os.toByteArray()); + + try (ArrowStreamReader reader = new ArrowStreamReader(in, allocator);) { + IntVector vector = (IntVector) reader.getVectorSchemaRoot().getFieldVectors().get(0); + validateBatchData(reader, vector); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowStreamPipe.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowStreamPipe.java new file mode 100644 index 000000000..422a63f57 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowStreamPipe.java @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.IOException; +import java.nio.channels.Pipe; +import java.nio.channels.ReadableByteChannel; +import java.nio.channels.WritableByteChannel; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.TinyIntVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.ipc.ArrowStreamReader; +import org.apache.arrow.vector.ipc.ArrowStreamWriter; +import org.apache.arrow.vector.ipc.MessageSerializerTest; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.Assert; +import org.junit.Test; + +public class TestArrowStreamPipe { + Schema schema = MessageSerializerTest.testSchema(); + BufferAllocator alloc = new RootAllocator(Long.MAX_VALUE); + + private final class WriterThread extends Thread { + + private final int numBatches; + private final ArrowStreamWriter writer; + private final VectorSchemaRoot root; + + public WriterThread(int numBatches, WritableByteChannel sinkChannel) + throws IOException { + this.numBatches = numBatches; + BufferAllocator allocator = alloc.newChildAllocator("writer thread", 0, Integer.MAX_VALUE); + root = VectorSchemaRoot.create(schema, allocator); + writer = new ArrowStreamWriter(root, null, sinkChannel); + } + + @Override + public void run() { + try { + writer.start(); + for (int j = 0; j < numBatches; j++) { + root.getFieldVectors().get(0).allocateNew(); + TinyIntVector vector = (TinyIntVector) root.getFieldVectors().get(0); + // Send a changing batch id first + vector.set(0, j); + for (int i = 1; i < 16; i++) { + vector.set(i, i < 8 ? 1 : 0, (byte) (i + 1)); + } + vector.setValueCount(16); + root.setRowCount(16); + + writer.writeBatch(); + } + writer.close(); + root.close(); + } catch (IOException e) { + e.printStackTrace(); + Assert.fail(e.toString()); // have to explicitly fail since we're in a separate thread + } + } + + public long bytesWritten() { + return writer.bytesWritten(); + } + } + + private final class ReaderThread extends Thread { + private int batchesRead = 0; + private final ArrowStreamReader reader; + private final BufferAllocator alloc = new RootAllocator(Long.MAX_VALUE); + private boolean done = false; + + public ReaderThread(ReadableByteChannel sourceChannel) + throws IOException { + reader = new ArrowStreamReader(sourceChannel, alloc) { + + @Override + public boolean loadNextBatch() throws IOException { + if (super.loadNextBatch()) { + batchesRead++; + } else { + done = true; + return false; + } + VectorSchemaRoot root = getVectorSchemaRoot(); + Assert.assertEquals(16, root.getRowCount()); + TinyIntVector vector = (TinyIntVector) root.getFieldVectors().get(0); + Assert.assertEquals((byte) (batchesRead - 1), vector.get(0)); + for (int i = 1; i < 16; i++) { + if (i < 8) { + Assert.assertEquals((byte) (i + 1), vector.get(i)); + } else { + Assert.assertTrue(vector.isNull(i)); + } + } + + return true; + } + }; + } + + @Override + public void run() { + try { + assertEquals(schema, reader.getVectorSchemaRoot().getSchema()); + while (!done) { + assertTrue(reader.loadNextBatch() != done); + } + reader.close(); + } catch (IOException e) { + e.printStackTrace(); + Assert.fail(e.toString()); // have to explicitly fail since we're in a separate thread + } + } + + public int getBatchesRead() { + return batchesRead; + } + + public long bytesRead() { + return reader.bytesRead(); + } + } + + // Starts up a producer and consumer thread to read/write batches. + @Test + public void pipeTest() throws IOException, InterruptedException { + final int NUM_BATCHES = 10; + Pipe pipe = Pipe.open(); + WriterThread writer = new WriterThread(NUM_BATCHES, pipe.sink()); + ReaderThread reader = new ReaderThread(pipe.source()); + + writer.start(); + reader.start(); + reader.join(); + writer.join(); + + assertEquals(NUM_BATCHES, reader.getBatchesRead()); + assertEquals(writer.bytesWritten(), reader.bytesRead()); + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestJSONFile.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestJSONFile.java new file mode 100644 index 000000000..f0aa226e2 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestJSONFile.java @@ -0,0 +1,458 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.io.IOException; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.UInt1Vector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.UInt8Vector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.impl.ComplexWriterImpl; +import org.apache.arrow.vector.complex.writer.BaseWriter; +import org.apache.arrow.vector.dictionary.DictionaryProvider; +import org.apache.arrow.vector.dictionary.DictionaryProvider.MapDictionaryProvider; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.Validator; +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class TestJSONFile extends BaseFileTest { + private static final Logger LOGGER = LoggerFactory.getLogger(TestJSONFile.class); + + @Test + public void testNoBatches() throws IOException { + File file = new File("target/no_batches.json"); + + try (BufferAllocator originalVectorAllocator = + allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + StructVector parent = StructVector.empty("parent", originalVectorAllocator)) { + BaseWriter.ComplexWriter writer = new ComplexWriterImpl("root", parent); + BaseWriter.StructWriter rootWriter = writer.rootAsStruct(); + rootWriter.integer("int"); + rootWriter.uInt1("uint1"); + rootWriter.bigInt("bigInt"); + rootWriter.float4("float"); + JsonFileWriter jsonWriter = new JsonFileWriter(file, JsonFileWriter.config().pretty(true)); + jsonWriter.start(new VectorSchemaRoot(parent.getChild("root")).getSchema(), null); + jsonWriter.close(); + } + + // read + try ( + BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); + JsonFileReader reader = new JsonFileReader(file, readerAllocator) + ) { + Schema schema = reader.start(); + LOGGER.debug("reading schema: " + schema); + } + } + + @Test + public void testWriteRead() throws IOException { + File file = new File("target/mytest.json"); + int count = COUNT; + + // write + try (BufferAllocator originalVectorAllocator = + allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + StructVector parent = StructVector.empty("parent", originalVectorAllocator)) { + writeData(count, parent); + writeJSON(file, new VectorSchemaRoot(parent.getChild("root")), null); + } + + // read + try ( + BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); + JsonFileReader reader = new JsonFileReader(file, readerAllocator) + ) { + Schema schema = reader.start(); + LOGGER.debug("reading schema: " + schema); + + // initialize vectors + try (VectorSchemaRoot root = reader.read();) { + validateContent(count, root); + } + } + } + + @Test + public void testWriteReadComplexJSON() throws IOException { + File file = new File("target/mytest_complex.json"); + int count = COUNT; + + // write + try ( + BufferAllocator originalVectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + StructVector parent = StructVector.empty("parent", originalVectorAllocator)) { + writeComplexData(count, parent); + writeJSON(file, new VectorSchemaRoot(parent.getChild("root")), null); + } + + // read + try ( + BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); + JsonFileReader reader = new JsonFileReader(file, readerAllocator); + ) { + Schema schema = reader.start(); + LOGGER.debug("reading schema: " + schema); + + // initialize vectors + try (VectorSchemaRoot root = reader.read();) { + validateComplexContent(count, root); + } + } + } + + @Test + public void testWriteComplexJSON() throws IOException { + File file = new File("target/mytest_write_complex.json"); + int count = COUNT; + try ( + BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + StructVector parent = StructVector.empty("parent", vectorAllocator)) { + writeComplexData(count, parent); + VectorSchemaRoot root = new VectorSchemaRoot(parent.getChild("root")); + validateComplexContent(root.getRowCount(), root); + writeJSON(file, root, null); + } + } + + public void writeJSON(File file, VectorSchemaRoot root, DictionaryProvider provider) throws IOException { + JsonFileWriter writer = new JsonFileWriter(file, JsonFileWriter.config().pretty(true)); + writer.start(root.getSchema(), provider); + writer.write(root); + writer.close(); + } + + + @Test + public void testWriteReadUnionJSON() throws IOException { + File file = new File("target/mytest_write_union.json"); + int count = COUNT; + try ( + BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + StructVector parent = StructVector.empty("parent", vectorAllocator)) { + writeUnionData(count, parent); + printVectors(parent.getChildrenFromFields()); + + try (VectorSchemaRoot root = new VectorSchemaRoot(parent.getChild("root"))) { + validateUnionData(count, root); + writeJSON(file, root, null); + + // read + try (BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE)) { + JsonFileReader reader = new JsonFileReader(file, readerAllocator); + + Schema schema = reader.start(); + LOGGER.debug("reading schema: " + schema); + + try (VectorSchemaRoot rootFromJson = reader.read();) { + validateUnionData(count, rootFromJson); + Validator.compareVectorSchemaRoot(root, rootFromJson); + } + } + } + } + } + + @Test + public void testWriteReadDateTimeJSON() throws IOException { + File file = new File("target/mytest_datetime.json"); + int count = COUNT; + + // write + try ( + BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + StructVector parent = StructVector.empty("parent", vectorAllocator)) { + + writeDateTimeData(count, parent); + + printVectors(parent.getChildrenFromFields()); + + VectorSchemaRoot root = new VectorSchemaRoot(parent.getChild("root")); + validateDateTimeContent(count, root); + + writeJSON(file, new VectorSchemaRoot(parent.getChild("root")), null); + } + + // read + try ( + BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); + JsonFileReader reader = new JsonFileReader(file, readerAllocator) + ) { + Schema schema = reader.start(); + LOGGER.debug("reading schema: " + schema); + + // initialize vectors + try (VectorSchemaRoot root = reader.read();) { + validateDateTimeContent(count, root); + } + } + } + + @Test + public void testWriteReadDictionaryJSON() throws IOException { + File file = new File("target/mytest_dictionary.json"); + + // write + try ( + BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE) + ) { + MapDictionaryProvider provider = new MapDictionaryProvider(); + + try (VectorSchemaRoot root = writeFlatDictionaryData(vectorAllocator, provider)) { + printVectors(root.getFieldVectors()); + validateFlatDictionary(root, provider); + writeJSON(file, root, provider); + } + + // Need to close dictionary vectors + for (long id : provider.getDictionaryIds()) { + provider.lookup(id).getVector().close(); + } + } + + // read + try ( + BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); + JsonFileReader reader = new JsonFileReader(file, readerAllocator) + ) { + Schema schema = reader.start(); + LOGGER.debug("reading schema: " + schema); + + // initialize vectors + try (VectorSchemaRoot root = reader.read();) { + validateFlatDictionary(root, reader); + } + } + } + + @Test + public void testWriteReadNestedDictionaryJSON() throws IOException { + File file = new File("target/mytest_dict_nested.json"); + + // data being written: + // [['foo', 'bar'], ['foo'], ['bar']] -> [[0, 1], [0], [1]] + + // write + try ( + BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE) + ) { + MapDictionaryProvider provider = new MapDictionaryProvider(); + + try (VectorSchemaRoot root = writeNestedDictionaryData(vectorAllocator, provider)) { + printVectors(root.getFieldVectors()); + validateNestedDictionary(root, provider); + writeJSON(file, root, provider); + } + + // Need to close dictionary vectors + for (long id : provider.getDictionaryIds()) { + provider.lookup(id).getVector().close(); + } + } + + // read + try ( + BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); + JsonFileReader reader = new JsonFileReader(file, readerAllocator) + ) { + Schema schema = reader.start(); + LOGGER.debug("reading schema: " + schema); + + // initialize vectors + try (VectorSchemaRoot root = reader.read();) { + validateNestedDictionary(root, reader); + } + } + } + + @Test + public void testWriteReadDecimalJSON() throws IOException { + File file = new File("target/mytest_decimal.json"); + + // write + try (BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + VectorSchemaRoot root = writeDecimalData(vectorAllocator)) { + printVectors(root.getFieldVectors()); + validateDecimalData(root); + writeJSON(file, root, null); + } + + // read + try ( + BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); + JsonFileReader reader = new JsonFileReader(file, readerAllocator) + ) { + Schema schema = reader.start(); + LOGGER.debug("reading schema: " + schema); + + // initialize vectors + try (VectorSchemaRoot root = reader.read();) { + validateDecimalData(root); + } + } + } + + @Test + public void testSetStructLength() throws IOException { + File file = new File("../../docs/source/format/integration_json_examples/struct.json"); + if (!file.exists()) { + file = new File("../docs/source/format/integration_json_examples/struct.json"); + } + try ( + BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); + JsonFileReader reader = new JsonFileReader(file, readerAllocator) + ) { + Schema schema = reader.start(); + LOGGER.debug("reading schema: " + schema); + + // initialize vectors + try (VectorSchemaRoot root = reader.read();) { + FieldVector vector = root.getVector("struct_nullable"); + Assert.assertEquals(7, vector.getValueCount()); + } + } + } + + @Test + public void testWriteReadVarBinJSON() throws IOException { + File file = new File("target/mytest_varbin.json"); + int count = COUNT; + + // write + try ( + BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + StructVector parent = StructVector.empty("parent", vectorAllocator)) { + writeVarBinaryData(count, parent); + VectorSchemaRoot root = new VectorSchemaRoot(parent.getChild("root")); + validateVarBinary(count, root); + writeJSON(file, new VectorSchemaRoot(parent.getChild("root")), null); + } + + // read + try (BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); + JsonFileReader reader = new JsonFileReader(file, readerAllocator)) { + Schema schema = reader.start(); + LOGGER.debug("reading schema: " + schema); + + // initialize vectors + try (VectorSchemaRoot root = reader.read();) { + validateVarBinary(count, root); + } + } + } + + @Test + public void testWriteReadMapJSON() throws IOException { + File file = new File("target/mytest_map.json"); + + // write + try (BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + VectorSchemaRoot root = writeMapData(vectorAllocator)) { + printVectors(root.getFieldVectors()); + validateMapData(root); + writeJSON(file, root, null); + } + + // read + try (BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); + JsonFileReader reader = new JsonFileReader(file, readerAllocator)) { + Schema schema = reader.start(); + LOGGER.debug("reading schema: " + schema); + + // initialize vectors + try (VectorSchemaRoot root = reader.read();) { + validateMapData(root); + } + } + } + + @Test + public void testWriteReadNullJSON() throws IOException { + File file = new File("target/mytest_null.json"); + int valueCount = 10; + + // write + try (BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + VectorSchemaRoot root = writeNullData(valueCount)) { + printVectors(root.getFieldVectors()); + validateNullData(root, valueCount); + writeJSON(file, root, null); + } + + // read + try ( + BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); + JsonFileReader reader = new JsonFileReader(file, readerAllocator) + ) { + + Schema schema = reader.start(); + LOGGER.debug("reading schema: " + schema); + + // initialize vectors + try (VectorSchemaRoot root = reader.read();) { + validateNullData(root, valueCount); + } + } + } + + @Test + public void testNoOverFlowWithUINT() { + try (final UInt8Vector uInt8Vector = new UInt8Vector("uint8", allocator); + final UInt4Vector uInt4Vector = new UInt4Vector("uint4", allocator); + final UInt1Vector uInt1Vector = new UInt1Vector("uint1", allocator)) { + + long[] longValues = new long[]{Long.MIN_VALUE, Long.MAX_VALUE, -1L}; + uInt8Vector.allocateNew(3); + uInt8Vector.setValueCount(3); + for (int i = 0; i < longValues.length; i++) { + uInt8Vector.set(i, longValues[i]); + long readValue = uInt8Vector.getObjectNoOverflow(i).longValue(); + assertEquals(readValue, longValues[i]); + } + + int[] intValues = new int[]{Integer.MIN_VALUE, Integer.MAX_VALUE, -1}; + uInt4Vector.allocateNew(3); + uInt4Vector.setValueCount(3); + for (int i = 0; i < intValues.length; i++) { + uInt4Vector.set(i, intValues[i]); + int actualValue = (int) UInt4Vector.getNoOverflow(uInt4Vector.getDataBuffer(), i); + assertEquals(intValues[i], actualValue); + } + + byte[] byteValues = new byte[]{Byte.MIN_VALUE, Byte.MAX_VALUE, -1}; + uInt1Vector.allocateNew(3); + uInt1Vector.setValueCount(3); + for (int i = 0; i < byteValues.length; i++) { + uInt1Vector.set(i, byteValues[i]); + byte actualValue = (byte) UInt1Vector.getNoOverflow(uInt1Vector.getDataBuffer(), i); + assertEquals(byteValues[i], actualValue); + } + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestRoundTrip.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestRoundTrip.java new file mode 100644 index 000000000..5f57e90f6 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestRoundTrip.java @@ -0,0 +1,628 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +import static org.apache.arrow.vector.dictionary.DictionaryProvider.MapDictionaryProvider; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.channels.Channels; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.BiConsumer; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.util.Collections2; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.TinyIntVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.VectorUnloader; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.dictionary.DictionaryProvider; +import org.apache.arrow.vector.ipc.message.ArrowBlock; +import org.apache.arrow.vector.ipc.message.ArrowBuffer; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; +import org.apache.arrow.vector.ipc.message.IpcOption; +import org.apache.arrow.vector.ipc.message.MessageMetadataResult; +import org.apache.arrow.vector.ipc.message.MessageSerializer; +import org.apache.arrow.vector.types.FloatingPointPrecision; +import org.apache.arrow.vector.types.MetadataVersion; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.AfterClass; +import org.junit.Assume; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@RunWith(Parameterized.class) +public class TestRoundTrip extends BaseFileTest { + private static final Logger LOGGER = LoggerFactory.getLogger(TestRoundTrip.class); + private static BufferAllocator allocator; + private final String name; + private final IpcOption writeOption; + + public TestRoundTrip(String name, IpcOption writeOption) { + this.name = name; + this.writeOption = writeOption; + } + + @Parameterized.Parameters(name = "options = {0}") + public static Collection getWriteOption() { + final IpcOption legacy = new IpcOption(true, MetadataVersion.V4); + final IpcOption version4 = new IpcOption(false, MetadataVersion.V4); + return Arrays.asList( + new Object[] {"V4Legacy", legacy}, + new Object[] {"V4", version4}, + new Object[] {"V5", IpcOption.DEFAULT} + ); + } + + @BeforeClass + public static void setUpClass() { + allocator = new RootAllocator(Integer.MAX_VALUE); + } + + @AfterClass + public static void tearDownClass() { + allocator.close(); + } + + @Test + public void testStruct() throws Exception { + try (final BufferAllocator originalVectorAllocator = + allocator.newChildAllocator("original vectors", 0, allocator.getLimit()); + final StructVector parent = StructVector.empty("parent", originalVectorAllocator)) { + writeData(COUNT, parent); + roundTrip( + new VectorSchemaRoot(parent.getChild("root")), + /* dictionaryProvider */null, + TestRoundTrip::writeSingleBatch, + validateFileBatches(new int[] {COUNT}, this::validateContent), + validateStreamBatches(new int[] {COUNT}, this::validateContent)); + } + } + + @Test + public void testComplex() throws Exception { + try (final BufferAllocator originalVectorAllocator = + allocator.newChildAllocator("original vectors", 0, allocator.getLimit()); + final StructVector parent = StructVector.empty("parent", originalVectorAllocator)) { + writeComplexData(COUNT, parent); + roundTrip( + new VectorSchemaRoot(parent.getChild("root")), + /* dictionaryProvider */null, + TestRoundTrip::writeSingleBatch, + validateFileBatches(new int[] {COUNT}, this::validateComplexContent), + validateStreamBatches(new int[] {COUNT}, this::validateComplexContent)); + } + } + + @Test + public void testMultipleRecordBatches() throws Exception { + int[] counts = {10, 5}; + try (final BufferAllocator originalVectorAllocator = + allocator.newChildAllocator("original vectors", 0, allocator.getLimit()); + final StructVector parent = StructVector.empty("parent", originalVectorAllocator)) { + writeData(counts[0], parent); + roundTrip( + new VectorSchemaRoot(parent.getChild("root")), + /* dictionaryProvider */null, + (root, writer) -> { + writer.start(); + parent.allocateNew(); + writeData(counts[0], parent); + root.setRowCount(counts[0]); + writer.writeBatch(); + + parent.allocateNew(); + // if we write the same data we don't catch that the metadata is stored in the wrong order. + writeData(counts[1], parent); + root.setRowCount(counts[1]); + writer.writeBatch(); + + writer.end(); + }, + validateFileBatches(counts, this::validateContent), + validateStreamBatches(counts, this::validateContent)); + } + } + + @Test + public void testUnionV4() throws Exception { + Assume.assumeTrue(writeOption.metadataVersion == MetadataVersion.V4); + final File temp = File.createTempFile("arrow-test-" + name + "-", ".arrow"); + temp.deleteOnExit(); + final ByteArrayOutputStream memoryStream = new ByteArrayOutputStream(); + + try (final BufferAllocator originalVectorAllocator = + allocator.newChildAllocator("original vectors", 0, allocator.getLimit()); + final StructVector parent = StructVector.empty("parent", originalVectorAllocator)) { + writeUnionData(COUNT, parent); + final VectorSchemaRoot root = new VectorSchemaRoot(parent.getChild("root")); + IllegalArgumentException e = assertThrows(IllegalArgumentException.class, () -> { + try (final FileOutputStream fileStream = new FileOutputStream(temp)) { + new ArrowFileWriter(root, null, fileStream.getChannel(), writeOption); + new ArrowStreamWriter(root, null, Channels.newChannel(memoryStream), writeOption); + } + }); + assertTrue(e.getMessage(), e.getMessage().contains("Cannot write union with V4 metadata")); + e = assertThrows(IllegalArgumentException.class, () -> { + new ArrowStreamWriter(root, null, Channels.newChannel(memoryStream), writeOption); + }); + assertTrue(e.getMessage(), e.getMessage().contains("Cannot write union with V4 metadata")); + } + } + + @Test + public void testUnionV5() throws Exception { + Assume.assumeTrue(writeOption.metadataVersion == MetadataVersion.V5); + try (final BufferAllocator originalVectorAllocator = + allocator.newChildAllocator("original vectors", 0, allocator.getLimit()); + final StructVector parent = StructVector.empty("parent", originalVectorAllocator)) { + writeUnionData(COUNT, parent); + VectorSchemaRoot root = new VectorSchemaRoot(parent.getChild("root")); + validateUnionData(COUNT, root); + roundTrip( + root, + /* dictionaryProvider */null, + TestRoundTrip::writeSingleBatch, + validateFileBatches(new int[] {COUNT}, this::validateUnionData), + validateStreamBatches(new int[] {COUNT}, this::validateUnionData)); + } + } + + @Test + public void testTiny() throws Exception { + try (final VectorSchemaRoot root = VectorSchemaRoot.create(MessageSerializerTest.testSchema(), allocator)) { + root.getFieldVectors().get(0).allocateNew(); + int count = 16; + TinyIntVector vector = (TinyIntVector) root.getFieldVectors().get(0); + for (int i = 0; i < count; i++) { + vector.set(i, i < 8 ? 1 : 0, (byte) (i + 1)); + } + vector.setValueCount(count); + root.setRowCount(count); + + roundTrip( + root, + /* dictionaryProvider */null, + TestRoundTrip::writeSingleBatch, + validateFileBatches(new int[] {count}, this::validateTinyData), + validateStreamBatches(new int[] {count}, this::validateTinyData)); + } + } + + private void validateTinyData(int count, VectorSchemaRoot root) { + assertEquals(count, root.getRowCount()); + TinyIntVector vector = (TinyIntVector) root.getFieldVectors().get(0); + for (int i = 0; i < count; i++) { + if (i < 8) { + assertEquals((byte) (i + 1), vector.get(i)); + } else { + assertTrue(vector.isNull(i)); + } + } + } + + @Test + public void testMetadata() throws Exception { + List childFields = new ArrayList<>(); + childFields.add(new Field("varchar-child", new FieldType(true, ArrowType.Utf8.INSTANCE, null, metadata(1)), null)); + childFields.add(new Field("float-child", + new FieldType(true, new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE), null, metadata(2)), null)); + childFields.add(new Field("int-child", new FieldType(false, new ArrowType.Int(32, true), null, metadata(3)), null)); + childFields.add(new Field("list-child", new FieldType(true, ArrowType.List.INSTANCE, null, metadata(4)), + Collections2.asImmutableList(new Field("l1", FieldType.nullable(new ArrowType.Int(16, true)), null)))); + Field field = new Field("meta", new FieldType(true, ArrowType.Struct.INSTANCE, null, metadata(0)), childFields); + Map metadata = new HashMap<>(); + metadata.put("s1", "v1"); + metadata.put("s2", "v2"); + Schema originalSchema = new Schema(Collections2.asImmutableList(field), metadata); + assertEquals(metadata, originalSchema.getCustomMetadata()); + + try (final BufferAllocator originalVectorAllocator = + allocator.newChildAllocator("original vectors", 0, allocator.getLimit()); + final StructVector vector = (StructVector) field.createVector(originalVectorAllocator)) { + vector.allocateNewSafe(); + vector.setValueCount(0); + + List vectors = Collections2.asImmutableList(vector); + VectorSchemaRoot root = new VectorSchemaRoot(originalSchema, vectors, 0); + + BiConsumer validate = (count, readRoot) -> { + Schema schema = readRoot.getSchema(); + assertEquals(originalSchema, schema); + assertEquals(originalSchema.getCustomMetadata(), schema.getCustomMetadata()); + Field top = schema.getFields().get(0); + assertEquals(metadata(0), top.getMetadata()); + for (int i = 0; i < 4; i++) { + assertEquals(metadata(i + 1), top.getChildren().get(i).getMetadata()); + } + }; + roundTrip( + root, + /* dictionaryProvider */null, + TestRoundTrip::writeSingleBatch, + validateFileBatches(new int[] {0}, validate), + validateStreamBatches(new int[] {0}, validate)); + } + } + + private Map metadata(int i) { + Map map = new HashMap<>(); + map.put("k_" + i, "v_" + i); + map.put("k2_" + i, "v2_" + i); + return Collections.unmodifiableMap(map); + } + + @Test + public void testFlatDictionary() throws Exception { + AtomicInteger numDictionaryBlocksWritten = new AtomicInteger(); + MapDictionaryProvider provider = new MapDictionaryProvider(); + try (final BufferAllocator originalVectorAllocator = + allocator.newChildAllocator("original vectors", 0, allocator.getLimit()); + final VectorSchemaRoot root = writeFlatDictionaryData(originalVectorAllocator, provider)) { + roundTrip( + root, + provider, + (ignored, writer) -> { + writer.start(); + writer.writeBatch(); + writer.end(); + if (writer instanceof ArrowFileWriter) { + numDictionaryBlocksWritten.set(((ArrowFileWriter) writer).getDictionaryBlocks().size()); + } + }, + (fileReader) -> { + VectorSchemaRoot readRoot = fileReader.getVectorSchemaRoot(); + Schema schema = readRoot.getSchema(); + LOGGER.debug("reading schema: " + schema); + assertTrue(fileReader.loadNextBatch()); + validateFlatDictionary(readRoot, fileReader); + assertEquals(numDictionaryBlocksWritten.get(), fileReader.getDictionaryBlocks().size()); + }, + (streamReader) -> { + VectorSchemaRoot readRoot = streamReader.getVectorSchemaRoot(); + Schema schema = readRoot.getSchema(); + LOGGER.debug("reading schema: " + schema); + assertTrue(streamReader.loadNextBatch()); + validateFlatDictionary(readRoot, streamReader); + }); + + // Need to close dictionary vectors + for (long id : provider.getDictionaryIds()) { + provider.lookup(id).getVector().close(); + } + } + } + + @Test + public void testNestedDictionary() throws Exception { + AtomicInteger numDictionaryBlocksWritten = new AtomicInteger(); + MapDictionaryProvider provider = new MapDictionaryProvider(); + // data being written: + // [['foo', 'bar'], ['foo'], ['bar']] -> [[0, 1], [0], [1]] + try (final BufferAllocator originalVectorAllocator = + allocator.newChildAllocator("original vectors", 0, allocator.getLimit()); + final VectorSchemaRoot root = writeNestedDictionaryData(originalVectorAllocator, provider)) { + CheckedConsumer validateDictionary = (streamReader) -> { + VectorSchemaRoot readRoot = streamReader.getVectorSchemaRoot(); + Schema schema = readRoot.getSchema(); + LOGGER.debug("reading schema: " + schema); + assertTrue(streamReader.loadNextBatch()); + validateNestedDictionary(readRoot, streamReader); + }; + roundTrip( + root, + provider, + (ignored, writer) -> { + writer.start(); + writer.writeBatch(); + writer.end(); + if (writer instanceof ArrowFileWriter) { + numDictionaryBlocksWritten.set(((ArrowFileWriter) writer).getDictionaryBlocks().size()); + } + }, + validateDictionary, + validateDictionary); + + // Need to close dictionary vectors + for (long id : provider.getDictionaryIds()) { + provider.lookup(id).getVector().close(); + } + } + } + + @Test + public void testFixedSizeBinary() throws Exception { + final int count = 10; + final int typeWidth = 11; + byte[][] byteValues = new byte[count][typeWidth]; + for (int i = 0; i < count; i++) { + for (int j = 0; j < typeWidth; j++) { + byteValues[i][j] = ((byte) i); + } + } + + BiConsumer validator = (expectedCount, root) -> { + for (int i = 0; i < expectedCount; i++) { + assertArrayEquals(byteValues[i], ((byte[]) root.getVector("fixed-binary").getObject(i))); + } + }; + + try (final BufferAllocator originalVectorAllocator = + allocator.newChildAllocator("original vectors", 0, allocator.getLimit()); + final StructVector parent = StructVector.empty("parent", originalVectorAllocator)) { + FixedSizeBinaryVector fixedSizeBinaryVector = parent.addOrGet("fixed-binary", + FieldType.nullable(new ArrowType.FixedSizeBinary(typeWidth)), FixedSizeBinaryVector.class); + parent.allocateNew(); + for (int i = 0; i < count; i++) { + fixedSizeBinaryVector.set(i, byteValues[i]); + } + parent.setValueCount(count); + + roundTrip( + new VectorSchemaRoot(parent), + /* dictionaryProvider */null, + TestRoundTrip::writeSingleBatch, + validateFileBatches(new int[] {count}, validator), + validateStreamBatches(new int[] {count}, validator)); + } + } + + @Test + public void testFixedSizeList() throws Exception { + BiConsumer validator = (expectedCount, root) -> { + for (int i = 0; i < expectedCount; i++) { + assertEquals(Collections2.asImmutableList(i + 0.1f, i + 10.1f), root.getVector("float-pairs") + .getObject(i)); + assertEquals(i, root.getVector("ints").getObject(i)); + } + }; + + try (final BufferAllocator originalVectorAllocator = + allocator.newChildAllocator("original vectors", 0, allocator.getLimit()); + final StructVector parent = StructVector.empty("parent", originalVectorAllocator)) { + FixedSizeListVector tuples = parent.addOrGet("float-pairs", + FieldType.nullable(new ArrowType.FixedSizeList(2)), FixedSizeListVector.class); + Float4Vector floats = (Float4Vector) tuples.addOrGetVector(FieldType.nullable(Types.MinorType.FLOAT4.getType())) + .getVector(); + IntVector ints = parent.addOrGet("ints", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class); + parent.allocateNew(); + for (int i = 0; i < COUNT; i++) { + tuples.setNotNull(i); + floats.set(i * 2, i + 0.1f); + floats.set(i * 2 + 1, i + 10.1f); + ints.set(i, i); + } + parent.setValueCount(COUNT); + + roundTrip( + new VectorSchemaRoot(parent), + /* dictionaryProvider */null, + TestRoundTrip::writeSingleBatch, + validateFileBatches(new int[] {COUNT}, validator), + validateStreamBatches(new int[] {COUNT}, validator)); + } + } + + @Test + public void testVarBinary() throws Exception { + try (final BufferAllocator originalVectorAllocator = + allocator.newChildAllocator("original vectors", 0, allocator.getLimit()); + final StructVector parent = StructVector.empty("parent", originalVectorAllocator)) { + writeVarBinaryData(COUNT, parent); + VectorSchemaRoot root = new VectorSchemaRoot(parent.getChild("root")); + validateVarBinary(COUNT, root); + + roundTrip( + root, + /* dictionaryProvider */null, + TestRoundTrip::writeSingleBatch, + validateFileBatches(new int[]{COUNT}, this::validateVarBinary), + validateStreamBatches(new int[]{COUNT}, this::validateVarBinary)); + } + } + + @Test + public void testReadWriteMultipleBatches() throws IOException { + File file = new File("target/mytest_nulls_multibatch.arrow"); + int numBlocksWritten = 0; + + try (IntVector vector = new IntVector("foo", allocator);) { + Schema schema = new Schema(Collections.singletonList(vector.getField())); + try (FileOutputStream fileOutputStream = new FileOutputStream(file); + VectorSchemaRoot root = + new VectorSchemaRoot(schema, Collections.singletonList((FieldVector) vector), vector.getValueCount()); + ArrowFileWriter writer = new ArrowFileWriter(root, null, fileOutputStream.getChannel(), writeOption)) { + writeBatchData(writer, vector, root); + numBlocksWritten = writer.getRecordBlocks().size(); + } + } + + try (FileInputStream fileInputStream = new FileInputStream(file); + ArrowFileReader reader = new ArrowFileReader(fileInputStream.getChannel(), allocator);) { + IntVector vector = (IntVector) reader.getVectorSchemaRoot().getFieldVectors().get(0); + validateBatchData(reader, vector); + assertEquals(numBlocksWritten, reader.getRecordBlocks().size()); + } + } + + @Test + public void testMap() throws Exception { + try (final BufferAllocator originalVectorAllocator = + allocator.newChildAllocator("original vectors", 0, allocator.getLimit()); + final VectorSchemaRoot root = writeMapData(originalVectorAllocator)) { + roundTrip( + root, + /* dictionaryProvider */null, + TestRoundTrip::writeSingleBatch, + validateFileBatches(new int[]{root.getRowCount()}, (count, readRoot) -> validateMapData(readRoot)), + validateStreamBatches(new int[]{root.getRowCount()}, (count, readRoot) -> validateMapData(readRoot))); + } + } + + @Test + public void testListAsMap() throws Exception { + try (final BufferAllocator originalVectorAllocator = + allocator.newChildAllocator("original vectors", 0, allocator.getLimit()); + final VectorSchemaRoot root = writeListAsMapData(originalVectorAllocator)) { + roundTrip( + root, + /* dictionaryProvider */null, + TestRoundTrip::writeSingleBatch, + validateFileBatches(new int[]{root.getRowCount()}, (count, readRoot) -> validateListAsMapData(readRoot)), + validateStreamBatches(new int[]{root.getRowCount()}, (count, readRoot) -> validateListAsMapData(readRoot))); + } + } + + // Generic test helpers + + private static void writeSingleBatch(VectorSchemaRoot root, ArrowWriter writer) throws IOException { + writer.start(); + writer.writeBatch(); + writer.end(); + } + + private CheckedConsumer validateFileBatches( + int[] counts, BiConsumer validator) { + return (arrowReader) -> { + VectorSchemaRoot root = arrowReader.getVectorSchemaRoot(); + VectorUnloader unloader = new VectorUnloader(root); + Schema schema = root.getSchema(); + LOGGER.debug("reading schema: " + schema); + int i = 0; + List recordBatches = arrowReader.getRecordBlocks(); + assertEquals(counts.length, recordBatches.size()); + long previousOffset = 0; + for (ArrowBlock rbBlock : recordBatches) { + assertTrue(rbBlock.getOffset() + " > " + previousOffset, rbBlock.getOffset() > previousOffset); + previousOffset = rbBlock.getOffset(); + arrowReader.loadRecordBatch(rbBlock); + assertEquals("RB #" + i, counts[i], root.getRowCount()); + validator.accept(counts[i], root); + try (final ArrowRecordBatch batch = unloader.getRecordBatch()) { + List buffersLayout = batch.getBuffersLayout(); + for (ArrowBuffer arrowBuffer : buffersLayout) { + assertEquals(0, arrowBuffer.getOffset() % 8); + } + } + ++i; + } + }; + } + + private CheckedConsumer validateStreamBatches( + int[] counts, BiConsumer validator) { + return (arrowReader) -> { + VectorSchemaRoot root = arrowReader.getVectorSchemaRoot(); + VectorUnloader unloader = new VectorUnloader(root); + Schema schema = root.getSchema(); + LOGGER.debug("reading schema: " + schema); + int i = 0; + + for (int n = 0; n < counts.length; n++) { + assertTrue(arrowReader.loadNextBatch()); + assertEquals("RB #" + i, counts[i], root.getRowCount()); + validator.accept(counts[i], root); + try (final ArrowRecordBatch batch = unloader.getRecordBatch()) { + final List buffersLayout = batch.getBuffersLayout(); + for (ArrowBuffer arrowBuffer : buffersLayout) { + assertEquals(0, arrowBuffer.getOffset() % 8); + } + } + ++i; + } + assertFalse(arrowReader.loadNextBatch()); + }; + } + + @FunctionalInterface + interface CheckedConsumer { + void accept(T t) throws Exception; + } + + @FunctionalInterface + interface CheckedBiConsumer { + void accept(T t, U u) throws Exception; + } + + private void roundTrip(VectorSchemaRoot root, DictionaryProvider provider, + CheckedBiConsumer writer, + CheckedConsumer fileValidator, + CheckedConsumer streamValidator) throws Exception { + final File temp = File.createTempFile("arrow-test-" + name + "-", ".arrow"); + temp.deleteOnExit(); + final ByteArrayOutputStream memoryStream = new ByteArrayOutputStream(); + final Map metadata = new HashMap<>(); + metadata.put("foo", "bar"); + try (final FileOutputStream fileStream = new FileOutputStream(temp); + final ArrowFileWriter fileWriter = + new ArrowFileWriter(root, provider, fileStream.getChannel(), metadata, writeOption); + final ArrowStreamWriter streamWriter = + new ArrowStreamWriter(root, provider, Channels.newChannel(memoryStream), writeOption)) { + writer.accept(root, fileWriter); + writer.accept(root, streamWriter); + } + + MessageMetadataResult metadataResult = MessageSerializer.readMessage( + new ReadChannel(Channels.newChannel(new ByteArrayInputStream(memoryStream.toByteArray())))); + assertNotNull(metadataResult); + assertEquals(writeOption.metadataVersion.toFlatbufID(), metadataResult.getMessage().version()); + + try ( + BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, allocator.getLimit()); + FileInputStream fileInputStream = new FileInputStream(temp); + ByteArrayInputStream inputStream = new ByteArrayInputStream(memoryStream.toByteArray()); + ArrowFileReader fileReader = new ArrowFileReader(fileInputStream.getChannel(), readerAllocator); + ArrowStreamReader streamReader = new ArrowStreamReader(inputStream, readerAllocator)) { + fileValidator.accept(fileReader); + streamValidator.accept(streamReader); + assertEquals(writeOption.metadataVersion, fileReader.getFooter().getMetadataVersion()); + assertEquals(metadata, fileReader.getMetaData()); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestUIntDictionaryRoundTrip.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestUIntDictionaryRoundTrip.java new file mode 100644 index 000000000..6aa7a0c6d --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestUIntDictionaryRoundTrip.java @@ -0,0 +1,246 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +import static org.apache.arrow.vector.testing.ValueVectorDataPopulator.setVector; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.channels.Channels; +import java.util.Arrays; +import java.util.Collection; +import java.util.Map; +import java.util.function.ToIntBiFunction; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.UInt1Vector; +import org.apache.arrow.vector.UInt2Vector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.UInt8Vector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.dictionary.Dictionary; +import org.apache.arrow.vector.dictionary.DictionaryProvider; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.DictionaryEncoding; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +/** + * Test the round-trip of dictionary encoding, + * with unsigned integer as indices. + */ +@RunWith(Parameterized.class) +public class TestUIntDictionaryRoundTrip { + + private final boolean streamMode; + + public TestUIntDictionaryRoundTrip(boolean streamMode) { + this.streamMode = streamMode; + } + + private BufferAllocator allocator; + + private DictionaryProvider.MapDictionaryProvider dictionaryProvider; + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + dictionaryProvider = new DictionaryProvider.MapDictionaryProvider(); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + private byte[] writeData(FieldVector encodedVector) throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + VectorSchemaRoot root = + new VectorSchemaRoot( + Arrays.asList(encodedVector.getField()), Arrays.asList(encodedVector), encodedVector.getValueCount()); + try (ArrowWriter writer = streamMode ? + new ArrowStreamWriter(root, dictionaryProvider, out) : + new ArrowFileWriter(root, dictionaryProvider, Channels.newChannel(out))) { + writer.start(); + writer.writeBatch(); + writer.end(); + + return out.toByteArray(); + } + } + + private void readData( + byte[] data, + Field expectedField, + ToIntBiFunction valGetter, + long dictionaryID, + int[] expectedIndices, + String[] expectedDictItems) throws IOException { + try (ArrowReader reader = streamMode ? + new ArrowStreamReader(new ByteArrayInputStream(data), allocator) : + new ArrowFileReader(new SeekableReadChannel(new ByteArrayReadableSeekableByteChannel(data)), allocator)) { + + // verify schema + Schema readSchema = reader.getVectorSchemaRoot().getSchema(); + assertEquals(1, readSchema.getFields().size()); + assertEquals(expectedField, readSchema.getFields().get(0)); + + // verify vector schema root + assertTrue(reader.loadNextBatch()); + VectorSchemaRoot root = reader.getVectorSchemaRoot(); + + assertEquals(1, root.getFieldVectors().size()); + ValueVector encodedVector = root.getVector(0); + assertEquals(expectedIndices.length, encodedVector.getValueCount()); + + for (int i = 0; i < expectedIndices.length; i++) { + assertEquals(expectedIndices[i], valGetter.applyAsInt(encodedVector, i)); + } + + // verify dictionary + Map dictVectors = reader.getDictionaryVectors(); + assertEquals(1, dictVectors.size()); + Dictionary dictionary = dictVectors.get(dictionaryID); + assertNotNull(dictionary); + + assertTrue(dictionary.getVector() instanceof VarCharVector); + VarCharVector dictVector = (VarCharVector) dictionary.getVector(); + assertEquals(expectedDictItems.length, dictVector.getValueCount()); + for (int i = 0; i < dictVector.getValueCount(); i++) { + assertArrayEquals(expectedDictItems[i].getBytes(), dictVector.get(i)); + } + } + } + + private ValueVector createEncodedVector(int bitWidth, VarCharVector dictionaryVector) { + final DictionaryEncoding dictionaryEncoding = + new DictionaryEncoding(bitWidth, false, new ArrowType.Int(bitWidth, false)); + Dictionary dictionary = new Dictionary(dictionaryVector, dictionaryEncoding); + dictionaryProvider.put(dictionary); + + final FieldType type = + new FieldType(true, dictionaryEncoding.getIndexType(), dictionaryEncoding, null); + final Field field = new Field("encoded", type, null); + return field.createVector(allocator); + } + + @Test + public void testUInt1RoundTrip() throws IOException { + final int vectorLength = UInt1Vector.MAX_UINT1 & UInt1Vector.PROMOTION_MASK; + try (VarCharVector dictionaryVector = new VarCharVector("dictionary", allocator); + UInt1Vector encodedVector1 = (UInt1Vector) createEncodedVector(8, dictionaryVector)) { + int[] indices = new int[vectorLength]; + String[] dictionaryItems = new String[vectorLength]; + for (int i = 0; i < vectorLength; i++) { + encodedVector1.setSafe(i, (byte) i); + indices[i] = i; + dictionaryItems[i] = String.valueOf(i); + } + encodedVector1.setValueCount(vectorLength); + setVector(dictionaryVector, dictionaryItems); + byte[] data = writeData(encodedVector1); + readData( + data, encodedVector1.getField(), (vector, index) -> (int) ((UInt1Vector) vector).getValueAsLong(index), + 8L, indices, dictionaryItems); + } + } + + @Test + public void testUInt2RoundTrip() throws IOException { + try (VarCharVector dictionaryVector = new VarCharVector("dictionary", allocator); + UInt2Vector encodedVector2 = (UInt2Vector) createEncodedVector(16, dictionaryVector)) { + int[] indices = new int[]{1, 3, 5, 7, 9, UInt2Vector.MAX_UINT2}; + String[] dictItems = new String[UInt2Vector.MAX_UINT2]; + for (int i = 0; i < UInt2Vector.MAX_UINT2; i++) { + dictItems[i] = String.valueOf(i); + } + + setVector(encodedVector2, (char) 1, (char) 3, (char) 5, (char) 7, (char) 9, UInt2Vector.MAX_UINT2); + setVector(dictionaryVector, dictItems); + + byte[] data = writeData(encodedVector2); + readData(data, encodedVector2.getField(), (vector, index) -> (int) ((UInt2Vector) vector).getValueAsLong(index), + 16L, indices, dictItems); + } + } + + @Test + public void testUInt4RoundTrip() throws IOException { + final int dictLength = 10; + try (VarCharVector dictionaryVector = new VarCharVector("dictionary", allocator); + UInt4Vector encodedVector4 = (UInt4Vector) createEncodedVector(32, dictionaryVector)) { + int[] indices = new int[]{1, 3, 5, 7, 9}; + String[] dictItems = new String[dictLength]; + for (int i = 0; i < dictLength; i++) { + dictItems[i] = String.valueOf(i); + } + + setVector(encodedVector4, 1, 3, 5, 7, 9); + setVector(dictionaryVector, dictItems); + + setVector(encodedVector4, 1, 3, 5, 7, 9); + byte[] data = writeData(encodedVector4); + readData(data, encodedVector4.getField(), (vector, index) -> (int) ((UInt4Vector) vector).getValueAsLong(index), + 32L, indices, dictItems); + } + } + + @Test + public void testUInt8RoundTrip() throws IOException { + final int dictLength = 10; + try (VarCharVector dictionaryVector = new VarCharVector("dictionary", allocator); + UInt8Vector encodedVector8 = (UInt8Vector) createEncodedVector(64, dictionaryVector)) { + int[] indices = new int[]{1, 3, 5, 7, 9}; + String[] dictItems = new String[dictLength]; + for (int i = 0; i < dictLength; i++) { + dictItems[i] = String.valueOf(i); + } + + setVector(encodedVector8, 1L, 3L, 5L, 7L, 9L); + setVector(dictionaryVector, dictItems); + + byte[] data = writeData(encodedVector8); + readData(data, encodedVector8.getField(), (vector, index) -> (int) ((UInt8Vector) vector).getValueAsLong(index), + 64L, indices, dictItems); + } + } + + @Parameterized.Parameters(name = "stream mode = {0}") + public static Collection getRepeat() { + return Arrays.asList( + new Object[]{true}, + new Object[]{false} + ); + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/message/TestMessageMetadataResult.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/message/TestMessageMetadataResult.java new file mode 100644 index 000000000..ee5361547 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/ipc/message/TestMessageMetadataResult.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc.message; + +import static org.junit.Assert.assertEquals; + +import java.nio.ByteBuffer; + +import org.junit.Test; + +public class TestMessageMetadataResult { + + @Test + public void getMessageLength_returnsConstructValue() { + // This API is used by spark. + MessageMetadataResult result = new MessageMetadataResult(1, ByteBuffer.allocate(0), + new org.apache.arrow.flatbuf.Message()); + assertEquals(result.getMessageLength(), 1); + } + +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java new file mode 100644 index 000000000..5cc0d0800 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.pojo; + +import static org.apache.arrow.vector.types.FloatingPointPrecision.DOUBLE; +import static org.apache.arrow.vector.types.FloatingPointPrecision.SINGLE; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.util.Collections2; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.types.TimeUnit; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.UnionMode; +import org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint; +import org.apache.arrow.vector.types.pojo.ArrowType.Int; +import org.apache.arrow.vector.types.pojo.ArrowType.List; +import org.apache.arrow.vector.types.pojo.ArrowType.Struct; +import org.apache.arrow.vector.types.pojo.ArrowType.Timestamp; +import org.apache.arrow.vector.types.pojo.ArrowType.Union; +import org.apache.arrow.vector.types.pojo.ArrowType.Utf8; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.Test; + +import com.google.flatbuffers.FlatBufferBuilder; + +/** + * Test conversion between Flatbuf and Pojo field representations. + */ +public class TestConvert { + + @Test + public void simple() { + Field initialField = new Field("a", FieldType.nullable(new Int(32, true)), null); + run(initialField); + } + + @Test + public void complex() { + java.util.List children = new ArrayList<>(); + children.add(new Field("child1", FieldType.nullable(Utf8.INSTANCE), null)); + children.add(new Field("child2", FieldType.nullable(new FloatingPoint(SINGLE)), Collections.emptyList())); + + Field initialField = new Field("a", FieldType.nullable(Struct.INSTANCE), children); + run(initialField); + } + + @Test + public void list() throws Exception { + java.util.List children = new ArrayList<>(); + try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + ListVector writeVector = ListVector.empty("list", allocator); + FixedSizeListVector writeFixedVector = FixedSizeListVector.empty("fixedlist", 5, allocator)) { + Field listVectorField = writeVector.getField(); + children.add(listVectorField); + Field listFixedVectorField = writeFixedVector.getField(); + children.add(listFixedVectorField); + } + + Field initialField = new Field("a", FieldType.nullable(Struct.INSTANCE), children); + java.util.List parent = new ArrayList<>(); + parent.add(initialField); + FlatBufferBuilder builder = new FlatBufferBuilder(); + builder.finish(initialField.getField(builder)); + org.apache.arrow.flatbuf.Field flatBufField = org.apache.arrow.flatbuf.Field.getRootAsField(builder.dataBuffer()); + Field finalField = Field.convertField(flatBufField); + assertEquals(initialField, finalField); + assertFalse(finalField.toString().contains("[DEFAULT]")); + + Schema initialSchema = new Schema(parent); + String jsonSchema = initialSchema.toJson(); + String modifiedSchema = jsonSchema.replace("$data$", "[DEFAULT]"); + + Schema tempSchema = Schema.fromJSON(modifiedSchema); + FlatBufferBuilder schemaBuilder = new FlatBufferBuilder(); + org.apache.arrow.vector.types.pojo.Schema schema = + new org.apache.arrow.vector.types.pojo.Schema(tempSchema.getFields()); + schemaBuilder.finish(schema.getSchema(schemaBuilder)); + Schema finalSchema = Schema.deserialize(ByteBuffer.wrap(schemaBuilder.sizedByteArray())); + assertFalse(finalSchema.toString().contains("[DEFAULT]")); + } + + @Test + public void schema() { + java.util.List children = new ArrayList<>(); + children.add(new Field("child1", FieldType.nullable(Utf8.INSTANCE), null)); + children.add(new Field("child2", FieldType.nullable(new FloatingPoint(SINGLE)), Collections.emptyList())); + Schema initialSchema = new Schema(children); + run(initialSchema); + } + + @Test + public void schemaMetadata() { + java.util.List children = new ArrayList<>(); + children.add(new Field("child1", FieldType.nullable(Utf8.INSTANCE), null)); + children.add(new Field("child2", FieldType.nullable(new FloatingPoint(SINGLE)), Collections.emptyList())); + Map metadata = new HashMap<>(); + metadata.put("key1", "value1"); + metadata.put("key2", "value2"); + Schema initialSchema = new Schema(children, metadata); + run(initialSchema); + } + + @Test + public void nestedSchema() { + java.util.List children = new ArrayList<>(); + children.add(new Field("child1", FieldType.nullable(Utf8.INSTANCE), null)); + children.add(new Field("child2", FieldType.nullable(new FloatingPoint(SINGLE)), Collections.emptyList())); + children.add(new Field("child3", FieldType.nullable(new Struct()), Collections2.asImmutableList( + new Field("child3.1", FieldType.nullable(Utf8.INSTANCE), null), + new Field("child3.2", FieldType.nullable(new FloatingPoint(DOUBLE)), Collections.emptyList()) + ))); + children.add(new Field("child4", FieldType.nullable(new List()), Collections2.asImmutableList( + new Field("child4.1", FieldType.nullable(Utf8.INSTANCE), null) + ))); + children.add(new Field("child5", FieldType.nullable( + new Union(UnionMode.Sparse, new int[] {MinorType.TIMESTAMPMILLI.ordinal(), MinorType.FLOAT8.ordinal()})), + Collections2.asImmutableList( + new Field("child5.1", FieldType.nullable(new Timestamp(TimeUnit.MILLISECOND, null)), null), + new Field("child5.2", FieldType.nullable(new FloatingPoint(DOUBLE)), Collections.emptyList()), + new Field("child5.3", FieldType.nullable(new Timestamp(TimeUnit.MILLISECOND, "UTC")), null) + ))); + Schema initialSchema = new Schema(children); + run(initialSchema); + } + + private void run(Field initialField) { + FlatBufferBuilder builder = new FlatBufferBuilder(); + builder.finish(initialField.getField(builder)); + org.apache.arrow.flatbuf.Field flatBufField = org.apache.arrow.flatbuf.Field.getRootAsField(builder.dataBuffer()); + Field finalField = Field.convertField(flatBufField); + assertEquals(initialField, finalField); + } + + private void run(Schema initialSchema) { + FlatBufferBuilder builder = new FlatBufferBuilder(); + builder.finish(initialSchema.getSchema(builder)); + org.apache.arrow.flatbuf.Schema flatBufSchema = + org.apache.arrow.flatbuf.Schema.getRootAsSchema(builder.dataBuffer()); + Schema finalSchema = Schema.convertSchema(flatBufSchema); + assertEquals(initialSchema, finalSchema); + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/testing/RandomDataGenerator.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/testing/RandomDataGenerator.java new file mode 100644 index 000000000..4b1094d28 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/testing/RandomDataGenerator.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.testing; + +import java.util.Random; +import java.util.function.Supplier; + +/** + * Utility for generating random data. + */ +public class RandomDataGenerator { + + static final Random random = new Random(0); + + public static final Supplier TINY_INT_GENERATOR = () -> (byte) random.nextInt(); + + public static final Supplier SMALL_INT_GENERATOR = () -> (short) random.nextInt(); + + public static final Supplier INT_GENERATOR = () -> random.nextInt(); + + public static final Supplier LONG_GENERATOR = () -> random.nextLong(); + + public static final Supplier FLOAT_GENERATOR = () -> random.nextFloat(); + + public static final Supplier DOUBLE_GENERATOR = () -> random.nextDouble(); + + private RandomDataGenerator() { + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/testing/TestValueVectorPopulator.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/testing/TestValueVectorPopulator.java new file mode 100644 index 000000000..f5d15e2c6 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/testing/TestValueVectorPopulator.java @@ -0,0 +1,604 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.testing; + +import static junit.framework.TestCase.assertTrue; +import static org.apache.arrow.vector.testing.ValueVectorDataPopulator.setVector; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.DateDayVector; +import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.DurationVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.IntervalYearVector; +import org.apache.arrow.vector.SmallIntVector; +import org.apache.arrow.vector.TimeMicroVector; +import org.apache.arrow.vector.TimeMilliVector; +import org.apache.arrow.vector.TimeNanoVector; +import org.apache.arrow.vector.TimeSecVector; +import org.apache.arrow.vector.TimeStampMicroVector; +import org.apache.arrow.vector.TimeStampMilliVector; +import org.apache.arrow.vector.TimeStampNanoVector; +import org.apache.arrow.vector.TimeStampSecVector; +import org.apache.arrow.vector.TinyIntVector; +import org.apache.arrow.vector.UInt1Vector; +import org.apache.arrow.vector.UInt2Vector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.UInt8Vector; +import org.apache.arrow.vector.VarBinaryVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.compare.VectorEqualsVisitor; +import org.apache.arrow.vector.testing.ValueVectorDataPopulator; +import org.apache.arrow.vector.types.TimeUnit; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestValueVectorPopulator { + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testPopulateBigIntVector() { + try (final BigIntVector vector1 = new BigIntVector("vector", allocator); + final BigIntVector vector2 = new BigIntVector("vector", allocator)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, i); + } + } + vector1.setValueCount(10); + + setVector(vector2, null, 1L, null, 3L, null, 5L, null, 7L, null, 9L); + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testPopulateBitVector() { + try (final BitVector vector1 = new BitVector("vector", allocator); + final BitVector vector2 = new BitVector("vector", allocator)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, i > 5 ? 0 : 1); + } + } + vector1.setValueCount(10); + + setVector(vector2, null, 1, null, 1, null, 0, null, 0, null, 0); + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testPopulateDateDayVector() { + try (final DateDayVector vector1 = new DateDayVector("vector", allocator); + final DateDayVector vector2 = new DateDayVector("vector", allocator)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, i * 10); + } + } + vector1.setValueCount(10); + + setVector(vector2, null, 10, null, 30, null, 50, null, 70, null, 90); + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testPopulateDateMilliVector() { + try (final DateMilliVector vector1 = new DateMilliVector("vector", allocator); + final DateMilliVector vector2 = new DateMilliVector("vector", allocator)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, i * 1000); + } + } + vector1.setValueCount(10); + + setVector(vector2, null, 1000L, null, 3000L, null, 5000L, null, 7000L, null, 9000L); + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testPopulateDecimalVector() { + try (final DecimalVector vector1 = new DecimalVector("vector", allocator, 10, 3); + final DecimalVector vector2 = new DecimalVector("vector", allocator, 10, 3)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, i); + } + } + vector1.setValueCount(10); + + setVector(vector2, null, 1L, null, 3L, null, 5L, null, 7L, null, 9L); + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testPopulateDurationVector() { + final FieldType fieldType = FieldType.nullable(new ArrowType.Duration(TimeUnit.SECOND)); + try (final DurationVector vector1 = new DurationVector("vector", fieldType, allocator); + final DurationVector vector2 = new DurationVector("vector", fieldType, allocator)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, i); + } + } + vector1.setValueCount(10); + + setVector(vector2, null, 1L, null, 3L, null, 5L, null, 7L, null, 9L); + + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testPopulateFixedSizeBinaryVector() { + try (final FixedSizeBinaryVector vector1 = new FixedSizeBinaryVector("vector", allocator, 5); + final FixedSizeBinaryVector vector2 = new FixedSizeBinaryVector("vector", allocator, 5)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, ("test" + i).getBytes()); + } + } + vector1.setValueCount(10); + + setVector(vector2, null, "test1".getBytes(), null, "test3".getBytes(), null, "test5".getBytes(), null, + "test7".getBytes(), null, "test9".getBytes()); + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testPopulateFloat4Vector() { + try (final Float4Vector vector1 = new Float4Vector("vector", allocator); + final Float4Vector vector2 = new Float4Vector("vector", allocator)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, i); + } + } + vector1.setValueCount(10); + setVector(vector2, null, 1f, null, 3f, null, 5f, null, 7f, null, 9f); + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testPopulateFloat8Vector() { + try (final Float8Vector vector1 = new Float8Vector("vector", allocator); + final Float8Vector vector2 = new Float8Vector("vector", allocator)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, i); + } + } + vector1.setValueCount(10); + setVector(vector2, null, 1d, null, 3d, null, 5d, null, 7d, null, 9d); + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testPopulateIntVector() { + try (final IntVector vector1 = new IntVector("vector", allocator); + final IntVector vector2 = new IntVector("vector", allocator)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, i); + } + } + vector1.setValueCount(10); + + ValueVectorDataPopulator.setVector(vector2, null, 1, null, 3, null, 5, null, 7, null, 9); + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testPopulateSmallIntVector() { + try (final SmallIntVector vector1 = new SmallIntVector("vector", allocator); + final SmallIntVector vector2 = new SmallIntVector("vector", allocator)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, i); + } + } + vector1.setValueCount(10); + + ValueVectorDataPopulator.setVector(vector2, null, (short) 1, null, (short) 3, null, (short) 5, + null, (short) 7, null, (short) 9); + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testPopulateIntervalDayVector() { + try (final IntervalYearVector vector1 = new IntervalYearVector("vector", allocator); + final IntervalYearVector vector2 = new IntervalYearVector("vector", allocator)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, i); + } + } + vector1.setValueCount(10); + + ValueVectorDataPopulator.setVector(vector2, null, 1, null, 3, null, 5, null, 7, null, 9); + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testPopulateTimeMicroVector() { + try (final TimeMicroVector vector1 = new TimeMicroVector("vector", allocator); + final TimeMicroVector vector2 = new TimeMicroVector("vector", allocator)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, i * 10000); + } + } + vector1.setValueCount(10); + setVector(vector2, null, 10000L, null, 30000L, null, 50000L, null, 70000L, null, 90000L); + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testPopulateTimeMilliVector() { + try (final TimeMilliVector vector1 = new TimeMilliVector("vector", allocator); + final TimeMilliVector vector2 = new TimeMilliVector("vector", allocator)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, i * 100); + } + } + vector1.setValueCount(10); + setVector(vector2, null, 100, null, 300, null, 500, null, 700, null, 900); + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testPopulateTimeNanoVector() { + try (final TimeNanoVector vector1 = new TimeNanoVector("vector", allocator); + final TimeNanoVector vector2 = new TimeNanoVector("vector", allocator)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, i * 10000); + } + } + vector1.setValueCount(10); + setVector(vector2, null, 10000L, null, 30000L, null, 50000L, null, 70000L, null, 90000L); + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testPopulateTimeSecVector() { + try (final TimeSecVector vector1 = new TimeSecVector("vector", allocator); + final TimeSecVector vector2 = new TimeSecVector("vector", allocator)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, i * 100); + } + } + vector1.setValueCount(10); + setVector(vector2, null, 100, null, 300, null, 500, null, 700, null, 900); + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testPopulateTimeStampMicroVector() { + try (final TimeStampMicroVector vector1 = new TimeStampMicroVector("vector", allocator); + final TimeStampMicroVector vector2 = new TimeStampMicroVector("vector", allocator)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, i * 10000); + } + } + vector1.setValueCount(10); + setVector(vector2, null, 10000L, null, 30000L, null, 50000L, null, 70000L, null, 90000L); + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testPopulateTimeStampMilliVector() { + try (final TimeStampMilliVector vector1 = new TimeStampMilliVector("vector", allocator); + final TimeStampMilliVector vector2 = new TimeStampMilliVector("vector", allocator)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, i * 10000); + } + } + vector1.setValueCount(10); + setVector(vector2, null, 10000L, null, 30000L, null, 50000L, null, 70000L, null, 90000L); + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testPopulateTimeStampNanoVector() { + try (final TimeStampNanoVector vector1 = new TimeStampNanoVector("vector", allocator); + final TimeStampNanoVector vector2 = new TimeStampNanoVector("vector", allocator)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, i * 10000); + } + } + vector1.setValueCount(10); + setVector(vector2, null, 10000L, null, 30000L, null, 50000L, null, 70000L, null, 90000L); + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testPopulateTimeStampSecVector() { + try (final TimeStampSecVector vector1 = new TimeStampSecVector("vector", allocator); + final TimeStampSecVector vector2 = new TimeStampSecVector("vector", allocator)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, i * 100); + } + } + vector1.setValueCount(10); + setVector(vector2, null, 100L, null, 300L, null, 500L, null, 700L, null, 900L); + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testPopulateTinyIntVector() { + try (final TinyIntVector vector1 = new TinyIntVector("vector", allocator); + final TinyIntVector vector2 = new TinyIntVector("vector", allocator)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, i); + } + } + vector1.setValueCount(10); + setVector(vector2, null, (byte) 1, null, (byte) 3, null, (byte) 5, null, (byte) 7, null, (byte) 9); + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testPopulateUInt1Vector() { + try (final UInt1Vector vector1 = new UInt1Vector("vector", allocator); + final UInt1Vector vector2 = new UInt1Vector("vector", allocator)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, i); + } + } + vector1.setValueCount(10); + setVector(vector2, null, (byte) 1, null, (byte) 3, null, (byte) 5, null, (byte) 7, null, (byte) 9); + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testPopulateUInt2Vector() { + try (final UInt2Vector vector1 = new UInt2Vector("vector", allocator); + final UInt2Vector vector2 = new UInt2Vector("vector", allocator)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, i); + } + } + vector1.setValueCount(10); + setVector(vector2, null, (char) 1, null, (char) 3, null, (char) 5, null, (char) 7, null, (char) 9); + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testPopulateUInt4Vector() { + try (final UInt4Vector vector1 = new UInt4Vector("vector", allocator); + final UInt4Vector vector2 = new UInt4Vector("vector", allocator)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, i); + } + } + vector1.setValueCount(10); + setVector(vector2, null, 1, null, 3, null, 5, null, 7, null, 9); + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testPopulateUInt8Vector() { + try (final UInt8Vector vector1 = new UInt8Vector("vector", allocator); + final UInt8Vector vector2 = new UInt8Vector("vector", allocator)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, i); + } + } + vector1.setValueCount(10); + setVector(vector2, null, 1L, null, 3L, null, 5L, null, 7L, null, 9L); + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testPopulateVarBinaryVector() { + try (final VarBinaryVector vector1 = new VarBinaryVector("vector", allocator); + final VarBinaryVector vector2 = new VarBinaryVector("vector", allocator)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, ("test" + i).getBytes()); + } + } + vector1.setValueCount(10); + + setVector(vector2, null, "test1".getBytes(), null, "test3".getBytes(), null, "test5".getBytes(), null, + "test7".getBytes(), null, "test9".getBytes()); + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } + + @Test + public void testPopulateVarCharVector() { + try (final VarCharVector vector1 = new VarCharVector("vector", allocator); + final VarCharVector vector2 = new VarCharVector("vector", allocator)) { + + vector1.allocateNew(10); + for (int i = 0; i < 10; i++) { + if (i % 2 == 0) { + vector1.setNull(i); + } else { + vector1.set(i, ("test" + i).getBytes()); + } + } + vector1.setValueCount(10); + + setVector(vector2, null, "test1", null, "test3", null, "test5", null, "test7", null, "test9"); + assertTrue(VectorEqualsVisitor.vectorEquals(vector1, vector2)); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java new file mode 100644 index 000000000..15d6a5cf9 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java @@ -0,0 +1,708 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.testing; + +import static org.junit.Assert.assertEquals; + +import java.nio.charset.StandardCharsets; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.BitVectorHelper; +import org.apache.arrow.vector.DateDayVector; +import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.DurationVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.IntervalDayVector; +import org.apache.arrow.vector.IntervalYearVector; +import org.apache.arrow.vector.LargeVarCharVector; +import org.apache.arrow.vector.SmallIntVector; +import org.apache.arrow.vector.TimeMicroVector; +import org.apache.arrow.vector.TimeMilliVector; +import org.apache.arrow.vector.TimeNanoVector; +import org.apache.arrow.vector.TimeSecVector; +import org.apache.arrow.vector.TimeStampMicroTZVector; +import org.apache.arrow.vector.TimeStampMicroVector; +import org.apache.arrow.vector.TimeStampMilliTZVector; +import org.apache.arrow.vector.TimeStampMilliVector; +import org.apache.arrow.vector.TimeStampNanoTZVector; +import org.apache.arrow.vector.TimeStampNanoVector; +import org.apache.arrow.vector.TimeStampSecTZVector; +import org.apache.arrow.vector.TimeStampSecVector; +import org.apache.arrow.vector.TinyIntVector; +import org.apache.arrow.vector.UInt1Vector; +import org.apache.arrow.vector.UInt2Vector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.UInt8Vector; +import org.apache.arrow.vector.VarBinaryVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.complex.BaseRepeatedValueVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.LargeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.holders.IntervalDayHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.FieldType; + +/** + * Utility for populating {@link org.apache.arrow.vector.ValueVector}. + */ +public class ValueVectorDataPopulator { + + private ValueVectorDataPopulator(){} + + /** + * Populate values for BigIntVector. + */ + public static void setVector(BigIntVector vector, Long... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for BitVector. + */ + public static void setVector(BitVector vector, Integer... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for DateDayVector. + * @param values numbers of days since UNIX epoch + */ + public static void setVector(DateDayVector vector, Integer... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for DateMilliVector. + * @param values numbers of milliseconds since UNIX epoch + */ + public static void setVector(DateMilliVector vector, Long... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for DecimalVector. + */ + public static void setVector(DecimalVector vector, Long... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for DurationVector. + * @param values values of elapsed time in either seconds, milliseconds, microseconds or nanoseconds. + */ + public static void setVector(DurationVector vector, Long... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for FixedSizeBinaryVector. + */ + public static void setVector(FixedSizeBinaryVector vector, byte[]... values) { + final int length = values.length; + vector.allocateNewSafe(); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for Float4Vector. + */ + public static void setVector(Float4Vector vector, Float... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for Float8Vector. + */ + public static void setVector(Float8Vector vector, Double... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for IntVector. + */ + public static void setVector(IntVector vector, Integer... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for IntervalDayVector. + * @param values holders witch holds days and milliseconds values which represents interval in SQL style. + */ + public static void setVector(IntervalDayVector vector, IntervalDayHolder... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i].days, values[i].milliseconds); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for IntervalYearVector. + * @param values total month intervals in SQL style. + */ + public static void setVector(IntervalYearVector vector, Integer... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for SmallIntVector. + */ + public static void setVector(SmallIntVector vector, Short... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for TimeMicroVector. + * @param values numbers of microseconds since UNIX epoch + */ + public static void setVector(TimeMicroVector vector, Long... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for TimeMicroVector. + * @param values numbers of milliseconds since UNIX epoch + */ + public static void setVector(TimeMilliVector vector, Integer... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for TimeNanoVector. + * @param values numbers of nanoseconds since UNIX epoch + */ + public static void setVector(TimeNanoVector vector, Long... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for TimeSecVector. + * @param values numbers of seconds since UNIX epoch + */ + public static void setVector(TimeSecVector vector, Integer... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for TimeStampMicroTZVector. + * @param values numbers of microseconds since UNIX epoch + */ + public static void setVector(TimeStampMicroTZVector vector, Long... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for TimeStampMicroVector. + * @param values numbers of microseconds since UNIX epoch + */ + public static void setVector(TimeStampMicroVector vector, Long... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for TimeStampMilliTZVector. + * @param values numbers of milliseconds since UNIX epoch + */ + public static void setVector(TimeStampMilliTZVector vector, Long... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for TimeStampMilliVector. + * @param values numbers of milliseconds since UNIX epoch + */ + public static void setVector(TimeStampMilliVector vector, Long... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for TimeStampNanoTZVector. + * @param values numbers of nanoseconds since UNIX epoch + */ + public static void setVector(TimeStampNanoTZVector vector, Long... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for TimeStampNanoVector. + * @param values numbers of nanoseconds since UNIX epoch + */ + public static void setVector(TimeStampNanoVector vector, Long... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for TimeStampSecTZVector. + * @param values numbers of seconds since UNIX epoch + */ + public static void setVector(TimeStampSecTZVector vector, Long... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for TimeStampSecVector. + * @param values numbers of seconds since UNIX epoch + */ + public static void setVector(TimeStampSecVector vector, Long... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for TinyIntVector. + */ + public static void setVector(TinyIntVector vector, Byte... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for UInt1Vector. + */ + public static void setVector(UInt1Vector vector, Byte... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for UInt2Vector. + */ + public static void setVector(UInt2Vector vector, Character... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for UInt4Vector. + */ + public static void setVector(UInt4Vector vector, Integer... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for UInt8Vector. + */ + public static void setVector(UInt8Vector vector, Long... values) { + final int length = values.length; + vector.allocateNew(length); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for VarBinaryVector. + */ + public static void setVector(VarBinaryVector vector, byte[]... values) { + final int length = values.length; + vector.allocateNewSafe(); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for VarCharVector. + */ + public static void setVector(VarCharVector vector, byte[]... values) { + final int length = values.length; + vector.allocateNewSafe(); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for LargeVarCharVector. + */ + public static void setVector(LargeVarCharVector vector, byte[]... values) { + final int length = values.length; + vector.allocateNewSafe(); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for VarCharVector. + */ + public static void setVector(VarCharVector vector, String... values) { + final int length = values.length; + vector.allocateNewSafe(); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.setSafe(i, values[i].getBytes(StandardCharsets.UTF_8)); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for LargeVarCharVector. + */ + public static void setVector(LargeVarCharVector vector, String... values) { + final int length = values.length; + vector.allocateNewSafe(); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.setSafe(i, values[i].getBytes(StandardCharsets.UTF_8)); + } + } + vector.setValueCount(length); + } + + /** + * Populate values for {@link ListVector}. + */ + public static void setVector(ListVector vector, List... values) { + vector.allocateNewSafe(); + Types.MinorType type = Types.MinorType.INT; + vector.addOrGetVector(FieldType.nullable(type.getType())); + + IntVector dataVector = (IntVector) vector.getDataVector(); + dataVector.allocateNew(); + + // set underlying vectors + int curPos = 0; + vector.getOffsetBuffer().setInt(0, curPos); + for (int i = 0; i < values.length; i++) { + if (values[i] == null) { + BitVectorHelper.unsetBit(vector.getValidityBuffer(), i); + } else { + BitVectorHelper.setBit(vector.getValidityBuffer(), i); + for (int value : values[i]) { + dataVector.setSafe(curPos, value); + curPos += 1; + } + } + vector.getOffsetBuffer().setInt((i + 1) * BaseRepeatedValueVector.OFFSET_WIDTH, curPos); + } + dataVector.setValueCount(curPos); + vector.setLastSet(values.length - 1); + vector.setValueCount(values.length); + } + + /** + * Populate values for {@link LargeListVector}. + */ + public static void setVector(LargeListVector vector, List... values) { + vector.allocateNewSafe(); + Types.MinorType type = Types.MinorType.INT; + vector.addOrGetVector(FieldType.nullable(type.getType())); + + IntVector dataVector = (IntVector) vector.getDataVector(); + dataVector.allocateNew(); + + // set underlying vectors + int curPos = 0; + vector.getOffsetBuffer().setLong(0, curPos); + for (int i = 0; i < values.length; i++) { + if (values[i] == null) { + BitVectorHelper.unsetBit(vector.getValidityBuffer(), i); + } else { + BitVectorHelper.setBit(vector.getValidityBuffer(), i); + for (int value : values[i]) { + dataVector.setSafe(curPos, value); + curPos += 1; + } + } + vector.getOffsetBuffer().setLong((long) (i + 1) * LargeListVector.OFFSET_WIDTH, curPos); + } + dataVector.setValueCount(curPos); + vector.setLastSet(values.length - 1); + vector.setValueCount(values.length); + } + + /** + * Populate values for {@link FixedSizeListVector}. + */ + public static void setVector(FixedSizeListVector vector, List... values) { + vector.allocateNewSafe(); + for (int i = 0; i < values.length; i++) { + if (values[i] != null) { + assertEquals(vector.getListSize(), values[i].size()); + } + } + + Types.MinorType type = Types.MinorType.INT; + vector.addOrGetVector(FieldType.nullable(type.getType())); + + IntVector dataVector = (IntVector) vector.getDataVector(); + dataVector.allocateNew(); + + // set underlying vectors + int curPos = 0; + for (int i = 0; i < values.length; i++) { + if (values[i] == null) { + BitVectorHelper.unsetBit(vector.getValidityBuffer(), i); + } else { + BitVectorHelper.setBit(vector.getValidityBuffer(), i); + for (int value : values[i]) { + dataVector.setSafe(curPos, value); + curPos += 1; + } + } + } + dataVector.setValueCount(curPos); + vector.setValueCount(values.length); + } + + /** + * Populate values for {@link StructVector}. + */ + public static void setVector(StructVector vector, Map> values) { + vector.allocateNewSafe(); + + int valueCount = 0; + for (final Entry> entry : values.entrySet()) { + // Add the child + final IntVector child = vector.addOrGet(entry.getKey(), + FieldType.nullable(MinorType.INT.getType()), IntVector.class); + + // Write the values to the child + child.allocateNew(); + final List v = entry.getValue(); + for (int i = 0; i < v.size(); i++) { + if (v.get(i) != null) { + child.set(i, v.get(i)); + vector.setIndexDefined(i); + } else { + child.setNull(i); + } + } + valueCount = Math.max(valueCount, v.size()); + } + vector.setValueCount(valueCount); + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java new file mode 100644 index 000000000..8b2743210 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java @@ -0,0 +1,420 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.types.pojo; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.SeekableByteChannel; +import java.nio.channels.WritableByteChannel; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.nio.file.StandardOpenOption; +import java.util.Collections; +import java.util.UUID; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.vector.ExtensionTypeVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.compare.Range; +import org.apache.arrow.vector.compare.RangeEqualsVisitor; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.ipc.ArrowFileReader; +import org.apache.arrow.vector.ipc.ArrowFileWriter; +import org.apache.arrow.vector.types.FloatingPointPrecision; +import org.apache.arrow.vector.types.pojo.ArrowType.ExtensionType; +import org.apache.arrow.vector.util.VectorBatchAppender; +import org.apache.arrow.vector.validate.ValidateVectorVisitor; +import org.junit.Assert; +import org.junit.Test; + +public class TestExtensionType { + /** + * Test that a custom UUID type can be round-tripped through a temporary file. + */ + @Test + public void roundtripUuid() throws IOException { + ExtensionTypeRegistry.register(new UuidType()); + final Schema schema = new Schema(Collections.singletonList(Field.nullable("a", new UuidType()))); + try (final BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); + final VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { + UUID u1 = UUID.randomUUID(); + UUID u2 = UUID.randomUUID(); + UuidVector vector = (UuidVector) root.getVector("a"); + vector.setValueCount(2); + vector.set(0, u1); + vector.set(1, u2); + root.setRowCount(2); + + final File file = File.createTempFile("uuidtest", ".arrow"); + try (final WritableByteChannel channel = FileChannel + .open(Paths.get(file.getAbsolutePath()), StandardOpenOption.WRITE); + final ArrowFileWriter writer = new ArrowFileWriter(root, null, channel)) { + writer.start(); + writer.writeBatch(); + writer.end(); + } + + try (final SeekableByteChannel channel = Files.newByteChannel(Paths.get(file.getAbsolutePath())); + final ArrowFileReader reader = new ArrowFileReader(channel, allocator)) { + reader.loadNextBatch(); + final VectorSchemaRoot readerRoot = reader.getVectorSchemaRoot(); + Assert.assertEquals(root.getSchema(), readerRoot.getSchema()); + + final Field field = readerRoot.getSchema().getFields().get(0); + final UuidType expectedType = new UuidType(); + Assert.assertEquals(field.getMetadata().get(ExtensionType.EXTENSION_METADATA_KEY_NAME), + expectedType.extensionName()); + Assert.assertEquals(field.getMetadata().get(ExtensionType.EXTENSION_METADATA_KEY_METADATA), + expectedType.serialize()); + + final ExtensionTypeVector deserialized = (ExtensionTypeVector) readerRoot.getFieldVectors().get(0); + Assert.assertEquals(vector.getValueCount(), deserialized.getValueCount()); + for (int i = 0; i < vector.getValueCount(); i++) { + Assert.assertEquals(vector.isNull(i), deserialized.isNull(i)); + if (!vector.isNull(i)) { + Assert.assertEquals(vector.getObject(i), deserialized.getObject(i)); + } + } + } + } + } + + /** + * Test that a custom UUID type can be read as its underlying type. + */ + @Test + public void readUnderlyingType() throws IOException { + ExtensionTypeRegistry.register(new UuidType()); + final Schema schema = new Schema(Collections.singletonList(Field.nullable("a", new UuidType()))); + try (final BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); + final VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { + UUID u1 = UUID.randomUUID(); + UUID u2 = UUID.randomUUID(); + UuidVector vector = (UuidVector) root.getVector("a"); + vector.setValueCount(2); + vector.set(0, u1); + vector.set(1, u2); + root.setRowCount(2); + + final File file = File.createTempFile("uuidtest", ".arrow"); + try (final WritableByteChannel channel = FileChannel + .open(Paths.get(file.getAbsolutePath()), StandardOpenOption.WRITE); + final ArrowFileWriter writer = new ArrowFileWriter(root, null, channel)) { + writer.start(); + writer.writeBatch(); + writer.end(); + } + + ExtensionTypeRegistry.unregister(new UuidType()); + + try (final SeekableByteChannel channel = Files.newByteChannel(Paths.get(file.getAbsolutePath())); + final ArrowFileReader reader = new ArrowFileReader(channel, allocator)) { + reader.loadNextBatch(); + final VectorSchemaRoot readerRoot = reader.getVectorSchemaRoot(); + Assert.assertEquals(1, readerRoot.getSchema().getFields().size()); + Assert.assertEquals("a", readerRoot.getSchema().getFields().get(0).getName()); + Assert.assertTrue(readerRoot.getSchema().getFields().get(0).getType() instanceof ArrowType.FixedSizeBinary); + Assert.assertEquals(16, + ((ArrowType.FixedSizeBinary) readerRoot.getSchema().getFields().get(0).getType()).getByteWidth()); + + final Field field = readerRoot.getSchema().getFields().get(0); + final UuidType expectedType = new UuidType(); + Assert.assertEquals(field.getMetadata().get(ExtensionType.EXTENSION_METADATA_KEY_NAME), + expectedType.extensionName()); + Assert.assertEquals(field.getMetadata().get(ExtensionType.EXTENSION_METADATA_KEY_METADATA), + expectedType.serialize()); + + final FixedSizeBinaryVector deserialized = (FixedSizeBinaryVector) readerRoot.getFieldVectors().get(0); + Assert.assertEquals(vector.getValueCount(), deserialized.getValueCount()); + for (int i = 0; i < vector.getValueCount(); i++) { + Assert.assertEquals(vector.isNull(i), deserialized.isNull(i)); + if (!vector.isNull(i)) { + final UUID uuid = vector.getObject(i); + final ByteBuffer bb = ByteBuffer.allocate(16); + bb.putLong(uuid.getMostSignificantBits()); + bb.putLong(uuid.getLeastSignificantBits()); + Assert.assertArrayEquals(bb.array(), deserialized.get(i)); + } + } + } + } + } + + @Test + public void testNullCheck() { + NullPointerException e = assertThrows(NullPointerException.class, + () -> { + try (final BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); + final ExtensionTypeVector vector = new UuidVector("uuid", allocator, null)) { + vector.getField(); + vector.allocateNewSafe(); + } + }); + assertTrue(e.getMessage().contains("underlyingVector can not be null.")); + } + + /** + * Test that a custom Location type can be round-tripped through a temporary file. + */ + @Test + public void roundtripLocation() throws IOException { + ExtensionTypeRegistry.register(new LocationType()); + final Schema schema = new Schema(Collections.singletonList(Field.nullable("location", new LocationType()))); + try (final BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); + final VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { + LocationVector vector = (LocationVector) root.getVector("location"); + vector.allocateNew(); + vector.set(0, 34.073814f, -118.240784f); + vector.set(2, 37.768056f, -122.3875f); + vector.set(3, 40.739716f, -73.840782f); + vector.setValueCount(4); + root.setRowCount(4); + + final File file = File.createTempFile("locationtest", ".arrow"); + try (final WritableByteChannel channel = FileChannel + .open(Paths.get(file.getAbsolutePath()), StandardOpenOption.WRITE); + final ArrowFileWriter writer = new ArrowFileWriter(root, null, channel)) { + writer.start(); + writer.writeBatch(); + writer.end(); + } + + try (final SeekableByteChannel channel = Files.newByteChannel(Paths.get(file.getAbsolutePath())); + final ArrowFileReader reader = new ArrowFileReader(channel, allocator)) { + reader.loadNextBatch(); + final VectorSchemaRoot readerRoot = reader.getVectorSchemaRoot(); + Assert.assertEquals(root.getSchema(), readerRoot.getSchema()); + + final Field field = readerRoot.getSchema().getFields().get(0); + final LocationType expectedType = new LocationType(); + Assert.assertEquals(field.getMetadata().get(ExtensionType.EXTENSION_METADATA_KEY_NAME), + expectedType.extensionName()); + Assert.assertEquals(field.getMetadata().get(ExtensionType.EXTENSION_METADATA_KEY_METADATA), + expectedType.serialize()); + + final ExtensionTypeVector deserialized = (ExtensionTypeVector) readerRoot.getFieldVectors().get(0); + Assert.assertTrue(deserialized instanceof LocationVector); + Assert.assertEquals(deserialized.getName(), "location"); + StructVector deserStruct = (StructVector) deserialized.getUnderlyingVector(); + Assert.assertNotNull(deserStruct.getChild("Latitude")); + Assert.assertNotNull(deserStruct.getChild("Longitude")); + Assert.assertEquals(vector.getValueCount(), deserialized.getValueCount()); + for (int i = 0; i < vector.getValueCount(); i++) { + Assert.assertEquals(vector.isNull(i), deserialized.isNull(i)); + if (!vector.isNull(i)) { + Assert.assertEquals(vector.getObject(i), deserialized.getObject(i)); + } + } + } + } + } + + @Test + public void testVectorCompare() { + UuidType uuidType = new UuidType(); + ExtensionTypeRegistry.register(uuidType); + try (final BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); + UuidVector a1 = (UuidVector) uuidType.getNewVector("a", FieldType.nullable(uuidType), allocator); + UuidVector a2 = (UuidVector) uuidType.getNewVector("a", FieldType.nullable(uuidType), allocator); + UuidVector bb = (UuidVector) uuidType.getNewVector("a", FieldType.nullable(uuidType), allocator) + ) { + UUID u1 = UUID.randomUUID(); + UUID u2 = UUID.randomUUID(); + + // Test out type and vector validation visitors for an ExtensionTypeVector + ValidateVectorVisitor validateVisitor = new ValidateVectorVisitor(); + validateVisitor.visit(a1, null); + + a1.setValueCount(2); + a1.set(0, u1); + a1.set(1, u2); + + a2.setValueCount(2); + a2.set(0, u1); + a2.set(1, u2); + + bb.setValueCount(2); + bb.set(0, u2); + bb.set(1, u1); + + Range range = new Range(0, 0, a1.getValueCount()); + RangeEqualsVisitor visitor = new RangeEqualsVisitor(a1, a2); + assertTrue(visitor.rangeEquals(range)); + + visitor = new RangeEqualsVisitor(a1, bb); + assertFalse(visitor.rangeEquals(range)); + + // Test out vector appender + VectorBatchAppender.batchAppend(a1, a2, bb); + assertEquals(a1.getValueCount(), 6); + validateVisitor.visit(a1, null); + } + } + + static class UuidType extends ExtensionType { + + @Override + public ArrowType storageType() { + return new ArrowType.FixedSizeBinary(16); + } + + @Override + public String extensionName() { + return "uuid"; + } + + @Override + public boolean extensionEquals(ExtensionType other) { + return other instanceof UuidType; + } + + @Override + public ArrowType deserialize(ArrowType storageType, String serializedData) { + if (!storageType.equals(storageType())) { + throw new UnsupportedOperationException("Cannot construct UuidType from underlying type " + storageType); + } + return new UuidType(); + } + + @Override + public String serialize() { + return ""; + } + + @Override + public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator) { + return new UuidVector(name, allocator, new FixedSizeBinaryVector(name, allocator, 16)); + } + } + + static class UuidVector extends ExtensionTypeVector { + + public UuidVector(String name, BufferAllocator allocator, FixedSizeBinaryVector underlyingVector) { + super(name, allocator, underlyingVector); + } + + @Override + public UUID getObject(int index) { + final ByteBuffer bb = ByteBuffer.wrap(getUnderlyingVector().getObject(index)); + return new UUID(bb.getLong(), bb.getLong()); + } + + @Override + public int hashCode(int index) { + return hashCode(index, null); + } + + @Override + public int hashCode(int index, ArrowBufHasher hasher) { + return getUnderlyingVector().hashCode(index, hasher); + } + + public void set(int index, UUID uuid) { + ByteBuffer bb = ByteBuffer.allocate(16); + bb.putLong(uuid.getMostSignificantBits()); + bb.putLong(uuid.getLeastSignificantBits()); + getUnderlyingVector().set(index, bb.array()); + } + } + + static class LocationType extends ExtensionType { + + @Override + public ArrowType storageType() { + return Struct.INSTANCE; + } + + @Override + public String extensionName() { + return "location"; + } + + @Override + public boolean extensionEquals(ExtensionType other) { + return other instanceof LocationType; + } + + @Override + public ArrowType deserialize(ArrowType storageType, String serializedData) { + if (!storageType.equals(storageType())) { + throw new UnsupportedOperationException("Cannot construct LocationType from underlying type " + storageType); + } + return new LocationType(); + } + + @Override + public String serialize() { + return ""; + } + + @Override + public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator) { + return new LocationVector(name, allocator); + } + } + + static class LocationVector extends ExtensionTypeVector { + + private static StructVector buildUnderlyingVector(String name, BufferAllocator allocator) { + final StructVector underlyingVector = + new StructVector(name, allocator, FieldType.nullable(ArrowType.Struct.INSTANCE), null); + underlyingVector.addOrGet("Latitude", + FieldType.nullable(new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)), Float4Vector.class); + underlyingVector.addOrGet("Longitude", + FieldType.nullable(new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)), Float4Vector.class); + return underlyingVector; + } + + public LocationVector(String name, BufferAllocator allocator) { + super(name, allocator, buildUnderlyingVector(name, allocator)); + } + + @Override + public int hashCode(int index) { + return hashCode(index, null); + } + + @Override + public int hashCode(int index, ArrowBufHasher hasher) { + return getUnderlyingVector().hashCode(index, hasher); + } + + @Override + public java.util.Map getObject(int index) { + return getUnderlyingVector().getObject(index); + } + + public void set(int index, float latitude, float longitude) { + getUnderlyingVector().getChild("Latitude", Float4Vector.class).set(index, latitude); + getUnderlyingVector().getChild("Longitude", Float4Vector.class).set(index, longitude); + getUnderlyingVector().setIndexDefined(index); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestField.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestField.java new file mode 100644 index 000000000..bc984fa64 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestField.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.types.pojo; + +import static org.apache.arrow.vector.types.pojo.Schema.METADATA_KEY; +import static org.apache.arrow.vector.types.pojo.Schema.METADATA_VALUE; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +import org.apache.arrow.vector.types.pojo.ArrowType.Int; +import org.junit.Test; + +public class TestField { + + private static Field field(String name, boolean nullable, ArrowType type, Map metadata) { + return new Field(name, new FieldType(nullable, type, null, metadata), Collections.emptyList()); + } + + @Test + public void testMetadata() throws IOException { + Map metadata = new HashMap<>(1); + metadata.put("testKey", "testValue"); + + Schema schema = new Schema(Collections.singletonList( + field("a", false, new Int(8, true), metadata) + )); + + String json = schema.toJson(); + Schema actual = Schema.fromJSON(json); + + jsonContains(json, "\"" + METADATA_KEY + "\" : \"testKey\"", "\"" + METADATA_VALUE + "\" : \"testValue\""); + + Map actualMetadata = actual.getFields().get(0).getMetadata(); + assertEquals(1, actualMetadata.size()); + assertEquals("testValue", actualMetadata.get("testKey")); + } + + private void jsonContains(String json, String... strings) { + for (String string : strings) { + assertTrue(json + " contains " + string, json.contains(string)); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestSchema.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestSchema.java new file mode 100644 index 000000000..0e5375865 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestSchema.java @@ -0,0 +1,254 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.types.pojo; + +import static java.util.Arrays.asList; +import static org.apache.arrow.vector.types.pojo.Schema.METADATA_KEY; +import static org.apache.arrow.vector.types.pojo.Schema.METADATA_VALUE; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.arrow.vector.types.DateUnit; +import org.apache.arrow.vector.types.FloatingPointPrecision; +import org.apache.arrow.vector.types.IntervalUnit; +import org.apache.arrow.vector.types.TimeUnit; +import org.apache.arrow.vector.types.UnionMode; +import org.apache.arrow.vector.types.pojo.ArrowType.Binary; +import org.apache.arrow.vector.types.pojo.ArrowType.Bool; +import org.apache.arrow.vector.types.pojo.ArrowType.Date; +import org.apache.arrow.vector.types.pojo.ArrowType.Decimal; +import org.apache.arrow.vector.types.pojo.ArrowType.Duration; +import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeBinary; +import org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint; +import org.apache.arrow.vector.types.pojo.ArrowType.Int; +import org.apache.arrow.vector.types.pojo.ArrowType.Interval; +import org.apache.arrow.vector.types.pojo.ArrowType.List; +import org.apache.arrow.vector.types.pojo.ArrowType.Null; +import org.apache.arrow.vector.types.pojo.ArrowType.Struct; +import org.apache.arrow.vector.types.pojo.ArrowType.Time; +import org.apache.arrow.vector.types.pojo.ArrowType.Timestamp; +import org.apache.arrow.vector.types.pojo.ArrowType.Union; +import org.apache.arrow.vector.types.pojo.ArrowType.Utf8; +import org.junit.Test; + +public class TestSchema { + + private static Field field(String name, boolean nullable, ArrowType type, Field... children) { + return new Field(name, new FieldType(nullable, type, null, null), asList(children)); + } + + private static Field field(String name, ArrowType type, Field... children) { + return field(name, true, type, children); + } + + @Test + public void testComplex() throws IOException { + Schema schema = new Schema(asList( + field("a", false, new Int(8, true)), + field("b", new Struct(), + field("c", new Int(16, true)), + field("d", new Utf8())), + field("e", new List(), field(null, new Date(DateUnit.MILLISECOND))), + field("f", new FloatingPoint(FloatingPointPrecision.SINGLE)), + field("g", new Timestamp(TimeUnit.MILLISECOND, "UTC")), + field("h", new Timestamp(TimeUnit.MICROSECOND, null)), + field("i", new Interval(IntervalUnit.DAY_TIME)), + field("j", new ArrowType.Duration(TimeUnit.SECOND)) + )); + roundTrip(schema); + assertEquals( + "Schema, e: List, " + + "f: FloatingPoint(SINGLE), g: Timestamp(MILLISECOND, UTC), h: Timestamp(MICROSECOND, null), " + + "i: Interval(DAY_TIME), j: Duration(SECOND)>", + schema.toString()); + } + + @Test + public void testAll() throws IOException { + Schema schema = new Schema(asList( + field("a", false, new Null()), + field("b", new Struct(), field("ba", new Null())), + field("c", new List(), field("ca", new Null())), + field("d", new Union(UnionMode.Sparse, new int[] {1, 2, 3}), field("da", new Null())), + field("e", new Int(8, true)), + field("f", new FloatingPoint(FloatingPointPrecision.SINGLE)), + field("g", new Utf8()), + field("h", new Binary()), + field("i", new Bool()), + field("j", new Decimal(5, 5, 128)), + field("k", new Date(DateUnit.DAY)), + field("l", new Date(DateUnit.MILLISECOND)), + field("m", new Time(TimeUnit.SECOND, 32)), + field("n", new Time(TimeUnit.MILLISECOND, 32)), + field("o", new Time(TimeUnit.MICROSECOND, 64)), + field("p", new Time(TimeUnit.NANOSECOND, 64)), + field("q", new Timestamp(TimeUnit.MILLISECOND, "UTC")), + field("r", new Timestamp(TimeUnit.MICROSECOND, null)), + field("s", new Interval(IntervalUnit.DAY_TIME)), + field("t", new FixedSizeBinary(100)), + field("u", new Duration(TimeUnit.SECOND)), + field("v", new Duration(TimeUnit.MICROSECOND)) + )); + roundTrip(schema); + } + + @Test + public void testUnion() throws IOException { + Schema schema = new Schema(asList( + field("d", new Union(UnionMode.Sparse, new int[] {1, 2, 3}), field("da", new Null())) + )); + roundTrip(schema); + contains(schema, "Sparse"); + } + + @Test + public void testDate() throws IOException { + Schema schema = new Schema(asList( + field("a", new Date(DateUnit.DAY)), + field("b", new Date(DateUnit.MILLISECOND)) + )); + roundTrip(schema); + assertEquals( + "Schema", + schema.toString()); + } + + @Test + public void testTime() throws IOException { + Schema schema = new Schema(asList( + field("a", new Time(TimeUnit.SECOND, 32)), + field("b", new Time(TimeUnit.MILLISECOND, 32)), + field("c", new Time(TimeUnit.MICROSECOND, 64)), + field("d", new Time(TimeUnit.NANOSECOND, 64)) + )); + roundTrip(schema); + assertEquals( + "Schema", + schema.toString()); + } + + @Test + public void testTS() throws IOException { + Schema schema = new Schema(asList( + field("a", new Timestamp(TimeUnit.SECOND, "UTC")), + field("b", new Timestamp(TimeUnit.MILLISECOND, "UTC")), + field("c", new Timestamp(TimeUnit.MICROSECOND, "UTC")), + field("d", new Timestamp(TimeUnit.NANOSECOND, "UTC")), + field("e", new Timestamp(TimeUnit.SECOND, null)), + field("f", new Timestamp(TimeUnit.MILLISECOND, null)), + field("g", new Timestamp(TimeUnit.MICROSECOND, null)), + field("h", new Timestamp(TimeUnit.NANOSECOND, null)) + )); + roundTrip(schema); + assertEquals( + "Schema", + schema.toString()); + } + + @Test + public void testInterval() throws IOException { + Schema schema = new Schema(asList( + field("a", new Interval(IntervalUnit.YEAR_MONTH)), + field("b", new Interval(IntervalUnit.DAY_TIME)) + )); + roundTrip(schema); + contains(schema, "YEAR_MONTH", "DAY_TIME"); + } + + @Test + public void testRoundTripDurationInterval() throws IOException { + Schema schema = new Schema(asList( + field("a", new Duration(TimeUnit.SECOND)), + field("b", new Duration(TimeUnit.MILLISECOND)), + field("c", new Duration(TimeUnit.MICROSECOND)), + field("d", new Duration(TimeUnit.NANOSECOND)) + )); + roundTrip(schema); + contains(schema, "SECOND", "MILLI", "MICRO", "NANO"); + } + + @Test + public void testFP() throws IOException { + Schema schema = new Schema(asList( + field("a", new FloatingPoint(FloatingPointPrecision.HALF)), + field("b", new FloatingPoint(FloatingPointPrecision.SINGLE)), + field("c", new FloatingPoint(FloatingPointPrecision.DOUBLE)) + )); + roundTrip(schema); + contains(schema, "HALF", "SINGLE", "DOUBLE"); + } + + @Test + public void testMetadata() throws IOException { + Map metadata = new HashMap<>(1); + metadata.put("testKey", "testValue"); + + java.util.List fields = asList( + field("a", false, new Int(8, true)), + field("b", new Struct(), + field("c", new Int(16, true)), + field("d", new Utf8())), + field("e", new List(), field(null, new Date(DateUnit.MILLISECOND))) + ); + Schema schema = new Schema(fields, metadata); + roundTrip(schema); + contains(schema, "\"" + METADATA_KEY + "\" : \"testKey\"", "\"" + METADATA_VALUE + "\" : \"testValue\""); + } + + private void roundTrip(Schema schema) throws IOException { + String json = schema.toJson(); + Schema actual = Schema.fromJSON(json); + assertEquals(schema.toJson(), actual.toJson()); + assertEquals(schema, actual); + validateFieldsHashcode(schema.getFields(), actual.getFields()); + assertEquals(schema.hashCode(), actual.hashCode()); + } + + private void validateFieldsHashcode(java.util.List schemaFields, java.util.List actualFields) { + assertEquals(schemaFields.size(), actualFields.size()); + if (schemaFields.size() == 0) { + return; + } + for (int i = 0; i < schemaFields.size(); i++) { + Field schemaField = schemaFields.get(i); + Field actualField = actualFields.get(i); + validateFieldsHashcode(schemaField.getChildren(), actualField.getChildren()); + validateHashCode(schemaField.getType(), actualField.getType()); + validateHashCode(schemaField, actualField); + } + } + + private void validateHashCode(Object o1, Object o2) { + assertEquals(o1, o2); + assertEquals(o1 + " == " + o2, o1.hashCode(), o2.hashCode()); + } + + private void contains(Schema schema, String... s) { + String json = schema.toJson(); + for (String string : s) { + assertTrue(json + " contains " + string, json.contains(string)); + } + } + +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/DecimalUtilityTest.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/DecimalUtilityTest.java new file mode 100644 index 000000000..804092ed9 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/DecimalUtilityTest.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import java.math.BigDecimal; +import java.math.BigInteger; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.junit.Assert; +import org.junit.Test; + +public class DecimalUtilityTest { + private static final BigInteger[] MAX_BIG_INT = new BigInteger[]{BigInteger.valueOf(10).pow(38) + .subtract(java.math.BigInteger.ONE), java.math.BigInteger.valueOf(10).pow(76)}; + private static final BigInteger[] MIN_BIG_INT = new BigInteger[]{MAX_BIG_INT[0].multiply(BigInteger.valueOf(-1)), + MAX_BIG_INT[1].multiply(BigInteger.valueOf(-1))}; + + @Test + public void testSetLongInDecimalArrowBuf() { + int[] byteLengths = new int[]{16, 32}; + for (int x = 0; x < 2; x++) { + try (BufferAllocator allocator = new RootAllocator(128); + ArrowBuf buf = allocator.buffer(byteLengths[x]); + ) { + int [] intValues = new int [] {Integer.MAX_VALUE, Integer.MIN_VALUE, 0}; + for (int val : intValues) { + buf.clear(); + DecimalUtility.writeLongToArrowBuf((long) val, buf, 0, byteLengths[x]); + BigDecimal actual = DecimalUtility.getBigDecimalFromArrowBuf(buf, 0, 0, byteLengths[x]); + BigDecimal expected = BigDecimal.valueOf(val); + Assert.assertEquals(expected, actual); + } + } + } + } + + @Test + public void testSetByteArrayInDecimalArrowBuf() { + int[] byteLengths = new int[]{16, 32}; + for (int x = 0; x < 2; x++) { + try (BufferAllocator allocator = new RootAllocator(128); + ArrowBuf buf = allocator.buffer(byteLengths[x]); + ) { + int [] intValues = new int [] {Integer.MAX_VALUE, Integer.MIN_VALUE, 0}; + for (int val : intValues) { + buf.clear(); + DecimalUtility.writeByteArrayToArrowBuf(BigInteger.valueOf(val).toByteArray(), buf, 0, byteLengths[x]); + BigDecimal actual = DecimalUtility.getBigDecimalFromArrowBuf(buf, 0, 0, byteLengths[x]); + BigDecimal expected = BigDecimal.valueOf(val); + Assert.assertEquals(expected, actual); + } + + long [] longValues = new long[] {Long.MIN_VALUE, 0 , Long.MAX_VALUE}; + for (long val : longValues) { + buf.clear(); + DecimalUtility.writeByteArrayToArrowBuf(BigInteger.valueOf(val).toByteArray(), buf, 0, byteLengths[x]); + BigDecimal actual = DecimalUtility.getBigDecimalFromArrowBuf(buf, 0, 0, byteLengths[x]); + BigDecimal expected = BigDecimal.valueOf(val); + Assert.assertEquals(expected, actual); + } + + BigInteger [] decimals = new BigInteger[] {MAX_BIG_INT[x], new BigInteger("0"), MIN_BIG_INT[x]}; + for (BigInteger val : decimals) { + buf.clear(); + DecimalUtility.writeByteArrayToArrowBuf(val.toByteArray(), buf, 0, byteLengths[x]); + BigDecimal actual = DecimalUtility.getBigDecimalFromArrowBuf(buf, 0, 0, byteLengths[x]); + BigDecimal expected = new BigDecimal(val); + Assert.assertEquals(expected, actual); + } + } + } + } + + @Test + public void testSetBigDecimalInDecimalArrowBuf() { + int[] byteLengths = new int[]{16, 32}; + for (int x = 0; x < 2; x++) { + try (BufferAllocator allocator = new RootAllocator(128); + ArrowBuf buf = allocator.buffer(byteLengths[x]); + ) { + int [] intValues = new int [] {Integer.MAX_VALUE, Integer.MIN_VALUE, 0}; + for (int val : intValues) { + buf.clear(); + DecimalUtility.writeBigDecimalToArrowBuf(BigDecimal.valueOf(val), buf, 0, byteLengths[x]); + BigDecimal actual = DecimalUtility.getBigDecimalFromArrowBuf(buf, 0, 0, byteLengths[x]); + BigDecimal expected = BigDecimal.valueOf(val); + Assert.assertEquals(expected, actual); + } + + long [] longValues = new long[] {Long.MIN_VALUE, 0 , Long.MAX_VALUE}; + for (long val : longValues) { + buf.clear(); + DecimalUtility.writeBigDecimalToArrowBuf(BigDecimal.valueOf(val), buf, 0, byteLengths[x]); + BigDecimal actual = DecimalUtility.getBigDecimalFromArrowBuf(buf, 0, 0, byteLengths[x]); + BigDecimal expected = BigDecimal.valueOf(val); + Assert.assertEquals(expected, actual); + } + + BigInteger [] decimals = new BigInteger[] {MAX_BIG_INT[x], new BigInteger("0"), MIN_BIG_INT[x]}; + for (BigInteger val : decimals) { + buf.clear(); + DecimalUtility.writeBigDecimalToArrowBuf(new BigDecimal(val), buf, 0, byteLengths[x]); + BigDecimal actual = DecimalUtility.getBigDecimalFromArrowBuf(buf, 0, 0, byteLengths[x]); + BigDecimal expected = new BigDecimal(val); + Assert.assertEquals(expected, actual); + } + } + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestDataSizeRoundingUtil.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestDataSizeRoundingUtil.java new file mode 100644 index 000000000..4138ea9d7 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestDataSizeRoundingUtil.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import static org.junit.Assert.assertEquals; + +import org.junit.Test; + +/** + * Test cases for {@link DataSizeRoundingUtil}. + */ +public class TestDataSizeRoundingUtil { + + @Test + public void testRoundUpTo8MultipleInt() { + assertEquals(0, DataSizeRoundingUtil.roundUpTo8Multiple(0)); + assertEquals(16, DataSizeRoundingUtil.roundUpTo8Multiple(9)); + assertEquals(24, DataSizeRoundingUtil.roundUpTo8Multiple(20)); + assertEquals(128, DataSizeRoundingUtil.roundUpTo8Multiple(128)); + } + + @Test + public void testRoundUpTo8MultipleLong() { + assertEquals(0L, DataSizeRoundingUtil.roundUpTo8Multiple(0L)); + assertEquals(40L, DataSizeRoundingUtil.roundUpTo8Multiple(37L)); + assertEquals(32L, DataSizeRoundingUtil.roundUpTo8Multiple(29L)); + assertEquals(512L, DataSizeRoundingUtil.roundUpTo8Multiple(512L)); + } + + @Test + public void testRoundDownTo8MultipleInt() { + assertEquals(0, DataSizeRoundingUtil.roundDownTo8Multiple(0)); + assertEquals(16, DataSizeRoundingUtil.roundDownTo8Multiple(23)); + assertEquals(24, DataSizeRoundingUtil.roundDownTo8Multiple(27)); + assertEquals(128, DataSizeRoundingUtil.roundDownTo8Multiple(128)); + } + + @Test + public void testRoundDownTo8MultipleLong() { + assertEquals(0L, DataSizeRoundingUtil.roundDownTo8Multiple(0L)); + assertEquals(40L, DataSizeRoundingUtil.roundDownTo8Multiple(45L)); + assertEquals(32L, DataSizeRoundingUtil.roundDownTo8Multiple(39L)); + assertEquals(512L, DataSizeRoundingUtil.roundDownTo8Multiple(512L)); + } + + @Test + public void testDivideBy8CeilInt() { + assertEquals(0, DataSizeRoundingUtil.divideBy8Ceil(0)); + assertEquals(3, DataSizeRoundingUtil.divideBy8Ceil(23)); + assertEquals(5, DataSizeRoundingUtil.divideBy8Ceil(35)); + assertEquals(24, DataSizeRoundingUtil.divideBy8Ceil(192)); + } + + @Test + public void testDivideBy8CeilLong() { + assertEquals(0L, DataSizeRoundingUtil.divideBy8Ceil(0L)); + assertEquals(5L, DataSizeRoundingUtil.divideBy8Ceil(37L)); + assertEquals(10L, DataSizeRoundingUtil.divideBy8Ceil(73L)); + assertEquals(25L, DataSizeRoundingUtil.divideBy8Ceil(200L)); + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestElementAddressableVectorIterator.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestElementAddressableVectorIterator.java new file mode 100644 index 000000000..419872225 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestElementAddressableVectorIterator.java @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import static junit.framework.TestCase.assertNull; +import static org.junit.Assert.assertEquals; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.memory.util.ArrowBufPointer; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.VarCharVector; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +/** + * Test cases for {@link ElementAddressableVectorIterator}. + */ +public class TestElementAddressableVectorIterator { + + private final int VECTOR_LENGTH = 100; + + private BufferAllocator allocator; + + @Before + public void prepare() { + allocator = new RootAllocator(1024 * 1024); + } + + @After + public void shutdown() { + allocator.close(); + } + + @Test + public void testIterateIntVector() { + try (IntVector intVector = new IntVector("", allocator)) { + intVector.allocateNew(VECTOR_LENGTH); + intVector.setValueCount(VECTOR_LENGTH); + + // prepare data in sorted order + for (int i = 0; i < VECTOR_LENGTH; i++) { + if (i == 0) { + intVector.setNull(i); + } else { + intVector.set(i, i); + } + } + + // iterate + ElementAddressableVectorIterator it = new ElementAddressableVectorIterator<>(intVector); + int index = 0; + while (it.hasNext()) { + ArrowBufPointer pt; + + if (index % 2 == 0) { + // use populated pointer. + pt = new ArrowBufPointer(); + it.next(pt); + } else { + // use iterator inner pointer + pt = it.next(); + } + if (index == 0) { + assertNull(pt.getBuf()); + } else { + assertEquals(index, pt.getBuf().getInt(pt.getOffset())); + } + index += 1; + } + } + } + + @Test + public void testIterateVarCharVector() { + try (VarCharVector strVector = new VarCharVector("", allocator)) { + strVector.allocateNew(VECTOR_LENGTH * 10, VECTOR_LENGTH); + strVector.setValueCount(VECTOR_LENGTH); + + // prepare data in sorted order + for (int i = 0; i < VECTOR_LENGTH; i++) { + if (i == 0) { + strVector.setNull(i); + } else { + strVector.set(i, String.valueOf(i).getBytes()); + } + } + + // iterate + ElementAddressableVectorIterator it = new ElementAddressableVectorIterator<>(strVector); + int index = 0; + while (it.hasNext()) { + ArrowBufPointer pt; + + if (index % 2 == 0) { + // use populated pointer. + pt = new ArrowBufPointer(); + it.next(pt); + } else { + // use iterator inner pointer + pt = it.next(); + } + + if (index == 0) { + assertNull(pt.getBuf()); + } else { + String expected = String.valueOf(index); + byte[] actual = new byte[expected.length()]; + assertEquals(expected.length(), pt.getLength()); + + pt.getBuf().getBytes(pt.getOffset(), actual); + assertEquals(expected, new String(actual)); + } + index += 1; + } + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestMultiMapWithOrdinal.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestMultiMapWithOrdinal.java new file mode 100644 index 000000000..ea829060d --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestMultiMapWithOrdinal.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import org.junit.Assert; +import org.junit.Test; + +public class TestMultiMapWithOrdinal { + + @Test + public void test() { + MultiMapWithOrdinal map = new MultiMapWithOrdinal<>(); + + map.put("x", "1", false); + Assert.assertEquals(1, map.size()); + map.remove("x", "1"); + Assert.assertTrue(map.isEmpty()); + map.put("x", "1", false); + map.put("x", "2", false); + map.put("y", "0", false); + Assert.assertEquals(3, map.size()); + Assert.assertEquals(2, map.getAll("x").size()); + Assert.assertEquals("1", map.getAll("x").stream().findFirst().get()); + Assert.assertEquals("1", map.getByOrdinal(0)); + Assert.assertEquals("2", map.getByOrdinal(1)); + Assert.assertEquals("0", map.getByOrdinal(2)); + Assert.assertTrue(map.remove("x", "1")); + Assert.assertFalse(map.remove("x", "1")); + Assert.assertEquals("0", map.getByOrdinal(0)); + Assert.assertEquals(2, map.size()); + map.put("x", "3", true); + Assert.assertEquals(1, map.getAll("x").size()); + Assert.assertEquals("3", map.getAll("x").stream().findFirst().get()); + map.put("z", "4", false); + Assert.assertEquals(3, map.size()); + map.put("z", "5", false); + map.put("z", "6", false); + Assert.assertEquals(5, map.size()); + map.removeAll("z"); + Assert.assertEquals(2, map.size()); + Assert.assertFalse(map.containsKey("z")); + + + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestValidator.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestValidator.java new file mode 100644 index 000000000..2db70ca5d --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestValidator.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import static org.apache.arrow.vector.util.Validator.equalEnough; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import org.junit.Test; + +public class TestValidator { + + @Test + public void testFloatComp() { + assertTrue(equalEnough(912.4140000000002F, 912.414F)); + assertTrue(equalEnough(912.4140000000002D, 912.414D)); + assertTrue(equalEnough(912.414F, 912.4140000000002F)); + assertTrue(equalEnough(912.414D, 912.4140000000002D)); + assertFalse(equalEnough(912.414D, 912.4140001D)); + assertFalse(equalEnough(null, 912.414D)); + assertTrue(equalEnough((Float) null, null)); + assertTrue(equalEnough((Double) null, null)); + assertFalse(equalEnough(912.414D, null)); + assertFalse(equalEnough(Double.MAX_VALUE, Double.MIN_VALUE)); + assertFalse(equalEnough(Double.MIN_VALUE, Double.MAX_VALUE)); + assertTrue(equalEnough(Double.MAX_VALUE, Double.MAX_VALUE)); + assertTrue(equalEnough(Double.MIN_VALUE, Double.MIN_VALUE)); + assertTrue(equalEnough(Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY)); + assertFalse(equalEnough(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY)); + assertTrue(equalEnough(Double.NaN, Double.NaN)); + assertFalse(equalEnough(1.0, Double.NaN)); + assertFalse(equalEnough(Float.MAX_VALUE, Float.MIN_VALUE)); + assertFalse(equalEnough(Float.MIN_VALUE, Float.MAX_VALUE)); + assertTrue(equalEnough(Float.MAX_VALUE, Float.MAX_VALUE)); + assertTrue(equalEnough(Float.MIN_VALUE, Float.MIN_VALUE)); + assertTrue(equalEnough(Float.NEGATIVE_INFINITY, Float.NEGATIVE_INFINITY)); + assertFalse(equalEnough(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY)); + assertTrue(equalEnough(Float.NaN, Float.NaN)); + assertFalse(equalEnough(1.0F, Float.NaN)); + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java new file mode 100644 index 000000000..1cd263120 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java @@ -0,0 +1,794 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import static junit.framework.TestCase.assertEquals; +import static junit.framework.TestCase.assertTrue; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import java.util.Arrays; +import java.util.List; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.LargeVarCharVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.compare.Range; +import org.apache.arrow.vector.compare.RangeEqualsVisitor; +import org.apache.arrow.vector.compare.TypeEqualsVisitor; +import org.apache.arrow.vector.complex.DenseUnionVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.LargeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.holders.NullableBigIntHolder; +import org.apache.arrow.vector.holders.NullableFloat4Holder; +import org.apache.arrow.vector.holders.NullableIntHolder; +import org.apache.arrow.vector.testing.ValueVectorDataPopulator; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +/** + * Test cases for {@link VectorAppender}. + */ +public class TestVectorAppender { + + private BufferAllocator allocator; + + @Before + public void prepare() { + allocator = new RootAllocator(1024 * 1024); + } + + @After + public void shutdown() { + allocator.close(); + } + + @Test + public void testAppendFixedWidthVector() { + final int length1 = 10; + final int length2 = 5; + try (IntVector target = new IntVector("", allocator); + IntVector delta = new IntVector("", allocator)) { + + target.allocateNew(length1); + delta.allocateNew(length2); + + ValueVectorDataPopulator.setVector(target, 0, 1, 2, 3, 4, 5, 6, null, 8, 9); + ValueVectorDataPopulator.setVector(delta, null, 11, 12, 13, 14); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + assertEquals(length1 + length2, target.getValueCount()); + + try (IntVector expected = new IntVector("expected", allocator)) { + expected.allocateNew(); + ValueVectorDataPopulator.setVector(expected, 0, 1, 2, 3, 4, 5, 6, null, 8, 9, null, 11, 12, 13, 14); + assertVectorsEqual(expected, target); + } + } + } + + @Test + public void testAppendEmptyFixedWidthVector() { + try (IntVector target = new IntVector("", allocator); + IntVector delta = new IntVector("", allocator)) { + + ValueVectorDataPopulator.setVector(target, 0, 1, 2, 3, 4, 5, 6, null, 8, 9); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + assertEquals(10, target.getValueCount()); + + try (IntVector expected = new IntVector("expected", allocator)) { + ValueVectorDataPopulator.setVector(expected, 0, 1, 2, 3, 4, 5, 6, null, 8, 9); + assertVectorsEqual(expected, target); + } + } + } + + @Test + public void testAppendVariableWidthVector() { + final int length1 = 10; + final int length2 = 5; + try (VarCharVector target = new VarCharVector("", allocator); + VarCharVector delta = new VarCharVector("", allocator)) { + + target.allocateNew(5, length1); + delta.allocateNew(5, length2); + + ValueVectorDataPopulator.setVector(target, "a0", "a1", "a2", "a3", null, "a5", "a6", "a7", "a8", "a9"); + ValueVectorDataPopulator.setVector(delta, "a10", "a11", "a12", "a13", null); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + try (VarCharVector expected = new VarCharVector("expected", allocator)) { + expected.allocateNew(); + ValueVectorDataPopulator.setVector(expected, + "a0", "a1", "a2", "a3", null, "a5", "a6", "a7", "a8", "a9", "a10", "a11", "a12", "a13", null); + assertVectorsEqual(expected, target); + } + } + } + + @Test + public void testAppendEmptyVariableWidthVector() { + try (VarCharVector target = new VarCharVector("", allocator); + VarCharVector delta = new VarCharVector("", allocator)) { + + ValueVectorDataPopulator.setVector(target, "a0", "a1", "a2", "a3", null, "a5", "a6", "a7", "a8", "a9"); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + try (VarCharVector expected = new VarCharVector("expected", allocator)) { + ValueVectorDataPopulator.setVector(expected, + "a0", "a1", "a2", "a3", null, "a5", "a6", "a7", "a8", "a9"); + assertVectorsEqual(expected, target); + } + } + } + + @Test + public void testAppendLargeVariableWidthVector() { + final int length1 = 5; + final int length2 = 10; + try (LargeVarCharVector target = new LargeVarCharVector("", allocator); + LargeVarCharVector delta = new LargeVarCharVector("", allocator)) { + + target.allocateNew(5, length1); + delta.allocateNew(5, length2); + + ValueVectorDataPopulator.setVector(target, "a0", null, "a2", "a3", null); + ValueVectorDataPopulator.setVector(delta, "a5", "a6", "a7", null, null, "a10", "a11", "a12", "a13", null); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + try (LargeVarCharVector expected = new LargeVarCharVector("expected", allocator)) { + expected.allocateNew(); + ValueVectorDataPopulator.setVector(expected, + "a0", null, "a2", "a3", null, "a5", "a6", "a7", null, null, "a10", "a11", "a12", "a13", null); + assertVectorsEqual(expected, target); + } + } + } + + @Test + public void testAppendEmptyLargeVariableWidthVector() { + try (LargeVarCharVector target = new LargeVarCharVector("", allocator); + LargeVarCharVector delta = new LargeVarCharVector("", allocator)) { + + ValueVectorDataPopulator.setVector(target, "a0", null, "a2", "a3", null); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + try (LargeVarCharVector expected = new LargeVarCharVector("expected", allocator)) { + ValueVectorDataPopulator.setVector(expected, "a0", null, "a2", "a3", null); + assertVectorsEqual(expected, target); + } + } + } + + @Test + public void testAppendListVector() { + final int length1 = 5; + final int length2 = 2; + try (ListVector target = ListVector.empty("target", allocator); + ListVector delta = ListVector.empty("delta", allocator)) { + + target.allocateNew(); + ValueVectorDataPopulator.setVector(target, + Arrays.asList(0, 1), + Arrays.asList(2, 3), + null, + Arrays.asList(6, 7), + Arrays.asList(8, 9)); + assertEquals(length1, target.getValueCount()); + + delta.allocateNew(); + ValueVectorDataPopulator.setVector(delta, + Arrays.asList(10, 11, 12, 13, 14), + Arrays.asList(15, 16, 17, 18, 19)); + assertEquals(length2, delta.getValueCount()); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + assertEquals(7, target.getValueCount()); + + List expected = Arrays.asList(0, 1); + assertEquals(expected, target.getObject(0)); + + expected = Arrays.asList(2, 3); + assertEquals(expected, target.getObject(1)); + + assertTrue(target.isNull(2)); + + expected = Arrays.asList(6, 7); + assertEquals(expected, target.getObject(3)); + + expected = Arrays.asList(8, 9); + assertEquals(expected, target.getObject(4)); + + expected = Arrays.asList(10, 11, 12, 13, 14); + assertEquals(expected, target.getObject(5)); + + expected = Arrays.asList(15, 16, 17, 18, 19); + assertEquals(expected, target.getObject(6)); + } + } + + @Test + public void testAppendEmptyListVector() { + try (ListVector target = ListVector.empty("target", allocator); + ListVector delta = ListVector.empty("delta", allocator)) { + // populate target with data + ValueVectorDataPopulator.setVector(target, + Arrays.asList(0, 1), + Arrays.asList(2, 3), + null, + Arrays.asList(6, 7)); + assertEquals(4, target.getValueCount()); + + // leave delta vector empty and unallocated + delta.addOrGetVector(FieldType.nullable(Types.MinorType.INT.getType())); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + // verify delta vector has original data + assertEquals(4, target.getValueCount()); + + List expected = Arrays.asList(0, 1); + assertEquals(expected, target.getObject(0)); + + expected = Arrays.asList(2, 3); + assertEquals(expected, target.getObject(1)); + + assertTrue(target.isNull(2)); + + expected = Arrays.asList(6, 7); + assertEquals(expected, target.getObject(3)); + } + } + + @Test + public void testAppendFixedSizeListVector() { + try (FixedSizeListVector target = FixedSizeListVector.empty("target", 5, allocator); + FixedSizeListVector delta = FixedSizeListVector.empty("delta", 5, allocator)) { + + target.allocateNew(); + ValueVectorDataPopulator.setVector(target, + Arrays.asList(0, 1, 2, 3, 4), + null); + assertEquals(2, target.getValueCount()); + + delta.allocateNew(); + ValueVectorDataPopulator.setVector(delta, + Arrays.asList(10, 11, 12, 13, 14), + Arrays.asList(15, 16, 17, 18, 19)); + assertEquals(2, delta.getValueCount()); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + assertEquals(4, target.getValueCount()); + + assertEquals(Arrays.asList(0, 1, 2, 3, 4), target.getObject(0)); + assertTrue(target.isNull(1)); + assertEquals(Arrays.asList(10, 11, 12, 13, 14), target.getObject(2)); + assertEquals(Arrays.asList(15, 16, 17, 18, 19), target.getObject(3)); + } + } + + @Test + public void testAppendEmptyFixedSizeListVector() { + try (FixedSizeListVector target = FixedSizeListVector.empty("target", 5, allocator); + FixedSizeListVector delta = FixedSizeListVector.empty("delta", 5, allocator)) { + + ValueVectorDataPopulator.setVector(target, + Arrays.asList(0, 1, 2, 3, 4), + null); + assertEquals(2, target.getValueCount()); + + // leave delta vector empty and unallocated + delta.addOrGetVector(FieldType.nullable(Types.MinorType.INT.getType())); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + assertEquals(2, target.getValueCount()); + + assertEquals(Arrays.asList(0, 1, 2, 3, 4), target.getObject(0)); + assertTrue(target.isNull(1)); + } + } + + @Test + public void testAppendEmptyLargeListVector() { + try (LargeListVector target = LargeListVector.empty("target", allocator); + LargeListVector delta = LargeListVector.empty("delta", allocator)) { + + ValueVectorDataPopulator.setVector(target, + Arrays.asList(0, 1, 2, 3, 4), + null); + assertEquals(2, target.getValueCount()); + + // leave delta vector empty and unallocated + delta.addOrGetVector(FieldType.nullable(Types.MinorType.INT.getType())); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + assertEquals(2, target.getValueCount()); + + assertEquals(Arrays.asList(0, 1, 2, 3, 4), target.getObject(0)); + assertTrue(target.isNull(1)); + } + } + + @Test + public void testAppendStructVector() { + final int length1 = 10; + final int length2 = 5; + try (final StructVector target = StructVector.empty("target", allocator); + final StructVector delta = StructVector.empty("delta", allocator)) { + + IntVector targetChild1 = target.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class); + VarCharVector targetChild2 = target.addOrGet("f1", FieldType.nullable(new ArrowType.Utf8()), VarCharVector.class); + targetChild1.allocateNew(); + targetChild2.allocateNew(); + ValueVectorDataPopulator.setVector(targetChild1, 0, 1, 2, 3, 4, null, 6, 7, 8, 9); + ValueVectorDataPopulator.setVector(targetChild2, "a0", "a1", "a2", "a3", "a4", "a5", "a6", null, "a8", "a9"); + target.setValueCount(length1); + + IntVector deltaChild1 = delta.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class); + VarCharVector deltaChild2 = delta.addOrGet("f1", FieldType.nullable(new ArrowType.Utf8()), VarCharVector.class); + deltaChild1.allocateNew(); + deltaChild2.allocateNew(); + ValueVectorDataPopulator.setVector(deltaChild1, 10, 11, 12, null, 14); + ValueVectorDataPopulator.setVector(deltaChild2, "a10", "a11", "a12", "a13", "a14"); + delta.setValueCount(length2); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + assertEquals(length1 + length2, target.getValueCount()); + IntVector child1 = (IntVector) target.getVectorById(0); + VarCharVector child2 = (VarCharVector) target.getVectorById(1); + + try (IntVector expected1 = new IntVector("expected1", allocator); + VarCharVector expected2 = new VarCharVector("expected2", allocator)) { + expected1.allocateNew(); + expected2.allocateNew(); + + ValueVectorDataPopulator.setVector(expected1, 0, 1, 2, 3, 4, null, 6, 7, 8, 9, 10, 11, 12, null, 14); + ValueVectorDataPopulator.setVector(expected2, + "a0", "a1", "a2", "a3", "a4", "a5", "a6", null, "a8", "a9", "a10", "a11", "a12", "a13", "a14"); + + assertVectorsEqual(expected1, target.getChild("f0")); + assertVectorsEqual(expected2, target.getChild("f1")); + } + } + } + + @Test + public void testAppendEmptyStructVector() { + try (final StructVector target = StructVector.empty("target", allocator); + final StructVector delta = StructVector.empty("delta", allocator)) { + + IntVector targetChild1 = target.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class); + VarCharVector targetChild2 = target.addOrGet("f1", FieldType.nullable(new ArrowType.Utf8()), VarCharVector.class); + ValueVectorDataPopulator.setVector(targetChild1, 0, 1, 2, 3, 4, null, 6, 7, 8, 9); + ValueVectorDataPopulator.setVector(targetChild2, "a0", "a1", "a2", "a3", "a4", "a5", "a6", null, "a8", "a9"); + target.setValueCount(10); + + // leave delta vector fields empty and unallocated + delta.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class); + delta.addOrGet("f1", FieldType.nullable(new ArrowType.Utf8()), VarCharVector.class); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + assertEquals(10, target.getValueCount()); + + try (IntVector expected1 = new IntVector("expected1", allocator); + VarCharVector expected2 = new VarCharVector("expected2", allocator)) { + ValueVectorDataPopulator.setVector(expected1, 0, 1, 2, 3, 4, null, 6, 7, 8, 9); + ValueVectorDataPopulator.setVector(expected2, + "a0", "a1", "a2", "a3", "a4", "a5", "a6", null, "a8", "a9"); + + assertVectorsEqual(expected1, target.getChild("f0")); + assertVectorsEqual(expected2, target.getChild("f1")); + } + } + } + + @Test + public void testAppendUnionVector() { + final int length1 = 10; + final int length2 = 5; + + try (final UnionVector target = UnionVector.empty("target", allocator); + final UnionVector delta = UnionVector.empty("delta", allocator)) { + + // alternating ints and big ints + target.setType(0, Types.MinorType.INT); + target.setType(1, Types.MinorType.BIGINT); + target.setType(2, Types.MinorType.INT); + target.setType(3, Types.MinorType.BIGINT); + target.setType(4, Types.MinorType.INT); + target.setType(5, Types.MinorType.BIGINT); + target.setType(6, Types.MinorType.INT); + target.setType(7, Types.MinorType.BIGINT); + target.setType(8, Types.MinorType.INT); + target.setType(9, Types.MinorType.BIGINT); + target.setType(10, Types.MinorType.INT); + target.setType(11, Types.MinorType.BIGINT); + target.setType(12, Types.MinorType.INT); + target.setType(13, Types.MinorType.BIGINT); + target.setType(14, Types.MinorType.INT); + target.setType(15, Types.MinorType.BIGINT); + target.setType(16, Types.MinorType.INT); + target.setType(17, Types.MinorType.BIGINT); + target.setType(18, Types.MinorType.INT); + target.setType(19, Types.MinorType.BIGINT); + + IntVector targetIntVec = target.getIntVector(); + targetIntVec.allocateNew(); + ValueVectorDataPopulator.setVector( + targetIntVec, + 0, null, 1, null, 2, null, 3, null, 4, null, 5, null, 6, null, 7, null, 8, null, 9, null); + assertEquals(length1 * 2, targetIntVec.getValueCount()); + + BigIntVector targetBigIntVec = target.getBigIntVector(); + targetBigIntVec.allocateNew(); + ValueVectorDataPopulator.setVector( + targetBigIntVec, + null, 0L, null, 1L, null, 2L, null, 3L, null, 4L, null, 5L, null, 6L, null, 7L, null, 8L, null, 9L); + assertEquals(length1 * 2, targetBigIntVec.getValueCount()); + + target.setValueCount(length1 * 2); + + // populate the delta vector + delta.setType(0, Types.MinorType.FLOAT4); + delta.setType(1, Types.MinorType.FLOAT4); + delta.setType(2, Types.MinorType.FLOAT4); + delta.setType(3, Types.MinorType.FLOAT4); + delta.setType(4, Types.MinorType.FLOAT4); + + Float4Vector deltaFloatVector = delta.getFloat4Vector(); + deltaFloatVector.allocateNew(); + ValueVectorDataPopulator.setVector(deltaFloatVector, 10f, 11f, 12f, 13f, 14f); + assertEquals(length2, deltaFloatVector.getValueCount()); + delta.setValueCount(length2); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + assertEquals(length1 * 2 + length2, target.getValueCount()); + + for (int i = 0; i < length1; i++) { + Object intObj = target.getObject(i * 2); + assertTrue(intObj instanceof Integer); + assertEquals(i, ((Integer) intObj).intValue()); + + Object longObj = target.getObject(i * 2 + 1); + assertTrue(longObj instanceof Long); + assertEquals(i, ((Long) longObj).longValue()); + } + + for (int i = 0; i < length2; i++) { + Object floatObj = target.getObject(length1 * 2 + i); + assertTrue(floatObj instanceof Float); + assertEquals(i + length1, ((Float) floatObj).intValue()); + } + } + } + + @Test + public void testAppendEmptyUnionVector() { + final int length1 = 10; + + try (final UnionVector target = UnionVector.empty("target", allocator); + final UnionVector delta = UnionVector.empty("delta", allocator)) { + + // alternating ints and big ints + target.setType(0, Types.MinorType.INT); + target.setType(1, Types.MinorType.BIGINT); + target.setType(2, Types.MinorType.INT); + target.setType(3, Types.MinorType.BIGINT); + target.setType(4, Types.MinorType.INT); + target.setType(5, Types.MinorType.BIGINT); + target.setType(6, Types.MinorType.INT); + target.setType(7, Types.MinorType.BIGINT); + target.setType(8, Types.MinorType.INT); + target.setType(9, Types.MinorType.BIGINT); + target.setType(10, Types.MinorType.INT); + target.setType(11, Types.MinorType.BIGINT); + target.setType(12, Types.MinorType.INT); + target.setType(13, Types.MinorType.BIGINT); + target.setType(14, Types.MinorType.INT); + target.setType(15, Types.MinorType.BIGINT); + target.setType(16, Types.MinorType.INT); + target.setType(17, Types.MinorType.BIGINT); + target.setType(18, Types.MinorType.INT); + target.setType(19, Types.MinorType.BIGINT); + + IntVector targetIntVec = target.getIntVector(); + ValueVectorDataPopulator.setVector( + targetIntVec, + 0, null, 1, null, 2, null, 3, null, 4, null, 5, null, 6, null, 7, null, 8, null, 9, null); + assertEquals(length1 * 2, targetIntVec.getValueCount()); + + BigIntVector targetBigIntVec = target.getBigIntVector(); + ValueVectorDataPopulator.setVector( + targetBigIntVec, + null, 0L, null, 1L, null, 2L, null, 3L, null, 4L, null, 5L, null, 6L, null, 7L, null, 8L, null, 9L); + assertEquals(length1 * 2, targetBigIntVec.getValueCount()); + + target.setValueCount(length1 * 2); + + // initialize the delta vector but leave it empty and unallocated + delta.setType(0, Types.MinorType.FLOAT4); + delta.setType(1, Types.MinorType.FLOAT4); + delta.setType(2, Types.MinorType.FLOAT4); + delta.setType(3, Types.MinorType.FLOAT4); + delta.setType(4, Types.MinorType.FLOAT4); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + assertEquals(length1 * 2, target.getValueCount()); + + for (int i = 0; i < length1; i++) { + Object intObj = target.getObject(i * 2); + assertTrue(intObj instanceof Integer); + assertEquals(i, ((Integer) intObj).intValue()); + + Object longObj = target.getObject(i * 2 + 1); + assertTrue(longObj instanceof Long); + assertEquals(i, ((Long) longObj).longValue()); + } + } + } + + private DenseUnionVector getTargetVector() { + // create a vector, and populate it with values {1, 2, null, 10L} + + final NullableIntHolder intHolder = new NullableIntHolder(); + intHolder.isSet = 1; + final NullableBigIntHolder longHolder = new NullableBigIntHolder(); + longHolder.isSet = 1; + final NullableFloat4Holder floatHolder = new NullableFloat4Holder(); + floatHolder.isSet = 1; + DenseUnionVector targetVector = new DenseUnionVector("target vector", allocator, null, null); + + targetVector.allocateNew(); + + while (targetVector.getValueCapacity() < 4) { + targetVector.reAlloc(); + } + + byte intTypeId = targetVector.registerNewTypeId(Field.nullable("", Types.MinorType.INT.getType())); + targetVector.setTypeId(0, intTypeId); + intHolder.value = 1; + targetVector.setSafe(0, intHolder); + targetVector.setTypeId(1, intTypeId); + intHolder.value = 2; + targetVector.setSafe(1, intHolder); + byte longTypeId = targetVector.registerNewTypeId(Field.nullable("", Types.MinorType.BIGINT.getType())); + targetVector.setTypeId(3, longTypeId); + longHolder.value = 10L; + targetVector.setSafe(3, longHolder); + targetVector.setValueCount(4); + + assertVectorValuesEqual(targetVector, new Object[]{1, 2, null, 10L}); + return targetVector; + } + + private DenseUnionVector getDeltaVector() { + // create a vector, and populate it with values {7, null, 8L, 9.0f} + + final NullableIntHolder intHolder = new NullableIntHolder(); + intHolder.isSet = 1; + final NullableBigIntHolder longHolder = new NullableBigIntHolder(); + longHolder.isSet = 1; + final NullableFloat4Holder floatHolder = new NullableFloat4Holder(); + floatHolder.isSet = 1; + + DenseUnionVector deltaVector = new DenseUnionVector("target vector", allocator, null, null); + + while (deltaVector.getValueCapacity() < 4) { + deltaVector.reAlloc(); + } + byte intTypeId = deltaVector.registerNewTypeId(Field.nullable("", Types.MinorType.INT.getType())); + deltaVector.setTypeId(0, intTypeId); + intHolder.value = 7; + deltaVector.setSafe(0, intHolder); + byte longTypeId = deltaVector.registerNewTypeId(Field.nullable("", Types.MinorType.BIGINT.getType())); + deltaVector.setTypeId(2, longTypeId); + longHolder.value = 8L; + deltaVector.setSafe(2, longHolder); + byte floatTypeId = deltaVector.registerNewTypeId(Field.nullable("", Types.MinorType.FLOAT4.getType())); + deltaVector.setTypeId(3, floatTypeId); + floatHolder.value = 9.0f; + deltaVector.setSafe(3, floatHolder); + + deltaVector.setValueCount(4); + + assertVectorValuesEqual(deltaVector, new Object[]{7, null, 8L, 9.0f}); + return deltaVector; + } + + @Test + public void testAppendDenseUnionVector() { + try (DenseUnionVector targetVector = getTargetVector(); + DenseUnionVector deltaVector = getDeltaVector()) { + + // append + VectorAppender appender = new VectorAppender(targetVector); + deltaVector.accept(appender, null); + assertVectorValuesEqual(targetVector, new Object[] {1, 2, null, 10L, 7, null, 8L, 9.0f}); + } + + // test reverse append + try (DenseUnionVector targetVector = getTargetVector(); + DenseUnionVector deltaVector = getDeltaVector()) { + + // append + VectorAppender appender = new VectorAppender(deltaVector); + targetVector.accept(appender, null); + assertVectorValuesEqual(deltaVector, new Object[] {7, null, 8L, 9.0f, 1, 2, null, 10L}); + } + } + + private DenseUnionVector getEmptyDeltaVector() { + // create a vector, but leave it empty and uninitialized + DenseUnionVector deltaVector = new DenseUnionVector("target vector", allocator, null, null); + + byte intTypeId = deltaVector.registerNewTypeId(Field.nullable("", Types.MinorType.INT.getType())); + deltaVector.setTypeId(0, intTypeId); + + byte longTypeId = deltaVector.registerNewTypeId(Field.nullable("", Types.MinorType.BIGINT.getType())); + deltaVector.setTypeId(2, longTypeId); + + byte floatTypeId = deltaVector.registerNewTypeId(Field.nullable("", Types.MinorType.FLOAT4.getType())); + deltaVector.setTypeId(3, floatTypeId); + + return deltaVector; + } + + @Test + public void testAppendEmptyDenseUnionVector() { + try (DenseUnionVector targetVector = getTargetVector(); + DenseUnionVector deltaVector = getEmptyDeltaVector()) { + + // append + VectorAppender appender = new VectorAppender(targetVector); + deltaVector.accept(appender, null); + assertVectorValuesEqual(targetVector, new Object[] {1, 2, null, 10L}); + } + } + + /** + * Test appending dense union vectors where the child vectors do not match. + */ + @Test + public void testAppendDenseUnionVectorMismatch() { + final NullableIntHolder intHolder = new NullableIntHolder(); + intHolder.isSet = 1; + + final NullableBigIntHolder longHolder = new NullableBigIntHolder(); + longHolder.isSet = 1; + + final NullableFloat4Holder floatHolder = new NullableFloat4Holder(); + floatHolder.isSet = 1; + + try (DenseUnionVector targetVector = new DenseUnionVector("target vector" , allocator, null, null); + DenseUnionVector deltaVector = new DenseUnionVector("target vector" , allocator, null, null)) { + targetVector.allocateNew(); + deltaVector.allocateNew(); + + // populate the target vector with values {1, 2L} + while (targetVector.getValueCapacity() < 2) { + targetVector.reAlloc(); + } + byte intTypeId = targetVector.registerNewTypeId(Field.nullable("", Types.MinorType.INT.getType())); + targetVector.setTypeId(0, intTypeId); + intHolder.value = 1; + targetVector.setSafe(0, intHolder); + byte longTypeId = targetVector.registerNewTypeId(Field.nullable("", Types.MinorType.BIGINT.getType())); + targetVector.setTypeId(1, longTypeId); + longHolder.value = 2L; + targetVector.setSafe(1, longHolder); + targetVector.setValueCount(2); + + assertVectorValuesEqual(targetVector, new Object[] {1, 2L}); + + // populate the delta vector with values {3, 5.0f} + while (deltaVector.getValueCapacity() < 2) { + deltaVector.reAlloc(); + } + intTypeId = deltaVector.registerNewTypeId(Field.nullable("", Types.MinorType.INT.getType())); + deltaVector.setTypeId(0, intTypeId); + intHolder.value = 3; + deltaVector.setSafe(0, intHolder); + byte floatTypeId = deltaVector.registerNewTypeId(Field.nullable("", Types.MinorType.FLOAT4.getType())); + deltaVector.setTypeId(1, floatTypeId); + floatHolder.value = 5.0f; + deltaVector.setSafe(1, floatHolder); + deltaVector.setValueCount(2); + + assertVectorValuesEqual(deltaVector, new Object[] {3, 5.0f}); + + // append + VectorAppender appender = new VectorAppender(targetVector); + assertThrows(IllegalArgumentException.class, + () -> deltaVector.accept(appender, null)); + } + } + + @Test + public void testAppendVectorNegative() { + final int vectorLength = 10; + try (IntVector target = new IntVector("", allocator); + VarCharVector delta = new VarCharVector("", allocator)) { + + target.allocateNew(vectorLength); + delta.allocateNew(vectorLength); + + VectorAppender appender = new VectorAppender(target); + + assertThrows(IllegalArgumentException.class, + () -> delta.accept(appender, null)); + } + } + + private void assertVectorValuesEqual(ValueVector vector, Object[] values) { + assertEquals(vector.getValueCount(), values.length); + for (int i = 0; i < values.length; i++) { + assertEquals(vector.getObject(i), values[i]); + } + } + + public static void assertVectorsEqual(ValueVector vector1, ValueVector vector2) { + assertEquals(vector1.getValueCount(), vector2.getValueCount()); + + TypeEqualsVisitor typeEqualsVisitor = new TypeEqualsVisitor(vector1, false, false); + RangeEqualsVisitor equalsVisitor = + new RangeEqualsVisitor(vector1, vector2, (v1, v2) -> typeEqualsVisitor.equals(vector2)); + assertTrue(equalsVisitor.rangeEquals(new Range(0, 0, vector1.getValueCount()))); + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorBatchAppender.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorBatchAppender.java new file mode 100644 index 000000000..799c25c0a --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorBatchAppender.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import static junit.framework.TestCase.assertEquals; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.testing.ValueVectorDataPopulator; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +/** + * Test cases for {@link VectorBatchAppender}. + */ +public class TestVectorBatchAppender { + + private BufferAllocator allocator; + + @Before + public void prepare() { + allocator = new RootAllocator(1024 * 1024); + } + + @After + public void shutdown() { + allocator.close(); + } + + @Test + public void testBatchAppendIntVector() { + final int length1 = 10; + final int length2 = 5; + final int length3 = 7; + try (IntVector target = new IntVector("", allocator); + IntVector delta1 = new IntVector("", allocator); + IntVector delta2 = new IntVector("", allocator)) { + + target.allocateNew(length1); + delta1.allocateNew(length2); + delta2.allocateNew(length3); + + ValueVectorDataPopulator.setVector(target, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9); + ValueVectorDataPopulator.setVector(delta1, 10, 11, 12, 13, 14); + ValueVectorDataPopulator.setVector(delta2, 15, 16, 17, 18, 19, 20, 21); + + VectorBatchAppender.batchAppend(target, delta1, delta2); + + assertEquals(length1 + length2 + length3, target.getValueCount()); + for (int i = 0; i < target.getValueCount(); i++) { + assertEquals(i, target.get(i)); + } + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorSchemaRootAppender.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorSchemaRootAppender.java new file mode 100644 index 000000000..ab0ee3a20 --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorSchemaRootAppender.java @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import static junit.framework.TestCase.assertEquals; +import static org.apache.arrow.vector.util.TestVectorAppender.assertVectorsEqual; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.testing.ValueVectorDataPopulator; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +/** + * Test cases for {@link VectorSchemaRootAppender}. + */ +public class TestVectorSchemaRootAppender { + + private BufferAllocator allocator; + + @Before + public void prepare() { + allocator = new RootAllocator(1024 * 1024); + } + + @After + public void shutdown() { + allocator.close(); + } + + @Test + public void testVectorScehmaRootAppend() { + final int length1 = 5; + final int length2 = 3; + final int length3 = 2; + + try (IntVector targetChild1 = new IntVector("t1", allocator); + VarCharVector targetChild2 = new VarCharVector("t2", allocator); + BigIntVector targetChild3 = new BigIntVector("t3", allocator); + + IntVector deltaChildOne1 = new IntVector("do1", allocator); + VarCharVector deltaChildOne2 = new VarCharVector("do2", allocator); + BigIntVector deltaChildOne3 = new BigIntVector("do3", allocator); + + IntVector deltaChildTwo1 = new IntVector("dt1", allocator); + VarCharVector deltaChildTwo2 = new VarCharVector("dt2", allocator); + BigIntVector deltaChildTwo3 = new BigIntVector("dt3", allocator)) { + + ValueVectorDataPopulator.setVector(targetChild1, 0, 1, null, 3, 4); + ValueVectorDataPopulator.setVector(targetChild2, "zero", "one", null, "three", "four"); + ValueVectorDataPopulator.setVector(targetChild3, 0L, 10L, null, 30L, 40L); + VectorSchemaRoot root1 = VectorSchemaRoot.of(targetChild1, targetChild2, targetChild3); + root1.setRowCount(length1); + + ValueVectorDataPopulator.setVector(deltaChildOne1, 5, 6, 7); + ValueVectorDataPopulator.setVector(deltaChildOne2, "five", "six", "seven"); + ValueVectorDataPopulator.setVector(deltaChildOne3, 50L, 60L, 70L); + VectorSchemaRoot root2 = VectorSchemaRoot.of(deltaChildOne1, deltaChildOne2, deltaChildOne3); + root2.setRowCount(length2); + + ValueVectorDataPopulator.setVector(deltaChildTwo1, null, 9); + ValueVectorDataPopulator.setVector(deltaChildTwo2, null, "nine"); + ValueVectorDataPopulator.setVector(deltaChildTwo3, null, 90L); + VectorSchemaRoot root3 = VectorSchemaRoot.of(deltaChildTwo1, deltaChildTwo2, deltaChildTwo3); + root3.setRowCount(length3); + + VectorSchemaRootAppender.append(root1, root2, root3); + assertEquals(length1 + length2 + length3, root1.getRowCount()); + assertEquals(3, root1.getFieldVectors().size()); + + try (IntVector expected1 = new IntVector("", allocator); + VarCharVector expected2 = new VarCharVector("", allocator); + BigIntVector expected3 = new BigIntVector("", allocator)) { + + ValueVectorDataPopulator.setVector(expected1, 0, 1, null, 3, 4, 5, 6, 7, null, 9); + ValueVectorDataPopulator.setVector( + expected2, "zero", "one", null, "three", "four", "five", "six", "seven", null, "nine"); + ValueVectorDataPopulator.setVector(expected3, 0L, 10L, null, 30L, 40L, 50L, 60L, 70L, null, 90L); + + assertVectorsEqual(expected1, root1.getVector(0)); + assertVectorsEqual(expected2, root1.getVector(1)); + assertVectorsEqual(expected3, root1.getVector(2)); + } + } + } + + @Test + public void testRootWithDifferentChildCounts() { + try (IntVector targetChild1 = new IntVector("t1", allocator); + VarCharVector targetChild2 = new VarCharVector("t2", allocator); + BigIntVector targetChild3 = new BigIntVector("t3", allocator); + + IntVector deltaChild1 = new IntVector("d1", allocator); + VarCharVector deltaChild2 = new VarCharVector("d2", allocator)) { + + ValueVectorDataPopulator.setVector(targetChild1, 0, 1, null, 3, 4); + ValueVectorDataPopulator.setVector(targetChild2, "zero", "one", null, "three", "four"); + ValueVectorDataPopulator.setVector(targetChild3, 0L, 10L, null, 30L, 40L); + VectorSchemaRoot root1 = VectorSchemaRoot.of(targetChild1, targetChild2, targetChild3); + root1.setRowCount(5); + + ValueVectorDataPopulator.setVector(deltaChild1, 5, 6, 7); + ValueVectorDataPopulator.setVector(deltaChild2, "five", "six", "seven"); + VectorSchemaRoot root2 = VectorSchemaRoot.of(deltaChild1, deltaChild2); + root2.setRowCount(3); + + IllegalArgumentException exp = assertThrows(IllegalArgumentException.class, + () -> VectorSchemaRootAppender.append(root1, root2)); + + assertEquals("Vector schema roots have different numbers of child vectors.", exp.getMessage()); + } + } + + @Test + public void testRootWithDifferentChildTypes() { + try (IntVector targetChild1 = new IntVector("t1", allocator); + VarCharVector targetChild2 = new VarCharVector("t2", allocator); + + IntVector deltaChild1 = new IntVector("d1", allocator); + VarCharVector deltaChild2 = new VarCharVector("d2", allocator)) { + + ValueVectorDataPopulator.setVector(targetChild1, 0, 1, null, 3, 4); + ValueVectorDataPopulator.setVector(targetChild2, "zero", "one", null, "three", "four"); + VectorSchemaRoot root1 = VectorSchemaRoot.of(targetChild1, targetChild2); + root1.setRowCount(5); + + ValueVectorDataPopulator.setVector(deltaChild1, 5, 6, 7); + ValueVectorDataPopulator.setVector(deltaChild2, "five", "six", "seven"); + + // note that the child vectors are in reverse order + VectorSchemaRoot root2 = VectorSchemaRoot.of(deltaChild2, deltaChild1); + root2.setRowCount(3); + + IllegalArgumentException exp = assertThrows(IllegalArgumentException.class, + () -> VectorSchemaRootAppender.append(root1, root2)); + + assertEquals("Vector schema roots have different schemas.", exp.getMessage()); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVector.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVector.java new file mode 100644 index 000000000..2354b281e --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVector.java @@ -0,0 +1,260 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.validate; + +import static org.apache.arrow.vector.testing.ValueVectorDataPopulator.setVector; +import static org.apache.arrow.vector.util.ValueVectorUtility.validate; +import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import java.nio.charset.Charset; +import java.util.Arrays; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.LargeVarCharVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.complex.DenseUnionVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.LargeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.complex.impl.NullableStructWriter; +import org.apache.arrow.vector.holders.NullableFloat4Holder; +import org.apache.arrow.vector.holders.NullableFloat8Holder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestValidateVector { + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + private static final Charset utf8Charset = Charset.forName("UTF-8"); + private static final byte[] STR1 = "AAAAA1".getBytes(utf8Charset); + private static final byte[] STR2 = "BBBBBBBBB2".getBytes(utf8Charset); + private static final byte[] STR3 = "CCCC3".getBytes(utf8Charset); + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testBaseFixedWidthVector() { + try (final IntVector vector = new IntVector("v", allocator)) { + validate(vector); + setVector(vector, 1, 2, 3); + validate(vector); + + vector.getDataBuffer().capacity(0); + ValidateUtil.ValidateException e = assertThrows(ValidateUtil.ValidateException.class, + () -> validate(vector)); + assertTrue(e.getMessage().contains("Not enough capacity for fixed width data buffer")); + } + } + + @Test + public void testBaseVariableWidthVector() { + try (final VarCharVector vector = new VarCharVector("v", allocator)) { + validate(vector); + setVector(vector, STR1, STR2, STR3); + validate(vector); + + vector.getDataBuffer().capacity(0); + ValidateUtil.ValidateException e = assertThrows(ValidateUtil.ValidateException.class, + () -> validate(vector)); + assertTrue(e.getMessage().contains("Not enough capacity for data buffer")); + } + } + + @Test + public void testBaseLargeVariableWidthVector() { + try (final LargeVarCharVector vector = new LargeVarCharVector("v", allocator)) { + validate(vector); + setVector(vector, STR1, STR2, null, STR3); + validate(vector); + + vector.getDataBuffer().capacity(0); + ValidateUtil.ValidateException e = assertThrows(ValidateUtil.ValidateException.class, + () -> validate(vector)); + assertTrue(e.getMessage().contains("Not enough capacity for data buffer")); + } + } + + @Test + public void testListVector() { + try (final ListVector vector = ListVector.empty("v", allocator)) { + validate(vector); + setVector(vector, Arrays.asList(1, 2, 3), Arrays.asList(4, 5)); + validate(vector); + + vector.getDataVector().setValueCount(3); + ValidateUtil.ValidateException e = assertThrows(ValidateUtil.ValidateException.class, + () -> validate(vector)); + assertTrue(e.getMessage().contains("Inner vector does not contain enough elements.")); + } + } + + @Test + public void testLargeListVector() { + try (final LargeListVector vector = LargeListVector.empty("v", allocator)) { + validate(vector); + setVector(vector, Arrays.asList(1, 2, 3, 4), Arrays.asList(5, 6)); + validate(vector); + + vector.getDataVector().setValueCount(4); + ValidateUtil.ValidateException e = assertThrows(ValidateUtil.ValidateException.class, + () -> validate(vector)); + assertTrue(e.getMessage().contains("Inner vector does not contain enough elements.")); + } + } + + @Test + public void testFixedSizeListVector() { + try (final FixedSizeListVector vector = FixedSizeListVector.empty("v", 3, allocator)) { + validate(vector); + setVector(vector, Arrays.asList(1, 2, 3), Arrays.asList(4, 5, 6)); + validate(vector); + + vector.getDataVector().setValueCount(3); + ValidateUtil.ValidateException e = assertThrows(ValidateUtil.ValidateException.class, + () -> validate(vector)); + assertTrue(e.getMessage().contains("Inner vector does not contain enough elements.")); + } + } + + @Test + public void testStructVectorRangeEquals() { + try (final StructVector vector = StructVector.empty("struct", allocator)) { + vector.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class); + vector.addOrGet("f1", FieldType.nullable(new ArrowType.Int(64, true)), BigIntVector.class); + + validate(vector); + + NullableStructWriter writer = vector.getWriter(); + writer.allocate(); + + writeStructVector(writer, 1, 10L); + writeStructVector(writer, 2, 20L); + writeStructVector(writer, 3, 30L); + writeStructVector(writer, 4, 40L); + writeStructVector(writer, 5, 50L); + writer.setValueCount(5); + + vector.getChild("f0").setValueCount(2); + ValidateUtil.ValidateException e = assertThrows(ValidateUtil.ValidateException.class, + () -> validate(vector)); + assertTrue(e.getMessage().contains("Struct vector length not equal to child vector length")); + + vector.getChild("f0").setValueCount(5); + validate(vector); + + vector.getChild("f0").getDataBuffer().capacity(0); + ValidateUtil.ValidateException e2 = assertThrows(ValidateUtil.ValidateException.class, + () -> validate(vector)); + assertTrue(e2.getMessage().contains("Not enough capacity for fixed width data buffer")); + } + } + + @Test + public void testUnionVector() { + try (final UnionVector vector = UnionVector.empty("union", allocator)) { + validate(vector); + + final NullableFloat4Holder float4Holder = new NullableFloat4Holder(); + float4Holder.value = 1.01f; + float4Holder.isSet = 1; + + final NullableFloat8Holder float8Holder = new NullableFloat8Holder(); + float8Holder.value = 2.02f; + float8Holder.isSet = 1; + + vector.setType(0, Types.MinorType.FLOAT4); + vector.setSafe(0, float4Holder); + vector.setType(1, Types.MinorType.FLOAT8); + vector.setSafe(1, float8Holder); + vector.setValueCount(2); + + validate(vector); + + vector.getChildrenFromFields().get(0).setValueCount(1); + ValidateUtil.ValidateException e1 = assertThrows(ValidateUtil.ValidateException.class, + () -> validate(vector)); + assertTrue(e1.getMessage().contains("Union vector length not equal to child vector length")); + + vector.getChildrenFromFields().get(0).setValueCount(2); + validate(vector); + + vector.getChildrenFromFields().get(0).getDataBuffer().capacity(0); + ValidateUtil.ValidateException e2 = assertThrows(ValidateUtil.ValidateException.class, + () -> validate(vector)); + assertTrue(e2.getMessage().contains("Not enough capacity for fixed width data buffer")); + } + } + + @Test + public void testDenseUnionVector() { + try (final DenseUnionVector vector = DenseUnionVector.empty("union", allocator)) { + validate(vector); + + final NullableFloat4Holder float4Holder = new NullableFloat4Holder(); + float4Holder.value = 1.01f; + float4Holder.isSet = 1; + + final NullableFloat8Holder float8Holder = new NullableFloat8Holder(); + float8Holder.value = 2.02f; + float8Holder.isSet = 1; + + byte float4TypeId = vector.registerNewTypeId(Field.nullable("", Types.MinorType.FLOAT4.getType())); + byte float8TypeId = vector.registerNewTypeId(Field.nullable("", Types.MinorType.FLOAT8.getType())); + + vector.setTypeId(0, float4TypeId); + vector.setSafe(0, float4Holder); + vector.setTypeId(1, float8TypeId); + vector.setSafe(1, float8Holder); + vector.setValueCount(2); + + validate(vector); + + vector.getChildrenFromFields().get(0).getDataBuffer().capacity(0); + ValidateUtil.ValidateException e = assertThrows(ValidateUtil.ValidateException.class, + () -> validate(vector)); + assertTrue(e.getMessage().contains("Not enough capacity for fixed width data buffer")); + } + } + + private void writeStructVector(NullableStructWriter writer, int value1, long value2) { + writer.start(); + writer.integer("f0").writeInt(value1); + writer.bigInt("f1").writeBigInt(value2); + writer.end(); + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVectorFull.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVectorFull.java new file mode 100644 index 000000000..4241a0d9c --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVectorFull.java @@ -0,0 +1,234 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.validate; + +import static org.apache.arrow.vector.testing.ValueVectorDataPopulator.setVector; +import static org.apache.arrow.vector.util.ValueVectorUtility.validateFull; +import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import java.util.Arrays; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.LargeVarCharVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.complex.DenseUnionVector; +import org.apache.arrow.vector.complex.LargeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.holders.NullableFloat4Holder; +import org.apache.arrow.vector.holders.NullableFloat8Holder; +import org.apache.arrow.vector.testing.ValueVectorDataPopulator; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestValidateVectorFull { + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testBaseVariableWidthVector() { + try (final VarCharVector vector = new VarCharVector("v", allocator)) { + validateFull(vector); + setVector(vector, "aaa", "bbb", "ccc"); + validateFull(vector); + + ArrowBuf offsetBuf = vector.getOffsetBuffer(); + offsetBuf.setInt(0, 100); + offsetBuf.setInt(4, 50); + + ValidateUtil.ValidateException e = assertThrows(ValidateUtil.ValidateException.class, + () -> validateFull(vector)); + assertTrue(e.getMessage().contains("The values in positions 0 and 1 of the offset buffer are decreasing")); + } + } + + @Test + public void testBaseLargeVariableWidthVector() { + try (final LargeVarCharVector vector = new LargeVarCharVector("v", allocator)) { + validateFull(vector); + setVector(vector, "aaa", "bbb", null, "ccc"); + validateFull(vector); + + ArrowBuf offsetBuf = vector.getOffsetBuffer(); + offsetBuf.setLong(0, 100); + offsetBuf.setLong(8, 50); + + ValidateUtil.ValidateException e = assertThrows(ValidateUtil.ValidateException.class, + () -> validateFull(vector)); + assertTrue(e.getMessage().contains("The values in positions 0 and 1 of the large offset buffer are decreasing")); + } + } + + @Test + public void testListVector() { + try (final ListVector vector = ListVector.empty("v", allocator)) { + validateFull(vector); + setVector(vector, Arrays.asList(1, 2, 3), Arrays.asList(4, 5), Arrays.asList(6, 7, 8, 9)); + validateFull(vector); + + ArrowBuf offsetBuf = vector.getOffsetBuffer(); + offsetBuf.setInt(0, 100); + offsetBuf.setInt(8, 50); + + ValidateUtil.ValidateException e = assertThrows(ValidateUtil.ValidateException.class, + () -> validateFull(vector)); + assertTrue(e.getMessage().contains("The values in positions 0 and 1 of the offset buffer are decreasing")); + } + } + + @Test + public void testLargeListVector() { + try (final LargeListVector vector = LargeListVector.empty("v", allocator)) { + validateFull(vector); + setVector(vector, Arrays.asList(1, 2, 3), Arrays.asList(4, 5), Arrays.asList(6, 7, 8, 9)); + validateFull(vector); + + ArrowBuf offsetBuf = vector.getOffsetBuffer(); + offsetBuf.setLong(0, 100); + offsetBuf.setLong(16, 50); + + ValidateUtil.ValidateException e = assertThrows(ValidateUtil.ValidateException.class, + () -> validateFull(vector)); + assertTrue(e.getMessage().contains("The values in positions 0 and 1 of the large offset buffer are decreasing")); + } + } + + @Test + public void testStructVectorRangeEquals() { + try (final StructVector vector = StructVector.empty("struct", allocator)) { + IntVector intVector = + vector.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class); + VarCharVector strVector = + vector.addOrGet("f1", FieldType.nullable(new ArrowType.Utf8()), VarCharVector.class); + + validateFull(vector); + validateFull(intVector); + validateFull(strVector); + + ValueVectorDataPopulator.setVector(intVector, 1, 2, 3, 4, 5); + ValueVectorDataPopulator.setVector(strVector, "a", "b", "c", "d", "e"); + vector.setValueCount(5); + + validateFull(vector); + validateFull(intVector); + validateFull(strVector); + + ArrowBuf offsetBuf = strVector.getOffsetBuffer(); + offsetBuf.setInt(0, 100); + offsetBuf.setInt(8, 50); + + ValidateUtil.ValidateException e = assertThrows(ValidateUtil.ValidateException.class, + () -> validateFull(strVector)); + assertTrue(e.getMessage().contains("The values in positions 0 and 1 of the offset buffer are decreasing")); + + e = assertThrows(ValidateUtil.ValidateException.class, + () -> validateFull(vector)); + assertTrue(e.getMessage().contains("The values in positions 0 and 1 of the offset buffer are decreasing")); + } + } + + @Test + public void testUnionVector() { + try (final UnionVector vector = UnionVector.empty("union", allocator)) { + validateFull(vector); + + final NullableFloat4Holder float4Holder = new NullableFloat4Holder(); + float4Holder.value = 1.01f; + float4Holder.isSet = 1; + + final NullableFloat8Holder float8Holder = new NullableFloat8Holder(); + float8Holder.value = 2.02f; + float8Holder.isSet = 1; + + vector.setType(0, Types.MinorType.FLOAT4); + vector.setSafe(0, float4Holder); + vector.setType(1, Types.MinorType.FLOAT8); + vector.setSafe(1, float8Holder); + vector.setValueCount(2); + + validateFull(vector); + + // negative type id + vector.getTypeBuffer().setByte(0, -1); + + ValidateUtil.ValidateException e = assertThrows(ValidateUtil.ValidateException.class, + () -> validateFull(vector)); + assertTrue(e.getMessage().contains("The type id at position 0 is negative")); + } + } + + @Test + public void testDenseUnionVector() { + try (final DenseUnionVector vector = DenseUnionVector.empty("union", allocator)) { + validateFull(vector); + + final NullableFloat4Holder float4Holder = new NullableFloat4Holder(); + float4Holder.value = 1.01f; + float4Holder.isSet = 1; + + final NullableFloat8Holder float8Holder = new NullableFloat8Holder(); + float8Holder.value = 2.02f; + float8Holder.isSet = 1; + + byte float4TypeId = vector.registerNewTypeId(Field.nullable("", Types.MinorType.FLOAT4.getType())); + byte float8TypeId = vector.registerNewTypeId(Field.nullable("", Types.MinorType.FLOAT8.getType())); + + vector.setTypeId(0, float4TypeId); + vector.setSafe(0, float4Holder); + vector.setTypeId(1, float8TypeId); + vector.setSafe(1, float8Holder); + vector.setValueCount(2); + + validateFull(vector); + + ValueVector subVector = vector.getVectorByType(float4TypeId); + assertTrue(subVector instanceof Float4Vector); + assertEquals(1, subVector.getValueCount()); + + // shrink sub-vector + subVector.setValueCount(0); + + ValidateUtil.ValidateException e = assertThrows(ValidateUtil.ValidateException.class, + () -> validateFull(vector)); + assertTrue(e.getMessage().contains("Dense union vector offset exceeds sub-vector boundary")); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVectorSchemaRoot.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVectorSchemaRoot.java new file mode 100644 index 000000000..1885fb21f --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVectorSchemaRoot.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.validate; + +import static org.apache.arrow.vector.util.ValueVectorUtility.validate; +import static org.apache.arrow.vector.util.ValueVectorUtility.validateFull; +import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.testing.ValueVectorDataPopulator; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestValidateVectorSchemaRoot { + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testValidatePositive() { + try (IntVector intVector = new IntVector("int vector", allocator); + VarCharVector strVector = new VarCharVector("var char vector", allocator)) { + + VectorSchemaRoot root = VectorSchemaRoot.of(intVector, strVector); + + validate(root); + validateFull(root); + + ValueVectorDataPopulator.setVector(intVector, 1, 2, 3, 4, 5); + ValueVectorDataPopulator.setVector(strVector, "a", "b", "c", "d", "e"); + root.setRowCount(5); + + validate(root); + validateFull(root); + } + } + + @Test + public void testValidateNegative() { + try (IntVector intVector = new IntVector("int vector", allocator); + VarCharVector strVector = new VarCharVector("var char vector", allocator)) { + + VectorSchemaRoot root = VectorSchemaRoot.of(intVector, strVector); + + ValueVectorDataPopulator.setVector(intVector, 1, 2, 3, 4, 5); + ValueVectorDataPopulator.setVector(strVector, "a", "b", "c", "d", "e"); + + // validate mismatching value counts + root.setRowCount(4); + intVector.setValueCount(5); + strVector.setValueCount(5); + ValidateUtil.ValidateException e = assertThrows(ValidateUtil.ValidateException.class, + () -> validate(root)); + assertTrue(e.getMessage().contains("Child vector and vector schema root have different value counts")); + e = assertThrows(ValidateUtil.ValidateException.class, + () -> validateFull(root)); + assertTrue(e.getMessage().contains("Child vector and vector schema root have different value counts")); + + // valid problems with the child vector + root.setRowCount(5); + ArrowBuf offsetBuf = strVector.getOffsetBuffer(); + offsetBuf.setInt(0, 100); + offsetBuf.setInt(8, 50); + validate(root); + e = assertThrows(ValidateUtil.ValidateException.class, + () -> validateFull(root)); + assertTrue(e.getMessage().contains("The values in positions 0 and 1 of the offset buffer are decreasing")); + } + } +} diff --git a/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVectorTypeVisitor.java b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVectorTypeVisitor.java new file mode 100644 index 000000000..7a0f12f7a --- /dev/null +++ b/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVectorTypeVisitor.java @@ -0,0 +1,301 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.validate; + +import static org.junit.jupiter.api.Assertions.assertThrows; + +import java.util.function.Supplier; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.DateDayVector; +import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.DurationVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.IntervalDayVector; +import org.apache.arrow.vector.IntervalYearVector; +import org.apache.arrow.vector.LargeVarBinaryVector; +import org.apache.arrow.vector.LargeVarCharVector; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.SmallIntVector; +import org.apache.arrow.vector.TimeMicroVector; +import org.apache.arrow.vector.TimeMilliVector; +import org.apache.arrow.vector.TimeNanoVector; +import org.apache.arrow.vector.TimeSecVector; +import org.apache.arrow.vector.TimeStampMicroTZVector; +import org.apache.arrow.vector.TimeStampMicroVector; +import org.apache.arrow.vector.TimeStampMilliTZVector; +import org.apache.arrow.vector.TimeStampMilliVector; +import org.apache.arrow.vector.TimeStampNanoTZVector; +import org.apache.arrow.vector.TimeStampNanoVector; +import org.apache.arrow.vector.TimeStampSecTZVector; +import org.apache.arrow.vector.TimeStampSecVector; +import org.apache.arrow.vector.TinyIntVector; +import org.apache.arrow.vector.UInt1Vector; +import org.apache.arrow.vector.UInt2Vector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.UInt8Vector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VarBinaryVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.complex.DenseUnionVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.LargeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.types.TimeUnit; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +/** + * Test cases for {@link ValidateVectorTypeVisitor}. + */ +public class TestValidateVectorTypeVisitor { + + private BufferAllocator allocator; + + private ValidateVectorTypeVisitor visitor = new ValidateVectorTypeVisitor(); + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + private void testPositiveCase(Supplier vectorGenerator) { + try (ValueVector vector = vectorGenerator.get();) { + vector.accept(visitor, null); + } + } + + private void testNegativeCase(Supplier vectorGenerator) { + try (ValueVector vector = vectorGenerator.get()) { + assertThrows(ValidateUtil.ValidateException.class, () -> { + vector.accept(visitor, null); + }); + } + } + + @Test + public void testFixedWidthVectorsPositive() { + // integer vectors + testPositiveCase(() -> new TinyIntVector("vector", allocator)); + testPositiveCase(() -> new SmallIntVector("vector", allocator)); + testPositiveCase(() -> new IntVector("vector", allocator)); + testPositiveCase(() -> new BigIntVector("vector", allocator)); + testPositiveCase(() -> new UInt1Vector("vector", allocator)); + testPositiveCase(() -> new UInt2Vector("vector", allocator)); + testPositiveCase(() -> new UInt4Vector("vector", allocator)); + testPositiveCase(() -> new UInt8Vector("vector", allocator)); + + testPositiveCase(() -> new BitVector("vector", allocator)); + testPositiveCase(() -> new DecimalVector("vector", allocator, 30, 16)); + + // date vectors + testPositiveCase(() -> new DateDayVector("vector", allocator)); + testPositiveCase(() -> new DateMilliVector("vector", allocator)); + + testPositiveCase(() -> new DurationVector( + "vector", FieldType.nullable(new ArrowType.Duration(TimeUnit.SECOND)), allocator)); + + // float vectors + testPositiveCase(() -> new Float4Vector("vector", allocator)); + testPositiveCase(() -> new Float8Vector("vector", allocator)); + + // interval vectors + testPositiveCase(() -> new IntervalDayVector("vector", allocator)); + testPositiveCase(() -> new IntervalYearVector("vector", allocator)); + + // time vectors + testPositiveCase(() -> new TimeMicroVector("vector", allocator)); + testPositiveCase(() -> new TimeMilliVector("vector", allocator)); + testPositiveCase(() -> new TimeMicroVector("vector", allocator)); + testPositiveCase(() -> new TimeSecVector("vector", allocator)); + + // time stamp vectors + testPositiveCase(() -> new TimeStampMicroTZVector("vector", allocator, "cn")); + testPositiveCase(() -> new TimeStampMicroVector("vector", allocator)); + testPositiveCase(() -> new TimeStampMilliTZVector("vector", allocator, "cn")); + testPositiveCase(() -> new TimeStampMilliVector("vector", allocator)); + testPositiveCase(() -> new TimeStampNanoTZVector("vector", allocator, "cn")); + testPositiveCase(() -> new TimeStampNanoVector("vector", allocator)); + testPositiveCase(() -> new TimeStampSecTZVector("vector", allocator, "cn")); + testPositiveCase(() -> new TimeStampSecVector("vector", allocator)); + + testPositiveCase(() -> new FixedSizeBinaryVector("vector", allocator, 5)); + } + + @Test + public void testFixedWidthVectorsNegative() { + // integer vectors + testNegativeCase( + () -> new TinyIntVector("vector", FieldType.nullable(Types.MinorType.INT.getType()), allocator)); + testNegativeCase( + () -> new SmallIntVector("vector", FieldType.nullable(Types.MinorType.INT.getType()), allocator)); + testNegativeCase( + () -> new BigIntVector("vector", FieldType.nullable(Types.MinorType.SMALLINT.getType()), allocator)); + testNegativeCase( + () -> new BigIntVector("vector", FieldType.nullable(Types.MinorType.SMALLINT.getType()), allocator)); + testNegativeCase( + () -> new UInt1Vector("vector", FieldType.nullable(Types.MinorType.SMALLINT.getType()), allocator)); + testNegativeCase( + () -> new UInt2Vector("vector", FieldType.nullable(Types.MinorType.SMALLINT.getType()), allocator)); + testNegativeCase( + () -> new UInt4Vector("vector", FieldType.nullable(Types.MinorType.SMALLINT.getType()), allocator)); + testNegativeCase( + () -> new UInt8Vector("vector", FieldType.nullable(Types.MinorType.SMALLINT.getType()), allocator)); + + testNegativeCase( + () -> new BitVector("vector", FieldType.nullable(Types.MinorType.BIGINT.getType()), allocator)); + testNegativeCase( + () -> new DecimalVector("vector", allocator, 30, -16)); + + // date vectors + testNegativeCase( + () -> new DateDayVector("vector", FieldType.nullable(Types.MinorType.FLOAT4.getType()), allocator)); + testNegativeCase( + () -> new DateMilliVector("vector", FieldType.nullable(Types.MinorType.BIGINT.getType()), allocator)); + + // float pont vectors + testNegativeCase( + () -> new Float4Vector("vector", FieldType.nullable(Types.MinorType.BIGINT.getType()), allocator)); + testNegativeCase( + () -> new Float8Vector("vector", FieldType.nullable(Types.MinorType.BIGINT.getType()), allocator)); + + // interval vectors + testNegativeCase( + () -> new IntervalDayVector("vector", FieldType.nullable(Types.MinorType.INT.getType()), allocator)); + testNegativeCase( + () -> new IntervalYearVector("vector", FieldType.nullable(Types.MinorType.BIGINT.getType()), allocator)); + + // time vectors + testNegativeCase( + () -> new TimeMilliVector("vector", FieldType.nullable(Types.MinorType.BIGINT.getType()), allocator)); + testNegativeCase( + () -> new TimeMicroVector("vector", FieldType.nullable(Types.MinorType.BIGINT.getType()), allocator)); + testNegativeCase( + () -> new TimeNanoVector("vector", FieldType.nullable(Types.MinorType.BIGINT.getType()), allocator)); + testNegativeCase( + () -> new TimeSecVector("vector", FieldType.nullable(Types.MinorType.BIGINT.getType()), allocator)); + + // time stamp vectors + testNegativeCase( + () -> new TimeStampMicroTZVector("vector", allocator, null)); + testNegativeCase( + () -> new TimeStampMicroVector("vector", FieldType.nullable(Types.MinorType.BIGINT.getType()), allocator)); + testNegativeCase( + () -> new TimeStampMilliTZVector("vector", allocator, null)); + testNegativeCase( + () -> new TimeStampMilliVector("vector", FieldType.nullable(Types.MinorType.BIGINT.getType()), allocator)); + testNegativeCase( + () -> new TimeStampNanoTZVector("vector", allocator, null)); + testNegativeCase( + () -> new TimeStampNanoVector("vector", FieldType.nullable(Types.MinorType.BIGINT.getType()), allocator)); + testNegativeCase( + () -> new TimeStampSecTZVector("vector", allocator, null)); + testNegativeCase( + () -> new TimeStampSecVector("vector", FieldType.nullable(Types.MinorType.BIGINT.getType()), allocator)); + } + + @Test + public void testVariableWidthVectorsPositive() { + testPositiveCase(() -> new VarCharVector("vector", allocator)); + testPositiveCase(() -> new VarBinaryVector("vector", allocator)); + } + + @Test + public void testVariableWidthVectorsNegative() { + testNegativeCase( + () -> new VarCharVector("vector", FieldType.nullable(Types.MinorType.INT.getType()), allocator)); + testNegativeCase( + () -> new VarBinaryVector("vector", FieldType.nullable(Types.MinorType.INT.getType()), allocator)); + } + + @Test + public void testLargeVariableWidthVectorsPositive() { + testPositiveCase(() -> new LargeVarCharVector("vector", allocator)); + testPositiveCase(() -> new LargeVarBinaryVector("vector", allocator)); + } + + @Test + public void testLargeVariableWidthVectorsNegative() { + testNegativeCase( + () -> new LargeVarCharVector("vector", FieldType.nullable(Types.MinorType.INT.getType()), allocator)); + testNegativeCase( + () -> new LargeVarBinaryVector("vector", FieldType.nullable(Types.MinorType.INT.getType()), allocator)); + } + + @Test + public void testListVector() { + testPositiveCase(() -> ListVector.empty("vector", allocator)); + + testNegativeCase( + () -> new ListVector("vector", allocator, FieldType.nullable(Types.MinorType.INT.getType()), null)); + } + + @Test + public void testLargeListVector() { + testPositiveCase(() -> LargeListVector.empty("vector", allocator)); + + testNegativeCase( + () -> new LargeListVector("vector", allocator, FieldType.nullable(Types.MinorType.INT.getType()), null)); + } + + @Test + public void testFixedSizeListVector() { + testPositiveCase(() -> FixedSizeListVector.empty("vector", 10, allocator)); + } + + @Test + public void testStructVector() { + testPositiveCase(() -> StructVector.empty("vector", allocator)); + + testNegativeCase( + () -> new StructVector("vector", allocator, FieldType.nullable(Types.MinorType.INT.getType()), null)); + } + + @Test + public void testUnionVector() { + testPositiveCase(() -> UnionVector.empty("vector", allocator)); + } + + @Test + public void testDenseUnionVector() { + testPositiveCase(() -> DenseUnionVector.empty("vector", allocator)); + } + + @Test + public void testNullVector() { + testPositiveCase(() -> new NullVector("null vec")); + } +} diff --git a/src/arrow/java/vector/src/test/resources/logback.xml b/src/arrow/java/vector/src/test/resources/logback.xml new file mode 100644 index 000000000..f9e449fa6 --- /dev/null +++ b/src/arrow/java/vector/src/test/resources/logback.xml @@ -0,0 +1,28 @@ + + + + + + + + %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + + + + + -- cgit v1.2.3