diff options
Diffstat (limited to 'src/arrow/go/parquet/metadata/app_version.go')
-rw-r--r-- | src/arrow/go/parquet/metadata/app_version.go | 184 |
1 files changed, 184 insertions, 0 deletions
diff --git a/src/arrow/go/parquet/metadata/app_version.go b/src/arrow/go/parquet/metadata/app_version.go new file mode 100644 index 000000000..1433da400 --- /dev/null +++ b/src/arrow/go/parquet/metadata/app_version.go @@ -0,0 +1,184 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metadata + +import ( + "regexp" + "strconv" + "strings" + + "github.com/apache/arrow/go/v6/parquet" + "github.com/apache/arrow/go/v6/parquet/schema" +) + +var ( + // Regular expression for the version format + // major . minor . patch unknown - prerelease.x + build info + // Eg: 1.5.0ab-cdh5.5.0+cd + versionRx = regexp.MustCompile(`^(\d+)\.(\d+)\.(\d+)([^-+]*)?(?:-([^+]*))?(?:\+(.*))?$`) + // Regular expression for the application format + // application_name version VERSION_FORMAT (build build_name) + // Eg: parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd) + applicationRx = regexp.MustCompile(`^(.*?)\s*(?:(version\s*(?:([^(]*?)\s*(?:\(\s*build\s*([^)]*?)\s*\))?)?)?)$`) + + // Parquet816FixedVersion is the version used for fixing PARQUET-816 + // that changed the padding calculations for dictionary headers on row groups. + Parquet816FixedVersion = NewAppVersionExplicit("parquet-mr", 1, 2, 9) + parquet251FixedVersion = NewAppVersionExplicit("parquet-mr", 1, 8, 0) + parquetCPPFixedStatsVersion = NewAppVersionExplicit("parquet-cpp", 1, 3, 0) + parquetMRFixedStatsVersion = NewAppVersionExplicit("parquet-mr", 1, 10, 0) + // parquet1655FixedVersion is the version used for fixing PARQUET-1655 + // which fixed min/max stats comparisons for Decimal types + parquet1655FixedVersion = NewAppVersionExplicit("parquet-cpp-arrow", 4, 0, 0) +) + +// AppVersion represents a specific application version either read from +// or written to a parquet file. +type AppVersion struct { + App string + Build string + Version struct { + Major int + Minor int + Patch int + Unknown string + PreRelease string + BuildInfo string + } +} + +// NewAppVersionExplicit is a convenience function to construct a specific +// application version from the given app string and version +func NewAppVersionExplicit(app string, major, minor, patch int) *AppVersion { + v := &AppVersion{App: app} + v.Version.Major = major + v.Version.Minor = minor + v.Version.Patch = patch + return v +} + +// NewAppVersion parses a "created by" string such as "parquet-go 1.0.0". +// +// It also supports handling pre-releases and build info such as +// parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd) +func NewAppVersion(createdby string) *AppVersion { + v := &AppVersion{} + + var ver []string + + m := applicationRx.FindStringSubmatch(strings.ToLower(createdby)) + if len(m) >= 4 { + v.App = m[1] + v.Build = m[4] + ver = versionRx.FindStringSubmatch(m[3]) + } else { + v.App = "unknown" + } + + if len(ver) >= 7 { + v.Version.Major, _ = strconv.Atoi(ver[1]) + v.Version.Minor, _ = strconv.Atoi(ver[2]) + v.Version.Patch, _ = strconv.Atoi(ver[3]) + v.Version.Unknown = ver[4] + v.Version.PreRelease = ver[5] + v.Version.BuildInfo = ver[6] + } + return v +} + +// LessThan compares the app versions and returns true if this version +// is "less than" the passed version. +// +// If the apps don't match, this always returns false. Otherwise it compares +// the major versions first, then the minor versions, and finally the patch +// versions. +// +// Pre-release and build info are not considered. +func (v AppVersion) LessThan(other *AppVersion) bool { + switch { + case v.App != other.App: + return false + case v.Version.Major < other.Version.Major: + return true + case v.Version.Major > other.Version.Major: + return false + case v.Version.Minor < other.Version.Minor: + return true + case v.Version.Minor > other.Version.Minor: + return false + } + + return v.Version.Patch < other.Version.Patch +} + +// Equal only compares the Application and major/minor/patch versions. +// +// Pre-release and build info are not considered. +func (v AppVersion) Equal(other *AppVersion) bool { + return v.App == other.App && + v.Version.Major == other.Version.Major && + v.Version.Minor == other.Version.Minor && + v.Version.Patch == other.Version.Patch +} + +// HasCorrectStatistics checks whether or not the statistics are valid to be used +// based on the primitive type and the version since previous versions had issues with +// properly computing stats. +// +// Reference: parquet-cpp/src/parquet/metadata.cc +// +// PARQUET-686 has more discussion on statistics +func (v AppVersion) HasCorrectStatistics(coltype parquet.Type, logicalType schema.LogicalType, stats EncodedStatistics, sort schema.SortOrder) bool { + // parquet-cpp version 1.3.0 and parquet-mr 1.10.0 onwards stats are computed correctly for all types except decimal + if (v.App == "parquet-cpp" && v.LessThan(parquetCPPFixedStatsVersion)) || + (v.App == "parquet-mr" && v.LessThan(parquetMRFixedStatsVersion)) { + // only SIGNED are valid unless max and min are the same (in which case the sort order doesn't matter) + var maxEqualsMin bool + if stats.HasMin && stats.HasMax { + maxEqualsMin = string(stats.Min) == string(stats.Max) + } + if sort != schema.SortSIGNED && !maxEqualsMin { + return false + } + + if coltype != parquet.Types.FixedLenByteArray && coltype != parquet.Types.ByteArray { + return true + } + } + + // parquet-cpp-arrow version 4.0.0 fixed Decimal comparisons for creating min/max stats + // parquet-cpp also becomes parquet-cpp-arrow as of version 4.0.0 + if v.App == "parquet-cpp" || (v.App == "parquet-cpp-arrow" && v.LessThan(parquet1655FixedVersion)) { + if _, ok := logicalType.(*schema.DecimalLogicalType); ok && coltype == parquet.Types.FixedLenByteArray { + return false + } + } + + // created_by is not populated, which could have been caused by + // parquet-mr during the same time as PARQUET-251, see PARQUET-297 + if v.App == "unknown" { + return true + } + + // unknown sort order has incorrect stats + if sort == schema.SortUNKNOWN { + return false + } + + // PARQUET-251 + return !v.LessThan(parquet251FixedVersion) +} |