From 58daab21cd043e1dc37024a7f99b396788372918 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 9 Mar 2024 14:19:48 +0100 Subject: Merging upstream version 1.44.3. Signed-off-by: Daniel Baumann --- web/server/h2o/libh2o/deps/brotli/enc/cluster.h | 297 ++++++++++++++++++++++++ 1 file changed, 297 insertions(+) create mode 100644 web/server/h2o/libh2o/deps/brotli/enc/cluster.h (limited to 'web/server/h2o/libh2o/deps/brotli/enc/cluster.h') diff --git a/web/server/h2o/libh2o/deps/brotli/enc/cluster.h b/web/server/h2o/libh2o/deps/brotli/enc/cluster.h new file mode 100644 index 000000000..4f6c06ee5 --- /dev/null +++ b/web/server/h2o/libh2o/deps/brotli/enc/cluster.h @@ -0,0 +1,297 @@ +/* Copyright 2013 Google Inc. All Rights Reserved. + + Distributed under MIT license. + See file LICENSE for detail or copy at https://opensource.org/licenses/MIT +*/ + +// Functions for clustering similar histograms together. + +#ifndef BROTLI_ENC_CLUSTER_H_ +#define BROTLI_ENC_CLUSTER_H_ + +#include +#include +#include +#include +#include + +#include "./bit_cost.h" +#include "./entropy_encode.h" +#include "./fast_log.h" +#include "./histogram.h" +#include "./port.h" +#include "./types.h" + +namespace brotli { + +struct HistogramPair { + uint32_t idx1; + uint32_t idx2; + double cost_combo; + double cost_diff; +}; + +inline bool operator<(const HistogramPair& p1, const HistogramPair& p2) { + if (p1.cost_diff != p2.cost_diff) { + return p1.cost_diff > p2.cost_diff; + } + return (p1.idx2 - p1.idx1) > (p2.idx2 - p2.idx1); +} + +// Returns entropy reduction of the context map when we combine two clusters. +inline double ClusterCostDiff(size_t size_a, size_t size_b) { + size_t size_c = size_a + size_b; + return static_cast(size_a) * FastLog2(size_a) + + static_cast(size_b) * FastLog2(size_b) - + static_cast(size_c) * FastLog2(size_c); +} + +// Computes the bit cost reduction by combining out[idx1] and out[idx2] and if +// it is below a threshold, stores the pair (idx1, idx2) in the *pairs queue. +template +void CompareAndPushToQueue(const HistogramType* out, + const uint32_t* cluster_size, + uint32_t idx1, uint32_t idx2, + std::vector* pairs) { + if (idx1 == idx2) { + return; + } + if (idx2 < idx1) { + uint32_t t = idx2; + idx2 = idx1; + idx1 = t; + } + bool store_pair = false; + HistogramPair p; + p.idx1 = idx1; + p.idx2 = idx2; + p.cost_diff = 0.5 * ClusterCostDiff(cluster_size[idx1], cluster_size[idx2]); + p.cost_diff -= out[idx1].bit_cost_; + p.cost_diff -= out[idx2].bit_cost_; + + if (out[idx1].total_count_ == 0) { + p.cost_combo = out[idx2].bit_cost_; + store_pair = true; + } else if (out[idx2].total_count_ == 0) { + p.cost_combo = out[idx1].bit_cost_; + store_pair = true; + } else { + double threshold = pairs->empty() ? 1e99 : + std::max(0.0, (*pairs)[0].cost_diff); + HistogramType combo = out[idx1]; + combo.AddHistogram(out[idx2]); + double cost_combo = PopulationCost(combo); + if (cost_combo < threshold - p.cost_diff) { + p.cost_combo = cost_combo; + store_pair = true; + } + } + if (store_pair) { + p.cost_diff += p.cost_combo; + if (!pairs->empty() && (pairs->front() < p)) { + // Replace the top of the queue if needed. + pairs->push_back(pairs->front()); + pairs->front() = p; + } else { + pairs->push_back(p); + } + } +} + +template +void HistogramCombine(HistogramType* out, + uint32_t* cluster_size, + uint32_t* symbols, + size_t symbols_size, + size_t max_clusters) { + double cost_diff_threshold = 0.0; + size_t min_cluster_size = 1; + + // Uniquify the list of symbols. + std::vector clusters(symbols, symbols + symbols_size); + std::sort(clusters.begin(), clusters.end()); + std::vector::iterator last = + std::unique(clusters.begin(), clusters.end()); + clusters.resize(static_cast(last - clusters.begin())); + + // We maintain a heap of histogram pairs, ordered by the bit cost reduction. + std::vector pairs; + for (size_t idx1 = 0; idx1 < clusters.size(); ++idx1) { + for (size_t idx2 = idx1 + 1; idx2 < clusters.size(); ++idx2) { + CompareAndPushToQueue(out, cluster_size, clusters[idx1], clusters[idx2], + &pairs); + } + } + + while (clusters.size() > min_cluster_size) { + if (pairs[0].cost_diff >= cost_diff_threshold) { + cost_diff_threshold = 1e99; + min_cluster_size = max_clusters; + continue; + } + // Take the best pair from the top of heap. + uint32_t best_idx1 = pairs[0].idx1; + uint32_t best_idx2 = pairs[0].idx2; + out[best_idx1].AddHistogram(out[best_idx2]); + out[best_idx1].bit_cost_ = pairs[0].cost_combo; + cluster_size[best_idx1] += cluster_size[best_idx2]; + for (size_t i = 0; i < symbols_size; ++i) { + if (symbols[i] == best_idx2) { + symbols[i] = best_idx1; + } + } + for (std::vector::iterator cluster = clusters.begin(); + cluster != clusters.end(); ++cluster) { + if (*cluster >= best_idx2) { + clusters.erase(cluster); + break; + } + } + + // Remove pairs intersecting the just combined best pair. + size_t copy_to_idx = 0; + for (size_t i = 0; i < pairs.size(); ++i) { + HistogramPair& p = pairs[i]; + if (p.idx1 == best_idx1 || p.idx2 == best_idx1 || + p.idx1 == best_idx2 || p.idx2 == best_idx2) { + // Remove invalid pair from the queue. + continue; + } + if (pairs.front() < p) { + // Replace the top of the queue if needed. + HistogramPair front = pairs.front(); + pairs.front() = p; + pairs[copy_to_idx] = front; + } else { + pairs[copy_to_idx] = p; + } + ++copy_to_idx; + } + pairs.resize(copy_to_idx); + + // Push new pairs formed with the combined histogram to the heap. + for (size_t i = 0; i < clusters.size(); ++i) { + CompareAndPushToQueue(out, cluster_size, best_idx1, clusters[i], &pairs); + } + } +} + +// ----------------------------------------------------------------------------- +// Histogram refinement + +// What is the bit cost of moving histogram from cur_symbol to candidate. +template +double HistogramBitCostDistance(const HistogramType& histogram, + const HistogramType& candidate) { + if (histogram.total_count_ == 0) { + return 0.0; + } + HistogramType tmp = histogram; + tmp.AddHistogram(candidate); + return PopulationCost(tmp) - candidate.bit_cost_; +} + +// Find the best 'out' histogram for each of the 'in' histograms. +// Note: we assume that out[]->bit_cost_ is already up-to-date. +template +void HistogramRemap(const HistogramType* in, size_t in_size, + HistogramType* out, uint32_t* symbols) { + // Uniquify the list of symbols. + std::vector all_symbols(symbols, symbols + in_size); + std::sort(all_symbols.begin(), all_symbols.end()); + std::vector::iterator last = + std::unique(all_symbols.begin(), all_symbols.end()); + all_symbols.resize(static_cast(last - all_symbols.begin())); + + for (size_t i = 0; i < in_size; ++i) { + uint32_t best_out = i == 0 ? symbols[0] : symbols[i - 1]; + double best_bits = HistogramBitCostDistance(in[i], out[best_out]); + for (std::vector::const_iterator k = all_symbols.begin(); + k != all_symbols.end(); ++k) { + const double cur_bits = HistogramBitCostDistance(in[i], out[*k]); + if (cur_bits < best_bits) { + best_bits = cur_bits; + best_out = *k; + } + } + symbols[i] = best_out; + } + + + // Recompute each out based on raw and symbols. + for (std::vector::const_iterator k = all_symbols.begin(); + k != all_symbols.end(); ++k) { + out[*k].Clear(); + } + for (size_t i = 0; i < in_size; ++i) { + out[symbols[i]].AddHistogram(in[i]); + } +} + +// Reorder histograms in *out so that the new symbols in *symbols come in +// increasing order. +template +void HistogramReindex(std::vector* out, + std::vector* symbols) { + std::vector tmp(*out); + std::map new_index; + uint32_t next_index = 0; + for (size_t i = 0; i < symbols->size(); ++i) { + if (new_index.find((*symbols)[i]) == new_index.end()) { + new_index[(*symbols)[i]] = next_index; + (*out)[next_index] = tmp[(*symbols)[i]]; + ++next_index; + } + } + out->resize(next_index); + for (size_t i = 0; i < symbols->size(); ++i) { + (*symbols)[i] = new_index[(*symbols)[i]]; + } +} + +// Clusters similar histograms in 'in' together, the selected histograms are +// placed in 'out', and for each index in 'in', *histogram_symbols will +// indicate which of the 'out' histograms is the best approximation. +template +void ClusterHistograms(const std::vector& in, + size_t num_contexts, size_t num_blocks, + size_t max_histograms, + std::vector* out, + std::vector* histogram_symbols) { + const size_t in_size = num_contexts * num_blocks; + assert(in_size == in.size()); + std::vector cluster_size(in_size, 1); + out->resize(in_size); + histogram_symbols->resize(in_size); + for (size_t i = 0; i < in_size; ++i) { + (*out)[i] = in[i]; + (*out)[i].bit_cost_ = PopulationCost(in[i]); + (*histogram_symbols)[i] = static_cast(i); + } + + + const size_t max_input_histograms = 64; + for (size_t i = 0; i < in_size; i += max_input_histograms) { + size_t num_to_combine = std::min(in_size - i, max_input_histograms); + HistogramCombine(&(*out)[0], &cluster_size[0], + &(*histogram_symbols)[i], num_to_combine, + max_histograms); + } + + // Collapse similar histograms. + HistogramCombine(&(*out)[0], &cluster_size[0], + &(*histogram_symbols)[0], in_size, + max_histograms); + + // Find the optimal map from original histograms to the final ones. + HistogramRemap(&in[0], in_size, &(*out)[0], &(*histogram_symbols)[0]); + + // Convert the context map to a canonical form. + HistogramReindex(out, histogram_symbols); + +} + + +} // namespace brotli + +#endif // BROTLI_ENC_CLUSTER_H_ -- cgit v1.2.3