Skip to content

Commit 75c7161

Browse files
committed
For best case size computation add a configurable reduction percentage.
The merger prunes patch pairs from evaluation by looking at a "best case" cost reduction. During this calculation the size of the merged patch is estimated using the size of the individual patches being merged. The following forumula is used: merged_size = largest_individual_patch_size + best_case_size_reduction_fraction * sum of remaining patch sizes Setting best_case_size_reduction_fraction to 0 gives the previous behaviour, larger values cause more aggressive pruning, at the cost of possibly underestimating the true best case for a pair, and over pruning pairs that may have actually been viable. Also adds a flag that when enabled has the merger record a histogram of encountered size reductions. This was used on a few example fonts to inform the default value of 0.50. In testing the default value gives a significant speedup, while having neglible impact on the total cost of produced segmentations.
1 parent e8e79d7 commit 75c7161

File tree

11 files changed

+182
-27
lines changed

11 files changed

+182
-27
lines changed

ift/encoder/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ cc_library(
8484
"//ift",
8585
"//ift/freq",
8686
"@abseil-cpp//absl/status",
87+
"@abseil-cpp//absl/flags:flag",
8788
"@abseil-cpp//absl/status:statusor",
8889
"@abseil-cpp//absl/strings",
8990
"@abseil-cpp//absl/container:flat_hash_map",

ift/encoder/candidate_merge.cc

Lines changed: 54 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <utility>
66
#include <vector>
77

8+
#include "absl/flags/flag.h"
89
#include "absl/container/btree_map.h"
910
#include "absl/log/log.h"
1011
#include "absl/status/status.h"
@@ -191,15 +192,10 @@ static void MergeSegments(const Merger& merger, const SegmentSet& segments,
191192
base.SetProbability(bound);
192193
}
193194

194-
static Status AddConditionAndPatchSize(
195-
const Merger& merger, const ActivationCondition& condition,
196-
btree_map<ActivationCondition, uint32_t>& conditions) {
197-
auto existing = conditions.find(condition);
198-
if (existing != conditions.end()) {
199-
// already exists.
200-
return absl::OkStatus();
201-
}
202-
195+
static StatusOr<uint32_t> ConditionToPatchSize(
196+
const Merger& merger,
197+
const ActivationCondition& condition
198+
) {
203199
const auto& conditions_and_glyphs =
204200
merger.Context().glyph_groupings.ConditionsAndGlyphs();
205201
auto it = conditions_and_glyphs.find(condition);
@@ -209,9 +205,21 @@ static Status AddConditionAndPatchSize(
209205
}
210206

211207
const GlyphSet& glyphs = it->second;
212-
uint32_t patch_size =
208+
return
213209
TRY(merger.Context().patch_size_cache->GetPatchSize(glyphs));
214-
conditions.insert(std::pair(condition, patch_size));
210+
}
211+
212+
static Status AddConditionAndPatchSize(
213+
const Merger& merger, const ActivationCondition& condition,
214+
btree_map<ActivationCondition, uint32_t>& conditions) {
215+
216+
auto existing = conditions.lower_bound(condition);
217+
if (existing != conditions.end() && existing->first == condition) {
218+
// already exists.
219+
return absl::OkStatus();
220+
}
221+
222+
conditions.emplace_hint(existing, condition, TRY(ConditionToPatchSize(merger, condition)));
215223
return absl::OkStatus();
216224
}
217225

@@ -413,8 +421,27 @@ StatusOr<std::pair<double, GlyphSet>> CandidateMerge::ComputeInitFontCostDelta(
413421
return std::make_pair(total_delta, glyph_closure_delta);
414422
}
415423

424+
static std::optional<double> ComputeMergedSizeReduction(
425+
uint32_t new_patch_size,
426+
const btree_map<ActivationCondition, uint32_t>& removed_conditions
427+
) {
428+
int32_t total_removed_size = 0;
429+
int32_t largest_size = 0;
430+
for (const auto& [_, removed_size] : removed_conditions) {
431+
total_removed_size += removed_size;
432+
largest_size = std::max((int32_t) removed_size, largest_size);
433+
}
434+
435+
int32_t extra_raw = total_removed_size - largest_size;
436+
int32_t extra_actual = ((int32_t) new_patch_size) - largest_size;
437+
if (extra_raw == 0) {
438+
return std::nullopt;
439+
}
440+
return (double) extra_actual / (double) extra_raw;
441+
}
442+
416443
StatusOr<double> CandidateMerge::ComputeCostDelta(
417-
const Merger& merger, const SegmentSet& merged_segments,
444+
Merger& merger, const SegmentSet& merged_segments,
418445
const Segment& merged_segment, std::optional<uint32_t> maybe_new_patch_size) {
419446

420447
// TODO(garretrieger): the accuracy of this can be improved by factoring
@@ -443,14 +470,24 @@ StatusOr<double> CandidateMerge::ComputeCostDelta(
443470
uint32_t new_patch_size = 0;
444471
if (maybe_new_patch_size.has_value()) {
445472
new_patch_size = *maybe_new_patch_size;
473+
if (merger.ShouldRecordMergedSizeReductions()) {
474+
std::optional<double> reduction = ComputeMergedSizeReduction(new_patch_size, removed_conditions);
475+
if (reduction.has_value()) {
476+
merger.RecordMergedSizeReduction(*reduction);
477+
}
478+
}
446479
} else {
447-
// In the best case the merged patch size is equal to that of the largest removed patch.
448-
// All removed patches will be joined into the new merged segment, and the best case is
449-
// that all of their data is completely redundant as much as possible.
480+
// In the best case the merged patch size is equal to that of the largest removed patch,
481+
// plus the data of all other removed patches reduced by a configured fraction.
482+
uint32_t total_removed_size = 0;
483+
uint32_t largest_size = 0;
450484
for (const auto& [_, removed_size] : removed_conditions) {
451-
new_patch_size = std::max(removed_size, new_patch_size);
485+
total_removed_size += removed_size;
486+
largest_size = std::max(removed_size, largest_size);
452487
}
453-
new_patch_size += Merger::BEST_CASE_MERGE_SIZE_DELTA;
488+
uint32_t extra = total_removed_size - largest_size;
489+
extra = std::max((uint32_t) (extra * merger.Strategy().BestCaseSizeReductionFraction()), Merger::BEST_CASE_MERGE_SIZE_DELTA);
490+
new_patch_size = largest_size + extra;
454491
}
455492

456493
double cost_delta = 0.0;

ift/encoder/candidate_merge.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ struct CandidateMerge {
143143
// If new_patch_size is not provided then this computes a "best case" delta
144144
// where the new patch size is choosen to produce the best achievable delta.
145145
static absl::StatusOr<double> ComputeCostDelta(
146-
const Merger& merger, const common::SegmentSet& merged_segments,
146+
Merger& merger, const common::SegmentSet& merged_segments,
147147
const Segment& merged_segment, std::optional<uint32_t> new_patch_size);
148148

149149
// Computes the predicted change to the toal cost if moved_glyphs are

ift/encoder/closure_glyph_segmenter.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -445,10 +445,6 @@ StatusOr<GlyphSegmentation> ClosureGlyphSegmenter::CodepointToGlyphSegments(
445445
TRYV(context.ReassignInitSubset(new_def));
446446
}
447447

448-
// Before we start merging, make sure the state after init font processing is
449-
// correct.
450-
TRYV(ValidateIncrementalGroupings(face, context));
451-
452448
if (merge_groups.empty()) {
453449
// No merging will be needed so we're done.
454450
return context.ToGlyphSegmentation();
@@ -494,6 +490,10 @@ StatusOr<GlyphSegmentation> ClosureGlyphSegmenter::CodepointToGlyphSegments(
494490
// Nothing was merged so we're done.
495491
TRYV(ValidateIncrementalGroupings(face, context));
496492
context.patch_size_cache->LogBrotliCallCount();
493+
for (const auto& merger : mergers) {
494+
merger.LogMergedSizeHistogram();
495+
}
496+
497497
return context.ToGlyphSegmentation();
498498
}
499499

ift/encoder/merge_strategy.cc

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,19 @@ void PrintTo(const MergeStrategy& strategy, std::ostream* os) {
1010
<< " network_overhead = " << strategy.NetworkOverheadCost()
1111
<< std::endl
1212
<< " min_group_size = " << strategy.MinimumGroupSize() << std::endl
13-
<< " optimization_cutoff = " << strategy.OptimizationCutoffFraction()
14-
<< std::endl;
13+
<< " optimization_cutoff = " << strategy.OptimizationCutoffFraction() << std::endl
14+
<< " best_case_size_reduction_fraction = " << strategy.BestCaseSizeReductionFraction() << std::endl;
15+
16+
if (strategy.InitFontMergeThreshold().has_value()) {
17+
*os << " init_font_merge_threshold = " << *strategy.InitFontMergeThreshold() << std::endl;
18+
}
19+
if (strategy.InitFontMergeProbabilityThreshold().has_value()) {
20+
*os << " init_font_merge_probability_threshold = " << *strategy.InitFontMergeProbabilityThreshold() << std::endl;
21+
}
22+
*os << " use_patch_merges = " << strategy.UsePatchMerges() << std::endl
23+
<< " pre_closure_group_size = " << strategy.PreClosureGroupSize() << std::endl
24+
<< " pre_closure_probability_threshold = " << strategy.PreClosureProbabilityThreshold() << std::endl;
25+
*os << std::endl;
1526

1627
if (strategy.init_font_merge_threshold_.has_value()) {
1728
*os << " init_font_merge_threshold = "

ift/encoder/merge_strategy.h

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,10 +142,23 @@ class MergeStrategy {
142142
double OptimizationCutoffFraction() const {
143143
return optimization_cutoff_fraction_;
144144
}
145+
145146
void SetOptimizationCutoffFraction(double value) {
146147
optimization_cutoff_fraction_ = value;
147148
}
148149

150+
// For best case size reduction computations this sets the assumed smallest
151+
// possible reduction in data (post compression) added to a base patch.
152+
//
153+
// See the comment in segmenter_config.proto for more details.
154+
double BestCaseSizeReductionFraction() const {
155+
return best_case_size_reduction_fraction_;
156+
}
157+
158+
void SetBestCaseSizeReductionFraction(double value) {
159+
best_case_size_reduction_fraction_ = std::max(0.0, std::min(1.0, value));
160+
}
161+
149162
// Configures the threshold (cost delta) for when to merge a segment into
150163
// the init font. If not set then no segments will be merged into the init
151164
// font.
@@ -189,7 +202,12 @@ class MergeStrategy {
189202
patch_size_max_bytes_ == other.patch_size_max_bytes_ &&
190203
optimization_cutoff_fraction_ ==
191204
other.optimization_cutoff_fraction_ &&
192-
init_font_merge_threshold_ == other.init_font_merge_threshold_;
205+
best_case_size_reduction_fraction_ == other.best_case_size_reduction_fraction_ &&
206+
init_font_merge_threshold_ == other.init_font_merge_threshold_ &&
207+
init_font_merge_probability_threshold_ == other.init_font_merge_probability_threshold_ &&
208+
use_patch_merges_ == other.use_patch_merges_ &&
209+
pre_closure_group_size_ == other.pre_closure_group_size_ &&
210+
pre_closure_probability_threshold_ == other.pre_closure_probability_threshold_;
193211
}
194212

195213
private:
@@ -210,12 +228,13 @@ class MergeStrategy {
210228
uint32_t patch_size_min_bytes_;
211229
uint32_t patch_size_max_bytes_;
212230
double optimization_cutoff_fraction_ = 0.001;
231+
double best_case_size_reduction_fraction_ = 0.5;
213232
std::optional<double> init_font_merge_threshold_ = std::nullopt;
214233
std::optional<double> init_font_merge_probability_threshold_ = std::nullopt;
215234
bool use_patch_merges_ = false;
216235

217236
uint32_t pre_closure_group_size_ = 1;
218-
double pre_closure_probability_threshold_ = 0.0;
237+
double pre_closure_probability_threshold_ = 1.0;
219238

220239
std::shared_ptr<freq::ProbabilityCalculator> probability_calculator_;
221240
};

ift/encoder/merger.cc

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#include <optional>
44

5+
#include "absl/flags/flag.h"
56
#include "absl/status/status.h"
67
#include "absl/status/statusor.h"
78
#include "common/int_set.h"
@@ -16,8 +17,15 @@ using absl::StatusOr;
1617
using common::GlyphSet;
1718
using common::SegmentSet;
1819

20+
ABSL_FLAG(bool, record_merged_size_reductions, false,
21+
"When enabled the merger will record the percent size reductions of each assessed merge.");
22+
1923
namespace ift::encoder {
2024

25+
bool Merger::ShouldRecordMergedSizeReductions() const {
26+
return absl::GetFlag(FLAGS_record_merged_size_reductions);
27+
}
28+
2129
StatusOr<std::optional<std::pair<segment_index_t, GlyphSet>>>
2230
Merger::TryNextMerge() {
2331
if (strategy_.IsNone()) {
@@ -704,4 +712,18 @@ Status Merger::ApplyInitFontMove(const GlyphSet& glyphs_to_move, double delta) {
704712
return absl::OkStatus();
705713
}
706714

715+
void Merger::LogMergedSizeHistogram() const {
716+
if (!ShouldRecordMergedSizeReductions()) {
717+
return;
718+
}
719+
720+
std::stringstream histogram_string;
721+
histogram_string << "reduction_percent, count" << std::endl;
722+
for (const auto [percent, count] : merged_size_reduction_histogram_) {
723+
histogram_string << percent << ", " << count << std::endl;
724+
}
725+
VLOG(0) << "Merged Size Reduction Histogram for " << strategy_.Name().value_or("unamed") << std::endl
726+
<< histogram_string.str();
727+
}
728+
707729
} // namespace ift::encoder

ift/encoder/merger.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#define IFT_ENCODER_MERGER_
33

44
#include <cstdint>
5+
#include <sstream>
56

67
#include "common/int_set.h"
78
#include "ift/encoder/candidate_merge.h"
@@ -85,6 +86,15 @@ class Merger {
8586

8687
uint32_t NumInscopeSegments() const { return inscope_segments_.size(); }
8788

89+
void RecordMergedSizeReduction(double size_reduction) {
90+
int32_t reduction_percent = 100.0 * size_reduction;
91+
merged_size_reduction_histogram_[reduction_percent]++;
92+
}
93+
94+
bool ShouldRecordMergedSizeReductions() const;
95+
96+
void LogMergedSizeHistogram() const;
97+
8898
private:
8999
Merger(SegmentationContext& context, MergeStrategy strategy,
90100
common::SegmentSet inscope_segments,
@@ -180,6 +190,9 @@ class Merger {
180190
// selecting merges. Merging is done via simple selection until minimum group
181191
// sizes are met.
182192
segment_index_t optimization_cutoff_segment_;
193+
194+
// Percent reduction of data beyond the single largest input patch.
195+
absl::btree_map<int32_t, uint32_t> merged_size_reduction_histogram_;
183196
};
184197

185198
} // namespace ift::encoder

util/segmenter_config.proto

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,30 @@ message CostConfiguration {
190190
// Value is from [0, 1].
191191
double initial_font_merge_probability_threshold = 8;
192192

193+
// The merger prunes patch pairs from evaluation by looking at a "best case" cost reduction. During
194+
// this calculation the size of the merged patch is estimated using the size of the individual patches
195+
// being merged. The following forumula is used:
196+
//
197+
// merged_size =
198+
// largest_individual_patch_size +
199+
// best_case_size_reduction_fraction * sum of remaining patch sizes
200+
//
201+
// Where the individual patch sizes used are post compression. In affect this sets the lower bound for
202+
// how much merged data can shrink beyond it's initial compressed size (for the best case computation).
203+
//
204+
// The value is from [0, 1.0]. Lower values make best case pruning less aggressive, which means slower
205+
// run time but may lead to lower final segmentation costs. Higher values make best case pruning more
206+
// aggressive, which means faster run time but if it's set too high the merger may miss good merges.
207+
//
208+
// The current default value of 0.5 was selected by looking at the distribution of size reductions
209+
// in some example fonts. It was found (using brotli 9) that a only 0.2% of pairs have a reduction
210+
// less than 0.5. So 0.5 should typically have minimal impact on the final cost, while offering
211+
// a signficant run time speedup.
212+
//
213+
// TODO(garretrieger): evaluate this accross a larger sampling of fonts and brotli qualities to find a
214+
// more general default value.
215+
double best_case_size_reduction_fraction = 9 [default = 0.5];
216+
193217
// By default merges under cost strategy are made by joining segments together, if this setting is
194218
// enabled than an alternate merge type, patch merge, will be considered by the merger. In a patch
195219
// merge glyphs from two patches are merged together along with the conditions for those patches.
@@ -209,7 +233,7 @@ message CostConfiguration {
209233
// Work is planned to fix this issue (either in the harfbuzz closure or with a workaround in the segmenter),
210234
// until then it's recommended to not used this except for testing (or if appropriate care has been taken
211235
// in producing the input segments to avoid this issue).
212-
bool experimental_use_patch_merges = 9 [default = false];
236+
bool experimental_use_patch_merges = 10 [default = false];
213237
}
214238

215239
// The merger will choose segments to merge based on a heuristic which primarily utilizes

util/segmenter_config_util.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ StatusOr<MergeStrategy> SegmenterConfigUtil::ProtoToStrategy(
122122
strategy.SetUsePatchMerges(merged.experimental_use_patch_merges());
123123

124124
strategy.SetOptimizationCutoffFraction(merged.optimization_cutoff_fraction());
125+
strategy.SetBestCaseSizeReductionFraction(merged.best_case_size_reduction_fraction());
125126

126127
if (merged.has_initial_font_merge_threshold()) {
127128
strategy.SetInitFontMergeThreshold(merged.initial_font_merge_threshold());

0 commit comments

Comments
 (0)