Skip to content

Commit cf8bca3

Browse files
committed
Add 'preprocess merging' configuration to the segmenter config.
When enabled this merges adjacent segments below a given probability threshold before any other processing (such as closure analysis) occurs. Can be used to speed up segmenter and merging analysis by premerging low probability segments.
1 parent 2b47beb commit cf8bca3

8 files changed

+253
-45
lines changed

ift/encoder/closure_glyph_segmenter.cc

Lines changed: 70 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "ift/encoder/glyph_segmentation.h"
2525
#include "ift/encoder/merge_strategy.h"
2626
#include "ift/encoder/merger.h"
27+
#include "ift/encoder/segment.h"
2728
#include "ift/encoder/segmentation_context.h"
2829
#include "ift/encoder/subset_definition.h"
2930
#include "ift/encoder/types.h"
@@ -191,6 +192,67 @@ struct SegmentOrdering {
191192
}
192193
};
193194

195+
static std::vector<Segment> PreGroupSegments(
196+
const btree_map<SegmentSet, MergeStrategy>& merge_groups,
197+
const std::vector<SegmentOrdering>& ordering,
198+
const std::vector<SubsetDefinition>& subset_definitions,
199+
std::vector<uint32_t>& segment_index_map
200+
) {
201+
segment_index_map.resize(subset_definitions.size());
202+
std::vector<Segment> segments;
203+
204+
unsigned i = 0;
205+
unsigned last_group_index = 0;
206+
auto merge_group_it = merge_groups.begin();
207+
auto ordering_it = ordering.begin();
208+
209+
while (ordering_it != ordering.end()) {
210+
const auto& o = *ordering_it;
211+
if (o.group_index != last_group_index && merge_group_it != merge_groups.end()) {
212+
merge_group_it++;
213+
}
214+
215+
const MergeStrategy* strategy = nullptr;
216+
if (merge_group_it != merge_groups.end()) {
217+
strategy = &(merge_group_it->second);
218+
}
219+
220+
Segment segment = Segment{subset_definitions[o.original_index], o.probability};
221+
if (strategy == nullptr ||
222+
strategy->PreClosureGroupSize() <= 1 ||
223+
o.probability.Max() > strategy->PreClosureProbabilityThreshold()) {
224+
segment_index_map[o.original_index] = i;
225+
} else {
226+
uint32_t remaining = strategy->PreClosureGroupSize() - 1;
227+
while (remaining > 0) {
228+
ordering_it++;
229+
if (ordering_it == ordering.end() ||
230+
ordering_it->group_index != last_group_index) {
231+
// Not consuming this item.
232+
ordering_it--;
233+
break;
234+
}
235+
236+
segment.Definition().Union(subset_definitions[ordering_it->original_index]);
237+
segment_index_map[ordering_it->original_index] = i;
238+
239+
remaining--;
240+
}
241+
242+
if (strategy->UseCosts()) {
243+
segment.SetProbability(strategy->ProbabilityCalculator()->ComputeProbability(segment.Definition()));
244+
}
245+
}
246+
247+
last_group_index = o.group_index;
248+
segments.push_back(segment);
249+
ordering_it++;
250+
i++;
251+
}
252+
253+
return segments;
254+
}
255+
194256
// Converts the input subset definitions to a sorted list of segments, remaps
195257
// the merge_groups segment set keys to reflect the ordering changes.
196258
static StatusOr<std::vector<Segment>> ToOrderedSegments(
@@ -266,16 +328,12 @@ static StatusOr<std::vector<Segment>> ToOrderedSegments(
266328
std::sort(ordering.begin(), ordering.end());
267329

268330
// maps from index in subset_definitions to the new ordering.
269-
std::vector<uint32_t> segment_index_map(subset_definitions.size());
270-
std::vector<Segment> segments;
271-
unsigned i = 0;
272-
for (const auto& ordering : ordering) {
273-
segments.push_back(Segment{subset_definitions[ordering.original_index],
274-
ordering.probability});
275-
segment_index_map[ordering.original_index] = i++;
276-
}
331+
std::vector<uint32_t> segment_index_map;
332+
std::vector<Segment> segments = PreGroupSegments(merge_groups, ordering, subset_definitions, segment_index_map);
333+
VLOG(0) << segments.size() << " segments after pregrouping.";
277334

278335
btree_map<SegmentSet, MergeStrategy> new_merge_groups;
336+
group_index = 0;
279337
for (auto& [segments, strategy] : merge_groups) {
280338
SegmentSet remapped;
281339
SegmentSet remapped_full;
@@ -287,6 +345,10 @@ static StatusOr<std::vector<Segment>> ToOrderedSegments(
287345
remapped_full.insert(s_prime);
288346
}
289347

348+
349+
VLOG(0) << " Merge group " << group_index << " has " << remapped.size() << " segments.";
350+
group_index++;
351+
290352
if (!new_merge_groups.insert(std::make_pair(remapped, std::move(strategy)))
291353
.second) {
292354
return absl::InvalidArgumentError(

ift/encoder/closure_glyph_segmenter_test.cc

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1427,6 +1427,77 @@ if (s0 AND s2) then p2
14271427
)");
14281428
}
14291429

1430+
1431+
TEST_F(ClosureGlyphSegmenterTest, MultipleMergeGroups_PreGrouping) {
1432+
UnicodeFrequencies freq{
1433+
{{' ', ' '}, 100},
1434+
{{'d', 'd'}, 100},
1435+
{{'a', 'a'}, 60},
1436+
{{'e', 'e'}, 30},
1437+
{{'b', 'b'}, 29},
1438+
{{'f', 'f'}, 28},
1439+
{{'c', 'c'}, 10},
1440+
{{'g', 'g'}, 9},
1441+
{{'h', 'h'}, 5},
1442+
{{'i', 'i'}, 1}, // 8
1443+
};
1444+
1445+
MergeStrategy costs = *MergeStrategy::CostBased(std::move(freq), 0, 1);
1446+
costs.SetPreClosureProbabilityThreshold(0.55);
1447+
costs.SetPreClosureGroupSize(3);
1448+
1449+
btree_map<SegmentSet, MergeStrategy> merge_groups{
1450+
{{0, 1, 2, 3, 4, 5, 6, 7, 8}, costs},
1451+
{{7, 8}, MergeStrategy::Heuristic(1)},
1452+
};
1453+
1454+
auto segmentation = segmenter.CodepointToGlyphSegments(roboto.get(), {},
1455+
{
1456+
{'a'},
1457+
{'b'},
1458+
{'c'},
1459+
{'d'},
1460+
{'e'},
1461+
{'f'},
1462+
{'g'},
1463+
{'h'},
1464+
{'i'},
1465+
},
1466+
merge_groups, false);
1467+
ASSERT_TRUE(segmentation.ok()) << segmentation.status();
1468+
1469+
// f + i would normally be a good merge, but here it's skipped since it
1470+
// spans merge groups.
1471+
std::vector<SubsetDefinition> expected_segments = {
1472+
// Group 1
1473+
{'d'},
1474+
{'a'},
1475+
{'e', 'b', 'f'}, // pre merge
1476+
{'c', 'g'}, // pre merge
1477+
// Shared
1478+
{'h'},
1479+
{'i'},
1480+
};
1481+
ASSERT_EQ(segmentation->Segments(), expected_segments);
1482+
ASSERT_EQ(segmentation->ToString(),
1483+
R"(initial font: { gid0 }
1484+
p0: { gid72 }
1485+
p1: { gid69 }
1486+
p2: { gid70, gid73, gid74 }
1487+
p3: { gid71, gid75 }
1488+
p4: { gid76 }
1489+
p5: { gid77 }
1490+
p6: { gid444, gid446 }
1491+
if (s0) then p0
1492+
if (s1) then p1
1493+
if (s2) then p2
1494+
if (s3) then p3
1495+
if (s4) then p4
1496+
if (s5) then p5
1497+
if (s2 AND s5) then p6
1498+
)");
1499+
}
1500+
14301501
// TODO(garretrieger): test that segments are excluded by init font segment. ie.
14311502
// if a segment is present in the init font then it should be cleared out in the
14321503
// segmentation.

ift/encoder/merge_strategy.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,22 @@ class MergeStrategy {
145145
return init_font_merge_probability_threshold_;
146146
}
147147

148+
uint32_t PreClosureGroupSize() const {
149+
return pre_closure_group_size_;
150+
}
151+
152+
double PreClosureProbabilityThreshold() const {
153+
return pre_closure_probability_threshold_;
154+
}
155+
156+
void SetPreClosureGroupSize(uint32_t value) {
157+
pre_closure_group_size_ = value;
158+
}
159+
160+
void SetPreClosureProbabilityThreshold(double value) {
161+
pre_closure_probability_threshold_ = value;
162+
}
163+
148164
void SetInitFontMergeThreshold(std::optional<double> value) {
149165
init_font_merge_threshold_ = value;
150166
}
@@ -187,6 +203,9 @@ class MergeStrategy {
187203
std::optional<double> init_font_merge_probability_threshold_ = std::nullopt;
188204
bool use_patch_merges_ = false;
189205

206+
uint32_t pre_closure_group_size_ = 1;
207+
double pre_closure_probability_threshold_ = 0.0;
208+
190209
std::shared_ptr<freq::ProbabilityCalculator> probability_calculator_;
191210
};
192211

util/closure_glyph_keyed_segmenter_util.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ static Status Main(const std::vector<char*> args) {
142142
TRY(config_util.ConfigToMergeGroups(config, font_codepoints, segments));
143143

144144
ClosureGlyphSegmenter segmenter(config.brotli_quality(),
145-
config.brotli_quality_for_init_font_merge());
145+
config.brotli_quality_for_initial_font_merging());
146146
GlyphSegmentation segmentation = TRY(segmenter.CodepointToGlyphSegments(
147147
font.get(), init_segment, segments, merge_groups,
148148
config.move_fallback_glyphs_into_initial_font()));

util/generate_table_keyed_config.cc

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -81,22 +81,6 @@ static StatusOr<SegmentationPlan> LoadSegmentationPlan(const char* path) {
8181
int main(int argc, char** argv) {
8282
auto args = absl::ParseCommandLine(argc, argv);
8383

84-
if (args.size() <= 1) {
85-
std::cerr << "Usage:" << std::endl
86-
<< "generate_table_keyed_config <initial font subset fil> "
87-
"<table keyed subset 1 file> [... <table keyed subset file n>]"
88-
<< std::endl
89-
<< std::endl
90-
<< "Where a subset file lists one codepoint per line in "
91-
"hexadecimal format: 0xXXXX"
92-
<< std::endl
93-
<< std::endl
94-
<< "If you don't want the config to contain an initial codepoint "
95-
"set, pass an empty file as the first argument."
96-
<< std::endl;
97-
return -1;
98-
}
99-
10084
SegmentationPlan config;
10185
CodepointSet init_codepoints;
10286

@@ -115,6 +99,20 @@ int main(int argc, char** argv) {
11599

116100
CodepointSet empty;
117101
sets.push_back(empty);
102+
} else if (args.size() <= 1) {
103+
std::cerr << "Usage:" << std::endl
104+
<< "generate_table_keyed_config <initial font subset file> "
105+
"<table keyed subset 1 file> [... <table keyed subset file n>]"
106+
<< std::endl
107+
<< std::endl
108+
<< "Where a subset file lists one codepoint per line in "
109+
"hexadecimal format: 0xXXXX"
110+
<< std::endl
111+
<< std::endl
112+
<< "If you don't want the config to contain an initial codepoint "
113+
"set, pass an empty file as the first argument."
114+
<< std::endl;
115+
return -1;
118116
}
119117

120118
for (size_t i = 1; i < args.size(); i++) {

0 commit comments

Comments
 (0)