Skip to content

Commit 1db6eec

Browse files
committed
When check for init font merges, recompute the segmentation on each iteration.
This gives much more accurate answers for cost delta computations since it takes into account changes made to the init font segment in prior iterations.
1 parent 5540151 commit 1db6eec

File tree

6 files changed

+176
-62
lines changed

6 files changed

+176
-62
lines changed

ift/encoder/closure_glyph_segmenter.cc

Lines changed: 35 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -509,54 +509,51 @@ static StatusOr<std::vector<Segment>> ToSegments(
509509
return segments;
510510
}
511511

512-
StatusOr<SegmentationContext> ClosureGlyphSegmenter::MoveSegmentsToInitFont(SegmentationContext& context) const {
512+
Status ClosureGlyphSegmenter::MoveSegmentsToInitFont(SegmentationContext& context) const {
513513
if (!context.GetMergeStrategy().InitFontMergeThreshold().has_value()) {
514514
return absl::FailedPreconditionError("Cannot be called when there is no merge threshold configured.");
515515
}
516516

517517
double threshold = *context.GetMergeStrategy().InitFontMergeThreshold();
518518

519519
VLOG(0) << "Checking if there are any segments which should be moved into the initial font.";
520-
SegmentSet segments_to_move;
521-
for (const auto& [c, glyphs] : context.glyph_groupings.ConditionsAndGlyphs()) {
522-
SegmentSet candidate_segments = c.TriggeringSegments();
523-
if (!candidate_segments.intersects(context.ActiveSegments())) {
524-
// Only do this check for things involving active segments, this let's us skip
525-
// checks for conditions are are extremely unlikely to benefit from merging
526-
// into the init font.
527-
continue;
528-
}
529-
530-
double delta = TRY(CandidateMerge::ComputeCostDelta(context, candidate_segments, std::nullopt, 0));
531-
if (delta >= threshold * (double) candidate_segments.size()) {
532-
// Merging doesn't improve cost, skip.
533-
continue;
534-
}
535-
536-
// TODO(garretrieger): to get a more accurate picture we should consider comparing
537-
// to an updated init subset definition on each iteration.
538-
segments_to_move.union_set(candidate_segments);
539-
VLOG(0) << " Moving segments " << candidate_segments.ToString() << " into the initial font (cost delta = " << delta << ")";
540-
}
541520

542521
SubsetDefinition initial_segment = context.SegmentationInfo().InitFontSegment();
543-
std::vector<Segment> segments;
544-
for (unsigned i = 0; i < context.SegmentationInfo().Segments().size(); i++) {
545-
const Segment& segment = context.SegmentationInfo().Segments()[i];
546-
if (segments_to_move.contains(i)) {
547-
initial_segment.Union(segment.Definition());
548-
} else if (!segment.Definition().Empty()) {
549-
segments.push_back(segment);
522+
bool change_made;
523+
do {
524+
change_made = false;
525+
SegmentSet segments_to_move;
526+
for (const auto& [c, glyphs] : context.glyph_groupings.ConditionsAndGlyphs()) {
527+
SegmentSet candidate_segments = c.TriggeringSegments();
528+
if (!candidate_segments.intersects(context.ActiveSegments())) {
529+
// Only do this check for things involving active segments, this let's us skip
530+
// checks for conditions are are extremely unlikely to benefit from merging
531+
// into the init font.
532+
continue;
533+
}
534+
535+
double delta = TRY(CandidateMerge::ComputeCostDelta(context, candidate_segments, std::nullopt, 0));
536+
if (delta >= threshold * (double) candidate_segments.size()) {
537+
// Merging doesn't improve cost, skip.
538+
continue;
539+
}
540+
541+
// TODO(garretrieger): to get a more accurate picture we should consider comparing
542+
// to an updated init subset definition on each iteration.
543+
segments_to_move.union_set(candidate_segments);
544+
VLOG(0) << " Moving segments " << candidate_segments.ToString() << " into the initial font (cost delta = " << delta << ")";
545+
for (segment_index_t s : segments_to_move) {
546+
initial_segment.Union(context.SegmentationInfo().Segments()[s].Definition());
547+
}
548+
549+
TRYV(context.ReassignInitSubset(initial_segment, segments_to_move));
550+
change_made = true;
551+
break;
550552
}
551-
}
553+
} while (change_made);
552554

553-
// Reset the context using the new initial font and segment definitions.
554-
VLOG(0) << segments_to_move.size() << " segments moved into the initial font. "
555-
<< "Initial font now has " << initial_segment.codepoints.size() << " codepoints. "
556-
<< "Initial segmentation plan will be recomputed." << std::endl;
557-
MergeStrategy strategy = context.GetMergeStrategy();
558-
return TRY(InitializeSegmentationContext(
559-
context.original_face.get(), initial_segment, std::move(segments), std::move(strategy)));
555+
VLOG(0) << "Initial font now has " << initial_segment.codepoints.size() << " codepoints.";
556+
return absl::OkStatus();
560557
}
561558

562559
StatusOr<GlyphSegmentation> ClosureGlyphSegmenter::CodepointToGlyphSegments(
@@ -585,7 +582,7 @@ StatusOr<GlyphSegmentation> ClosureGlyphSegmenter::CodepointToGlyphSegments(
585582
// ### First phase of merging is to check for any patches which should be moved to the initial font
586583
// (eg. cases where the probability of a patch is ~1.0).
587584
if (context.GetMergeStrategy().UseCosts() && context.GetMergeStrategy().InitFontMergeThreshold().has_value()) {
588-
context = TRY(MoveSegmentsToInitFont(context));
585+
TRYV(MoveSegmentsToInitFont(context));
589586
}
590587

591588
// ### Iteratively merge segments and incrementally reprocess affected data.

ift/encoder/closure_glyph_segmenter.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ class ClosureGlyphSegmenter {
6565
const freq::ProbabilityCalculator& probability_calculator) const;
6666

6767
private:
68-
absl::StatusOr<SegmentationContext> MoveSegmentsToInitFont(SegmentationContext& context) const;
68+
absl::Status MoveSegmentsToInitFont(SegmentationContext& context) const;
6969
};
7070

7171
} // namespace ift::encoder

ift/encoder/closure_glyph_segmenter_test.cc

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -806,6 +806,75 @@ if (s0) then p0
806806
)");
807807
}
808808

809+
810+
TEST_F(ClosureGlyphSegmenterTest, InitFontMerging) {
811+
// In this test we enable merging of segments into the init font
812+
UnicodeFrequencies frequencies{
813+
{{'a', 'a'}, 100},
814+
{{'b', 'b'}, 50},
815+
{{'c', 'c'}, 50},
816+
{{'d', 'd'}, 100},
817+
818+
// b and c co-occur
819+
{{'b', 'c'}, 50},
820+
};
821+
822+
MergeStrategy strategy = *MergeStrategy::BigramCostBased(std::move(frequencies));
823+
strategy.SetInitFontMergeThreshold(-75);
824+
825+
auto segmentation = segmenter.CodepointToGlyphSegments(
826+
roboto.get(), {},
827+
{{'a'}, {'d'}, {'b'}, {'c'}},
828+
strategy);
829+
ASSERT_TRUE(segmentation.ok()) << segmentation.status();
830+
831+
// 'a' and 'd' will be moved to the init font, leaving only two segments 'b', 'c'
832+
std::vector<SubsetDefinition> expected_segments = {
833+
{},
834+
{},
835+
{'b', 'c'},
836+
{},
837+
};
838+
ASSERT_EQ(segmentation->Segments(), expected_segments);
839+
840+
ASSERT_EQ(segmentation->ToString(),
841+
R"(initial font: { gid0, gid69, gid72 }
842+
p0: { gid70, gid71 }
843+
if (s2) then p0
844+
)");
845+
}
846+
847+
TEST_F(ClosureGlyphSegmenterTest, InitFontMerging_CommonGlyphs) {
848+
UnicodeFrequencies frequencies{
849+
{{'A', 'A'}, 1},
850+
{{'C', 'C'}, 1},
851+
{{0x106, 0x106}, 100}, // Cacute (contains C glyph)
852+
};
853+
854+
MergeStrategy strategy = *MergeStrategy::CostBased(std::move(frequencies));
855+
strategy.SetInitFontMergeThreshold(-75);
856+
857+
auto segmentation = segmenter.CodepointToGlyphSegments(
858+
roboto.get(), {},
859+
{{0x106}, {'A'}, {'C'}},
860+
strategy);
861+
ASSERT_TRUE(segmentation.ok()) << segmentation.status();
862+
863+
std::vector<SubsetDefinition> expected_segments = {
864+
{},
865+
{'A'},
866+
{'C'},
867+
};
868+
ASSERT_EQ(segmentation->Segments(), expected_segments);
869+
870+
// C get's covered by the Cacute merge into int, so only A is left in the patch.
871+
ASSERT_EQ(segmentation->ToString(),
872+
R"(initial font: { gid0, gid39, gid117, gid700 }
873+
p0: { gid37 }
874+
if (s1) then p0
875+
)");
876+
}
877+
809878
// TODO(garretrieger): add test where or_set glyphs are moved back to unmapped
810879
// due to found "additional conditions".
811880

ift/encoder/requested_segmentation_information.cc

Lines changed: 2 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -10,24 +10,8 @@ RequestedSegmentationInformation::RequestedSegmentationInformation(
1010
std::vector<Segment> segments, SubsetDefinition init_font_segment,
1111
GlyphClosureCache& closure_cache)
1212
: segments_(std::move(segments)),
13-
init_font_segment_(std::move(init_font_segment)) {
14-
SubsetDefinition all;
15-
all.Union(init_font_segment_);
16-
for (const auto& s : segments_) {
17-
all.Union(s.Definition());
18-
}
19-
20-
{
21-
auto closure = closure_cache.GlyphClosure(init_font_segment_);
22-
if (closure.ok()) {
23-
init_font_glyphs_ = std::move(*closure);
24-
}
25-
}
26-
27-
auto closure = closure_cache.GlyphClosure(all);
28-
if (closure.ok()) {
29-
full_closure_ = std::move(*closure);
30-
}
13+
init_font_segment_() {
14+
ReassignInitSubset(closure_cache, std::move(init_font_segment), {});
3115
}
3216

3317
} // namespace ift::encoder

ift/encoder/requested_segmentation_information.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,32 @@ class RequestedSegmentationInformation {
3333
return base_segment.Definition().codepoints.size();
3434
}
3535

36+
void ReassignInitSubset(GlyphClosureCache& closure_cache, SubsetDefinition new_def, const common::SegmentSet& removed_segments) {
37+
for (segment_index_t s : removed_segments) {
38+
segments_[s].Clear();
39+
}
40+
41+
init_font_segment_ = std::move(new_def);
42+
43+
SubsetDefinition all;
44+
all.Union(init_font_segment_);
45+
for (const auto& s : segments_) {
46+
all.Union(s.Definition());
47+
}
48+
49+
{
50+
auto closure = closure_cache.GlyphClosure(init_font_segment_);
51+
if (closure.ok()) {
52+
init_font_glyphs_ = std::move(*closure);
53+
}
54+
}
55+
56+
auto closure = closure_cache.GlyphClosure(all);
57+
if (closure.ok()) {
58+
full_closure_ = std::move(*closure);
59+
}
60+
}
61+
3662
const SubsetDefinition& InitFontSegment() const { return init_font_segment_; }
3763

3864
// Returns the init font segment with all default always included items

ift/encoder/segmentation_context.h

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,7 @@ class SegmentationContext {
5252
glyph_groupings(segments),
5353
merge_strategy_(std::move(strategy)),
5454
optimization_cutoff_segment_(UINT32_MAX) {
55-
for (unsigned i = 0; i < segments.size(); i++) {
56-
if (!segments[i].Definition().Empty()) {
57-
active_segments_.insert(i);
58-
}
59-
}
55+
ComputeActiveSegments();
6056
}
6157

6258
// Convert the information in this context into a finalized GlyphSegmentation
@@ -141,6 +137,39 @@ class SegmentationContext {
141137
glyph_condition_set.InvalidateGlyphInformation(glyphs, segments);
142138
}
143139

140+
/*
141+
* Invalidates all grouping information and fully reprocesses all segments.
142+
*/
143+
absl::Status ReassignInitSubset(SubsetDefinition new_def, const common::SegmentSet& removed_segments) {
144+
unsigned glyph_count = hb_face_get_glyph_count(original_face.get());
145+
146+
segmentation_info_.ReassignInitSubset(glyph_closure_cache, std::move(new_def), removed_segments);
147+
active_segments_.clear();
148+
ComputeActiveSegments();
149+
150+
// All segments depend on the init subset def, so we must reprocess everything.
151+
// First reset grouping information:
152+
glyph_condition_set = GlyphConditionSet(glyph_count);
153+
glyph_groupings = GlyphGroupings(SegmentationInfo().Segments());
154+
inert_segments_.clear();
155+
156+
// Then reprocess segments:
157+
for (segment_index_t segment_index = 0;
158+
segment_index < SegmentationInfo().Segments().size();
159+
segment_index++) {
160+
TRY(ReprocessSegment(segment_index));
161+
}
162+
163+
common::GlyphSet all_glyphs;
164+
all_glyphs.insert_range(0, glyph_count - 1);
165+
TRYV(GroupGlyphs(all_glyphs));
166+
glyph_closure_cache.LogClosureCount("Segmentation reprocess for init def change.");
167+
168+
TRYV(InitOptimizationCutoff());
169+
170+
return absl::OkStatus();
171+
}
172+
144173
// Performs a closure analysis on codepoints and returns the associated
145174
// and, or, and exclusive glyph sets.
146175
absl::Status AnalyzeSegment(const common::SegmentSet& segment_ids,
@@ -166,6 +195,15 @@ class SegmentationContext {
166195
}
167196

168197
private:
198+
199+
void ComputeActiveSegments() {
200+
for (unsigned i = 0; i < segmentation_info_.Segments().size(); i++) {
201+
if (!segmentation_info_.Segments()[i].Definition().Empty()) {
202+
active_segments_.insert(i);
203+
}
204+
}
205+
}
206+
169207
/*
170208
* Ensures that the produce segmentation is:
171209
* - Disjoint (no duplicated glyphs) and doesn't overlap what's in the initial

0 commit comments

Comments
 (0)