Skip to content

Commit 7f72efe

Browse files
committed
In closure_glyph_segmenter_util segmentation analysis use the probability based cost function when computing total cost.
1 parent 039603f commit 7f72efe

File tree

4 files changed

+200
-26
lines changed

4 files changed

+200
-26
lines changed

ift/encoder/closure_glyph_segmenter.cc

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,15 @@
2020
#include "common/hb_set_unique_ptr.h"
2121
#include "common/int_set.h"
2222
#include "common/try.h"
23+
#include "common/woff2.h"
2324
#include "ift/encoder/activation_condition.h"
2425
#include "ift/encoder/candidate_merge.h"
2526
#include "ift/encoder/glyph_segmentation.h"
2627
#include "ift/encoder/merge_strategy.h"
28+
#include "ift/encoder/patch_size_cache.h"
2729
#include "ift/encoder/segmentation_context.h"
2830
#include "ift/encoder/subset_definition.h"
31+
#include "ift/freq/probability_calculator.h"
2932
#include "ift/glyph_keyed_diff.h"
3033

3134
using absl::btree_map;
@@ -47,7 +50,9 @@ using common::IntSet;
4750
using common::make_hb_face;
4851
using common::make_hb_set;
4952
using common::SegmentSet;
53+
using common::Woff2;
5054
using ift::GlyphKeyedDiff;
55+
using ift::freq::ProbabilityCalculator;
5156

5257
namespace ift::encoder {
5358

@@ -588,4 +593,75 @@ ClosureGlyphSegmenter::InitializeSegmentationContext(
588593
return context;
589594
}
590595

596+
static StatusOr<double> Woff2SizeOf(hb_face_t* original_face,
597+
const SubsetDefinition& def) {
598+
hb_subset_input_t* input = hb_subset_input_create_or_fail();
599+
if (!input) {
600+
return absl::InternalError("Failed to create subset input.");
601+
}
602+
def.ConfigureInput(input, original_face);
603+
604+
hb_face_t* init_face = hb_subset_or_fail(original_face, input);
605+
hb_subset_input_destroy(input);
606+
if (!init_face) {
607+
return absl::InternalError("Failed to create initial face subset.");
608+
}
609+
610+
FontData init_data(init_face);
611+
hb_face_destroy(init_face);
612+
613+
FontData woff2 = TRY(Woff2::EncodeWoff2(init_data.str()));
614+
return (double)woff2.size();
615+
}
616+
617+
StatusOr<SegmentationCost> ClosureGlyphSegmenter::TotalCost(
618+
hb_face_t* original_face, const GlyphSegmentation& segmentation,
619+
const ProbabilityCalculator& probability_calculator) const {
620+
SubsetDefinition non_ift;
621+
non_ift.Union(segmentation.InitialFontSegment());
622+
623+
std::vector<Segment> segments;
624+
for (const auto& def : segmentation.Segments()) {
625+
non_ift.Union(def);
626+
627+
auto P = probability_calculator.ComputeProbability(def);
628+
Segment s(def, P);
629+
segments.push_back(std::move(s));
630+
}
631+
632+
double init_font_size =
633+
TRY(Woff2SizeOf(original_face, segmentation.InitialFontSegment()));
634+
double non_ift_font_size = TRY(Woff2SizeOf(original_face, non_ift));
635+
636+
// TODO(garretrieger): for the total cost we need to also add in the table
637+
// keyed patch costs
638+
// may want to use the IFT compiler to produce the
639+
// complete encoding then compute table keyed costs from
640+
// that (in conjunction) with probability calculations.
641+
double total_cost = init_font_size;
642+
643+
// Use highest quality so we get the true cost.
644+
PatchSizeCacheImpl patch_sizer(original_face, 11);
645+
for (const auto& c : segmentation.Conditions()) {
646+
double Pc = TRY(c.Probability(segments, probability_calculator));
647+
const GlyphSet& gids = segmentation.GidSegments().at(c.activated());
648+
double patch_size = (double)TRY(patch_sizer.GetPatchSize(gids));
649+
total_cost += Pc * (patch_size + 75);
650+
}
651+
652+
double ideal_cost = 0.0;
653+
double incremental_size =
654+
non_ift_font_size / (double)non_ift.codepoints.size();
655+
for (unsigned cp : non_ift.codepoints) {
656+
double Pcp = probability_calculator.ComputeProbability({cp}).Min();
657+
ideal_cost += Pcp * incremental_size;
658+
}
659+
660+
return SegmentationCost{
661+
.total_cost = total_cost,
662+
.cost_for_non_segmented = non_ift_font_size,
663+
.ideal_cost = ideal_cost,
664+
};
665+
}
666+
591667
} // namespace ift::encoder

ift/encoder/closure_glyph_segmenter.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,16 @@
99
#include "ift/encoder/merge_strategy.h"
1010
#include "ift/encoder/segmentation_context.h"
1111
#include "ift/encoder/subset_definition.h"
12+
#include "ift/freq/probability_calculator.h"
1213

1314
namespace ift::encoder {
1415

16+
struct SegmentationCost {
17+
double total_cost;
18+
double cost_for_non_segmented;
19+
double ideal_cost;
20+
};
21+
1522
/*
1623
* This generates a glyph segmentation of a font which satisifies the closure
1724
* requirement by utilizing a a font subsetter closure function to detect glyph
@@ -48,6 +55,14 @@ class ClosureGlyphSegmenter {
4855
absl::StatusOr<SegmentationContext> InitializeSegmentationContext(
4956
hb_face_t* face, SubsetDefinition initial_segment,
5057
std::vector<Segment> segments, MergeStrategy merge_strategy) const;
58+
59+
/*
60+
* Computes the total cost (expected number of bytes transferred) for a given
61+
* segmentation with respect to the provided frequency data.
62+
*/
63+
absl::StatusOr<SegmentationCost> TotalCost(
64+
hb_face_t* original_face, const GlyphSegmentation& segmentation,
65+
const freq::ProbabilityCalculator& probability_calculator) const;
5166
};
5267

5368
} // namespace ift::encoder

ift/encoder/closure_glyph_segmenter_test.cc

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,20 @@
44
#include "common/font_helper.h"
55
#include "common/int_set.h"
66
#include "gtest/gtest.h"
7+
#include "ift/encoder/glyph_segmentation.h"
78
#include "ift/encoder/merge_strategy.h"
89
#include "ift/encoder/subset_definition.h"
10+
#include "ift/freq/probability_bound.h"
911
#include "ift/freq/unicode_frequencies.h"
12+
#include "ift/freq/unigram_probability_calculator.h"
1013

1114
using common::CodepointSet;
1215
using common::FontData;
1316
using common::hb_face_unique_ptr;
1417
using common::IntSet;
1518
using common::make_hb_face;
1619
using ift::freq::UnicodeFrequencies;
20+
using ift::freq::UnigramProbabilityCalculator;
1721

1822
namespace ift::encoder {
1923

@@ -725,6 +729,47 @@ if (s2) then p1
725729
)");
726730
}
727731

732+
TEST_F(ClosureGlyphSegmenterTest, TotalCost) {
733+
UnicodeFrequencies frequencies{
734+
{{' ', ' '}, 100}, {{'a', 'a'}, 95}, {{'b', 'b'}, 1},
735+
{{'c', 'c'}, 1}, {{'d', 'd'}, 50}, {{'e', 'e'}, 25},
736+
};
737+
UnigramProbabilityCalculator calculator(std::move(frequencies));
738+
739+
// Basic no segment case.
740+
GlyphSegmentation segmentation1({'a', 'b', 'c'}, {}, {});
741+
auto sc = GlyphSegmentation::GroupsToSegmentation({}, {}, {}, segmentation1);
742+
ASSERT_TRUE(sc.ok()) << sc;
743+
744+
ClosureGlyphSegmenter segmenter;
745+
SegmentationCost base_cost =
746+
*segmenter.TotalCost(roboto.get(), segmentation1, calculator);
747+
ASSERT_GT(base_cost.total_cost, 1000);
748+
ASSERT_EQ(base_cost.total_cost, base_cost.cost_for_non_segmented);
749+
ASSERT_LT(base_cost.ideal_cost, base_cost.total_cost);
750+
751+
// Add some patches
752+
GlyphSegmentation segmentation2({'a', 'b', 'c'}, {}, {});
753+
sc = GlyphSegmentation::GroupsToSegmentation(
754+
{
755+
{{0}, {100, 101, 102}},
756+
{{1}, {103, 104, 105}},
757+
},
758+
{}, {}, segmentation2);
759+
ASSERT_TRUE(sc.ok()) << sc;
760+
761+
std::vector<SubsetDefinition> segments{
762+
{'d'},
763+
{'e'},
764+
};
765+
segmentation2.CopySegments(segments);
766+
767+
SegmentationCost with_patches_cost =
768+
*segmenter.TotalCost(roboto.get(), segmentation2, calculator);
769+
ASSERT_GT(with_patches_cost.total_cost, base_cost.total_cost + 400);
770+
ASSERT_LT(with_patches_cost.ideal_cost, with_patches_cost.total_cost);
771+
}
772+
728773
// TODO(garretrieger): add test where or_set glyphs are moved back to unmapped
729774
// due to found "additional conditions".
730775

util/closure_glyph_keyed_segmenter_util.cc

Lines changed: 64 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "ift/encoder/glyph_segmentation.h"
2525
#include "ift/encoder/merge_strategy.h"
2626
#include "ift/encoder/subset_definition.h"
27+
#include "ift/freq/bigram_probability_calculator.h"
2728
#include "ift/freq/unicode_frequencies.h"
2829
#include "ift/proto/patch_encoding.h"
2930
#include "ift/proto/patch_map.h"
@@ -139,8 +140,7 @@ ABSL_FLAG(uint32_t, network_overhead_cost, 75,
139140
"for each network request.");
140141

141142
// TODO(garretrieger): add additional setting for cost base merging that
142-
// configures a minimum
143-
// grouping size (in terms of number of codepoints).
143+
// configures a minimum grouping size (in terms of number of codepoints).
144144

145145
ABSL_FLAG(std::vector<std::string>, optional_feature_tags, {},
146146
"A list of feature tags which can be optionally added to the font "
@@ -168,6 +168,7 @@ using ift::encoder::GlyphSegmentation;
168168
using ift::encoder::MergeStrategy;
169169
using ift::encoder::Segment;
170170
using ift::encoder::SubsetDefinition;
171+
using ift::freq::BigramProbabilityCalculator;
171172
using ift::freq::UnicodeFrequencies;
172173
using ift::proto::PatchEncoding;
173174
using ift::proto::PatchMap;
@@ -468,6 +469,63 @@ StatusOr<UnicodeFrequencies> GetFrequencyData(
468469
return util::LoadFrequenciesFromRiegeli(frequency_data_file.c_str());
469470
}
470471

472+
// Analysis of segmentation that does not utilize codepoint frequencies.
473+
static int NonFrequencyAnalysis(hb_face_t* font,
474+
const GlyphSegmentation& segmentation) {
475+
auto cost = SegmentationSize(font, segmentation);
476+
if (!cost.ok()) {
477+
std::cerr << "Failed to compute segmentation cost: " << cost.status()
478+
<< std::endl;
479+
return -1;
480+
}
481+
auto ideal_cost = IdealSegmentationSize(font, segmentation,
482+
NumExclusivePatches(segmentation));
483+
if (!ideal_cost.ok()) {
484+
std::cerr << "Failed to compute ideal segmentation cost: " << cost.status()
485+
<< std::endl;
486+
return -1;
487+
}
488+
489+
std::cerr << std::endl;
490+
std::cerr << "glyphs_in_fallback = " << segmentation.UnmappedGlyphs().size()
491+
<< std::endl;
492+
std::cerr << "ideal_cost_bytes = " << *ideal_cost << std::endl;
493+
std::cerr << "total_cost_bytes = " << *cost << std::endl;
494+
495+
double over_ideal_percent =
496+
(((double)*cost) / ((double)*ideal_cost) * 100.0) - 100.0;
497+
std::cerr << "%_extra_over_ideal = " << over_ideal_percent << std::endl;
498+
return 0;
499+
}
500+
501+
static int AnalysisWithFrequency(hb_face_t* font,
502+
const GlyphSegmentation& segmentation) {
503+
auto freq_data =
504+
GetFrequencyData(absl::GetFlag(FLAGS_frequency_data_file), {});
505+
if (!freq_data.ok()) {
506+
std::cerr << "Failed to load codepoint frequencies: " << freq_data.status()
507+
<< std::endl;
508+
return -1;
509+
}
510+
511+
BigramProbabilityCalculator calculator(std::move(*freq_data));
512+
513+
ClosureGlyphSegmenter segmenter;
514+
auto cost = segmenter.TotalCost(font, segmentation, calculator);
515+
if (!cost.ok()) {
516+
std::cerr << "Failed to compute cost of segmentation. " << cost.status()
517+
<< std::endl;
518+
return -1;
519+
}
520+
521+
std::cerr << "non_ift_cost_bytes = " << (uint64_t)cost->cost_for_non_segmented
522+
<< std::endl;
523+
std::cerr << "total_cost_bytes = " << (uint64_t)cost->total_cost << std::endl;
524+
std::cerr << "ideal_cost_bytes = " << (uint64_t)cost->ideal_cost << std::endl;
525+
526+
return 0;
527+
}
528+
471529
int main(int argc, char** argv) {
472530
absl::SetStderrThreshold(absl::LogSeverityAtLeast::kInfo);
473531
auto args = absl::ParseCommandLine(argc, argv);
@@ -583,29 +641,9 @@ int main(int argc, char** argv) {
583641
}
584642

585643
std::cerr << ">> Analysis" << std::endl;
586-
auto cost = SegmentationSize(font->get(), *result);
587-
if (!cost.ok()) {
588-
std::cerr << "Failed to compute segmentation cost: " << cost.status()
589-
<< std::endl;
590-
return -1;
591-
}
592-
auto ideal_cost =
593-
IdealSegmentationSize(font->get(), *result, NumExclusivePatches(*result));
594-
if (!ideal_cost.ok()) {
595-
std::cerr << "Failed to compute ideal segmentation cost: " << cost.status()
596-
<< std::endl;
597-
return -1;
644+
if (FrequenciesAreRequired()) {
645+
return AnalysisWithFrequency(font->get(), *result);
646+
} else {
647+
return NonFrequencyAnalysis(font->get(), *result);
598648
}
599-
600-
std::cerr << std::endl;
601-
std::cerr << "glyphs_in_fallback = " << result->UnmappedGlyphs().size()
602-
<< std::endl;
603-
std::cerr << "ideal_cost_bytes = " << *ideal_cost << std::endl;
604-
std::cerr << "total_cost_bytes = " << *cost << std::endl;
605-
606-
double over_ideal_percent =
607-
(((double)*cost) / ((double)*ideal_cost) * 100.0) - 100.0;
608-
std::cerr << "%_extra_over_ideal = " << over_ideal_percent << std::endl;
609-
610-
return 0;
611649
}

0 commit comments

Comments
 (0)