Skip to content

Commit edcbdae

Browse files
committed
Allow merge groups to be assigned a name for logging.
1 parent 7d2f832 commit edcbdae

File tree

4 files changed

+52
-14
lines changed

4 files changed

+52
-14
lines changed

ift/encoder/closure_glyph_segmenter.cc

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include <algorithm>
44
#include <cstdint>
55
#include <optional>
6+
#include <string>
67
#include <vector>
78

89
#include "absl/container/btree_map.h"
@@ -348,7 +349,12 @@ static StatusOr<std::vector<Segment>> ToOrderedSegments(
348349
remapped_full.insert(s_prime);
349350
}
350351

351-
VLOG(0) << " Merge group " << group_index << " has " << remapped.size()
352+
std::string name = std::to_string(group_index);
353+
if (strategy.Name().has_value()) {
354+
name = *strategy.Name();
355+
}
356+
357+
VLOG(0) << " Merge group " << name << " has " << remapped.size()
352358
<< " segments.";
353359
group_index++;
354360

@@ -396,6 +402,7 @@ StatusOr<GlyphSegmentation> ClosureGlyphSegmenter::CodepointToGlyphSegments(
396402
const std::vector<SubsetDefinition>& subset_definitions,
397403
btree_map<SegmentSet, MergeStrategy> merge_groups,
398404
bool place_fallback_in_init) const {
405+
399406
for (const auto& [segments, strategy] : merge_groups) {
400407
if (strategy.UseCosts()) {
401408
TRYV(CheckForDisjointCodepoints(subset_definitions, segments));
@@ -452,8 +459,13 @@ StatusOr<GlyphSegmentation> ClosureGlyphSegmenter::CodepointToGlyphSegments(
452459

453460
// ### Iteratively merge segments and incrementally reprocess affected data.
454461
size_t merger_index = 0;
462+
std::string merger_name = std::to_string(merger_index);
463+
if (mergers[merger_index].Strategy().Name().has_value()) {
464+
merger_name = *mergers[merger_index].Strategy().Name();
465+
}
466+
455467
segment_index_t last_merged_segment_index = 0;
456-
VLOG(0) << "Starting merge selection for merge group " << merger_index
468+
VLOG(0) << "Starting merge selection for merge group " << merger_name
457469
<< std::endl
458470
<< " " << mergers[merger_index].NumInscopeSegments()
459471
<< " inscope segments, " << mergers[merger_index].NumCutoffSegments()
@@ -465,8 +477,13 @@ StatusOr<GlyphSegmentation> ClosureGlyphSegmenter::CodepointToGlyphSegments(
465477

466478
if (!merged.has_value()) {
467479
merger_index++;
480+
468481
if (merger_index < mergers.size()) {
469-
VLOG(0) << "Merge group finished, starting next group " << merger_index
482+
std::string merger_name = std::to_string(merger_index);
483+
if (mergers[merger_index].Strategy().Name().has_value()) {
484+
merger_name = *mergers[merger_index].Strategy().Name();
485+
}
486+
VLOG(0) << "Merge group finished, starting next group " << merger_name
470487
<< std::endl
471488
<< " " << mergers[merger_index].NumInscopeSegments()
472489
<< " inscope segments, "

ift/encoder/merge_strategy.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,18 @@ class MergeStrategy {
114114
bool UseCosts() const { return use_costs_; }
115115
bool UsePatchMerges() const { return use_patch_merges_; }
116116

117+
std::optional<absl::string_view> Name() const {
118+
if (name_.has_value()) {
119+
return name_;
120+
} else {
121+
return std::nullopt;
122+
}
123+
}
124+
125+
void SetName(std::string name) {
126+
name_ = name;
127+
}
128+
117129
uint32_t NetworkOverheadCost() const { return network_overhead_cost_; }
118130
uint32_t MinimumGroupSize() const { return min_group_size_; }
119131
uint32_t PatchSizeMinBytes() const { return patch_size_min_bytes_; }
@@ -191,6 +203,7 @@ class MergeStrategy {
191203
patch_size_max_bytes_(patch_size_max_bytes),
192204
probability_calculator_(nullptr) {}
193205

206+
std::optional<std::string> name_ = std::nullopt;
194207
bool use_costs_;
195208
uint32_t network_overhead_cost_;
196209
uint32_t min_group_size_;

util/segmenter_config.proto

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -110,29 +110,32 @@ message SegmenterConfig {
110110
// is processed independently. Merges will only be made between things within the same group.
111111
// Each merge group has it's own configuration and frequency data.
112112
message MergeGroup {
113+
// Used to identify the group in logging.
114+
string name = 1;
115+
113116
// The set of segments that this merge group covers. May overlap with other merge groups.
114117
// If this is not specified and a heuristic config is used, then it will default to all segments.
115118
// If this is not specified and a cost config is used, then this will default to the set of
116119
// segments the provided frequency data covers.
117-
SegmentsProto segment_ids = 1;
120+
SegmentsProto segment_ids = 2;
118121

119122
// Adds in segments provided in the 'feature_segments' mapping.
120-
SegmentsProto feature_segment_ids = 2;
123+
SegmentsProto feature_segment_ids = 3;
121124

122125
// This is the group size (number of segments merged together) used in preprocess merging
123126
// of any segments covered by this merge group. Setting to 1 disables preprocess merging
124127
// of ungrouped segments.
125-
uint32 preprocess_merging_group_size = 3 [default = 1];
128+
uint32 preprocess_merging_group_size = 4 [default = 1];
126129

127130
// If frequency data is available only segments with probability less than this will
128131
// be included in the preprocess merging phase. Setting this to 1.0 will make preprocess
129132
// merging apply to all segments. Has no effect if this merge group is using heuristic
130133
// merging.
131-
double preprocess_merging_probability_threshold = 4 [default = 1.0];
134+
double preprocess_merging_probability_threshold = 5 [default = 1.0];
132135

133136
oneof config {
134-
HeuristicConfiguration heuristic_config = 5;
135-
CostConfiguration cost_config = 6;
137+
HeuristicConfiguration heuristic_config = 6;
138+
CostConfiguration cost_config = 7;
136139
}
137140
}
138141

util/segmenter_config_util.cc

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -157,9 +157,11 @@ SegmenterConfigUtil::ProtoToMergeGroup(
157157
SegmentId{.feature = true, .id_value = feature_group_id}));
158158
}
159159

160+
MergeStrategy strategy = MergeStrategy::None();
161+
160162
if (group.has_cost_config()) {
161163
CodepointSet covered_codepoints;
162-
MergeStrategy strategy = TRY(
164+
strategy = TRY(
163165
ProtoToStrategy(base_cost, group.cost_config(), covered_codepoints));
164166

165167
if (group.has_segment_ids()) {
@@ -175,8 +177,6 @@ SegmenterConfigUtil::ProtoToMergeGroup(
175177
strategy.SetPreClosureGroupSize(group.preprocess_merging_group_size());
176178
strategy.SetPreClosureProbabilityThreshold(
177179
group.preprocess_merging_probability_threshold());
178-
179-
return std::make_pair(segment_indices, strategy);
180180
} else {
181181
if (group.has_segment_ids()) {
182182
segment_indices.union_set(MapToIndices(group.segment_ids(), id_to_index));
@@ -185,14 +185,18 @@ SegmenterConfigUtil::ProtoToMergeGroup(
185185
segment_indices.insert_range(0, id_to_index.size() - 1);
186186
}
187187

188-
MergeStrategy strategy =
188+
strategy =
189189
::util::ProtoToStrategy(base_heuristic, group.heuristic_config());
190190

191191
strategy.SetPreClosureGroupSize(group.preprocess_merging_group_size());
192192
strategy.SetPreClosureProbabilityThreshold(1.0);
193+
}
193194

194-
return std::make_pair(segment_indices, strategy);
195+
if (group.has_name()) {
196+
strategy.SetName(group.name());
195197
}
198+
199+
return std::make_pair(segment_indices, strategy);
196200
}
197201

198202
StatusOr<btree_map<SegmentSet, MergeStrategy>>
@@ -234,6 +238,7 @@ SegmenterConfigUtil::ConfigToMergeGroups(
234238

235239
MergeStrategy strategy = util::ProtoToStrategy(config.base_heuristic_config(),
236240
config.ungrouped_config());
241+
strategy.SetName("Ungrouped");
237242
strategy.SetPreClosureGroupSize(
238243
config.preprocess_merging_group_size_for_ungrouped());
239244
strategy.SetPreClosureProbabilityThreshold(1.0);

0 commit comments

Comments
 (0)