Skip to content

Commit ae9cfbd

Browse files
Add accuracy checks to submission checker
1 parent cb7db52 commit ae9cfbd

File tree

7 files changed

+258
-19
lines changed

7 files changed

+258
-19
lines changed
Lines changed: 131 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,137 @@
11
from .base import BaseCheck
2+
from ..constants import *
3+
from ..loader import SubmissionLogs
4+
from ..configuration.configuration import Config
5+
import re
6+
import os
27

38
class AccuracyCheck(BaseCheck):
4-
def __init__(self, log, path, parsed_log):
9+
def __init__(self, log, path, config: Config, submission_logs: SubmissionLogs):
510
super().__init__(log, path)
6-
self.parsed_log = parsed_log
7-
self.checks.append(self.sample_check)
11+
self.name = "accuracy checks"
12+
self.submission_logs = submission_logs
13+
self.mlperf_log = self.submission_logs.accuracy_log
14+
self.accuracy_result = self.submission_logs.accuracy_result
15+
self.accuracy_json = self. submission_logs.accuracy_json
16+
self.config = config
17+
self.model = self.submission_logs.loader_data.get("benchmark", "")
18+
self.model_mapping = self.submission_logs.loader_data.get("model_mapping", {})
19+
self.model = self.config.get_mlperf_model(self.model, self.model_mapping)
20+
self.scenario_fixed = self.submission_logs.loader_data.get("scenario", "")
21+
self.scenario = self.mlperf_log["effective_scenario"]
22+
self.division = self.submission_logs.loader_data.get("division", "")
23+
self.setup_checks()
824

9-
def sample_check(self):
25+
def setup_checks(self):
26+
self.checks.append(self.accuracy_result_check)
27+
self.checks.append(self.accuracy_json_check)
28+
self.checks.append(self.loadgen_errors_check)
29+
self.checks.append(self.dataset_check)
30+
31+
def accuracy_result_check(self):
32+
if self.division.lower() == "open":
33+
return True
34+
patterns, acc_targets, acc_types, acc_limits, up_patterns, acc_upper_limit = self.config.get_accuracy_values(
35+
self.model
36+
)
37+
acc = None
38+
hash_val = None
39+
result_acc = {}
40+
acc_limit_check = True
41+
all_accuracy_valid = True
42+
acc_seen = [False for _ in acc_targets]
43+
for line in self.accuracy_result:
44+
for i, (pattern, acc_target, acc_type) in enumerate(
45+
zip(patterns, acc_targets, acc_types)
46+
):
47+
m = re.match(pattern, line)
48+
if m:
49+
acc = m.group(1)
50+
m = re.match(r"^hash=([\w\d]+)$", line)
51+
if m:
52+
hash_val = m.group(1)
53+
if acc is not None and float(acc) >= acc_target:
54+
all_accuracy_valid &= True
55+
acc_seen[i] = True
56+
elif acc is not None:
57+
all_accuracy_valid = False
58+
self.log.warning(
59+
"%s accuracy not met: expected=%f, found=%s",
60+
self.path,
61+
acc_target,
62+
acc,
63+
)
64+
if acc:
65+
result_acc[acc_type] = acc
66+
acc = None
67+
68+
if acc_upper_limit is not None:
69+
for i, (pattern, acc_limit) in enumerate(
70+
zip(up_patterns, acc_limits)):
71+
m = re.match(pattern, line)
72+
if m:
73+
acc = m.group(1)
74+
m = re.match(r"^hash=([\w\d]+)$", line)
75+
if m:
76+
hash_val = m.group(1)
77+
if (
78+
acc is not None
79+
and acc_upper_limit is not None
80+
and float(acc) > acc_limit
81+
):
82+
acc_limit_check = False
83+
self.log.warning(
84+
"%s accuracy not met: upper limit=%f, found=%s",
85+
self.path,
86+
acc_limit,
87+
acc,
88+
)
89+
acc = None
90+
if all(acc_seen) and hash_val:
91+
break
92+
is_valid = all_accuracy_valid & all(acc_seen)
93+
if acc_upper_limit is not None:
94+
is_valid &= acc_limit_check
95+
if not hash_val:
96+
self.log.error("%s not hash value for accuracy.txt", self.path)
97+
is_valid = False
98+
99+
return is_valid
100+
101+
def accuracy_json_check(self):
102+
if not os.path.exists(self.accuracy_json):
103+
self.log.error("%s is missing", self.accuracy_json)
104+
return False
105+
else:
106+
if os.stat(self.accuracy_json).st_size > MAX_ACCURACY_LOG_SIZE:
107+
self.log.error("%s is not truncated", self.accuracy_json)
108+
return False
109+
return True
110+
111+
def loadgen_errors_check(self):
112+
if self.mlperf_log.has_error():
113+
if self.config.ignore_uncommited:
114+
has_other_errors = False
115+
for error in self.mlperf_log.get_errors():
116+
if "Loadgen built with uncommitted changes!" not in error["value"]:
117+
has_other_errors = True
118+
self.log.error("%s contains errors:", self.path)
119+
for error in self.mlperf_log.get_errors():
120+
self.log.error("%s", error["value"])
121+
122+
if not self.config.ignore_uncommited or has_other_errors:
123+
self.log.error(
124+
"%s has loadgen errors, number of errors: %s", self.path, self.mlperf_log.num_errors()
125+
)
126+
return False
127+
return True
128+
129+
def dataset_check(self):
130+
qsl_total_count = self.mlperf_log["qsl_reported_total_count"]
131+
expected_qsl_total_count = self.config.get_dataset_size(self.model)
132+
if qsl_total_count != expected_qsl_total_count:
133+
self.log.error(
134+
"%s accurcy run does not cover all dataset, accuracy samples: %s, dataset size: %s", self.path, qsl_total_count, expected_qsl_total_count
135+
)
136+
return False
10137
return True

tools/submission/submission_checker/checks/base.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ def __init__(self, log, path):
1010
self.checks = []
1111
self.log = log
1212
self.path = path
13+
self.name = "base checks"
1314
pass
1415

1516
def run_checks(self):
@@ -32,10 +33,10 @@ def execute(self, check):
3233

3334
def __call__(self):
3435
"""Allows the check instance to be called like a function."""
35-
self.log("Starting check...")
36+
self.log.info("Starting %s for: %s", self.name, self.path)
3637
valid = self.run_checks()
3738
if valid:
38-
self.log.info("Checks passed")
39+
self.log.info("All %s checks passed for: %s", self.name, self.path)
3940
else:
40-
self.log.error("%s Checks failed", self.path)
41+
self.log.error("Some %s Checks failed for: %s", self.name, self.path)
4142
return valid

tools/submission/submission_checker/checks/performance_check.py

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,20 @@
22
from ..constants import *
33
from ..loader import SubmissionLogs
44
from ..configuration.configuration import Config
5-
import os
65

76
class PerformanceCheck(BaseCheck):
87
def __init__(self, log, path, config: Config, submission_logs: SubmissionLogs):
98
super().__init__(log, path)
9+
self.name = "performance checks"
1010
self.submission_logs = submission_logs
1111
self.mlperf_log = self.submission_logs.performance_log
1212
self.system_json = self.submission_logs.system_json
1313
self.config = config
1414
self.model = self.submission_logs.loader_data.get("benchmark", "")
1515
self.model_mapping = self.submission_logs.loader_data.get("model_mapping", {})
1616
self.model = self.config.get_mlperf_model(self.model, self.model_mapping)
17-
self.scenario = self.submission_logs.loader_data.get("scenario", "")
17+
self.scenario_fixed = self.submission_logs.loader_data.get("scenario", "")
18+
self.scenario = self.mlperf_log["effective_scenario"]
1819
self.division = self.submission_logs.loader_data.get("division", "")
1920
self.setup_checks()
2021

@@ -28,8 +29,8 @@ def setup_checks(self):
2829
self.checks.append(self.min_query_count_check)
2930
self.checks.append(self.min_duration_check)
3031
self.checks.append(self.network_check)
32+
self.checks.append(self.llm_check)
3133

32-
3334
def missing_check(self):
3435
if self.mlperf_log is None:
3536
self.log.error("Performance log missing at %s", self.path)
@@ -54,7 +55,6 @@ def loadgen_errors_check(self):
5455
return False
5556
return True
5657

57-
5858
def equal_issue_check(self):
5959
if self.config.requires_equal_issue(self.model, self.division) and self.mlperf_log["effective_sample_concatenate_permutation"]:
6060
self.log.error("%s requires equal issue mode (sample_concatenate_permutation), expected=true, found=false", self.path)
@@ -228,3 +228,35 @@ def network_check(self):
228228
return False
229229

230230
return True
231+
232+
233+
def llm_check(self):
234+
if self.model in self.config.get_llm_models():
235+
if self.mlperf_log["requested_use_token_latencies"]:
236+
if self.scenario not in ["Server", "Interactive"]:
237+
# For offline, singlestream and multistream no further checks are
238+
# necessary
239+
return True
240+
else:
241+
limits = LLM_LATENCY_LIMITS[self.model][self.scenario]
242+
if (
243+
self.mlperf_log["result_first_token_99.00_percentile_latency_ns"]
244+
< limits["ttft"]
245+
and self.mlperf_log["result_time_per_output_token_99.00_percentile_ns"]
246+
< limits["tpot"]
247+
):
248+
return True
249+
else:
250+
self.log.error(
251+
f"use_token_latencies flag needs to be enabled for Llama2 benchmark")
252+
return False
253+
254+
self.log.error(
255+
'Failed extra check for TTFT and TPOT. Obtained: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f. Required: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f',
256+
self.mlperf_log["result_first_token_99.00_percentile_latency_ns"],
257+
self.mlperf_log["result_time_per_output_token_99.00_percentile_ns"],
258+
limits["ttft"],
259+
limits["tpot"]
260+
)
261+
return False
262+
return True

tools/submission/submission_checker/configuration/configuration.py

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from ..constants import MODEL_CONFIG
1+
from ..constants import MODEL_CONFIG, ACC_PATTERN
22

33

44
class Config:
@@ -23,10 +23,13 @@ def load_config(self, version):
2323
# TODO: Load values from
2424
self.models = self.base["models"]
2525
self.seeds = self.base["seeds"]
26+
if self.base.get("test05_seeds"):
27+
self.test05_seeds = self.base["test05_seeds"]
2628
self.accuracy_target = self.base["accuracy-target"]
2729
self.accuracy_delta_perc = self.base["accuracy-delta-perc"]
2830
self.accuracy_upper_limit = self.base.get("accuracy-upper-limit", {})
2931
self.performance_sample_count = self.base["performance-sample-count"]
32+
self.dataset_size = self.base["dataset-size"]
3033
self.latency_constraint = self.base.get("latency-constraint", {})
3134
self.min_queries = self.base.get("min-queries", {})
3235
self.required = None
@@ -97,6 +100,30 @@ def get_accuracy_target(self, model):
97100

98101
def get_accuracy_upper_limit(self, model):
99102
return self.accuracy_upper_limit.get(model, None)
103+
104+
def get_accuracy_values(self, model):
105+
patterns = []
106+
acc_targets = []
107+
acc_types = []
108+
acc_limits = []
109+
up_patterns = []
110+
acc_limit_check = False
111+
112+
target = self.get_accuracy_target(model)
113+
acc_upper_limit = self.get_accuracy_upper_limit(model)
114+
if acc_upper_limit is not None:
115+
for i in range(0, len(acc_upper_limit), 2):
116+
acc_type, acc_target = acc_upper_limit[i: i + 2]
117+
acc_limits.append(acc_target)
118+
up_patterns.append(ACC_PATTERN[acc_type])
119+
120+
for i in range(0, len(target), 2):
121+
acc_type, acc_target = target[i: i + 2]
122+
patterns.append(ACC_PATTERN[acc_type])
123+
acc_targets.append(acc_target)
124+
acc_types.append(acc_type)
125+
126+
return patterns, acc_targets, acc_types, acc_limits, up_patterns, acc_upper_limit
100127

101128
def get_performance_sample_count(self, model):
102129
model = self.get_mlperf_model(model)
@@ -120,6 +147,12 @@ def get_min_query_count(self, model, scenario):
120147
if model not in self.min_queries:
121148
raise ValueError("model not known: " + model)
122149
return self.min_queries[model].get(scenario)
150+
151+
def get_dataset_size(self, model):
152+
model = self.get_mlperf_model(model)
153+
if model not in self.dataset_size:
154+
raise ValueError("model not known: " + model)
155+
return self.dataset_size[model]
123156

124157
def get_delta_perc(self, model, metric):
125158
if model in self.accuracy_delta_perc:
@@ -154,3 +187,16 @@ def requires_equal_issue(self, model, division):
154187
]
155188
and self.version in ["v4.1"]
156189
)
190+
191+
def get_llm_models(self):
192+
return [
193+
"llama2-70b-99",
194+
"llama2-70b-99.9",
195+
"llama2-70b-interactive-99",
196+
"llama2-70b-interactive-99.9",
197+
"mixtral-8x7b",
198+
"llama3.1-405b",
199+
"llama3.1-8b",
200+
"llama3.1-8b-edge",
201+
"deepseek-r1"
202+
]

tools/submission/submission_checker/constants.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -693,7 +693,7 @@
693693
"rgat": 788379,
694694
"deepseek-r1": 4388,
695695
"whisper": 1633,
696-
"pointpainting": 24576,
696+
"pointpainting": 6636,
697697
}
698698

699699
SCENARIO_MAPPING = {
@@ -1010,6 +1010,20 @@
10101010
"default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_detail.txt",
10111011
}
10121012

1013+
ACCURACY_RESULT_PATH = {
1014+
"v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/accuracy.txt",
1015+
"v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/accuracy.txt",
1016+
"v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/accuracy.txt",
1017+
"default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/accuracy.txt",
1018+
}
1019+
1020+
ACCURACY_JSON_PATH = {
1021+
"v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_accuracy.json",
1022+
"v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_accuracy.json",
1023+
"v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_accuracy.json",
1024+
"default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_accuracy.json",
1025+
}
1026+
10131027
MEASUREMENTS_PATH = {
10141028
"v5.0": "{division}/{submitter}/measurements/{system}/{benchmark}/{scenario}/{system}.json",
10151029
"v5.1": "{division}/{submitter}/measurements/{system}/{benchmark}/{scenario}/{system}.json",

0 commit comments

Comments
 (0)