Skip to content

Commit 91a2e33

Browse files
Add results to submission checker
1 parent 7d1e0f5 commit 91a2e33

File tree

5 files changed

+259
-14
lines changed

5 files changed

+259
-14
lines changed

tools/submission/submission_checker/checks/accuracy_check.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,6 @@ def setup_checks(self):
2929
self.checks.append(self.dataset_check)
3030

3131
def accuracy_result_check(self):
32-
if self.division.lower() == "open":
33-
return True
3432
patterns, acc_targets, acc_types, acc_limits, up_patterns, acc_upper_limit = self.config.get_accuracy_values(
3533
self.model
3634
)
@@ -95,7 +93,9 @@ def accuracy_result_check(self):
9593
if not hash_val:
9694
self.log.error("%s not hash value for accuracy.txt", self.path)
9795
is_valid = False
98-
96+
self.submission_logs.loader_data["accuracy_metrics"] = result_acc
97+
if self.division.lower() == "open":
98+
return True
9999
return is_valid
100100

101101
def accuracy_json_check(self):

tools/submission/submission_checker/checks/performance_check.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from ..constants import *
33
from ..loader import SubmissionLogs
44
from ..configuration.configuration import Config
5+
import os
56

67
class PerformanceCheck(BaseCheck):
78
def __init__(self, log, path, config: Config, submission_logs: SubmissionLogs):
@@ -31,6 +32,7 @@ def setup_checks(self):
3132
self.checks.append(self.network_check)
3233
self.checks.append(self.llm_check)
3334
self.checks.append(self.inferred_check)
35+
self.checks.append(self.get_performance_metric_check)
3436

3537
def missing_check(self):
3638
if self.mlperf_log is None:
@@ -282,3 +284,78 @@ def inferred_check(self):
282284
self.log.error("Result for scenario %s can not be inferred from %s for: %s", self.scenario_fixed, self.scenario, self.path)
283285
return False
284286
return True
287+
288+
def get_performance_metric_check(self):
289+
# Assumes new logging format
290+
is_valid = True
291+
version = self.config.version
292+
if (
293+
"result_validity" in self.mlperf_log.get_keys()
294+
and self.mlperf_log["result_validity"] == "VALID"
295+
):
296+
is_valid = True
297+
scenario = self.mlperf_log["effective_scenario"]
298+
299+
res = float(self.mlperf_log[RESULT_FIELD_NEW[version][scenario]])
300+
if (
301+
version in RESULT_FIELD_BENCHMARK_OVERWRITE
302+
and self.model in RESULT_FIELD_BENCHMARK_OVERWRITE[version]
303+
and scenario in RESULT_FIELD_BENCHMARK_OVERWRITE[version][self.model]
304+
):
305+
res = float(
306+
self.mlperf_log[RESULT_FIELD_BENCHMARK_OVERWRITE[version]
307+
[self.model][scenario]]
308+
)
309+
310+
inferred = False
311+
if self.scenario.lower() != self.scenario_fixed.lower() and (self.scenario.lower(), self.scenario_fixed.lower()) != ("server", "interactive"):
312+
res, is_valid = self.get_inferred_result(res)
313+
self.submission_logs.loader_data["performance_metric"] = res
314+
return is_valid
315+
316+
def get_inferred_result(self, res):
317+
318+
inferred = False
319+
is_valid = True
320+
# Check if current scenario (and version) uses early stopping
321+
uses_early_stopping = self.config.uses_early_stopping(self.scenario)
322+
323+
latency_mean = self.mlperf_log["result_mean_latency_ns"]
324+
if self.scenario in ["MultiStream"]:
325+
latency_99_percentile = self.mlperf_log[
326+
"result_99.00_percentile_per_query_latency_ns"
327+
]
328+
latency_mean = self.mlperf_log["result_mean_query_latency_ns"]
329+
samples_per_query = self.mlperf_log["effective_samples_per_query"]
330+
if self.scenario == "SingleStream":
331+
# qps_wo_loadgen_overhead is only used for inferring Offline from
332+
# SingleStream; only for old submissions
333+
qps_wo_loadgen_overhead = self.mlperf_log["result_qps_without_loadgen_overhead"]
334+
335+
# special case for results inferred from different scenario
336+
if self.scenario_fixed in ["Offline"] and self.scenario in ["SingleStream"]:
337+
inferred = True
338+
res = qps_wo_loadgen_overhead
339+
340+
if (self.scenario_fixed in ["Offline"]) and self.scenario in ["MultiStream"]:
341+
inferred = True
342+
res = samples_per_query * S_TO_MS / (latency_mean / MS_TO_NS)
343+
344+
if (self.scenario_fixed in ["MultiStream"]) and self.scenario in ["SingleStream"]:
345+
inferred = True
346+
# samples_per_query does not match with the one reported in the logs
347+
# when inferring MultiStream from SingleStream
348+
samples_per_query = 8
349+
if uses_early_stopping:
350+
early_stopping_latency_ms = self.mlperf_log["early_stopping_latency_ms"]
351+
if early_stopping_latency_ms == 0:
352+
self.log.error(
353+
"Not enough samples were processed for early stopping to make an estimate"
354+
)
355+
is_valid = False
356+
res = (early_stopping_latency_ms * samples_per_query) / MS_TO_NS
357+
else:
358+
res = (latency_99_percentile * samples_per_query) / MS_TO_NS
359+
if (self.scenario_fixed in ["Interactive"]) and self.scenario not in ["Server"]:
360+
is_valid = False
361+
return res, is_valid

tools/submission/submission_checker/constants.py

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -969,7 +969,7 @@
969969
"sw_notes",
970970
"host_network_card_count",
971971
"system_type_detail",
972-
"network_speed_mbit",
972+
#"network_speed_mbit",
973973
]
974974

975975
SYSTEM_DESC_MEANINGFUL_RESPONSE_REQUIRED_FIELDS = [
@@ -1002,7 +1002,7 @@
10021002
]
10031003

10041004
SYSTEM_DESC_NUMERIC_RESPONSE_REQUIRED_FIELDS = [
1005-
"network_speed_mbit"
1005+
#"network_speed_mbit"
10061006
]
10071007

10081008

@@ -1052,6 +1052,67 @@
10521052
"weight_transformations",
10531053
]
10541054

1055+
SPECIAL_UNIT_DICT = {
1056+
"llama3.1-8b": {
1057+
"Offline": "Tokens/s",
1058+
"Server": "Tokens/s",
1059+
},
1060+
"llama3.1-8b-edge": {
1061+
"Offline": "Tokens/s",
1062+
},
1063+
"llama2-70b-99": {
1064+
"Offline": "Tokens/s",
1065+
"Server": "Tokens/s",
1066+
"Interactive": "Tokens/s",
1067+
},
1068+
"llama2-70b-99.9": {
1069+
"Offline": "Tokens/s",
1070+
"Server": "Tokens/s",
1071+
"Interactive": "Tokens/s",
1072+
},
1073+
"mixtral-8x7b": {
1074+
"Offline": "Tokens/s",
1075+
"Server": "Tokens/s",
1076+
"Interactive": "Tokens/s",
1077+
},
1078+
"llama3.1-405b": {
1079+
"Offline": "Tokens/s",
1080+
"Server": "Tokens/s",
1081+
"Interactive": "Tokens/s",
1082+
},
1083+
"deepseek-r1": {
1084+
"Offline": "Tokens/s",
1085+
"Server": "Tokens/s",
1086+
"Interactive": "Tokens/s",
1087+
},
1088+
}
1089+
UNIT_DICT = {
1090+
"SingleStream": "Latency (ms)",
1091+
"MultiStream": "Latency (ms)",
1092+
"Offline": "Samples/s",
1093+
"Server": "Queries/s",
1094+
"Interactive": "Queries/s",
1095+
1096+
"singlestream": "Latency (ms)",
1097+
"multistream": "Latency (ms)",
1098+
"offline": "Samples/s",
1099+
"server": "Queries/s",
1100+
"interactive": "Queries/s",
1101+
}
1102+
POWER_UNIT_DICT = {
1103+
"SingleStream": "millijoules",
1104+
"MultiStream": "millijoules",
1105+
"Offline": "Watts",
1106+
"Server": "Watts",
1107+
"Interactive": "Watts",
1108+
1109+
"singlestream": "millijoules",
1110+
"multistream": "millijoules",
1111+
"offline": "Watts",
1112+
"server": "Watts",
1113+
"interactive": "Watts",
1114+
}
1115+
10551116

10561117
PERFORMANCE_LOG_PATH = {
10571118
"v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/mlperf_log_detail.txt",

tools/submission/submission_checker/main.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from .checks.measurements_checks import MeasurementsCheck
1212
from .checks.compliance_check import ComplianceCheck
1313
from .checks.power_check import PowerCheck
14+
from .results import ResultExporter
1415

1516
logging.basicConfig(level=logging.INFO)
1617
log = logging.getLogger("main")
@@ -101,6 +102,7 @@ def main():
101102
scenarios_to_skip = []
102103

103104
loader = Loader(args.input, args.version)
105+
exporter = ResultExporter(args.csv, config)
104106
for logs in loader.load():
105107
# Initialize check classes
106108
performance_checks = PerformanceCheck(log, logs.loader_data["perf_path"], config, logs)
@@ -110,15 +112,17 @@ def main():
110112
measurements_checks = ComplianceCheck(log, logs.loader_data["compliance_path"], config, logs)
111113
power_checks = PowerCheck(log, logs.loader_data["power_dir_path"], config, logs)
112114
# Run checks
113-
performance_checks()
114-
accuracy_checks()
115-
system_checks()
116-
measurements_checks()
117-
power_checks()
118-
119-
with open(args.csv, "w") as csv:
120-
# Output summary
121-
pass
115+
valid = True
116+
valid &= performance_checks()
117+
valid &= accuracy_checks()
118+
valid &= system_checks()
119+
valid &= measurements_checks()
120+
valid &= power_checks()
121+
# Add results to summary
122+
if valid:
123+
exporter.add_result(logs)
124+
# Export results
125+
exporter.export()
122126

123127
# log results
124128
results = {}
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
2+
from .loader import SubmissionLogs
3+
from .configuration.configuration import Config
4+
import os
5+
import csv
6+
from .constants import *
7+
import json
8+
9+
class ResultExporter:
10+
def __init__(self, csv_path, config: Config) -> None:
11+
self.head = [
12+
"Organization",
13+
"Availability",
14+
"Division",
15+
"SystemType",
16+
"SystemName",
17+
"Platform",
18+
"Model",
19+
"MlperfModel",
20+
"Scenario",
21+
"Result",
22+
"Accuracy",
23+
"number_of_nodes",
24+
"host_processor_model_name",
25+
"host_processors_per_node",
26+
"host_processor_core_count",
27+
"accelerator_model_name",
28+
"accelerators_per_node",
29+
"Location",
30+
"framework",
31+
"operating_system",
32+
"notes",
33+
"compliance",
34+
"errors",
35+
"version",
36+
"inferred",
37+
"has_power",
38+
"Units",
39+
"weight_data_types",
40+
]
41+
self.rows = []
42+
self.csv_path = csv_path
43+
self.config = config
44+
45+
def add_result(self, submission_logs: SubmissionLogs):
46+
row = {key: "" for key in self.head}
47+
row["Organization"] = submission_logs.loader_data["submitter"]
48+
row["Availability"] = submission_logs.system_json["status"]
49+
row["Division"] = submission_logs.loader_data["division"]
50+
row["SystemType"] = submission_logs.system_json["system_type"]
51+
row["SystemName"] = submission_logs.system_json["system_name"]
52+
row["Platform"] = submission_logs.loader_data["system"]
53+
row["Model"] = submission_logs.loader_data["benchmark"]
54+
row["MlperfModel"] = self.config.get_mlperf_model(row["Model"], submission_logs.loader_data.get("model_mapping", {}))
55+
row["Scenario"] = submission_logs.loader_data["scenario"]
56+
row["Result"] = submission_logs.loader_data["performance_metric"]
57+
row["Accuracy"] = json.dumps(submission_logs.loader_data["accuracy_metrics"]).replace(",", " ").replace('"', "").replace("{", "").replace("}", "").strip()
58+
row["number_of_nodes"] = submission_logs.system_json["number_of_nodes"]
59+
row["host_processor_model_name"] = submission_logs.system_json["host_processor_model_name"]
60+
row["host_processors_per_node"] = submission_logs.system_json["host_processors_per_node"]
61+
row["host_processor_core_count"] = submission_logs.system_json["host_processor_core_count"]
62+
row["accelerator_model_name"] = submission_logs.system_json["accelerator_model_name"]
63+
row["accelerators_per_node"] = submission_logs.system_json["accelerators_per_node"]
64+
row["Location"] = os.path.dirname(submission_logs.loader_data["perf_path"])
65+
row["framework"] = submission_logs.system_json["framework"]
66+
row["operating_system"] = submission_logs.system_json["operating_system"]
67+
notes = submission_logs.system_json.get("hw_notes", "")
68+
if submission_logs.system_json.get("sw_notes"):
69+
notes = notes + ". " if notes else ""
70+
notes = notes + submission_logs.system_json.get("sw_notes")
71+
row["notes"] = notes
72+
row["compliance"] = submission_logs.loader_data["division"] # TODO
73+
row["errors"] = 0
74+
row["version"] = self.config.version
75+
row["inferred"] = 1 if row["Scenario"] != submission_logs.performance_log["effective_scenario"] and (submission_logs.performance_log["effective_scenario"], row["Scenario"]) != ("server", "interactive") else 0
76+
row["has_power"] = os.path.exists(submission_logs.loader_data["power_dir_path"])
77+
unit = SPECIAL_UNIT_DICT.get(
78+
row["MlperfModel"], UNIT_DICT).get(
79+
row["Scenario"], UNIT_DICT[row["Scenario"]]
80+
)
81+
row["Units"] = unit
82+
row["weight_data_types"] = submission_logs.measurements_json["weight_data_types"]
83+
self.rows.append(row.copy())
84+
if row["has_power"]:
85+
row["Result"] = submission_logs.loader_data["division"] #TODO
86+
power_unit = POWER_UNIT_DICT[row["Scenario"]]
87+
row["Units"] = power_unit
88+
self.rows.append(row.copy())
89+
90+
91+
def export_row(self, row: dict):
92+
values = [str(row.get(key, "")) for key in self.head]
93+
csv_row = ",".join(values) + "\n"
94+
with open(self.csv_path, "+a") as csv:
95+
csv.write(csv_row)
96+
97+
98+
def export(self):
99+
csv_header = ",".join(self.head) + "\n"
100+
with open(self.csv_path, "w") as csv:
101+
csv.write(csv_header)
102+
for row in self.rows:
103+
self.export_row(row)

0 commit comments

Comments
 (0)