Add accuracy checks to submission checker

pgmpablo157321 · pgmpablo157321 · commit ae9cfbd76214 · 2025-12-03T18:05:25.000-05:00
diff --git a/tools/submission/submission_checker/checks/accuracy_check.py b/tools/submission/submission_checker/checks/accuracy_check.py
@@ -1,10 +1,137 @@
 from .base import BaseCheck
+from ..constants import *
+from ..loader import SubmissionLogs
+from ..configuration.configuration import Config
+import re
+import os
 
 class AccuracyCheck(BaseCheck):
-    def __init__(self, log, path, parsed_log):
+    def __init__(self, log, path, config: Config, submission_logs: SubmissionLogs):
         super().__init__(log, path)
-        self.parsed_log = parsed_log
-        self.checks.append(self.sample_check)
+        self.name = "accuracy checks"
+        self.submission_logs = submission_logs
+        self.mlperf_log = self.submission_logs.accuracy_log
+        self.accuracy_result = self.submission_logs.accuracy_result
+        self.accuracy_json = self. submission_logs.accuracy_json
+        self.config = config
+        self.model = self.submission_logs.loader_data.get("benchmark", "")
+        self.model_mapping = self.submission_logs.loader_data.get("model_mapping", {})
+        self.model = self.config.get_mlperf_model(self.model, self.model_mapping)
+        self.scenario_fixed = self.submission_logs.loader_data.get("scenario", "")
+        self.scenario = self.mlperf_log["effective_scenario"]
+        self.division = self.submission_logs.loader_data.get("division", "")
+        self.setup_checks()
 
-    def sample_check(self):
+    def setup_checks(self):
+        self.checks.append(self.accuracy_result_check)
+        self.checks.append(self.accuracy_json_check)
+        self.checks.append(self.loadgen_errors_check)
+        self.checks.append(self.dataset_check)
+
+    def accuracy_result_check(self):
+        if self.division.lower() == "open":
+            return True
+        patterns, acc_targets, acc_types, acc_limits, up_patterns, acc_upper_limit = self.config.get_accuracy_values(
+            self.model
+        )
+        acc = None
+        hash_val = None
+        result_acc = {}
+        acc_limit_check = True
+        all_accuracy_valid = True
+        acc_seen = [False for _ in acc_targets]
+        for line in self.accuracy_result:
+            for i, (pattern, acc_target, acc_type) in enumerate(
+                zip(patterns, acc_targets, acc_types)
+            ):
+                m = re.match(pattern, line)
+                if m:
+                    acc = m.group(1)
+                m = re.match(r"^hash=([\w\d]+)$", line)
+                if m:
+                    hash_val = m.group(1)
+                if acc is not None and float(acc) >= acc_target:
+                    all_accuracy_valid &= True
+                    acc_seen[i] = True
+                elif acc is not None:
+                    all_accuracy_valid = False
+                    self.log.warning(
+                        "%s accuracy not met: expected=%f, found=%s",
+                        self.path,
+                        acc_target,
+                        acc,
+                    )
+                if acc:
+                    result_acc[acc_type] = acc
+                acc = None
+
+            if acc_upper_limit is not None:
+                for i, (pattern, acc_limit) in enumerate(
+                        zip(up_patterns, acc_limits)):
+                    m = re.match(pattern, line)
+                    if m:
+                        acc = m.group(1)
+                    m = re.match(r"^hash=([\w\d]+)$", line)
+                    if m:
+                        hash_val = m.group(1)
+                    if (
+                        acc is not None
+                        and acc_upper_limit is not None
+                        and float(acc) > acc_limit
+                    ):
+                        acc_limit_check = False
+                        self.log.warning(
+                            "%s accuracy not met: upper limit=%f, found=%s",
+                            self.path,
+                            acc_limit,
+                            acc,
+                        )
+                    acc = None
+            if all(acc_seen) and hash_val:
+                break
+        is_valid = all_accuracy_valid & all(acc_seen)
+        if acc_upper_limit is not None:
+            is_valid &= acc_limit_check
+        if not hash_val:
+            self.log.error("%s not hash value for accuracy.txt", self.path)
+            is_valid = False
+
+        return is_valid
+    
+    def accuracy_json_check(self):
+        if not os.path.exists(self.accuracy_json):
+            self.log.error("%s is missing", self.accuracy_json)
+            return False
+        else:
+            if os.stat(self.accuracy_json).st_size > MAX_ACCURACY_LOG_SIZE:
+                self.log.error("%s is not truncated", self.accuracy_json)
+                return False
+        return True
+    
+    def loadgen_errors_check(self):
+        if self.mlperf_log.has_error():
+            if self.config.ignore_uncommited:
+                has_other_errors = False
+                for error in self.mlperf_log.get_errors():
+                    if "Loadgen built with uncommitted changes!" not in error["value"]:
+                        has_other_errors = True
+            self.log.error("%s contains errors:", self.path)
+            for error in self.mlperf_log.get_errors():
+                self.log.error("%s", error["value"])
+
+            if not self.config.ignore_uncommited or has_other_errors:
+                self.log.error(
+                    "%s has loadgen errors, number of errors: %s", self.path, self.mlperf_log.num_errors()
+                )
+                return False
+        return True
+    
+    def dataset_check(self):
+        qsl_total_count = self.mlperf_log["qsl_reported_total_count"]
+        expected_qsl_total_count = self.config.get_dataset_size(self.model)
+        if qsl_total_count != expected_qsl_total_count:
+            self.log.error(
+                "%s accurcy run does not cover all dataset, accuracy samples: %s, dataset size: %s", self.path, qsl_total_count, expected_qsl_total_count
+            )
+            return False
         return True
diff --git a/tools/submission/submission_checker/checks/base.py b/tools/submission/submission_checker/checks/base.py
@@ -10,6 +10,7 @@ def __init__(self, log, path):
         self.checks = []
         self.log = log
         self.path = path
+        self.name = "base checks"
         pass
 
     def run_checks(self):
@@ -32,10 +33,10 @@ def execute(self, check):
     
     def __call__(self):
         """Allows the check instance to be called like a function."""
-        self.log("Starting check...")
+        self.log.info("Starting %s for: %s", self.name, self.path)
         valid = self.run_checks()
         if valid:
-            self.log.info("Checks passed")
+            self.log.info("All %s checks passed for: %s", self.name, self.path)
         else:
-            self.log.error("%s Checks failed", self.path)
+            self.log.error("Some %s Checks failed for: %s", self.name, self.path)
         return valid
diff --git a/tools/submission/submission_checker/checks/performance_check.py b/tools/submission/submission_checker/checks/performance_check.py
@@ -2,19 +2,20 @@
 from ..constants import *
 from ..loader import SubmissionLogs
 from ..configuration.configuration import Config
-import os
 
 class PerformanceCheck(BaseCheck):
     def __init__(self, log, path, config: Config, submission_logs: SubmissionLogs):
         super().__init__(log, path)
+        self.name = "performance checks"
         self.submission_logs = submission_logs
         self.mlperf_log = self.submission_logs.performance_log
         self.system_json = self.submission_logs.system_json
         self.config = config
         self.model = self.submission_logs.loader_data.get("benchmark", "")
         self.model_mapping = self.submission_logs.loader_data.get("model_mapping", {})
         self.model = self.config.get_mlperf_model(self.model, self.model_mapping)
-        self.scenario = self.submission_logs.loader_data.get("scenario", "")
+        self.scenario_fixed = self.submission_logs.loader_data.get("scenario", "")
+        self.scenario = self.mlperf_log["effective_scenario"]
         self.division = self.submission_logs.loader_data.get("division", "")
         self.setup_checks()
 
@@ -28,8 +29,8 @@ def setup_checks(self):
         self.checks.append(self.min_query_count_check)
         self.checks.append(self.min_duration_check)
         self.checks.append(self.network_check)
+        self.checks.append(self.llm_check)
 
-    
     def missing_check(self):
         if self.mlperf_log is None:
             self.log.error("Performance log missing at %s", self.path)
@@ -54,7 +55,6 @@ def loadgen_errors_check(self):
                 return False
         return True
 
-    
     def equal_issue_check(self):
         if self.config.requires_equal_issue(self.model, self.division) and self.mlperf_log["effective_sample_concatenate_permutation"]:
             self.log.error("%s requires equal issue mode (sample_concatenate_permutation), expected=true, found=false", self.path)
@@ -228,3 +228,35 @@ def network_check(self):
                 return False
 
         return True
+    
+
+    def llm_check(self):
+        if self.model in self.config.get_llm_models():
+            if self.mlperf_log["requested_use_token_latencies"]:
+                if self.scenario not in ["Server", "Interactive"]:
+                    # For offline, singlestream and multistream no further checks are
+                    # necessary
+                    return True
+                else:
+                    limits = LLM_LATENCY_LIMITS[self.model][self.scenario]
+                    if (
+                        self.mlperf_log["result_first_token_99.00_percentile_latency_ns"]
+                        < limits["ttft"]
+                        and self.mlperf_log["result_time_per_output_token_99.00_percentile_ns"]
+                        < limits["tpot"]
+                    ):
+                        return True
+            else:
+                self.log.error(
+                    f"use_token_latencies flag needs to be enabled for Llama2 benchmark")
+                return False
+
+            self.log.error(
+                'Failed extra check for TTFT and TPOT. Obtained: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f. Required: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f',
+                self.mlperf_log["result_first_token_99.00_percentile_latency_ns"],
+                self.mlperf_log["result_time_per_output_token_99.00_percentile_ns"],
+                limits["ttft"],
+                limits["tpot"]
+            )
+            return False
+        return True
diff --git a/tools/submission/submission_checker/configuration/configuration.py b/tools/submission/submission_checker/configuration/configuration.py
@@ -1,4 +1,4 @@
-from ..constants import MODEL_CONFIG
+from ..constants import MODEL_CONFIG, ACC_PATTERN
 
 
 class Config:
@@ -23,10 +23,13 @@ def load_config(self, version):
         # TODO: Load values from 
         self.models = self.base["models"]
         self.seeds = self.base["seeds"]
+        if self.base.get("test05_seeds"):
+            self.test05_seeds = self.base["test05_seeds"]
         self.accuracy_target = self.base["accuracy-target"]
         self.accuracy_delta_perc = self.base["accuracy-delta-perc"]
         self.accuracy_upper_limit = self.base.get("accuracy-upper-limit", {})
         self.performance_sample_count = self.base["performance-sample-count"]
+        self.dataset_size = self.base["dataset-size"]
         self.latency_constraint = self.base.get("latency-constraint", {})
         self.min_queries = self.base.get("min-queries", {})
         self.required = None
@@ -97,6 +100,30 @@ def get_accuracy_target(self, model):
 
     def get_accuracy_upper_limit(self, model):
         return self.accuracy_upper_limit.get(model, None)
+    
+    def get_accuracy_values(self, model):
+        patterns = []
+        acc_targets = []
+        acc_types = []
+        acc_limits = []
+        up_patterns = []
+        acc_limit_check = False
+
+        target = self.get_accuracy_target(model)
+        acc_upper_limit = self.get_accuracy_upper_limit(model)
+        if acc_upper_limit is not None:
+            for i in range(0, len(acc_upper_limit), 2):
+                acc_type, acc_target = acc_upper_limit[i: i + 2]
+                acc_limits.append(acc_target)
+                up_patterns.append(ACC_PATTERN[acc_type])
+
+        for i in range(0, len(target), 2):
+            acc_type, acc_target = target[i: i + 2]
+            patterns.append(ACC_PATTERN[acc_type])
+            acc_targets.append(acc_target)
+            acc_types.append(acc_type)
+
+        return patterns, acc_targets, acc_types, acc_limits, up_patterns, acc_upper_limit
 
     def get_performance_sample_count(self, model):
         model = self.get_mlperf_model(model)
@@ -120,6 +147,12 @@ def get_min_query_count(self, model, scenario):
         if model not in self.min_queries:
             raise ValueError("model not known: " + model)
         return self.min_queries[model].get(scenario)
+    
+    def get_dataset_size(self, model):
+        model = self.get_mlperf_model(model)
+        if model not in self.dataset_size:
+            raise ValueError("model not known: " + model)
+        return self.dataset_size[model]
 
     def get_delta_perc(self, model, metric):
         if model in self.accuracy_delta_perc:
@@ -154,3 +187,16 @@ def requires_equal_issue(self, model, division):
             ]
             and self.version in ["v4.1"]
         )
+    
+    def get_llm_models(self):
+        return [
+            "llama2-70b-99",
+            "llama2-70b-99.9",
+            "llama2-70b-interactive-99",
+            "llama2-70b-interactive-99.9",
+            "mixtral-8x7b",
+            "llama3.1-405b",
+            "llama3.1-8b",
+            "llama3.1-8b-edge", 
+            "deepseek-r1"
+        ]
diff --git a/tools/submission/submission_checker/constants.py b/tools/submission/submission_checker/constants.py
@@ -693,7 +693,7 @@
     "rgat": 788379,
     "deepseek-r1": 4388,
     "whisper": 1633,
-    "pointpainting": 24576,
+    "pointpainting": 6636,
 }
 
 SCENARIO_MAPPING = {
@@ -1010,6 +1010,20 @@
     "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_detail.txt",
 }
 
+ACCURACY_RESULT_PATH = {
+    "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/accuracy.txt",
+    "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/accuracy.txt",
+    "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/accuracy.txt",
+    "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/accuracy.txt",
+}
+
+ACCURACY_JSON_PATH = {
+    "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_accuracy.json",
+    "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_accuracy.json",
+    "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_accuracy.json",
+    "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_accuracy.json",
+}
+
 MEASUREMENTS_PATH = {
     "v5.0": "{division}/{submitter}/measurements/{system}/{benchmark}/{scenario}/{system}.json",
     "v5.1": "{division}/{submitter}/measurements/{system}/{benchmark}/{scenario}/{system}.json",
diff --git a/tools/submission/submission_checker/loader.py b/tools/submission/submission_checker/loader.py
diff --git a/tools/submission/submission_checker/main.py b/tools/submission/submission_checker/main.py