fix: bug fixes for planner tests (#3821) (#3835)

hhzhang16 · tedzhouhk · web-flow · commit a22cf24d476d · 2025-10-22T20:20:46.000-04:00
Signed-off-by: Hannah Zhang &lt;hannahz@nvidia.com&gt;
Signed-off-by: hongkuanz &lt;hongkuanz@nvidia.com&gt;
Co-authored-by: hongkuanz &lt;hongkuanz@nvidia.com&gt;
diff --git a/components/src/dynamo/planner/utils/prometheus.py b/components/src/dynamo/planner/utils/prometheus.py
@@ -63,6 +63,11 @@ def _get_average_metric(
             Average metric value or 0 if no data/error
         """
         try:
+            # Prepend the frontend metric prefix if not already present
+            if not full_metric_name.startswith(prometheus_names.name_prefix.FRONTEND):
+                full_metric_name = (
+                    f"{prometheus_names.name_prefix.FRONTEND}_{full_metric_name}"
+                )
             query = f"increase({full_metric_name}_sum[{interval}])/increase({full_metric_name}_count[{interval}])"
             result = self.prom.custom_query(query=query)
             if not result:
@@ -75,8 +80,10 @@ def _get_average_metric(
 
             values = []
             for container in metrics_containers:
+                # Frontend lowercases model names for Prometheus labels so we need to do case-insensitive comparison
                 if (
-                    container.metric.model == model_name
+                    container.metric.model
+                    and container.metric.model.lower() == model_name.lower()
                     and container.metric.dynamo_namespace == self.dynamo_namespace
                 ):
                     values.append(container.value[1])
@@ -120,14 +127,23 @@ def get_avg_request_count(self, interval: str, model_name: str):
         # This function follows a different query pattern than the other metrics
         try:
             requests_total_metric = prometheus_names.frontend_service.REQUESTS_TOTAL
+            # Prepend the frontend metric prefix if not already present
+            if not requests_total_metric.startswith(
+                prometheus_names.name_prefix.FRONTEND
+            ):
+                requests_total_metric = (
+                    f"{prometheus_names.name_prefix.FRONTEND}_{requests_total_metric}"
+                )
             raw_res = self.prom.custom_query(
                 query=f"increase({requests_total_metric}[{interval}])"
             )
             metrics_containers = parse_frontend_metric_containers(raw_res)
             total_count = 0.0
             for container in metrics_containers:
+                # Frontend lowercases model names for Prometheus labels so we need to do case-insensitive comparison
                 if (
-                    container.metric.model == model_name
+                    container.metric.model
+                    and container.metric.model.lower() == model_name.lower()
                     and container.metric.dynamo_namespace == self.dynamo_namespace
                 ):
                     total_count += container.value[1]
diff --git a/docs/planner/sla_planner_quickstart.md b/docs/planner/sla_planner_quickstart.md
@@ -38,7 +38,7 @@ flowchart TD
 
 Before deploying the SLA planner, ensure:
 - **Dynamo platform installed** (see [Installation Guide](/docs/kubernetes/installation_guide.md))
-- **[kube-prometheus-stack](/docs/kubernetes/metrics.md) installed and running.** By default, the prometheus server is not deployed in the `monitoring` namespace. If it is deployed to a different namespace, set `dynamo-operator.dynamo.metrics.prometheusEndpoint="http://prometheus-kube-prometheus-prometheus.<namespace>.svc.cluster.local:9090"`.
+- **[kube-prometheus-stack](/docs/kubernetes/metrics.md) installed and running.** By default, the prometheus server is deployed in the `monitoring` namespace. If it is deployed to a different namespace, set `dynamo-operator.dynamo.metrics.prometheusEndpoint="http://prometheus-kube-prometheus-prometheus.<namespace>.svc.cluster.local:9090"`.
 - **Benchmarking resources setup** (see [Kubernetes utilities for Dynamo Benchmarking and Profiling](../../deploy/utils/README.md)) The script will create a `dynamo-pvc` with `ReadWriteMany` access, if your cluster's default storageClassName does not allow `ReadWriteMany`, you need to specify a different storageClassName in `deploy/utils/manifests/pvc.yaml` which does support `ReadWriteMany`.
 
 
diff --git a/tests/planner/README.md b/tests/planner/README.md
@@ -160,20 +160,64 @@ PYTHONPATH=../../components/src python -m pytest test_replica_calculation.py -v
 **Note**: The unit tests automatically mock external dependencies (prometheus_client, runtime modules) to ensure they can run in isolation without requiring the full Dynamo environment.
 
 #### Run Full End-to-End Test
-Test complete scaling behavior including Kubernetes deployment and load generation:
+
+Test complete scaling behavior including Kubernetes deployment and load generation.
+
+**Prerequisites:**
+
+- **[kube-prometheus-stack](../../docs/kubernetes/metrics.md) installed and running.** The SLA planner requires Prometheus to observe metrics and make scaling decisions.
+- Ensure the Dynamo operator was installed with the Prometheus endpoint configured (see [SLA Planner Quickstart Guide](../../docs/planner/sla_planner_quickstart.md#prerequisites) for details).
+
+**Prepare the test deployment manifest:**
+
+The test requires modifying `components/backends/vllm/deploy/disagg_planner.yaml` with test-specific planner arguments:
+
+1. Copy the base deployment:
 
 ```bash
-./scaling/run_scaling_test.sh
+cp components/backends/vllm/deploy/disagg_planner.yaml tests/planner/scaling/disagg_planner.yaml
 ```
 
-With custom namespace:
+2. Edit `tests/planner/scaling/disagg_planner.yaml`. Ensure all services use the correct image. Modify the Planner service args:
+
+```yaml
+spec:
+  services:
+    Planner:
+      extraPodSpec:
+        mainContainer:
+          args:
+            - --environment=kubernetes
+            - --backend=vllm
+            - --adjustment-interval=60
+            - --profile-results-dir=/workspace/tests/planner/profiling_results/H200_TP1P_TP1D
+            - --ttft=100
+            - --itl=10
+            - --load-predictor=constant
+            - --no-correction
+```
+
+3. Update the model in VllmPrefillWorker and VllmDecodeWorker services:
+
+```yaml
+args:
+  - -m
+  - dynamo.vllm
+  - --model
+  - nvidia/Llama-3.1-8B-Instruct-FP8
+  - --migration-limit=3
+  - --max-model-len=8192
+```
+
+**Run the test:**
+
 ```bash
 ./scaling/run_scaling_test.sh --namespace <namespace>
 ```
 
 To save results to `tests/planner/e2e_scaling_results` instead of `/tmp`:
 ```bash
-./scaling/run_scaling_test.sh --save-results
+./scaling/run_scaling_test.sh --namespace <namespace> --save-results
 ```
 
 **E2E Test Deployment Management:**
diff --git a/tests/planner/scaling/disagg_planner.yaml b/tests/planner/scaling/disagg_planner.yaml
diff --git a/tests/planner/test_scaling_e2e.py b/tests/planner/test_scaling_e2e.py
@@ -97,13 +97,15 @@ def get_pod_counts(self) -> Optional[PodCounts]:
             for pod in data.get("items", []):
                 pod_phase = pod.get("status", {}).get("phase", "")
                 pod_labels = pod.get("metadata", {}).get("labels", {})
-                component = pod_labels.get("nvidia.com/dynamo-component", "")
+                sub_component = pod_labels.get(
+                    "nvidia.com/dynamo-sub-component-type", ""
+                )
 
                 # Only count Running pods
                 if pod_phase == "Running":
-                    if component == "VllmPrefillWorker":
+                    if sub_component == "prefill":
                         prefill_pods += 1
-                    elif component == "VllmDecodeWorker":
+                    elif sub_component == "decode":
                         decode_pods += 1
                     else:
                         continue