Merge pull request #192 from t-mialve/t-mialve/fix-tests

t-mialve · web-flow · commit 0749895d1a1d · 2025-01-08T16:03:21.000-05:00
Fix instability in long haul tests
diff --git a/deploy/example/echodate/deployment.yaml b/deploy/example/echodate/deployment.yaml
@@ -8,7 +8,7 @@ spec:
     - ReadWriteMany
   resources:
     requests:
-      storage: 1Gi
+      storage: 4Ti
   storageClassName: sc.azurelustre.csi.azure.com
 ---
 apiVersion: apps/v1
diff --git a/docs/examples/pv.yaml b/docs/examples/pv.yaml
@@ -10,7 +10,7 @@ spec:
   capacity:
     # This field should be the true size of the Azure Lustre you want
     # to used. So that, k8s can allocate resources better.
-    storage: 4Ti
+    storage: 48Ti
   csi:
     driver: azurelustre.csi.azure.com
     volumeAttributes:
diff --git a/docs/examples/pv_subdir.yaml b/docs/examples/pv_subdir.yaml
@@ -10,7 +10,7 @@ spec:
   capacity:
     # This field should be the true size of the Azure Lustre you want
     # to used. So that, k8s can allocate resources better.
-    storage: 4Ti
+    storage: 48Ti
   csi:
     driver: azurelustre.csi.azure.com
     volumeAttributes:
diff --git a/docs/examples/pvc_storageclass.yaml b/docs/examples/pvc_storageclass.yaml
@@ -10,6 +10,6 @@ spec:
   resources:
     requests:
       # The real storage capacity in the claim
-      storage: 1Gi
+      storage: 4Ti
   # This field must be the same as the storage class name in StorageClass
   storageClassName: sc.azurelustre.csi.azure.com
diff --git a/docs/examples/pvc_storageclass_subdir.yaml b/docs/examples/pvc_storageclass_subdir.yaml
@@ -10,6 +10,6 @@ spec:
   resources:
     requests:
       # The real storage capacity in the claim
-      storage: 1Gi
+      storage: 4Ti
   # This field must be the same as the storage class name in StorageClass
   storageClassName: subdir.azurelustre.csi.azure.com
diff --git a/hack/verify-integration-test-aks.sh b/hack/verify-integration-test-aks.sh
@@ -40,7 +40,7 @@ function catlog {
 trap catlog ERR EXIT
 
 ./kubectl wait --for=condition=Ready pod/aml-integration-test --timeout=60s
-./kubectl wait --for=condition=Ready=false pod/aml-integration-test --timeout=300s
+./kubectl wait --for=condition=Ready=false pod/aml-integration-test --timeout=600s
 
 exit_code=$(./kubectl get pod aml-integration-test -o=jsonpath='{.status.containerStatuses[*].state.*.exitCode}')
 
diff --git a/test/external-e2e/run.sh b/test/external-e2e/run.sh
@@ -60,15 +60,15 @@ echo "deploy test pvc"
 kubectl apply -f ${claim_file}
 echo "wait pvc to Bound status"
 # wait for json is supported in kubectl v1.24
-kubectl wait --for=jsonpath='{.status.phase}'=Bound -f ${claim_file} --timeout=300s
+kubectl wait --for=jsonpath='{.status.phase}'=Bound -f ${claim_file} --timeout=600s
 bounded_pv=$(kubectl get -f ${claim_file} -ojsonpath='{.spec.volumeName}')
 echo "bounded pv is ${bounded_pv}"
 echo "delete pvc"
 kubectl delete -f ${claim_file}
 echo "wait for the pvc to be deleted"
-kubectl wait --for=delete -f ${claim_file} --timeout=300s
+kubectl wait --for=delete -f ${claim_file} --timeout=600s
 echo "wait for pv ${bounded_pv} to be deleted"
-kubectl wait --for=delete pv/${bounded_pv} --timeout=300s
+kubectl wait --for=delete pv/${bounded_pv} --timeout=600s
 
 echo "delete test storageclass"
 kubectl delete -f ${sc_file}
diff --git a/test/external-e2e/testdriver-azurelustre.yaml b/test/external-e2e/testdriver-azurelustre.yaml
@@ -7,7 +7,7 @@ StorageClass:
 DriverInfo:
   Name: azurelustre.csi.azure.com
   SupportedSizeRange:
-    Max: 8Ti
+    Max: 48Ti
     Min: 4Ti
   RequiredAccessModes:
     - ReadWriteMany
diff --git a/test/long-haul/cleanup/cleanupjob.yaml b/test/long-haul/cleanup/cleanupjob.yaml
@@ -19,7 +19,7 @@ spec:
     - ReadWriteMany
   resources:
     requests:
-      storage: 1Gi
+      storage: 48Ti
   storageClassName: azurelustre-longhaulcleanup-sc
 
 ---
diff --git a/test/long-haul/fault-test.sh b/test/long-haul/fault-test.sh
@@ -38,7 +38,7 @@ sleep $SleepInSecs
 verify_sample_workload_by_pod_status workloadPodNameNew workloadNodeNameNew
 if [[ "$workloadPodName" == "$workloadPodNameNew" ]] ; then
     print_logs_error "workload pod $workloadPodName should be killed and new workload should be started"
-    print_debug_on_ERR
+    print_debug
     fast_exit
 fi
 
@@ -92,20 +92,20 @@ print_logs_info "running 'kubectl delete po' by background task"
 sleep $SleepInSecs
 
 podState=$(get_pod_state $workloadPodName $workloadNodeName)
-if [[ -z $podState || "$podState" != "Terminating" ]]; then
-    print_logs_error "Workload pod $workloadPodName should be in Terminating state on node $workloadNodeName, but its actual state is $podState"
-    print_debug_on_ERR
+if [[ "$podState" != "Terminating" && "$podState" != "Error" ]]; then
+    print_logs_error "Workload pod $workloadPodName should be in Error/Terminating state on node $workloadNodeName, but its actual state is $podState"
+    print_debug
     fast_exit
 else
-    print_logs_info "Workload pod $workloadPodName is in Terminating state on node $workloadNodeName"
+    print_logs_info "Workload pod $workloadPodName is in Error state on node $workloadNodeName"
 fi
 
 
 print_logs_title "Verify the new workload pod in Running state on other nodes or ContainerCreating state on the same node"
 verify_sample_workload_by_pod_status workloadPodNameNew workloadNodeNameNew "Running\|ContainerCreating"
 if [[ "$workloadPodName" == "$workloadPodNameNew" ]] ; then
     print_logs_error "New workload pod should be started, but still find old running pod $workloadPodName"
-    print_debug_on_ERR
+    print_debug
     fast_exit
 else
     print_logs_info "new workload pod $workloadPodNameNew started on another node $workloadNodeNameNew"
@@ -119,7 +119,7 @@ sleep $SleepInSecs
 podState=$(get_pod_state $NodePodNameKeyword $workloadNodeName)
 if  [[ -z "$podState" || "$podState" != "Running" ]]; then
     print_logs_error "Lustre CSI node pod can't be started on $nodeName, state=$podState"
-    print_debug_on_ERR
+    print_debug
     fast_exit
 else
     print_logs_info "Lustre CSI node pod started on $nodeName again"
@@ -132,7 +132,7 @@ sleep $SleepInSecs
 podState=$(get_pod_state $workloadPodName $workloadNodeName)
 if [[ ! -z $podState ]]; then
     print_logs_error "Still can find workload pod $workloadPodName in $podState state on node $workloadNodeName, it should be deleted successfully"
-    print_debug_on_ERR
+    print_debug
     fast_exit
 else
     print_logs_info "workload pod $workloadPodName has been deleted successfully from node $workloadNodeName"
diff --git a/test/long-haul/sample-workload/deployment_write_print_file.yaml b/test/long-haul/sample-workload/deployment_write_print_file.yaml
@@ -19,7 +19,7 @@ spec:
     - ReadWriteMany
   resources:
     requests:
-      storage: 1Gi
+      storage: 48Ti
   storageClassName: azurelustre-longhaulsample-sc
 ---
 apiVersion: apps/v1
diff --git a/test/long-haul/update-test.sh b/test/long-haul/update-test.sh
@@ -68,10 +68,10 @@ else
 fi
 
 print_logs_info "Upgrading node pool to the latest node image"
-az aks nodepool upgrade --resource-group $ResourceGroup --cluster-name $ClusterName --name $PoolName --node-image-only
+az aks nodepool upgrade --resource-group $ResourceGroup --cluster-name $ClusterName --name $PoolName --node-image-only -y
 
 print_logs_info "Upgrading node pool to the latest"
-az aks nodepool upgrade --resource-group $ResourceGroup --cluster-name $ClusterName --name $PoolName
+az aks nodepool upgrade --resource-group $ResourceGroup --cluster-name $ClusterName --name $PoolName -y
 
 print_logs_title "Print versions after"
 print_versions
diff --git a/test/long-haul/utils.sh b/test/long-haul/utils.sh
@@ -1,7 +1,10 @@
+set -x
 set -o errexit
 set -o pipefail
 set -o nounset
 
+trap print_debug EXIT
+
 REPO_ROOT_PATH=${REPO_ROOT_PATH:-$(git rev-parse --show-toplevel)}
 
 export REPO_ROOT_PATH=$REPO_ROOT_PATH
@@ -37,7 +40,8 @@ reset_csi_driver () {
     echo "Reset CSI driver"
     kubectl delete -f $REPO_ROOT_PATH/deploy/csi-azurelustre-controller.yaml --ignore-not-found
     kubectl delete -f $REPO_ROOT_PATH/deploy/csi-azurelustre-node.yaml --ignore-not-found
-    kubectl wait pod -n kube-system --for=delete --selector='app in (csi-azurelustre-controller,csi-azurelustre-node)' --timeout=300s
+    kubectl wait pod -n kube-system --for=delete --selector='app in (csi-azurelustre-controller,csi-azurelustre-node)' --timeout=600s
+
 
     echo "Reset node label"
     kubectl get nodes --no-headers | grep "$PoolName" | awk '{print $1}' | 
@@ -51,7 +55,7 @@ reset_csi_driver () {
     kubectl apply -f $REPO_ROOT_PATH/deploy/csi-azurelustre-controller.yaml
     kubectl apply -f $REPO_ROOT_PATH/deploy/csi-azurelustre-node.yaml
 
-    kubectl wait pod -n kube-system --for=condition=Ready --selector='app in (csi-azurelustre-controller,csi-azurelustre-node)' --timeout=300s
+    kubectl wait pod -n kube-system --for=condition=Ready --selector='app in (csi-azurelustre-controller,csi-azurelustre-node)' --timeout=600s
 
     sleep 60
 }
@@ -144,13 +148,19 @@ verify_csi_driver () {
         print_logs_info "$nodePodsNum node pods running..."        
     fi
 
-    kubectl wait pod -n kube-system --for=condition=Ready --selector='app in (csi-azurelustre-controller,csi-azurelustre-node)' --timeout=300s
+    kubectl wait pod -n kube-system --for=condition=Ready --selector='app in (csi-azurelustre-controller,csi-azurelustre-node)' --timeout=600s
+
 }
 
 start_sample_workload () {
     stop_sample_workload
-    kubectl apply -f ./sample-workload/deployment_write_print_file.yaml --timeout=300s
-    kubectl wait pod --for=condition=Ready --selector=app=azurelustre-longhaulsample-deployment --timeout=300s
+    kubectl apply -f ./sample-workload/deployment_write_print_file.yaml --timeout=600s
+    kubectl wait pod --for=condition=Ready --selector=app=azurelustre-longhaulsample-deployment --timeout=600s
+
+    if [[ $? -ne 0 ]]; then
+        print_logs_error "Failed to start sample workload"
+        print_debug
+    fi
     sleep 15
 }
 
@@ -160,8 +170,9 @@ stop_sample_workload () {
         kubectl patch pvc azurelustre-longhaulsample-pvc -p '{"metadata":{"finalizers":null}}'
     fi
 
-    kubectl delete -f ./sample-workload/deployment_write_print_file.yaml --ignore-not-found --timeout=300s --grace-period=0 --force --cascade
-    kubectl wait pod --for=delete --selector=app=azurelustre-longhaulsample-deployment --timeout=300s
+    kubectl delete -f ./sample-workload/deployment_write_print_file.yaml --ignore-not-found --timeout=600s --grace-period=0 --force --cascade
+    kubectl wait pod --for=delete --selector=app=azurelustre-longhaulsample-deployment --timeout=600s
+
 }
 
 verify_sample_workload_logs () {
diff --git a/test/scale/run_test.py b/test/scale/run_test.py
@@ -251,7 +251,7 @@ def deploy_workload(self):
         self.run_command(
             "kubectl rollout status deployment"
             " scale-test-set"
-            " --timeout=300s"
+            " --timeout=600s"
         )
         logger.info("workload was ready")
 
diff --git a/test/scale/static_workload.yml.template b/test/scale/static_workload.yml.template
@@ -7,7 +7,7 @@ spec:
   accessModes:
     - ReadWriteMany
   capacity:
-    storage: 4Ti
+    storage: 48Ti
   csi:
     driver: ${csi_name}
     volumeAttributes: