-
Notifications
You must be signed in to change notification settings - Fork 261
Datapath tests for Long running clusters. #4142
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 31 commits
896ff1f
873c05e
3395415
b34b332
e9f50e6
8364bf5
1d2ed59
efbfb02
04a22a0
56fbeb2
4d29aec
a1baf08
0945c2c
7df1c79
b37b033
2672caa
9d27d43
95ff010
c1bd2e6
7bdf1b0
a27aa52
4f32773
feb46e4
3b9bc5c
de09b98
d3c4686
4adcb1a
e08fa01
066ba2c
6688685
cf7173b
e51230d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,42 +1,47 @@ | ||
| trigger: none | ||
| pr: none | ||
|
|
||
| # Schedule: Run every 1 hour | ||
| schedules: | ||
| - cron: "0 */3 * * *" # Every 3 hours at minute 0 | ||
| displayName: "Run tests every 3 hours" | ||
| branches: | ||
| include: | ||
| - sv2-long-running-pipeline-stage2 | ||
| always: true # Run even if there are no code changes | ||
|
|
||
| parameters: | ||
| - name: subscriptionId | ||
| displayName: "Azure Subscription ID" | ||
| type: string | ||
| default: "37deca37-c375-4a14-b90a-043849bd2bf1" | ||
|
|
||
| - name: serviceConnection | ||
| displayName: "Azure Service Connection" | ||
| type: string | ||
| default: "Azure Container Networking - Standalone Test Service Connection" | ||
|
|
||
| - name: location | ||
| displayName: "Deployment Region" | ||
| type: string | ||
| default: "centraluseuap" | ||
|
|
||
| - name: resourceGroupName | ||
| displayName: "Resource Group Name" | ||
| type: string | ||
| default: "long-run-$(Build.BuildId)" | ||
|
|
||
| - name: vmSkuDefault | ||
| displayName: "VM SKU for Default Node Pool" | ||
| type: string | ||
| default: "Standard_D2s_v3" | ||
|
|
||
| - name: vmSkuHighNIC | ||
| displayName: "VM SKU for High NIC Node Pool" | ||
| type: string | ||
| default: "Standard_D16s_v3" | ||
| - name: runSetupStages | ||
| displayName: "Create New Infrastructure Setup" | ||
| type: boolean | ||
| default: false | ||
|
|
||
| - name: serviceConnection | ||
| displayName: "Azure Service Connection" | ||
| # Setup-only parameters (only used when runSetupStages=true) | ||
| - name: resourceGroupName | ||
| displayName: "Resource Group Name used when Create new Infrastructure Setup is selected" | ||
| type: string | ||
| default: "Azure Container Networking - Standalone Test Service Connection" | ||
| default: "sv2-long-run-$(Build.BuildId)" | ||
|
|
||
| extends: | ||
| template: template/long-running-pipeline-template.yaml | ||
| parameters: | ||
| subscriptionId: ${{ parameters.subscriptionId }} | ||
| location: ${{ parameters.location }} | ||
| resourceGroupName: ${{ parameters.resourceGroupName }} | ||
| vmSkuDefault: ${{ parameters.vmSkuDefault }} | ||
| vmSkuHighNIC: ${{ parameters.vmSkuHighNIC }} | ||
| serviceConnection: ${{ parameters.serviceConnection }} | ||
| runSetupStages: ${{ parameters.runSetupStages }} | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,57 +7,113 @@ RG=$3 | |
| VM_SKU_DEFAULT=$4 | ||
| VM_SKU_HIGHNIC=$5 | ||
|
|
||
| CLUSTER_COUNT=2 | ||
| CLUSTER_PREFIX="aks" | ||
| DEFAULT_NODE_COUNT=1 | ||
| COMMON_TAGS="fastpathenabled=true RGOwner=LongRunningTestPipelines stampcreatorserviceinfo=true" | ||
|
|
||
| wait_for_provisioning() { # Helper for safe retry/wait for provisioning states (basic) | ||
| local rg="$1" clusterName="$2" | ||
| echo "Waiting for AKS '$clusterName' in RG '$rg' to reach Succeeded/Failed (polling)..." | ||
| CLUSTER_COUNT=2 | ||
| CLUSTER_PREFIX="aks" | ||
|
|
||
|
|
||
| stamp_vnet() { | ||
| local vnet_id="$1" | ||
|
|
||
| responseFile="response.txt" | ||
| modified_vnet="${vnet_id//\//%2F}" | ||
| cmd_stamp_curl="'curl -v -X PUT http://localhost:8080/VirtualNetwork/$modified_vnet/stampcreatorservicename'" | ||
| cmd_containerapp_exec="az containerapp exec -n subnetdelegator-westus-u3h4j -g subnetdelegator-westus --subscription 9b8218f9-902a-4d20-a65c-e98acec5362f --command $cmd_stamp_curl" | ||
|
||
|
|
||
| max_retries=10 | ||
| sleep_seconds=15 | ||
| retry_count=0 | ||
|
|
||
| while [[ $retry_count -lt $max_retries ]]; do | ||
sivakami-projects marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| script --quiet -c "$cmd_containerapp_exec" "$responseFile" | ||
| if grep -qF "200 OK" "$responseFile"; then | ||
| echo "Subnet Delegator successfully stamped the vnet" | ||
| return 0 | ||
| else | ||
| echo "Subnet Delegator failed to stamp the vnet, attempt $((retry_count+1))" | ||
| cat "$responseFile" | ||
| retry_count=$((retry_count+1)) | ||
| sleep "$sleep_seconds" | ||
| fi | ||
| done | ||
|
|
||
| echo "Failed to stamp the vnet even after $max_retries attempts" | ||
| exit 1 | ||
| } | ||
|
|
||
| wait_for_provisioning() { | ||
| local rg="$1" clusterName="$2" | ||
| echo "Waiting for AKS '$clusterName' in RG '$rg'..." | ||
| while :; do | ||
| state=$(az aks show --resource-group "$rg" --name "$clusterName" --query provisioningState -o tsv 2>/dev/null || true) | ||
| if [ -z "$state" ]; then | ||
| sleep 3 | ||
| continue | ||
| if [[ "$state" =~ Succeeded ]]; then | ||
| echo "Provisioning state: $state" | ||
| break | ||
| fi | ||
| case "$state" in | ||
| Succeeded|Succeeded*) echo "Provisioning state: $state"; break ;; | ||
| Failed|Canceled|Rejected) echo "Provisioning finished with state: $state"; break ;; | ||
| *) printf "."; sleep 6 ;; | ||
| esac | ||
| if [[ "$state" =~ Failed|Canceled ]]; then | ||
| echo "Provisioning finished with state: $state" | ||
| break | ||
| fi | ||
| sleep 6 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we look at leveraging another option besides sleep |
||
| done | ||
| } | ||
|
|
||
|
|
||
| ######################################### | ||
| # Main script starts here | ||
| ######################################### | ||
|
|
||
| for i in $(seq 1 "$CLUSTER_COUNT"); do | ||
| echo "==============================" | ||
| echo " Working on cluster set #$i" | ||
| echo "==============================" | ||
|
|
||
| CLUSTER_NAME="${CLUSTER_PREFIX}-${i}" | ||
| echo "Creating AKS cluster '$CLUSTER_NAME' in RG '$RG'" | ||
|
|
||
| make -C ./hack/aks azcfg AZCLI=az REGION=$LOCATION | ||
|
|
||
| make -C ./hack/aks swiftv2-podsubnet-cluster-up \ | ||
| AZCLI=az REGION=$LOCATION \ | ||
| SUB=$SUBSCRIPTION_ID \ | ||
| GROUP=$RG \ | ||
| CLUSTER=$CLUSTER_NAME \ | ||
| NODE_COUNT=$DEFAULT_NODE_COUNT \ | ||
| VM_SIZE=$VM_SKU_DEFAULT \ | ||
|
|
||
| echo " - waiting for AKS provisioning state..." | ||
| wait_for_provisioning "$RG" "$CLUSTER_NAME" | ||
|
|
||
| echo "Adding multi-tenant nodepool ' to '$CLUSTER_NAME'" | ||
| make -C ./hack/aks linux-swiftv2-nodepool-up \ | ||
| AZCLI=az REGION=$LOCATION \ | ||
| GROUP=$RG \ | ||
| VM_SIZE=$VM_SKU_HIGHNIC \ | ||
| CLUSTER=$CLUSTER_NAME \ | ||
| SUB=$SUBSCRIPTION_ID \ | ||
| echo "Creating cluster #$i..." | ||
|
|
||
| CLUSTER_NAME="${CLUSTER_PREFIX}-${i}" | ||
|
|
||
| make -C ./hack/aks azcfg AZCLI=az REGION=$LOCATION | ||
|
|
||
| # Create cluster with SkipAutoDeleteTill tag for persistent infrastructure | ||
| make -C ./hack/aks swiftv2-podsubnet-cluster-up \ | ||
| AZCLI=az REGION=$LOCATION \ | ||
| SUB=$SUBSCRIPTION_ID \ | ||
| GROUP=$RG \ | ||
| CLUSTER=$CLUSTER_NAME \ | ||
| VM_SIZE=$VM_SKU_DEFAULT | ||
|
|
||
| # Add SkipAutoDeleteTill tag to cluster (2032-12-31 for long-term persistence) | ||
| az aks update -g "$RG" -n "$CLUSTER_NAME" --tags SkipAutoDeleteTill=2032-12-31 || echo "Warning: Failed to add tag to cluster" | ||
|
|
||
| wait_for_provisioning "$RG" "$CLUSTER_NAME" | ||
|
|
||
| vnet_id=$(az network vnet show -g "$RG" --name "$CLUSTER_NAME" --query id -o tsv) | ||
| echo "Found VNET: $vnet_id" | ||
|
|
||
| # Add SkipAutoDeleteTill tag to AKS VNet | ||
| az network vnet update --ids "$vnet_id" --set tags.SkipAutoDeleteTill=2032-12-31 || echo "Warning: Failed to add tag to vnet" | ||
|
|
||
| stamp_vnet "$vnet_id" | ||
|
|
||
| make -C ./hack/aks linux-swiftv2-nodepool-up \ | ||
| AZCLI=az REGION=$LOCATION \ | ||
| GROUP=$RG \ | ||
| VM_SIZE=$VM_SKU_HIGHNIC \ | ||
| CLUSTER=$CLUSTER_NAME \ | ||
| SUB=$SUBSCRIPTION_ID | ||
|
|
||
| az aks get-credentials -g "$RG" -n "$CLUSTER_NAME" --admin --overwrite-existing \ | ||
| --file "/tmp/${CLUSTER_NAME}.kubeconfig" | ||
|
|
||
| # Label all nodes with workload-type and nic-capacity labels | ||
| echo "==> Labeling all nodes in $CLUSTER_NAME with workload-type=swiftv2-linux" | ||
| kubectl --kubeconfig "/tmp/${CLUSTER_NAME}.kubeconfig" label nodes --all workload-type=swiftv2-linux --overwrite | ||
| echo "[OK] All nodes labeled with workload-type=swiftv2-linux" | ||
|
|
||
| # Label default nodepool (nodepool1) with low-nic capacity | ||
| echo "==> Labeling default nodepool (nodepool1) nodes with nic-capacity=low-nic" | ||
| kubectl --kubeconfig "/tmp/${CLUSTER_NAME}.kubeconfig" label nodes -l agentpool=nodepool1 nic-capacity=low-nic --overwrite | ||
| echo "[OK] Default nodepool nodes labeled with nic-capacity=low-nic" | ||
|
|
||
| # Label nplinux nodepool with high-nic capacity | ||
| echo "==> Labeling nplinux nodepool nodes with nic-capacity=high-nic" | ||
| kubectl --kubeconfig "/tmp/${CLUSTER_NAME}.kubeconfig" label nodes -l agentpool=nplinux nic-capacity=high-nic --overwrite | ||
| echo "[OK] nplinux nodepool nodes labeled with nic-capacity=high-nic" | ||
| done | ||
| echo "All done. Created $CLUSTER_COUNT cluster set(s)." | ||
|
|
||
| echo "All clusters complete." | ||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the intent to have a separate CI/CD branch for these long running tests?