Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
7354655
issue-11979 - stream works
hbelmiro Oct 27, 2025
292acf9
issue-11979 - Stream works v1 and v2 (returns an error as original)
hbelmiro Oct 27, 2025
8843a5f
issue-11979 - cleaned unneeded changes
hbelmiro Oct 27, 2025
bc85781
issue-11979 - removed grpc
hbelmiro Oct 27, 2025
461a402
issue-11979 - generated proto files
hbelmiro Oct 27, 2025
ebae22f
issue-11979 - manifests
hbelmiro Oct 27, 2025
4efe209
issue-11979 - migrated to blob
hbelmiro Oct 27, 2025
d4cf242
issue-11979 - cleaning
hbelmiro Oct 27, 2025
6519912
issue-11979 - generated api files
hbelmiro Oct 28, 2025
94e16e1
issue-11979 - fixed tests
hbelmiro Oct 28, 2025
7b4e5d8
issue-11979 - fixed client_manager.go
hbelmiro Oct 28, 2025
35c328d
issue-11979 - removed ReadArtifact
hbelmiro Oct 28, 2025
ab27a52
issue-11979 - updated client_manager for S3 compatibility; improved t…
hbelmiro Oct 28, 2025
f973c37
issue-11979 - fixed formatting
hbelmiro Oct 28, 2025
d517868
issue-11979 - lint
hbelmiro Oct 28, 2025
71bf2f4
issue-11979 - added Kubernetes secrets support for MinIO credentials …
hbelmiro Oct 29, 2025
2d604f8
issue-11979 - added context propagation to blob storage functions in …
hbelmiro Oct 29, 2025
7f8130d
issue-11979 - disabled EC2 metadata service queries to prevent MinIO …
hbelmiro Oct 29, 2025
59abc69
Added enhanced log collection script and workflow steps for improved …
hbelmiro Oct 29, 2025
b2beb4c
Added enhanced log collection script and workflow steps for improved …
hbelmiro Oct 29, 2025
d8f5ff2
Added immediate pipeline pod capture and improved workflow pod log co…
hbelmiro Oct 29, 2025
dde3d40
Added critical KFP infrastructure log collection for enhanced workflo…
hbelmiro Oct 29, 2025
45ff6ba
Expanded pod log collection to include broader patterns, failed/pendi…
hbelmiro Oct 29, 2025
49e5300
Refactored enhanced log collection to use a dedicated script, simplif…
hbelmiro Oct 29, 2025
def578e
Integrated comprehensive log collection into existing scripts and wor…
hbelmiro Oct 29, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 15 additions & 5 deletions .github/actions/test-and-report/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,18 +94,28 @@ runs:
go run github.com/onsi/ginkgo/v2/ginkgo -r -v --cover -p --keep-going --github-output=true --nodes=${{ inputs.num_parallel_nodes }} -v --label-filter=${{ inputs.test_label }} -- -namespace=${{ inputs.default_namespace }} -multiUserMode=$MULTI_USER -useProxy=$USE_PROXY -userNamespace=${{ inputs.user_namespace }} -uploadPipelinesWithKubernetes=${{ inputs.upload_pipelines_with_kubernetes_client}} -tlsEnabled=$TLS_ENABLED -caCertPath=$CA_CERT_PATH -pullNumber=$PULL_NUMBER -repoName=$REPO_NAME
continue-on-error: true

- name: Collect Pod logs in case of Test Failures
- name: Collect logs in case of Test Failures
id: collect-logs
shell: bash
if: ${{ steps.run-tests.outcome != 'success' }}
run: |
echo "=== Current disk usage ==="
df -h
NAMESPACE=${{ env.NAMESPACE }}
if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
NAMESPACE=${{ inputs.namespace }}
NAMESPACE=${{ inputs.default_namespace }}

# Run enhanced log collection
./.github/resources/scripts/collect-logs.sh \
--ns "$NAMESPACE" \
--output /tmp/enhanced_failure_logs.txt

# Append test results if available
if [ -f "${{ inputs.test_directory }}/reports/junit.xml" ]; then
echo "=== GINKGO TEST RESULTS ===" >> /tmp/enhanced_failure_logs.txt
cat "${{ inputs.test_directory }}/reports/junit.xml" >> /tmp/enhanced_failure_logs.txt 2>/dev/null || true
fi
./.github/resources/scripts/collect-logs.sh --ns $NAMESPACE --output /tmp/tmp_pod_log.txt

# Also create the original output file for backward compatibility
cp /tmp/enhanced_failure_logs.txt /tmp/tmp_pod_log.txt

- name: Publish Test Summary
id: publish
Expand Down
239 changes: 221 additions & 18 deletions .github/resources/scripts/collect-logs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,52 +14,255 @@ while [[ "$#" -gt 0 ]]; do
shift
done

mkdir -p /tmp/tmp.log
mkdir -p "$(dirname "$OUTPUT_FILE")"

if [[ -z "$NS" ]]; then
echo "Both --ns parameters are required."
echo "Namespace (--ns) parameter is required."
exit 1
fi

function check_namespace {
# Verify namespace exists
check_namespace() {
if ! kubectl get namespace "$1" &>/dev/null; then
echo "Namespace '$1' does not exist."
echo "Error: Namespace '$1' does not exist."
exit 1
fi
}

function display_pod_info {
# Main log collection function
collect_comprehensive_logs() {
local NAMESPACE=$1

kubectl get pods -n "${NAMESPACE}"
echo "===== ENHANCED LOG COLLECTION REPORT =====" > "$OUTPUT_FILE"
echo "Collection Time: $(date)" >> "$OUTPUT_FILE"
echo "Namespace: ${NAMESPACE}" >> "$OUTPUT_FILE"
echo "" >> "$OUTPUT_FILE"

# 1. Pod overview with labels
echo "===== POD OVERVIEW WITH LABELS =====" >> "$OUTPUT_FILE"
kubectl get pods -n "${NAMESPACE}" -o wide --show-labels >> "$OUTPUT_FILE" 2>&1 || echo "Failed to get pod overview" >> "$OUTPUT_FILE"
echo "" >> "$OUTPUT_FILE"

# 2. Argo Workflows
echo "===== ARGO WORKFLOWS =====" >> "$OUTPUT_FILE"
kubectl get workflows -n "${NAMESPACE}" -o wide --show-labels >> "$OUTPUT_FILE" 2>&1 || echo "No workflows found" >> "$OUTPUT_FILE"
echo "" >> "$OUTPUT_FILE"

# 3. Recent events
echo "===== RECENT EVENTS =====" >> "$OUTPUT_FILE"
kubectl get events -n "${NAMESPACE}" --sort-by='.lastTimestamp' >> "$OUTPUT_FILE" 2>&1 || echo "Failed to get events" >> "$OUTPUT_FILE"
echo "" >> "$OUTPUT_FILE"

# 4. Detailed pod logs
collect_pod_logs "${NAMESPACE}"

# 5. KFP infrastructure logs
collect_infrastructure_logs "${NAMESPACE}"

# 6. Workflow-specific resources
collect_workflow_resources "${NAMESPACE}"

# 7. Multi-user namespace analysis (auto-detect)
collect_user_namespace_logs

echo "Enhanced log collection completed. Output saved to: $OUTPUT_FILE"
}

# Collect logs from all pods in namespace
collect_pod_logs() {
local NAMESPACE=$1
local POD_NAMES

POD_NAMES=$(kubectl get pods -n "${NAMESPACE}" -o custom-columns=":metadata.name" --no-headers)

if [[ -z "${POD_NAMES}" ]]; then
echo "No pods found in namespace '${NAMESPACE}'." | tee -a "$OUTPUT_FILE"
echo "No pods found in namespace '${NAMESPACE}'." >> "$OUTPUT_FILE"
return
fi

echo "Pod Information for Namespace: ${NAMESPACE}" > "$OUTPUT_FILE"

# Collect detailed information for each pod
for POD_NAME in ${POD_NAMES}; do
{
echo "===== Pod: ${POD_NAME} in ${NAMESPACE} ====="
echo "----- EVENTS -----"
kubectl describe pod "${POD_NAME}" -n "${NAMESPACE}" | grep -A 100 Events || echo "No events found for pod ${POD_NAME}."
echo "=========================================="
echo "POD: ${POD_NAME}"
echo "=========================================="

echo "----- POD DESCRIPTION -----"
kubectl describe pod "${POD_NAME}" -n "${NAMESPACE}" || echo "Failed to describe pod ${POD_NAME}"

echo "----- LOGS -----"
kubectl logs "${POD_NAME}" -n "${NAMESPACE}" || echo "No logs found for pod ${POD_NAME}."
echo ""
echo "----- POD LOGS -----"
kubectl logs "${POD_NAME}" -n "${NAMESPACE}" --all-containers=true || echo "No logs found for pod ${POD_NAME}"

# Get previous logs if pod restarted
echo ""
echo "----- PREVIOUS LOGS (if restarted) -----"
kubectl logs "${POD_NAME}" -n "${NAMESPACE}" --all-containers=true --previous || echo "No previous logs for pod ${POD_NAME}"

echo "==========================="
echo ""
} | tee -a "$OUTPUT_FILE"
echo "=========================================="
echo ""
} >> "$OUTPUT_FILE"
done
}

# Collect logs from critical KFP infrastructure components
collect_infrastructure_logs() {
local NAMESPACE=$1

echo "===== CRITICAL KFP INFRASTRUCTURE LOGS =====" >> "$OUTPUT_FILE"

# Define infrastructure components
local components=(
"workflow-controller:app=workflow-controller"
"persistence-agent:app=ml-pipeline-persistenceagent"
"scheduled-workflow:app=ml-pipeline-scheduledworkflow"
"api-server:app=ml-pipeline"
)

for component in "${components[@]}"; do
local name="${component%%:*}"
local selector="${component##*:}"

echo "--- ${name^^} LOGS (ALL LOGS) ---" >> "$OUTPUT_FILE"
local pod=$(kubectl get pods -n "${NAMESPACE}" -l "${selector}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")

if [[ -n "$pod" ]]; then
echo "${name^} Pod: $pod" >> "$OUTPUT_FILE"
kubectl logs "$pod" -n "${NAMESPACE}" >> "$OUTPUT_FILE" 2>&1 || echo "No logs for $name" >> "$OUTPUT_FILE"

# Previous logs if restarted
echo "--- ${name^^} PREVIOUS LOGS (if restarted) ---" >> "$OUTPUT_FILE"
kubectl logs "$pod" -n "${NAMESPACE}" --previous >> "$OUTPUT_FILE" 2>&1 || echo "No previous logs" >> "$OUTPUT_FILE"
else
echo "No $name pod found" >> "$OUTPUT_FILE"
fi
echo "" >> "$OUTPUT_FILE"
done
}

# Collect workflow-related Kubernetes resources
collect_workflow_resources() {
local NAMESPACE=$1

echo "--- WORKFLOW CUSTOM RESOURCES ---" >> "$OUTPUT_FILE"
kubectl get workflows -n "${NAMESPACE}" -o yaml >> "$OUTPUT_FILE" 2>&1 || echo "No workflows found" >> "$OUTPUT_FILE"
echo "" >> "$OUTPUT_FILE"

echo "--- WORKFLOW TEMPLATES ---" >> "$OUTPUT_FILE"
kubectl get workflowtemplates -n "${NAMESPACE}" >> "$OUTPUT_FILE" 2>&1 || echo "No workflow templates found" >> "$OUTPUT_FILE"
echo "" >> "$OUTPUT_FILE"

echo "--- PIPELINE RUNS ---" >> "$OUTPUT_FILE"
kubectl get runs -n "${NAMESPACE}" -o wide --show-labels >> "$OUTPUT_FILE" 2>&1 || echo "No pipeline runs found" >> "$OUTPUT_FILE"
echo "" >> "$OUTPUT_FILE"

echo "--- ARGO WORKFLOW CONTROLLER CONFIG ---" >> "$OUTPUT_FILE"
kubectl get configmap -n "${NAMESPACE}" | grep -E "(workflow|argo)" >> "$OUTPUT_FILE" 2>&1 || echo "No Argo-related ConfigMaps found" >> "$OUTPUT_FILE"
echo "" >> "$OUTPUT_FILE"
}

# Collect logs from user namespaces (multi-user mode)
collect_user_namespace_logs() {
echo "===== USER NAMESPACE ANALYSIS =====" >> "$OUTPUT_FILE"

# Common user namespace patterns
for user_ns in "kubeflow-user-example-com" "kubeflow-user-test" "default"; do
if kubectl get namespace "$user_ns" &>/dev/null && [[ "$user_ns" != "$NS" ]]; then
echo "Found user namespace: $user_ns" >> "$OUTPUT_FILE"

# All pods in user namespace
echo "=== ALL USER NAMESPACE PODS ===" >> "$OUTPUT_FILE"
kubectl get pods -n "$user_ns" -o wide >> "$OUTPUT_FILE" 2>&1 || echo "Failed to get pods in $user_ns" >> "$OUTPUT_FILE"
echo "" >> "$OUTPUT_FILE"

# Workflow/execution pods
collect_user_workflow_pods "$user_ns"

# Failed/pending pods
collect_failed_pods "$user_ns"

# Resource constraints
collect_resource_info "$user_ns"

# Recent events
echo "=== USER NAMESPACE EVENTS ===" >> "$OUTPUT_FILE"
kubectl get events -n "$user_ns" --sort-by='.lastTimestamp' | tail -30 >> "$OUTPUT_FILE" 2>&1 || echo "No events in $user_ns" >> "$OUTPUT_FILE"
echo "" >> "$OUTPUT_FILE"

# Workflows in user namespace
echo "--- USER NAMESPACE WORKFLOWS ---" >> "$OUTPUT_FILE"
kubectl get workflows -n "$user_ns" >> "$OUTPUT_FILE" 2>&1 || echo "No workflows found in $user_ns" >> "$OUTPUT_FILE"
echo "" >> "$OUTPUT_FILE"
fi
done
}

# Collect workflow/execution pods in user namespace
collect_user_workflow_pods() {
local user_ns=$1
local workflow_pods

workflow_pods=$(kubectl get pods -n "$user_ns" -o name 2>/dev/null | grep -E "(pipeline|workflow|producer|consumer|dag-driver|system)" || echo "")

if [[ -n "$workflow_pods" ]]; then
echo "=== USER NAMESPACE WORKFLOW PODS ===" >> "$OUTPUT_FILE"
for pod_name in $workflow_pods; do
pod_name=$(echo "$pod_name" | sed 's|pod/||')
echo "--- User NS Pod: $pod_name ---" >> "$OUTPUT_FILE"

kubectl describe pod "$pod_name" -n "$user_ns" >> "$OUTPUT_FILE" 2>&1 || echo "Failed to describe $pod_name" >> "$OUTPUT_FILE"
echo "" >> "$OUTPUT_FILE"

echo "Pod logs for $pod_name:" >> "$OUTPUT_FILE"
kubectl logs "$pod_name" -n "$user_ns" --all-containers=true >> "$OUTPUT_FILE" 2>&1 || echo "No logs for $pod_name" >> "$OUTPUT_FILE"

echo "Previous logs for $pod_name:" >> "$OUTPUT_FILE"
kubectl logs "$pod_name" -n "$user_ns" --all-containers=true --previous >> "$OUTPUT_FILE" 2>&1 || echo "No previous logs for $pod_name" >> "$OUTPUT_FILE"
echo "" >> "$OUTPUT_FILE"
done
fi
}

# Collect failed/pending pods
collect_failed_pods() {
local user_ns=$1
local failed_pods

echo "=== FAILED/PENDING PODS ===" >> "$OUTPUT_FILE"
failed_pods=$(kubectl get pods -n "$user_ns" --field-selector=status.phase!=Running,status.phase!=Succeeded -o name 2>/dev/null || echo "")

if [[ -n "$failed_pods" ]]; then
echo "Found non-running pods:" >> "$OUTPUT_FILE"
for pod_name in $failed_pods; do
pod_name=$(echo "$pod_name" | sed 's|pod/||')
echo "--- Failed/Pending Pod: $pod_name ---" >> "$OUTPUT_FILE"

kubectl describe pod "$pod_name" -n "$user_ns" >> "$OUTPUT_FILE" 2>&1 || echo "Failed to describe $pod_name" >> "$OUTPUT_FILE"
echo "" >> "$OUTPUT_FILE"

kubectl logs "$pod_name" -n "$user_ns" --all-containers=true >> "$OUTPUT_FILE" 2>&1 || echo "No logs for $pod_name" >> "$OUTPUT_FILE"
echo "" >> "$OUTPUT_FILE"
done
else
echo "No failed/pending pods found" >> "$OUTPUT_FILE"
fi
echo "" >> "$OUTPUT_FILE"
}

# Collect resource quotas and limits
collect_resource_info() {
local user_ns=$1

echo "=== RESOURCE QUOTAS ===" >> "$OUTPUT_FILE"
kubectl get resourcequota -n "$user_ns" -o yaml >> "$OUTPUT_FILE" 2>&1 || echo "No resource quotas in $user_ns" >> "$OUTPUT_FILE"
echo "" >> "$OUTPUT_FILE"

echo "Pod information stored in $OUTPUT_FILE"
echo "=== LIMIT RANGES ===" >> "$OUTPUT_FILE"
kubectl get limitrange -n "$user_ns" -o yaml >> "$OUTPUT_FILE" 2>&1 || echo "No limit ranges in $user_ns" >> "$OUTPUT_FILE"
echo "" >> "$OUTPUT_FILE"
}

# Main execution
check_namespace "$NS"
display_pod_info "$NS"
collect_comprehensive_logs "$NS"
18 changes: 18 additions & 0 deletions .github/workflows/e2e-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,15 @@ jobs:
tls_enabled: ${{ matrix.pod_to_pod_tls_enabled }}
ca_cert_path: ${{ env.CA_CERT_PATH }}

- name: Upload enhanced failure logs on test failure
uses: actions/upload-artifact@v4
if: ${{ always() && steps.test-run.outcome != 'success' && steps.configure.outcome == 'success' }}
with:
name: enhanced-failure-logs-${{ matrix.test_label}}-K8s-${{ matrix.k8s_version }}-cache-${{ matrix.cache_enabled }}-argo-${{ matrix.argo_version}}-proxy-${{ matrix.proxy }}-storage-${{ matrix.storage }}
path: /tmp/enhanced_failure_logs.txt
retention-days: 30
continue-on-error: true

- name: Notify test reports
shell: bash
if: ${{ steps.test-run.outcome == 'success' }}
Expand Down Expand Up @@ -264,6 +273,15 @@ jobs:
user_namespace: ${{ env.USER_NAMESPACE }}
report_name: "E2EMultiUserTests_K8s=${{ matrix.k8s_version }}_cacheEnabled=${{ matrix.cache_enabled }}_multiUser=${{ matrix.multi_user }}_storage=${{ matrix.storage }}"

- name: Upload enhanced failure logs on test failure
uses: actions/upload-artifact@v4
if: ${{ always() && steps.test-run.outcome != 'success' && steps.configure.outcome == 'success' }}
with:
name: enhanced-failure-logs-multiuser-K8s-${{ matrix.k8s_version }}-cache-${{ matrix.cache_enabled }}-storage-${{ matrix.storage }}
path: /tmp/enhanced_failure_logs.txt
retention-days: 30
continue-on-error: true

- name: Notify test reports
shell: bash
if: ${{ steps.test-run.outcome == 'success' }}
Expand Down
Loading
Loading