Skip to content

Commit f95534f

Browse files
authored
Chore: Attempt to fix flaky tests that fail often in CI (#6004)
Attempts to fix some of these flaky tests that fail often in CI:
1 parent 65a62d6 commit f95534f

File tree

5 files changed

+377
-18
lines changed

5 files changed

+377
-18
lines changed

cli/azd/internal/appdetect/java_test.go

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,26 @@ package appdetect
55

66
import (
77
"context"
8+
"fmt"
89
"log/slog"
910
"os"
1011
osexec "os/exec"
1112
"path/filepath"
13+
"strings"
1214
"testing"
15+
"time"
1316

1417
"github.com/azure/azure-dev/cli/azd/pkg/exec"
1518
"github.com/azure/azure-dev/cli/azd/pkg/tools/maven"
19+
"github.com/sethvargo/go-retry"
1620
)
1721

1822
func TestToMavenProject(t *testing.T) {
23+
// Skip in short mode since this test requires network access to Maven Central
24+
if testing.Short() {
25+
t.Skip("Skipping Maven network-dependent test in short mode")
26+
}
27+
1928
path, err := osexec.LookPath("java")
2029
if err != nil {
2130
t.Skip("Skip readMavenProject because java command doesn't exist.")
@@ -358,7 +367,12 @@ func TestToMavenProject(t *testing.T) {
358367
testPom := tt.testPoms[0]
359368
pomFilePath := filepath.Join(workingDir, testPom.pomFilePath)
360369

361-
mavenProject, err := readMavenProject(context.TODO(), maven.NewCli(exec.NewCommandRunner(nil)),
370+
// Use a timeout context to prevent hanging on network issues
371+
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
372+
defer cancel()
373+
374+
// Use retry logic for Maven operations due to potential network issues
375+
mavenProject, err := readMavenProjectWithRetry(ctx, maven.NewCli(exec.NewCommandRunner(nil)),
362376
pomFilePath)
363377
if err != nil {
364378
t.Fatalf("readMavenProject failed: %v", err)
@@ -377,6 +391,44 @@ func TestToMavenProject(t *testing.T) {
377391
}
378392
}
379393

394+
// readMavenProjectWithRetry wraps readMavenProject with retry logic to handle network issues
395+
func readMavenProjectWithRetry(ctx context.Context, mvnCli *maven.Cli, filePath string) (*mavenProject, error) {
396+
var mavenProject *mavenProject
397+
var lastErr error
398+
399+
err := retry.Do(
400+
ctx,
401+
retry.WithMaxRetries(3, retry.NewExponential(1*time.Second)),
402+
func(ctx context.Context) error {
403+
result, err := readMavenProject(ctx, mvnCli, filePath)
404+
if err != nil {
405+
// Check if error is likely network-related
406+
errStr := strings.ToLower(err.Error())
407+
if strings.Contains(errStr, "connection") ||
408+
strings.Contains(errStr, "timeout") ||
409+
strings.Contains(errStr, "network") ||
410+
strings.Contains(errStr, "unknown host") ||
411+
strings.Contains(errStr, "could not resolve") ||
412+
strings.Contains(errStr, "transfer failed") {
413+
lastErr = err
414+
return retry.RetryableError(err)
415+
}
416+
// For non-network errors (parsing, etc.), fail immediately
417+
return err
418+
}
419+
mavenProject = result
420+
return nil
421+
},
422+
)
423+
424+
if err != nil && lastErr != nil {
425+
// If we retried but still failed, include context about retries
426+
return nil, fmt.Errorf("maven operation failed after retries due to network issues: %w", lastErr)
427+
}
428+
429+
return mavenProject, err
430+
}
431+
380432
type testPom struct {
381433
pomFilePath string
382434
pomContentString string

cli/azd/test/functional/vs_server_test.go

Lines changed: 46 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -51,18 +51,33 @@ func Test_CLI_VsServerExternalAuth(t *testing.T) {
5151
err := cmd.Start()
5252
require.NoError(t, err)
5353

54-
// Wait for the server to start
55-
for i := 0; i < 5; i++ {
56-
time.Sleep(300 * time.Millisecond)
54+
// Wait for the server to start and output complete JSON
55+
var svr contracts.VsServerResult
56+
var outputData []byte
57+
maxAttempts := 20 // Increased from 5 to give more time
58+
for i := 0; i < maxAttempts; i++ {
59+
time.Sleep(150 * time.Millisecond) // Reduced sleep time but more attempts
5760
if stdout.Len() > 0 {
58-
break
61+
outputData = stdout.Bytes()
62+
// Try to parse JSON - if it succeeds, we have complete output
63+
if err := json.Unmarshal(outputData, &svr); err == nil {
64+
break
65+
}
66+
// If we're on the last attempt and still can't parse, fail with helpful error
67+
if i == maxAttempts-1 {
68+
require.NoError(
69+
t,
70+
err,
71+
"failed to parse JSON after %d attempts, output: %s",
72+
maxAttempts,
73+
string(outputData),
74+
)
75+
}
76+
} else if i == maxAttempts-1 {
77+
require.FailNow(t, "server did not produce any output after %d attempts", maxAttempts)
5978
}
6079
}
6180

62-
var svr contracts.VsServerResult
63-
err = json.Unmarshal(stdout.Bytes(), &svr)
64-
require.NoError(t, err, "value: %s", stdout.String())
65-
6681
ssConn, _, err := websocket.DefaultDialer.Dial(fmt.Sprintf("ws://127.0.0.1:%d/ServerService/v1.0", svr.Port), nil)
6782
require.NoError(t, err)
6883

@@ -235,18 +250,33 @@ func Test_CLI_VsServer(t *testing.T) {
235250
err = cmd.Start()
236251
require.NoError(t, err)
237252

238-
// Wait for the server to start
239-
for i := 0; i < 5; i++ {
240-
time.Sleep(300 * time.Millisecond)
253+
// Wait for the server to start and output complete JSON
254+
var svr contracts.VsServerResult
255+
var outputData []byte
256+
maxAttempts := 20 // Increased from 5 to give more time
257+
for i := 0; i < maxAttempts; i++ {
258+
time.Sleep(150 * time.Millisecond) // Reduced sleep time but more attempts
241259
if stdout.Len() > 0 {
242-
break
260+
outputData = stdout.Bytes()
261+
// Try to parse JSON - if it succeeds, we have complete output
262+
if err := json.Unmarshal(outputData, &svr); err == nil {
263+
break
264+
}
265+
// If we're on the last attempt and still can't parse, fail with helpful error
266+
if i == maxAttempts-1 {
267+
require.NoError(
268+
t,
269+
err,
270+
"failed to parse JSON after %d attempts, output: %s",
271+
maxAttempts,
272+
string(outputData),
273+
)
274+
}
275+
} else if i == maxAttempts-1 {
276+
require.FailNow(t, "server did not produce any output after %d attempts", maxAttempts)
243277
}
244278
}
245279

246-
var svr contracts.VsServerResult
247-
err = json.Unmarshal(stdout.Bytes(), &svr)
248-
require.NoError(t, err, "value: %s", stdout.String())
249-
250280
/* #nosec G204 - Subprocess launched with a potential tainted input or cmd arguments false positive */
251281
cmd = exec.CommandContext(context.Background(),
252282
"dotnet", "test",

cli/azd/test/recording/recording.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,10 +195,14 @@ func Start(t *testing.T, opts ...Options) *Session {
195195
}
196196

197197
transport := http.DefaultTransport.(*http.Transport).Clone()
198-
vcr.SetRealTransport(&gzip2HttpRoundTripper{
198+
199+
// Wrap the transport chain with resilient retry logic and gzip handling
200+
resilientTransport := NewResilientHttpTransport(&gzip2HttpRoundTripper{
199201
transport: transport,
200202
})
201203

204+
vcr.SetRealTransport(resilientTransport)
205+
202206
vcr.SetMatcher(func(r *http.Request, i cassette.Request) bool {
203207
// Ignore query parameter 's=...' in containerappOperationResults
204208
if strings.Contains(r.URL.Path, "/providers/Microsoft.App/") &&
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// Licensed under the MIT License.
3+
4+
package recording
5+
6+
import (
7+
"context"
8+
"net/http"
9+
"strings"
10+
"time"
11+
12+
"github.com/sethvargo/go-retry"
13+
)
14+
15+
// networkErrorKeywords contains error message keywords that indicate network-related failures
16+
// that should be retried
17+
var networkErrorKeywords = []string{
18+
"timeout",
19+
"connection",
20+
"network",
21+
"dns",
22+
"tls handshake",
23+
"context deadline exceeded",
24+
"request timeout",
25+
"no such host",
26+
"connection refused",
27+
"connection reset",
28+
"i/o timeout",
29+
"network is unreachable",
30+
"temporary failure",
31+
"service unavailable",
32+
}
33+
34+
// resilientHttpTransport wraps an HTTP transport with retry logic for network failures.
35+
// This makes the test recorder more robust to transient network issues without affecting recorded interactions.
36+
type resilientHttpTransport struct {
37+
transport http.RoundTripper
38+
}
39+
40+
// NewResilientHttpTransport creates a new resilient HTTP transport that wraps the provided transport
41+
func NewResilientHttpTransport(transport http.RoundTripper) *resilientHttpTransport {
42+
return &resilientHttpTransport{
43+
transport: transport,
44+
}
45+
}
46+
47+
// isNetworkError checks if an error message contains keywords indicating a network-related failure
48+
func isNetworkError(err error) bool {
49+
if err == nil {
50+
return false
51+
}
52+
53+
errStr := strings.ToLower(err.Error())
54+
for _, keyword := range networkErrorKeywords {
55+
if strings.Contains(errStr, keyword) {
56+
return true
57+
}
58+
}
59+
return false
60+
}
61+
62+
// RoundTrip implements http.RoundTripper with retry logic for network failures
63+
func (r *resilientHttpTransport) RoundTrip(req *http.Request) (*http.Response, error) {
64+
var resp *http.Response
65+
var err error
66+
67+
// Retry logic with exponential backoff for network failures
68+
retryErr := retry.Do(
69+
req.Context(),
70+
retry.WithMaxRetries(3, retry.NewExponential(2*time.Second)),
71+
func(ctx context.Context) error {
72+
resp, err = r.transport.RoundTrip(req)
73+
if err != nil {
74+
// Check if error is likely network-related
75+
if isNetworkError(err) {
76+
return retry.RetryableError(err)
77+
}
78+
// For non-network errors, fail immediately
79+
return err
80+
}
81+
return nil
82+
},
83+
)
84+
85+
if retryErr != nil {
86+
return nil, retryErr
87+
}
88+
return resp, nil
89+
}

0 commit comments

Comments
 (0)