Skip to content

Commit 3f26e05

Browse files
test: fix TestRoundTripQueryCacheWithShardingMiddleware (#8472)
This commit refactors the TestRoundTripQueryCacheWithShardingMiddleware test to make it more reliable. The previous implementation had a design flaw because it expected that the PromQL sharding middleware would always issue 2 HTTP requests per attempt which isn't always the case (e.g. the first HTTP request might fail before the second request is issued). I've run the test on my local machine with `-count 1000` with different values of GOMAXPROCS (1, 2, 12) and it never failed. Signed-off-by: Simon Pasquier <[email protected]>
1 parent 2228202 commit 3f26e05

File tree

1 file changed

+74
-68
lines changed

1 file changed

+74
-68
lines changed

pkg/queryfrontend/roundtrip_test.go

Lines changed: 74 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package queryfrontend
66
import (
77
"context"
88
"encoding/json"
9+
"fmt"
910
"net/http"
1011
"net/http/httptest"
1112
"net/url"
@@ -532,6 +533,17 @@ func TestRoundTripQueryRangeCacheMiddleware(t *testing.T) {
532533
}
533534

534535
func TestRoundTripQueryCacheWithShardingMiddleware(t *testing.T) {
536+
// Run the test a couple of times because the behavior of the
537+
// PromQLShardingMiddleware middleware isn't 100% predictable.
538+
// See testRoundTripQueryCacheWithShardingMiddleware for details.
539+
for i := range 10 {
540+
if !t.Run(fmt.Sprintf("run-%d", i), testRoundTripQueryCacheWithShardingMiddleware) {
541+
return
542+
}
543+
}
544+
}
545+
546+
func testRoundTripQueryCacheWithShardingMiddleware(t *testing.T) {
535547
testRequest := &ThanosQueryRangeRequest{
536548
Path: "/api/v1/query_range",
537549
Start: 0,
@@ -569,57 +581,69 @@ func TestRoundTripQueryCacheWithShardingMiddleware(t *testing.T) {
569581
rt, err := newFakeRoundTripper()
570582
testutil.Ok(t, err)
571583
defer rt.Close()
572-
res, handler := promqlResultsWithFailures(3)
584+
count, handler := promqlResultsWithFailures(3)
573585
rt.setHandler(handler)
574586

575-
for _, tc := range []struct {
576-
name string
577-
req queryrange.Request
578-
err bool
579-
expected int64
580-
}{
581-
{
582-
name: "query with vertical sharding",
583-
req: testRequest,
584-
err: true,
585-
expected: 2,
586-
},
587-
{
588-
name: "same query as before, both requests are executed",
589-
req: testRequest,
590-
err: true,
591-
expected: 4,
592-
},
593-
{
594-
name: "same query as before, one request is executed",
595-
req: testRequest,
596-
err: false,
597-
expected: 5,
598-
},
599-
{
600-
name: "same query as before again, no requests are executed",
601-
req: testRequest,
602-
err: false,
603-
expected: 5,
604-
},
605-
} {
606-
if !t.Run(tc.name, func(t *testing.T) {
607-
ctx := user.InjectOrgID(context.Background(), "1")
608-
httpReq, err := NewThanosQueryRangeCodec(true).EncodeRequest(ctx, tc.req)
609-
testutil.Ok(t, err)
610-
611-
_, err = tpw(rt).RoundTrip(httpReq)
612-
if tc.err {
613-
testutil.NotOk(t, err)
614-
} else {
615-
testutil.Ok(t, err)
616-
}
617-
618-
testutil.Equals(t, tc.expected, res.Load())
619-
}) {
587+
var (
588+
rtErr error
589+
res *http.Response
590+
attempts int
591+
)
592+
// Depending on the timing of operations, the PromQLShardingMiddleware
593+
// middleware needs between 3 and 4 calls before returning successfully.
594+
//
595+
// Knowing that the downstream server is configured to fail the first 3
596+
// requests, the timeline can one of the 2 cases below.
597+
//
598+
// "Best" case scenarios (3 attempts):
599+
// 1. the middleware issues 2 concurrent requests that both fail.
600+
// 2. the middleware issues 2 concurrent requests. One fails, the other
601+
// succeeds and is stored in the cache.
602+
// 3. the middleware issues 1 request for the remaining shard that succeeds
603+
// and is stored in the cache.
604+
// Or
605+
// 1. the middleware issues 1 request that fails before it could
606+
// initiate the second request
607+
// 2. the middleware issues 2 concurrent requests that both fail.
608+
// 3. the middleware issues 2 concurrent requests that both succeed and are
609+
// stored in the cache.
610+
// Note that in the last case, 1. and 2. may happen in the reverse order.
611+
//
612+
// "Worst" case scenario (4 attempts):
613+
// 1. the middleware issues 1 request that fails before it could
614+
// initiate the second request
615+
// 2. the middleware issues 1 request that fails before it could
616+
// initiate the second request
617+
// 3. the middleware issues 1 request that fails before it could
618+
// initiate the second request
619+
// 4. the middleware issues 2 concurrent requests that both succeed.
620+
for range 4 {
621+
attempts++
622+
ctx := user.InjectOrgID(context.Background(), "1")
623+
httpReq, err := NewThanosQueryRangeCodec(true).EncodeRequest(ctx, testRequest)
624+
testutil.Ok(t, err)
625+
626+
res, rtErr = tpw(rt).RoundTrip(httpReq)
627+
if rtErr == nil {
620628
break
621629
}
622630
}
631+
632+
testutil.Ok(t, rtErr)
633+
testutil.Equals(t, http.StatusOK, res.StatusCode)
634+
testutil.Assert(t, attempts == 3 || attempts == 4)
635+
636+
// Check that a subsequent request is served from the cache instead of
637+
// hitting the server.
638+
n := count.Load()
639+
ctx := user.InjectOrgID(context.Background(), "1")
640+
httpReq, err := NewThanosQueryRangeCodec(true).EncodeRequest(ctx, testRequest)
641+
testutil.Ok(t, err)
642+
643+
_, rtErr = tpw(rt).RoundTrip(httpReq)
644+
testutil.Ok(t, rtErr)
645+
testutil.Equals(t, http.StatusOK, res.StatusCode)
646+
testutil.Equals(t, n, count.Load())
623647
}
624648

625649
// TestRoundTripLabelsCacheMiddleware tests the cache middleware for labels requests.
@@ -876,10 +900,9 @@ func promqlResults(fail bool) (*int, http.Handler) {
876900
}
877901

878902
// promqlResultsWithFailures is a mock handler used to test split and cache middleware.
879-
// it will return a failed response numFailures times.
903+
// it will return a failed response for the first numFailures requests.
880904
func promqlResultsWithFailures(numFailures int) (*atomic.Int64, http.Handler) {
881905
count := &atomic.Int64{}
882-
var lock sync.Mutex
883906
q := queryrange.PrometheusResponse{
884907
Status: "success",
885908
Data: queryrange.PrometheusData{
@@ -896,33 +919,16 @@ func promqlResultsWithFailures(numFailures int) (*atomic.Int64, http.Handler) {
896919
},
897920
}
898921

899-
cond := sync.NewCond(&sync.Mutex{})
900-
cond.L.Lock()
901922
return count, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
902-
lock.Lock()
903-
defer lock.Unlock()
904-
905923
// Set fail in the response code to test retry.
906-
if numFailures > 0 {
907-
numFailures--
908-
909-
// Wait for a successful request.
910-
// Release the lock to allow other requests to execute.
911-
if numFailures == 0 {
912-
lock.Unlock()
913-
cond.Wait()
914-
<-time.After(500 * time.Millisecond)
915-
lock.Lock()
916-
}
924+
if count.Inc() <= int64(numFailures) {
917925
w.WriteHeader(500)
926+
return
918927
}
928+
919929
if err := json.NewEncoder(w).Encode(q); err != nil {
920930
panic(err)
921931
}
922-
if numFailures == 0 {
923-
cond.Broadcast()
924-
}
925-
count.Add(1)
926932
})
927933
}
928934

0 commit comments

Comments
 (0)