diff --git a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/implementation/directconnectivity/GoneAndRetryWithRetryPolicyTest.java b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/implementation/directconnectivity/GoneAndRetryWithRetryPolicyTest.java index 213a71ebff59..e951b2078b43 100644 --- a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/implementation/directconnectivity/GoneAndRetryWithRetryPolicyTest.java +++ b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/implementation/directconnectivity/GoneAndRetryWithRetryPolicyTest.java @@ -12,6 +12,7 @@ import com.azure.cosmos.implementation.LeaseNotFoundException; import com.azure.cosmos.implementation.OperationType; import com.azure.cosmos.implementation.PartitionIsMigratingException; +import com.azure.cosmos.implementation.PartitionKeyRangeGoneException; import com.azure.cosmos.implementation.PartitionKeyRangeIsSplittingException; import com.azure.cosmos.implementation.RequestTimeoutException; import com.azure.cosmos.implementation.ResourceType; @@ -325,6 +326,66 @@ public void shouldRetryWithPartitionKeyRangeIsSplittingException() { } + /** + * Retry with address resolution PartitionKeyRangeGoneException + */ + @Test(groups = { "unit" }, timeOut = TIMEOUT) + public void shouldRetryWithAddressResolutionPartitionKeyRangeGoneException() { + RxDocumentServiceRequest request = RxDocumentServiceRequest.create( + mockDiagnosticsClientContext(), + OperationType.Read, + ResourceType.Document); + GoneAndRetryWithRetryPolicy goneAndRetryWithRetryPolicy = new GoneAndRetryWithRetryPolicy(request, 30); + Mono singleShouldRetry = goneAndRetryWithRetryPolicy + .shouldRetry(new PartitionKeyRangeGoneException().markRetryWithRoutingMapRefresh()); + ShouldRetryResult shouldRetryResult = singleShouldRetry.block(); + assertThat(shouldRetryResult.shouldRetry).isTrue(); + assertThat(request.forcePartitionKeyRangeRefresh).isTrue(); + assertThat(request.requestContext.resolvedPartitionKeyRange).isNull(); + assertThat(request.requestContext.quorumSelectedLSN).isEqualTo(-1); + assertThat(shouldRetryResult.policyArg.getValue0()).isFalse(); + } + + @Test(groups = { "unit" }, timeOut = TIMEOUT) + public void shouldNotRetryWithPartitionKeyRangeGoneException() { + RxDocumentServiceRequest request = RxDocumentServiceRequest.create( + mockDiagnosticsClientContext(), + OperationType.Read, + ResourceType.Document); + GoneAndRetryWithRetryPolicy goneAndRetryWithRetryPolicy = new GoneAndRetryWithRetryPolicy(request, 30); + ShouldRetryResult shouldRetryResult = goneAndRetryWithRetryPolicy + .shouldRetry(new PartitionKeyRangeGoneException()) + .block(); + + assertThat(shouldRetryResult.shouldRetry).isFalse(); + } + + @Test(groups = { "unit" }, timeOut = TIMEOUT) + public void shouldWrapAddressResolutionPartitionKeyRangeGoneExceptionWithServiceUnavailableWhenRetryBudgetExhausted() { + RxDocumentServiceRequest request = RxDocumentServiceRequest.create( + mockDiagnosticsClientContext(), + OperationType.Read, + ResourceType.Document); + GoneAndRetryWithRetryPolicy goneAndRetryWithRetryPolicy = new GoneAndRetryWithRetryPolicy(request, 0); + + ShouldRetryResult shouldRetryResult = goneAndRetryWithRetryPolicy + .shouldRetry(new PartitionKeyRangeGoneException().markRetryWithRoutingMapRefresh()) + .block(); + assertThat(shouldRetryResult.shouldRetry).isTrue(); + + shouldRetryResult = goneAndRetryWithRetryPolicy + .shouldRetry(new PartitionKeyRangeGoneException().markRetryWithRoutingMapRefresh()) + .block(); + + assertThat(shouldRetryResult.shouldRetry).isFalse(); + assertThat(shouldRetryResult.exception).isInstanceOf(CosmosException.class); + + CosmosException cosmosException = (CosmosException) shouldRetryResult.exception; + assertThat(cosmosException.getStatusCode()).isEqualTo(HttpConstants.StatusCodes.SERVICE_UNAVAILABLE); + assertThat(cosmosException.getSubStatusCode()) + .isEqualTo(HttpConstants.SubStatusCodes.PARTITION_KEY_RANGE_GONE_EXCEEDED_RETRY_LIMIT); + } + /** * No retry on bad request exception */ diff --git a/sdk/cosmos/azure-cosmos/CHANGELOG.md b/sdk/cosmos/azure-cosmos/CHANGELOG.md index 25b8545acfff..ade6abe0c0ac 100644 --- a/sdk/cosmos/azure-cosmos/CHANGELOG.md +++ b/sdk/cosmos/azure-cosmos/CHANGELOG.md @@ -7,6 +7,7 @@ #### Breaking Changes #### Bugs Fixed +* Fixed direct connectivity retries for `PartitionKeyRangeGoneException` thrown during address resolution so stale partition key ranges trigger routing-map refresh and exhausted retries surface as 503 with partition-key-range-gone retry-limit substatus. - See [Issue 49381](https://github.com/Azure/azure-sdk-for-java/issues/49381). * Unified request-level consistency override behavior across transports: invalid attempts to upgrade the request consistency level above the account default are now silently ignored instead of returning `BadRequest` in some gateway paths. - See PR [49606](https://github.com/Azure/azure-sdk-for-java/pull/49606). #### Other Changes diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/PartitionKeyRangeGoneException.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/PartitionKeyRangeGoneException.java index fc181f14bbf2..9e4f99f04237 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/PartitionKeyRangeGoneException.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/PartitionKeyRangeGoneException.java @@ -15,10 +15,12 @@ * This exception is thrown when DocumentServiceRequest contains x-ms-documentdb-partitionkeyrangeid * header and such range id doesn't exist. *

- * No retries should be made in this case, as either split or merge might have happened and query/readfeed - * must take appropriate actions. + * No retries should generally be made in this case, as either split or merge might have happened and query/readfeed + * must take appropriate actions. Direct-mode address resolution may opt into retrying with a routing map refresh when + * this exception is caused by stale address or routing state. */ public class PartitionKeyRangeGoneException extends CosmosException { + private boolean shouldRetryWithRoutingMapRefresh; /** * Instantiates a new Partition key range gone exception. @@ -85,4 +87,13 @@ private void setSubstatus() { this.getResponseHeaders().put(WFConstants.BackendHeaders.SUB_STATUS, Integer.toString(HttpConstants.SubStatusCodes.PARTITION_KEY_RANGE_GONE)); } + + public boolean shouldRetryWithRoutingMapRefresh() { + return this.shouldRetryWithRoutingMapRefresh; + } + + public PartitionKeyRangeGoneException markRetryWithRoutingMapRefresh() { + this.shouldRetryWithRoutingMapRefresh = true; + return this; + } } diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/directconnectivity/AddressResolver.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/directconnectivity/AddressResolver.java index 54d88d6ad670..ad1fdf0f4766 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/directconnectivity/AddressResolver.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/directconnectivity/AddressResolver.java @@ -583,7 +583,9 @@ private ResolutionResult handleRangeAddressResolutionFailure( RMResources.PartitionKeyRangeNotFound, request.getPartitionKeyRangeIdentity().getPartitionKeyRangeId(), request.getPartitionKeyRangeIdentity().getCollectionRid()); - throw BridgeInternal.setResourceAddress(new PartitionKeyRangeGoneException(errorMessage), request.requestContext.resourcePhysicalAddress); + throw BridgeInternal.setResourceAddress( + new PartitionKeyRangeGoneException(errorMessage).markRetryWithRoutingMapRefresh(), + request.requestContext.resourcePhysicalAddress); } logger.debug("handleRangeAddressResolutionFailure returns null"); return null; diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/directconnectivity/GatewayAddressCache.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/directconnectivity/GatewayAddressCache.java index 97773c6aae32..be1f9c4e610e 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/directconnectivity/GatewayAddressCache.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/directconnectivity/GatewayAddressCache.java @@ -732,7 +732,8 @@ private Mono getAddressesForRangeId( partitionKeyRangeId, collectionRid); - PartitionKeyRangeGoneException e = new PartitionKeyRangeGoneException(errorMessage); + PartitionKeyRangeGoneException e = new PartitionKeyRangeGoneException(errorMessage) + .markRetryWithRoutingMapRefresh(); BridgeInternal.setResourceAddress(e, collectionRid); return Mono.error(e); diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/directconnectivity/GoneAndRetryWithRetryPolicy.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/directconnectivity/GoneAndRetryWithRetryPolicy.java index 1822baf02b23..051825aedb60 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/directconnectivity/GoneAndRetryWithRetryPolicy.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/directconnectivity/GoneAndRetryWithRetryPolicy.java @@ -38,6 +38,11 @@ private static ImplementationBridgeHelpers.CosmosExceptionHelper.CosmosException return ImplementationBridgeHelpers.CosmosExceptionHelper.getCosmosExceptionAccessor(); } + private static boolean isPartitionKeyRangeGoneExceptionWithRoutingMapRefresh(Exception exception) { + return exception instanceof PartitionKeyRangeGoneException && + ((PartitionKeyRangeGoneException) exception).shouldRetryWithRoutingMapRefresh(); + } + private final static Logger logger = LoggerFactory.getLogger(GoneAndRetryWithRetryPolicy.class); private final GoneRetryPolicy goneRetryPolicy; private final RetryWithRetryPolicy retryWithRetryPolicy; @@ -127,6 +132,7 @@ private boolean isNonRetryableException(Exception exception) { if (exception instanceof GoneException || exception instanceof PartitionIsMigratingException || exception instanceof PartitionKeyRangeIsSplittingException || + isPartitionKeyRangeGoneExceptionWithRoutingMapRefresh(exception) || exception instanceof LeaseNotFoundException) { return false; @@ -292,6 +298,8 @@ private Pair, Boolean> handleException(Exception excepti return handlePartitionIsMigratingException((PartitionIsMigratingException)exception); } else if (exception instanceof PartitionKeyRangeIsSplittingException) { return handlePartitionKeyIsSplittingException((PartitionKeyRangeIsSplittingException) exception); + } else if (isPartitionKeyRangeGoneExceptionWithRoutingMapRefresh(exception)) { + return handlePartitionKeyRangeGoneException((PartitionKeyRangeGoneException) exception); } throw new IllegalStateException("Invalid exception type", exception); @@ -309,13 +317,27 @@ private Pair, Boolean> handlePartitionIsMigratingExcepti } private Pair, Boolean> handlePartitionKeyIsSplittingException(PartitionKeyRangeIsSplittingException exception) { - this.request.requestContext.resolvedPartitionKeyRange = null; - this.request.requestContext.quorumSelectedLSN = -1; - this.request.requestContext.quorumSelectedStoreResponse = null; + resetRequestContextForPartitionKeyRangeRefresh(); logger.debug("Received partition key range splitting exception, will retry, {}", exception.toString()); this.request.forcePartitionKeyRangeRefresh = true; return Pair.of(null, false); } + + private Pair, Boolean> handlePartitionKeyRangeGoneException(PartitionKeyRangeGoneException exception) { + // PartitionKeyRangeGoneException is generally treated as non-retriable, but when it is thrown while resolving + // addresses in direct mode it typically indicates stale routing/partition state; clear the cached target and + // force a routing-map (partition key range) refresh to allow the request to be re-routed. + resetRequestContextForPartitionKeyRangeRefresh(); + logger.debug("Received partition key range gone exception, will retry, {}", exception.toString()); + this.request.forcePartitionKeyRangeRefresh = true; + return Pair.of(null, false); + } + + private void resetRequestContextForPartitionKeyRangeRefresh() { + this.request.requestContext.resolvedPartitionKeyRange = null; + this.request.requestContext.quorumSelectedLSN = -1; + this.request.requestContext.quorumSelectedStoreResponse = null; + } } }