Skip to content

Commit 6d9c3b8

Browse files
authored
fix: transaction retries should not timeout (#1009)
Transactions that are retried because of an aborted transaction use the retry settings of the Rollback RPC. This ensures reasonable backoff values. It however also meant that transactions that are retried multiple times could exceed the total timeout of the retry settings, and that again would cause the Aborted error to propagate. This change sets the total timeout for transaction retries to 24 hours and disables any max attempts in the retry settings to prevent retries to fail because the deadline is exceeded. Transactions can still fail with timeout errors if individual RPC invocations exceed the configured timeout of that RPC. This change only prevents timeouts from occurring because of repeated retries of an entire transaction. Fixes #1008
1 parent 865bf01 commit 6d9c3b8

File tree

2 files changed

+81
-6
lines changed

2 files changed

+81
-6
lines changed

google-cloud-spanner/src/main/java/com/google/cloud/spanner/SpannerRetryHelper.java

+19-6
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
package com.google.cloud.spanner;
1818

19+
import com.google.api.core.ApiClock;
1920
import com.google.api.core.NanoClock;
2021
import com.google.api.gax.retrying.ResultRetryAlgorithm;
2122
import com.google.api.gax.retrying.RetrySettings;
@@ -24,6 +25,7 @@
2425
import com.google.cloud.RetryHelper.RetryHelperException;
2526
import com.google.cloud.spanner.v1.stub.SpannerStub;
2627
import com.google.cloud.spanner.v1.stub.SpannerStubSettings;
28+
import com.google.common.annotations.VisibleForTesting;
2729
import com.google.common.base.Throwables;
2830
import com.google.spanner.v1.RollbackRequest;
2931
import io.grpc.Context;
@@ -45,25 +47,36 @@ class SpannerRetryHelper {
4547
* retrying aborted transactions will also automatically be updated if the default retry settings
4648
* are updated.
4749
*
50+
* <p>A read/write transaction should not timeout while retrying. The total timeout of the retry
51+
* settings is therefore set to 24 hours and there is no max attempts value.
52+
*
4853
* <p>These default {@link RetrySettings} are only used if no retry information is returned by the
4954
* {@link AbortedException}.
5055
*/
51-
private static final RetrySettings txRetrySettings =
52-
SpannerStubSettings.newBuilder().rollbackSettings().getRetrySettings();
56+
@VisibleForTesting
57+
static final RetrySettings txRetrySettings =
58+
SpannerStubSettings.newBuilder()
59+
.rollbackSettings()
60+
.getRetrySettings()
61+
.toBuilder()
62+
.setTotalTimeout(Duration.ofHours(24L))
63+
.setMaxAttempts(0)
64+
.build();
5365

5466
/** Executes the {@link Callable} and retries if it fails with an {@link AbortedException}. */
5567
static <T> T runTxWithRetriesOnAborted(Callable<T> callable) {
56-
return runTxWithRetriesOnAborted(callable, txRetrySettings);
68+
return runTxWithRetriesOnAborted(callable, txRetrySettings, NanoClock.getDefaultClock());
5769
}
5870

5971
/**
6072
* Executes the {@link Callable} and retries if it fails with an {@link AbortedException} using
6173
* the specific {@link RetrySettings}.
6274
*/
63-
static <T> T runTxWithRetriesOnAborted(Callable<T> callable, RetrySettings retrySettings) {
75+
@VisibleForTesting
76+
static <T> T runTxWithRetriesOnAborted(
77+
Callable<T> callable, RetrySettings retrySettings, ApiClock clock) {
6478
try {
65-
return RetryHelper.runWithRetries(
66-
callable, retrySettings, new TxRetryAlgorithm<>(), NanoClock.getDefaultClock());
79+
return RetryHelper.runWithRetries(callable, retrySettings, new TxRetryAlgorithm<>(), clock);
6780
} catch (RetryHelperException e) {
6881
if (e.getCause() != null) {
6982
Throwables.throwIfUnchecked(e.getCause());

google-cloud-spanner/src/test/java/com/google/cloud/spanner/SpannerRetryHelperTest.java

+62
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,10 @@
1717
package com.google.cloud.spanner;
1818

1919
import static com.google.common.truth.Truth.assertThat;
20+
import static org.junit.Assert.assertEquals;
2021
import static org.junit.Assert.fail;
2122

23+
import com.google.api.core.ApiClock;
2224
import com.google.common.base.Stopwatch;
2325
import com.google.common.util.concurrent.ThreadFactoryBuilder;
2426
import com.google.protobuf.Duration;
@@ -42,6 +44,66 @@
4244

4345
@RunWith(JUnit4.class)
4446
public class SpannerRetryHelperTest {
47+
private static class FakeClock implements ApiClock {
48+
private long currentTime;
49+
50+
@Override
51+
public long nanoTime() {
52+
return TimeUnit.NANOSECONDS.convert(currentTime, TimeUnit.MILLISECONDS);
53+
}
54+
55+
@Override
56+
public long millisTime() {
57+
return currentTime;
58+
}
59+
}
60+
61+
@Test
62+
public void testRetryDoesNotTimeoutAfterTenMinutes() {
63+
final FakeClock clock = new FakeClock();
64+
final AtomicInteger attempts = new AtomicInteger();
65+
Callable<Integer> callable =
66+
new Callable<Integer>() {
67+
@Override
68+
public Integer call() {
69+
if (attempts.getAndIncrement() == 0) {
70+
clock.currentTime += TimeUnit.MILLISECONDS.convert(10L, TimeUnit.MINUTES);
71+
throw SpannerExceptionFactory.newSpannerException(ErrorCode.ABORTED, "test");
72+
}
73+
return 1 + 1;
74+
}
75+
};
76+
assertEquals(
77+
2,
78+
SpannerRetryHelper.runTxWithRetriesOnAborted(
79+
callable, SpannerRetryHelper.txRetrySettings, clock)
80+
.intValue());
81+
}
82+
83+
@Test
84+
public void testRetryDoesFailAfterMoreThanOneDay() {
85+
final FakeClock clock = new FakeClock();
86+
final AtomicInteger attempts = new AtomicInteger();
87+
Callable<Integer> callable =
88+
new Callable<Integer>() {
89+
@Override
90+
public Integer call() {
91+
if (attempts.getAndIncrement() == 0) {
92+
clock.currentTime += TimeUnit.MILLISECONDS.convert(25L, TimeUnit.HOURS);
93+
throw SpannerExceptionFactory.newSpannerException(ErrorCode.ABORTED, "test");
94+
}
95+
return 1 + 1;
96+
}
97+
};
98+
try {
99+
SpannerRetryHelper.runTxWithRetriesOnAborted(
100+
callable, SpannerRetryHelper.txRetrySettings, clock);
101+
fail("missing expected exception");
102+
} catch (SpannerException e) {
103+
assertEquals(ErrorCode.ABORTED, e.getErrorCode());
104+
assertEquals(1, attempts.get());
105+
}
106+
}
45107

46108
@Test
47109
public void testCancelledContext() {

0 commit comments

Comments
 (0)