Lokasi ngalangkungan proxy:   [ UP ]  
[Ngawartoskeun bug]   [Panyetelan cookie]                
Skip to content
This repository was archived by the owner on Feb 24, 2026. It is now read-only.

Commit a5157fa

Browse files
feat: Add reconnect support to v1 client lib. (#1446)
* fix: update code comment to reflect max size change * feat: add reconnection support to v1 client * 🦉 Updates from OwlBot See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
1 parent b8f1edb commit a5157fa

4 files changed

Lines changed: 227 additions & 43 deletions

File tree

google-cloud-bigquerystorage/src/main/java/com/google/cloud/bigquery/storage/v1/StreamWriter.java

Lines changed: 111 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import com.google.api.gax.core.CredentialsProvider;
2121
import com.google.api.gax.rpc.FixedHeaderProvider;
2222
import com.google.api.gax.rpc.TransportChannelProvider;
23+
import com.google.cloud.bigquery.storage.util.Errors;
2324
import com.google.cloud.bigquery.storage.v1.AppendRowsRequest.ProtoData;
2425
import com.google.cloud.bigquery.storage.v1.StreamConnection.DoneCallback;
2526
import com.google.cloud.bigquery.storage.v1.StreamConnection.RequestCallback;
@@ -90,6 +91,26 @@ public class StreamWriter implements AutoCloseable {
9091
@GuardedBy("lock")
9192
private long inflightBytes = 0;
9293

94+
/*
95+
* Tracks how often the stream was closed due to a retriable error. Streaming will stop when the
96+
* count hits a threshold. Streaming should only be halted, if it isn't possible to establish a
97+
* connection. Keep track of the number of reconnections in succession. This will be reset if
98+
* a row is successfully called back.
99+
*/
100+
@GuardedBy("lock")
101+
private long conectionRetryCountWithoutCallback = 0;
102+
103+
/*
104+
* If false, streamConnection needs to be reset.
105+
*/
106+
@GuardedBy("lock")
107+
private boolean streamConnectionIsConnected = false;
108+
109+
/*
110+
* Retry threshold, limits how often the connection is retried before processing halts.
111+
*/
112+
private static final long RETRY_THRESHOLD = 3;
113+
93114
/*
94115
* Indicates whether user has called Close() or not.
95116
*/
@@ -173,6 +194,18 @@ private StreamWriter(Builder builder) throws IOException {
173194
this.ownsBigQueryWriteClient = false;
174195
}
175196

197+
this.appendThread =
198+
new Thread(
199+
new Runnable() {
200+
@Override
201+
public void run() {
202+
appendLoop();
203+
}
204+
});
205+
this.appendThread.start();
206+
}
207+
208+
private void resetConnection() {
176209
this.streamConnection =
177210
new StreamConnection(
178211
this.client,
@@ -188,15 +221,6 @@ public void run(Throwable finalStatus) {
188221
doneCallback(finalStatus);
189222
}
190223
});
191-
this.appendThread =
192-
new Thread(
193-
new Runnable() {
194-
@Override
195-
public void run() {
196-
appendLoop();
197-
}
198-
});
199-
this.appendThread.start();
200224
}
201225

202226
/**
@@ -331,12 +355,27 @@ public void close() {
331355
* It takes requests from waiting queue and sends them to server.
332356
*/
333357
private void appendLoop() {
334-
boolean isFirstRequestInConnection = true;
335358
Deque<AppendRequestAndResponse> localQueue = new LinkedList<AppendRequestAndResponse>();
359+
boolean streamNeedsConnecting = false;
360+
// Set firstRequestInConnection to true immediately after connecting the steam,
361+
// indicates then next row sent, needs the schema and other metadata.
362+
boolean isFirstRequestInConnection = true;
336363
while (!waitingQueueDrained()) {
337364
this.lock.lock();
338365
try {
339366
hasMessageInWaitingQueue.await(100, TimeUnit.MILLISECONDS);
367+
// Copy the streamConnectionIsConnected guarded by lock to a local variable.
368+
// In addition, only reconnect if there is a retriable error.
369+
streamNeedsConnecting = !streamConnectionIsConnected && connectionFinalStatus == null;
370+
if (streamNeedsConnecting) {
371+
// If the stream connection is broken, any requests on inflightRequestQueue will need
372+
// to be resent, as the new connection has no knowledge of the requests. Copy the requests
373+
// from inflightRequestQueue and prepent them onto the waitinRequestQueue. They need to be
374+
// prepended as they need to be sent before new requests.
375+
while (!inflightRequestQueue.isEmpty()) {
376+
waitingRequestQueue.addFirst(inflightRequestQueue.pollLast());
377+
}
378+
}
340379
while (!this.waitingRequestQueue.isEmpty()) {
341380
AppendRequestAndResponse requestWrapper = this.waitingRequestQueue.pollFirst();
342381
this.inflightRequestQueue.addLast(requestWrapper);
@@ -355,12 +394,34 @@ private void appendLoop() {
355394
if (localQueue.isEmpty()) {
356395
continue;
357396
}
358-
359-
// TODO: Add reconnection here.
397+
if (streamNeedsConnecting) {
398+
// Set streamConnectionIsConnected to true, to indicate the stream has been connected. This
399+
// should happen before the call to resetConnection. As it is unknown when the connection
400+
// could be closed and the doneCallback called, and thus clearing the flag.
401+
lock.lock();
402+
try {
403+
this.streamConnectionIsConnected = true;
404+
} finally {
405+
lock.unlock();
406+
}
407+
resetConnection();
408+
// Set firstRequestInConnection to indicate the next request to be sent should include
409+
// metedata.
410+
isFirstRequestInConnection = true;
411+
}
360412
while (!localQueue.isEmpty()) {
361413
AppendRowsRequest preparedRequest =
362414
prepareRequestBasedOnPosition(
363415
localQueue.pollFirst().message, isFirstRequestInConnection);
416+
// Send should only throw an exception if there is a problem with the request. The catch
417+
// block will handle this case, and return the exception with the result.
418+
// Otherwise send will return:
419+
// SUCCESS: Message was sent, wait for the callback.
420+
// STREAM_CLOSED: Stream was closed, normally or due to en error
421+
// NOT_ENOUGH_QUOTA: Message wasn't sent due to not enough quota.
422+
// TODO: Handle NOT_ENOUGH_QUOTA.
423+
// In the close case, the request is in the inflight queue, and will either be returned
424+
// to the user with an error, or will be resent.
364425
this.streamConnection.send(preparedRequest);
365426
isFirstRequestInConnection = false;
366427
}
@@ -369,8 +430,10 @@ private void appendLoop() {
369430
log.fine("Cleanup starts. Stream: " + streamName);
370431
// At this point, the waiting queue is drained, so no more requests.
371432
// We can close the stream connection and handle the remaining inflight requests.
372-
this.streamConnection.close();
373-
waitForDoneCallback();
433+
if (streamConnection != null) {
434+
this.streamConnection.close();
435+
waitForDoneCallback();
436+
}
374437

375438
// At this point, there cannot be more callback. It is safe to clean up all inflight requests.
376439
log.fine(
@@ -455,6 +518,12 @@ private void requestCallback(AppendRowsResponse response) {
455518
AppendRequestAndResponse requestWrapper;
456519
this.lock.lock();
457520
try {
521+
// Had a successful connection with at least one result, reset retries.
522+
// conectionRetryCountWithoutCallback is reset so that only multiple retries, without
523+
// successful records sent, will cause the stream to fail.
524+
if (conectionRetryCountWithoutCallback != 0) {
525+
conectionRetryCountWithoutCallback = 0;
526+
}
458527
requestWrapper = pollInflightRequestQueue();
459528
} finally {
460529
this.lock.unlock();
@@ -476,6 +545,14 @@ private void requestCallback(AppendRowsResponse response) {
476545
}
477546
}
478547

548+
private boolean isRetriableError(Throwable t) {
549+
Status status = Status.fromThrowable(t);
550+
if (Errors.isRetryableInternalStatus(status)) {
551+
return true;
552+
}
553+
return status.getCode() == Status.Code.ABORTED || status.getCode() == Status.Code.UNAVAILABLE;
554+
}
555+
479556
private void doneCallback(Throwable finalStatus) {
480557
log.fine(
481558
"Received done callback. Stream: "
@@ -484,7 +561,26 @@ private void doneCallback(Throwable finalStatus) {
484561
+ finalStatus.toString());
485562
this.lock.lock();
486563
try {
487-
this.connectionFinalStatus = finalStatus;
564+
this.streamConnectionIsConnected = false;
565+
if (connectionFinalStatus == null) {
566+
// If the error can be retried, don't set it here, let it try to retry later on.
567+
if (isRetriableError(finalStatus)
568+
&& conectionRetryCountWithoutCallback < RETRY_THRESHOLD
569+
&& !userClosed) {
570+
this.conectionRetryCountWithoutCallback++;
571+
log.fine(
572+
"Retriable error "
573+
+ finalStatus.toString()
574+
+ " received, retry count "
575+
+ conectionRetryCountWithoutCallback
576+
+ " for stream "
577+
+ streamName);
578+
} else {
579+
this.connectionFinalStatus = finalStatus;
580+
log.info(
581+
"Stream finished with error " + finalStatus.toString() + " for stream " + streamName);
582+
}
583+
}
488584
} finally {
489585
this.lock.unlock();
490586
}

google-cloud-bigquerystorage/src/test/java/com/google/cloud/bigquery/storage/v1/FakeBigQueryWrite.java

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,14 +79,22 @@ public void reset() {
7979
serviceImpl.reset();
8080
}
8181

82-
public void setResponseDelay(Duration delay) {
83-
serviceImpl.setResponseDelay(delay);
84-
}
85-
8682
public void setResponseSleep(Duration sleep) {
8783
serviceImpl.setResponseSleep(sleep);
8884
}
8985

86+
public void setCloseEveryNAppends(long closeAfter) {
87+
serviceImpl.setCloseEveryNAppends(closeAfter);
88+
}
89+
90+
public void setTimesToClose(long numberTimesToClose) {
91+
serviceImpl.setTimesToClose(numberTimesToClose);
92+
}
93+
94+
public long getConnectionCount() {
95+
return serviceImpl.getConnectionCount();
96+
}
97+
9098
public void setExecutor(ScheduledExecutorService executor) {
9199
serviceImpl.setExecutor(executor);
92100
}

google-cloud-bigquerystorage/src/test/java/com/google/cloud/bigquery/storage/v1/FakeBigQueryWriteImpl.java

Lines changed: 62 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
import com.google.common.base.Optional;
1919
import com.google.common.util.concurrent.Uninterruptibles;
20+
import io.grpc.Status;
2021
import io.grpc.stub.StreamObserver;
2122
import java.util.ArrayList;
2223
import java.util.List;
@@ -45,11 +46,16 @@ class FakeBigQueryWriteImpl extends BigQueryWriteGrpc.BigQueryWriteImplBase {
4546
private final AtomicInteger nextMessageId = new AtomicInteger(1);
4647
private boolean autoPublishResponse;
4748
private ScheduledExecutorService executor = null;
48-
private Duration responseDelay = Duration.ZERO;
4949

5050
private Duration responseSleep = Duration.ZERO;
5151
private Semaphore responseSemaphore = new Semaphore(0, true);
5252

53+
private long numberTimesToClose = 0;
54+
private long closeAfter = 0;
55+
private long recordCount = 0;
56+
private long connectionCount = 0;
57+
private boolean firstRecord = false;
58+
5359
/** Class used to save the state of a possible response. */
5460
private static class Response {
5561
Optional<AppendRowsResponse> appendResponse;
@@ -120,38 +126,51 @@ public void waitForResponseScheduled() throws InterruptedException {
120126
responseSemaphore.acquire();
121127
}
122128

129+
/* Return the number of times the stream was connected. */
130+
public long getConnectionCount() {
131+
return connectionCount;
132+
}
133+
123134
@Override
124135
public StreamObserver<AppendRowsRequest> appendRows(
125136
final StreamObserver<AppendRowsResponse> responseObserver) {
137+
this.connectionCount++;
138+
this.firstRecord = true;
126139
StreamObserver<AppendRowsRequest> requestObserver =
127140
new StreamObserver<AppendRowsRequest>() {
128141
@Override
129142
public void onNext(AppendRowsRequest value) {
130143
LOG.fine("Get request:" + value.toString());
131-
final Response response = responses.remove();
132144
requests.add(value);
145+
recordCount++;
133146
if (responseSleep.compareTo(Duration.ZERO) > 0) {
134-
LOG.info("Sleeping before response for " + responseSleep.toString());
147+
LOG.fine("Sleeping before response for " + responseSleep.toString());
135148
Uninterruptibles.sleepUninterruptibly(
136149
responseSleep.toMillis(), TimeUnit.MILLISECONDS);
137150
}
138-
if (responseDelay == Duration.ZERO) {
139-
sendResponse(response, responseObserver);
151+
if (firstRecord) {
152+
if (!value.getProtoRows().hasWriterSchema() || value.getWriteStream().isEmpty()) {
153+
LOG.info(
154+
String.valueOf(
155+
!value.getProtoRows().hasWriterSchema()
156+
|| value.getWriteStream().isEmpty()));
157+
responseObserver.onError(
158+
Status.INVALID_ARGUMENT
159+
.withDescription("Unexpected first request: " + value.toString())
160+
.asException());
161+
return;
162+
}
163+
}
164+
firstRecord = false;
165+
if (closeAfter > 0
166+
&& recordCount % closeAfter == 0
167+
&& (numberTimesToClose == 0 || connectionCount <= numberTimesToClose)) {
168+
LOG.info("Shutting down connection from test...");
169+
responseObserver.onError(Status.ABORTED.asException());
140170
} else {
141-
final Response responseToSend = response;
142-
// TODO(yirutang): This is very wrong because it messes up response/complete ordering.
143-
LOG.fine("Schedule a response to be sent at delay");
144-
executor.schedule(
145-
new Runnable() {
146-
@Override
147-
public void run() {
148-
sendResponse(responseToSend, responseObserver);
149-
}
150-
},
151-
responseDelay.toMillis(),
152-
TimeUnit.MILLISECONDS);
171+
final Response response = responses.remove();
172+
sendResponse(response, responseObserver);
153173
}
154-
responseSemaphore.release();
155174
}
156175

157176
@Override
@@ -183,12 +202,6 @@ public FakeBigQueryWriteImpl setExecutor(ScheduledExecutorService executor) {
183202
return this;
184203
}
185204

186-
/** Set an amount of time by which to delay publish responses. */
187-
public FakeBigQueryWriteImpl setResponseDelay(Duration responseDelay) {
188-
this.responseDelay = responseDelay;
189-
return this;
190-
}
191-
192205
/** Set an amount of time by which to sleep before publishing responses. */
193206
public FakeBigQueryWriteImpl setResponseSleep(Duration responseSleep) {
194207
this.responseSleep = responseSleep;
@@ -231,4 +244,29 @@ public void reset() {
231244
requests.clear();
232245
responses.clear();
233246
}
247+
248+
/* Abort the stream after N records. The primary use case is to test the retry logic. After N
249+
* records are sent, the stream will be aborted with Code.ABORTED. This is a retriable error.
250+
* The abort will call the onDone callback immediately, and thus potentially losing some messages
251+
* that have already been sent. If the value of closeAfter is too small, the client might not get
252+
* a chance to process any records before a subsequent abort is sent. Which means multiple retries
253+
* in a row on the client side. After 3 retries in a row the write will fail.
254+
* closeAfter should be large enough to give the client some opportunity to receive some of the
255+
* messages.
256+
**/
257+
public void setCloseEveryNAppends(long closeAfter) {
258+
this.closeAfter = closeAfter;
259+
}
260+
/* If setCloseEveryNAppends is greater than 0, then the stream will be aborted every N appends.
261+
* setTimesToClose will limit the number of times to do the abort. If it is set to 0, it will
262+
* abort every N appends.
263+
* The primary use cases is, send a couple of records, then abort. But if there are only a couple
264+
* of records, it is possible these two records are sent, then the abort happens before those two
265+
* records are processed by the client, requiring them to be sent again, and thus a potential
266+
* infinite loop. Therefore set the times to close to 1. This will send the two records, force
267+
* an abort an retry, and then reprocess the records to completion.
268+
**/
269+
public void setTimesToClose(long numberTimesToClose) {
270+
this.numberTimesToClose = numberTimesToClose;
271+
}
234272
}

0 commit comments

Comments
 (0)