-
Notifications
You must be signed in to change notification settings - Fork 3.2k
Kafka Connect: Surface commit failures instead of silently swallowing them #16237
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -53,6 +53,9 @@ | |
| import org.apache.iceberg.connect.events.Event; | ||
| import org.apache.iceberg.connect.events.StartCommit; | ||
| import org.apache.iceberg.connect.events.TableReference; | ||
| import org.apache.iceberg.exceptions.CleanableFailure; | ||
| import org.apache.iceberg.exceptions.CommitFailedException; | ||
| import org.apache.iceberg.exceptions.CommitStateUnknownException; | ||
| import org.apache.iceberg.exceptions.NoSuchTableException; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Maps; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Streams; | ||
|
|
@@ -150,12 +153,28 @@ protected boolean receive(Envelope envelope) { | |
| private void commit(boolean partialCommit) { | ||
| try { | ||
| doCommit(partialCommit); | ||
| } catch (Exception e) { | ||
| } catch (CommitFailedException | CommitStateUnknownException e) { | ||
| LOG.warn( | ||
| "Coordinator {} failed to commit for commit {}, will try again next cycle", | ||
| taskId, | ||
| "Commit {} failed, will retry on next cycle: {}", | ||
| commitState.currentCommitId(), | ||
| e.getMessage(), | ||
| e); | ||
| } catch (RuntimeException e) { | ||
| if (e instanceof CleanableFailure) { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Its only uncommitted metadata can be cleaned up. This branch also catches things that are clearly not retryable:
So a bad credential could keep retrying forever at WARN, with no clear signal to the operator. If we want retry behavior here, I’d make it explicit: list the retryable exception types instead of using |
||
| LOG.warn( | ||
| "Commit {} failed, will retry on next cycle: {}", | ||
| commitState.currentCommitId(), | ||
| e.getMessage(), | ||
| e); | ||
| } else { | ||
| LOG.error( | ||
| "Commit {} failed fatally for task {}: {}", | ||
| commitState.currentCommitId(), | ||
| taskId, | ||
| e.getMessage(), | ||
| e); | ||
| throw e; | ||
| } | ||
| } finally { | ||
| commitState.endCurrentCommit(); | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,12 +19,16 @@ | |
| package org.apache.iceberg.connect.channel; | ||
|
|
||
| import static org.assertj.core.api.Assertions.assertThat; | ||
| import static org.assertj.core.api.Assertions.assertThatThrownBy; | ||
| import static org.mockito.Mockito.doThrow; | ||
| import static org.mockito.Mockito.mock; | ||
| import static org.mockito.Mockito.spy; | ||
| import static org.mockito.Mockito.when; | ||
|
|
||
| import java.time.OffsetDateTime; | ||
| import java.util.List; | ||
| import java.util.UUID; | ||
| import org.apache.iceberg.AppendFiles; | ||
| import org.apache.iceberg.DataFile; | ||
| import org.apache.iceberg.DataFiles; | ||
| import org.apache.iceberg.DataOperations; | ||
|
|
@@ -45,6 +49,8 @@ | |
| import org.apache.iceberg.connect.events.StartCommit; | ||
| import org.apache.iceberg.connect.events.TableReference; | ||
| import org.apache.iceberg.connect.events.TopicPartitionOffset; | ||
| import org.apache.iceberg.exceptions.CommitFailedException; | ||
| import org.apache.iceberg.exceptions.CommitStateUnknownException; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Lists; | ||
| import org.apache.iceberg.types.Types.StructType; | ||
|
|
@@ -135,14 +141,50 @@ public void testCommitError() { | |
| .withRecordCount(5) | ||
| .build(); | ||
|
|
||
| coordinatorTest(ImmutableList.of(badDataFile), ImmutableList.of(), null); | ||
| assertThatThrownBy( | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
In prod the flow is:
This test doesn’t cover that. It would still pass even if I think we need an end-to-end test that goes through |
||
| () -> coordinatorTest(ImmutableList.of(badDataFile), ImmutableList.of(), null)) | ||
| .isInstanceOf(IllegalArgumentException.class) | ||
| .hasMessageContaining("Cannot find partition spec"); | ||
|
|
||
| // no commit messages sent | ||
| assertThat(producer.history()).hasSize(1); | ||
|
|
||
| assertThat(table.snapshots()).isEmpty(); | ||
| } | ||
|
|
||
| @Test | ||
| public void testCommitFailedExceptionSwallowed() { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The comment says “Verify issue #15878”, but the test still expects That was the bug in #15878: the commit failure was swallowed and data was lost. To verify the fix, the test should assert that the next |
||
| // Verify issue #15878: a CommitFailedException from the catalog (e.g., Glue concurrent | ||
| // update) is logged and swallowed so the coordinator retries on the next cycle, rather | ||
| // than killing the task permanently. | ||
| Table spiedTable = spy(table); | ||
| AppendFiles spiedAppend = spy(table.newAppend()); | ||
| doThrow(new CommitFailedException("Glue detected concurrent update")) | ||
| .when(spiedAppend) | ||
| .commit(); | ||
| when(spiedTable.newAppend()).thenReturn(spiedAppend); | ||
| when(catalog.loadTable(TABLE_IDENTIFIER)).thenReturn(spiedTable); | ||
|
|
||
| // Should not throw -- CommitFailedException is retryable | ||
| coordinatorTest( | ||
| ImmutableList.of(EventTestUtil.createDataFile()), ImmutableList.of(), EventTestUtil.now()); | ||
| } | ||
|
|
||
| @Test | ||
| public void testCommitStateUnknownExceptionSwallowed() { | ||
| Table spiedTable = spy(table); | ||
| AppendFiles spiedAppend = spy(table.newAppend()); | ||
| doThrow(new CommitStateUnknownException(new RuntimeException("connection reset"))) | ||
| .when(spiedAppend) | ||
| .commit(); | ||
| when(spiedTable.newAppend()).thenReturn(spiedAppend); | ||
| when(catalog.loadTable(TABLE_IDENTIFIER)).thenReturn(spiedTable); | ||
|
|
||
| // Should not throw -- CommitStateUnknownException is retryable | ||
| coordinatorTest( | ||
| ImmutableList.of(EventTestUtil.createDataFile()), ImmutableList.of(), EventTestUtil.now()); | ||
| } | ||
|
|
||
| private void assertCommitTable(int idx, UUID commitId, OffsetDateTime ts) { | ||
| byte[] bytes = producer.history().get(idx).value(); | ||
| Event commitTable = AvroUtil.decode(bytes); | ||
|
|
@@ -289,7 +331,8 @@ public void testCoordinatorCommittedOffsetValidation() { | |
| Snapshot firstSnapshot = table.currentSnapshot(); | ||
| assertThat(firstSnapshot.summary()).containsEntry(OFFSETS_SNAPSHOT_PROP, "{\"0\":7}"); | ||
|
|
||
| // Trigger commit to the table | ||
| // Trigger commit to the table; the coordinator detects stale offsets via the | ||
| // ValidationException (a CleanableFailure), logs a warning, and retries next cycle. | ||
| coordinatorTest( | ||
| ImmutableList.of(EventTestUtil.createDataFile()), ImmutableList.of(), EventTestUtil.now()); | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are two different issues here, and both look bad.
CommitStateUnknownExceptionshould not be retried blindly. Its own Javadoc says we don’t know whether the commit succeeded, and retrying can create duplicates. Since the files stay incommitBuffer, the next cycle can commit the same files again. If the first commit actually landed, we get duplicate rows. This should be fatal and require manual table-state check.CommitFailedExceptionis also not something we should swallow. That is exactly the bug from #15878. A WARN + “retry next cycle” is not enough here: Kafka offsets may already be flushed, and on rebalance the in-memory commit state can be lost. Then the data is gone and the operator never sees a real failure.Flink lets
CommitFailedExceptionpropagate. I think we should do the same here.