From 178b1c8e8860297bc511884d5d3f9235dc51aea6 Mon Sep 17 00:00:00 2001 From: Raunaq Morarka Date: Tue, 28 Apr 2026 14:29:25 +0530 Subject: [PATCH 1/3] Tolerate path normalization in EmulatedListFilesStartingFromIterator The iterator's strict `entryPath.startsWith(locationPath)` invariant breaks when the underlying file system canonicalizes runs of slashes: listing a directory location ending in `//` returns entries with a single slash and the check fires with `IllegalStateException`. ADLS Gen2 (hierarchical), Java NIO's `LocalFileSystem`, and `AlluxioFileSystem` canonicalize; Hadoop's `HdfsFileSystem` and `S3FileSystem` preserve `//` as a distinct path component. Try the original prefix first (preserves blob-store keys with literal `//` components), fall back to the slash-collapsed form, and compute `entryTail` from whichever matched. Surfaced by `TestDeltaLakeAdlsStorage.testQuery`. --- .../azure/AbstractTestAzureFileSystem.java | 26 +++++++++++++++++++ ...EmulatedListFilesStartingFromIterator.java | 24 ++++++++++++++--- ...EmulatedListFilesStartingFromIterator.java | 26 +++++++++++++++++++ .../filesystem/local/TestLocalFileSystem.java | 23 ++++++++++++++++ 4 files changed, 96 insertions(+), 3 deletions(-) diff --git a/lib/trino-filesystem-azure/src/test/java/io/trino/filesystem/azure/AbstractTestAzureFileSystem.java b/lib/trino-filesystem-azure/src/test/java/io/trino/filesystem/azure/AbstractTestAzureFileSystem.java index 46483e349b6d..227db637329e 100644 --- a/lib/trino-filesystem-azure/src/test/java/io/trino/filesystem/azure/AbstractTestAzureFileSystem.java +++ b/lib/trino-filesystem-azure/src/test/java/io/trino/filesystem/azure/AbstractTestAzureFileSystem.java @@ -22,8 +22,11 @@ import com.azure.storage.file.datalake.DataLakeServiceClientBuilder; import com.azure.storage.file.datalake.models.PathItem; import com.azure.storage.file.datalake.options.DataLakePathDeleteOptions; +import com.google.common.collect.ImmutableList; +import com.google.common.io.Closer; import io.opentelemetry.api.OpenTelemetry; import io.trino.filesystem.AbstractTestTrinoFileSystem; +import io.trino.filesystem.FileIterator; import io.trino.filesystem.Location; import io.trino.filesystem.TrinoFileSystem; import io.trino.filesystem.TrinoInput; @@ -291,4 +294,27 @@ public void testDirectoryExists() } super.testDirectoryExists(); } + + @Test + void testListFilesStartingFromConsecutiveSlashesInLocation() + throws IOException + { + // ADLS Gen2 hierarchical canonicalizes runs of slashes; verifies + // EmulatedListFilesStartingFromIterator falls back to the slash-collapsed prefix. + if (!isHierarchical()) { + abort("Only ADLS Gen2 hierarchical canonicalizes `//` and routes through EmulatedListFilesStartingFromIterator"); + } + try (Closer closer = Closer.create()) { + Location file1 = createBlob(closer, "level0/level1-file1"); + Location file2 = createBlob(closer, "level0/level1-file2"); + + Location doubledSlash = createLocation("level0").appendSuffix("//"); + ImmutableList.Builder builder = ImmutableList.builder(); + FileIterator iterator = getFileSystem().listFilesStartingFrom(doubledSlash, "level1-file1"); + while (iterator.hasNext()) { + builder.add(iterator.next().location()); + } + assertThat(builder.build()).containsExactlyInAnyOrder(file1, file2); + } + } } diff --git a/lib/trino-filesystem/src/main/java/io/trino/filesystem/EmulatedListFilesStartingFromIterator.java b/lib/trino-filesystem/src/main/java/io/trino/filesystem/EmulatedListFilesStartingFromIterator.java index dedc9f300756..f475a8eff579 100644 --- a/lib/trino-filesystem/src/main/java/io/trino/filesystem/EmulatedListFilesStartingFromIterator.java +++ b/lib/trino-filesystem/src/main/java/io/trino/filesystem/EmulatedListFilesStartingFromIterator.java @@ -15,15 +15,19 @@ import java.io.IOException; import java.util.NoSuchElementException; +import java.util.regex.Pattern; -import static com.google.common.base.Preconditions.checkState; +import static java.lang.String.format; import static java.util.Objects.requireNonNull; public final class EmulatedListFilesStartingFromIterator implements FileIterator { + private static final Pattern CONSECUTIVE_SLASHES = Pattern.compile("/+"); + private final FileIterator delegate; private final String locationPath; + private final String collapsedLocationPath; private final String startingFrom; private FileEntry nextEntry; @@ -33,6 +37,7 @@ public EmulatedListFilesStartingFromIterator(FileIterator delegate, Location loc String locationPath = location.path(); this.locationPath = (locationPath.isEmpty() || locationPath.endsWith("/")) ? locationPath : locationPath + "/"; + this.collapsedLocationPath = CONSECUTIVE_SLASHES.matcher(this.locationPath).replaceAll("/"); this.startingFrom = requireNonNull(startingFrom, "startingFrom is null"); } @@ -69,9 +74,22 @@ private void loadNextEntry() while (delegate.hasNext()) { FileEntry entry = delegate.next(); String entryPath = entry.location().path(); - checkState(entryPath.startsWith(locationPath), "Expected listed file to start with directory path '%s': %s", locationPath, entry.location()); - String entryTail = entryPath.substring(locationPath.length()); + // LocalFileSystem, AlluxioFileSystem, and ADLS Gen2 hierarchical canonicalize runs of + // slashes in returned paths. Try the original prefix first to preserve blob-store keys + // where `//` is meaningful; fall back to the slash-collapsed form. + String prefix; + if (entryPath.startsWith(locationPath)) { + prefix = locationPath; + } + else if (entryPath.startsWith(collapsedLocationPath)) { + prefix = collapsedLocationPath; + } + else { + throw new IllegalStateException(format("Expected listed file to start with directory path '%s': %s", locationPath, entry.location())); + } + + String entryTail = entryPath.substring(prefix.length()); if (entryTail.compareTo(startingFrom) >= 0) { nextEntry = entry; return; diff --git a/lib/trino-filesystem/src/test/java/io/trino/filesystem/TestEmulatedListFilesStartingFromIterator.java b/lib/trino-filesystem/src/test/java/io/trino/filesystem/TestEmulatedListFilesStartingFromIterator.java index df05d01294cd..311219c9ffe6 100644 --- a/lib/trino-filesystem/src/test/java/io/trino/filesystem/TestEmulatedListFilesStartingFromIterator.java +++ b/lib/trino-filesystem/src/test/java/io/trino/filesystem/TestEmulatedListFilesStartingFromIterator.java @@ -191,6 +191,32 @@ void testListFilesStartingFromDoubleSlashPathComponent() Location.of("file:///double/a")); } + @Test + void testListFilesStartingFromHierarchicalLocationNormalization() + throws IOException + { + // FS canonicalizes `//` to `/`; iterator falls back to slash-collapsed prefix. + assertThat(listFilesStartingFrom( + Location.of("abfs://container@account.dfs.core.windows.net/dir//sub/_delta_log/"), + "00000000000000000000", + List.of( + entry("abfs://container@account.dfs.core.windows.net/dir/sub/_delta_log/00000000000000000000.json"), + entry("abfs://container@account.dfs.core.windows.net/dir/sub/_delta_log/00000000000000000001.checkpoint.parquet")))) + .containsExactly( + Location.of("abfs://container@account.dfs.core.windows.net/dir/sub/_delta_log/00000000000000000000.json"), + Location.of("abfs://container@account.dfs.core.windows.net/dir/sub/_delta_log/00000000000000000001.checkpoint.parquet")); + + // startingFrom filtering still applies to the slash-collapsed remainder. + assertThat(listFilesStartingFrom( + Location.of("abfs://container@account.dfs.core.windows.net/dir//sub/_delta_log/"), + "00000000000000000001", + List.of( + entry("abfs://container@account.dfs.core.windows.net/dir/sub/_delta_log/00000000000000000000.json"), + entry("abfs://container@account.dfs.core.windows.net/dir/sub/_delta_log/00000000000000000001.checkpoint.parquet")))) + .containsExactly( + Location.of("abfs://container@account.dfs.core.windows.net/dir/sub/_delta_log/00000000000000000001.checkpoint.parquet")); + } + @Test void testListFilesStartingFromIncludesAllNonAsciiFilenames() throws IOException diff --git a/lib/trino-filesystem/src/test/java/io/trino/filesystem/local/TestLocalFileSystem.java b/lib/trino-filesystem/src/test/java/io/trino/filesystem/local/TestLocalFileSystem.java index 036bbcd8a980..ff39af582862 100644 --- a/lib/trino-filesystem/src/test/java/io/trino/filesystem/local/TestLocalFileSystem.java +++ b/lib/trino-filesystem/src/test/java/io/trino/filesystem/local/TestLocalFileSystem.java @@ -13,7 +13,10 @@ */ package io.trino.filesystem.local; +import com.google.common.collect.ImmutableList; +import com.google.common.io.Closer; import io.trino.filesystem.AbstractTestTrinoFileSystem; +import io.trino.filesystem.FileIterator; import io.trino.filesystem.Location; import io.trino.filesystem.TrinoFileSystem; import org.junit.jupiter.api.AfterAll; @@ -119,6 +122,26 @@ protected void verifyFileSystemIsEmpty() } } + @Test + void testListFilesStartingFromConsecutiveSlashesInLocation() + throws IOException + { + // LocalFileSystem (Java NIO) canonicalizes runs of slashes; verifies + // EmulatedListFilesStartingFromIterator falls back to the slash-collapsed prefix. + try (Closer closer = Closer.create()) { + Location file1 = createBlob(closer, "level0/level1-file1"); + Location file2 = createBlob(closer, "level0/level1-file2"); + + Location doubledSlash = createLocation("level0").appendSuffix("//"); + ImmutableList.Builder builder = ImmutableList.builder(); + FileIterator iterator = getFileSystem().listFilesStartingFrom(doubledSlash, "level1-file1"); + while (iterator.hasNext()) { + builder.add(iterator.next().location()); + } + assertThat(builder.build()).containsExactlyInAnyOrder(file1, file2); + } + } + @Test void testPathsOutOfBounds() { From 7c99b4e7a33254528fc82228e801b70419392552 Mon Sep 17 00:00:00 2001 From: Adam Richardson Date: Wed, 18 Feb 2026 21:19:44 -0800 Subject: [PATCH 2/3] Read metadata and protocol information from Delta checksum files Compliant Delta writers may emit optional checksum files alongside commits containing metadata and protocol information. Instead of loading the latest checkpoint and replaying intervening commits (which can be expensive, especially for large v1 checkpoints), Trino can read the latest commit's checksum file to obtain this information with a single listing and small JSON read. Ref. https://github.com/delta-io/delta/blob/master/PROTOCOL.md#version-checksum-file If the checksum file is missing or does not contain both metadata and protocol, we fall back to the existing Delta log scanning approach. Behavior is gated by session property load_metadata_from_checksum_file (defaulting to config delta.load_metadata_from_checksum_file, which defaults to true). Internal testing reduced analysis time for large v1-checkpoint tables from ~10s to <500ms. Within a transaction, the resolved commit version and _last_checkpoint contents are reused across loadDescriptor and getSnapshot calls so the descriptor and snapshot paths don't each re-read _last_checkpoint. Co-authored-by: Eric Hwang Co-authored-by: Fred Liu --- docs/src/main/sphinx/connector/delta-lake.md | 13 + .../plugin/deltalake/DeltaLakeConfig.java | 14 + .../plugin/deltalake/DeltaLakeMetadata.java | 144 ++++++++-- .../deltalake/DeltaLakeSessionProperties.java | 11 + .../DeltaLakeTableMetadataScheduler.java | 7 +- .../DeltaLakeVersionChecksum.java | 30 +++ .../transactionlog/TableSnapshot.java | 10 +- .../TemporalTimeTravelUtil.java | 11 +- .../transactionlog/TransactionLogAccess.java | 41 ++- .../transactionlog/TransactionLogParser.java | 49 ++++ .../transactionlog/TransactionLogUtil.java | 26 ++ .../checkpoint/TransactionLogTail.java | 1 + ...stDeltaLakeAlluxioCacheFileOperations.java | 73 ++++- ...LakeAlluxioCacheMutableTransactionLog.java | 2 + .../plugin/deltalake/TestDeltaLakeBasic.java | 59 ++-- .../plugin/deltalake/TestDeltaLakeConfig.java | 7 +- .../TestDeltaLakeFileOperations.java | 253 ++++++++++++++++++ .../deltalake/TestDeltaLakeMetadata.java | 159 ++++++++++- .../TestTransactionLogParser.java | 88 ++++++ .../resources/deltalake/checksum/README.md | 18 ++ .../_delta_log/00000000000000000000.crc | 1 + .../_delta_log/00000000000000000000.json | 3 + .../00000000000000000001.checkpoint.parquet | Bin 0 -> 17389 bytes .../_delta_log/00000000000000000001.crc | 1 + .../_delta_log/00000000000000000001.json | 2 + .../checksum/_delta_log/_last_checkpoint | 1 + ...46b2-b014-76f9dff4a485-c000.snappy.parquet | Bin 0 -> 475 bytes .../deltalake/checksum_invalid_json/README.md | 21 ++ .../_delta_log/00000000000000000000.crc | 1 + .../_delta_log/00000000000000000000.json | 3 + .../00000000000000000001.checkpoint.parquet | Bin 0 -> 17389 bytes .../_delta_log/00000000000000000001.crc | 1 + .../_delta_log/00000000000000000001.json | 2 + .../_delta_log/_last_checkpoint | 1 + ...46b2-b014-76f9dff4a485-c000.snappy.parquet | Bin 0 -> 475 bytes .../checksum_invalid_json_mapping/README.md | 22 ++ .../_delta_log/00000000000000000000.crc | 1 + .../_delta_log/00000000000000000000.json | 3 + .../00000000000000000001.checkpoint.parquet | Bin 0 -> 17389 bytes .../_delta_log/00000000000000000001.crc | 1 + .../_delta_log/00000000000000000001.json | 2 + .../_delta_log/_last_checkpoint | 1 + ...46b2-b014-76f9dff4a485-c000.snappy.parquet | Bin 0 -> 475 bytes .../checksum_missing_latest/README.md | 21 ++ .../_delta_log/00000000000000000000.crc | 1 + .../_delta_log/00000000000000000000.json | 3 + .../00000000000000000001.checkpoint.parquet | Bin 0 -> 17389 bytes .../_delta_log/00000000000000000001.json | 2 + .../_delta_log/_last_checkpoint | 1 + ...46b2-b014-76f9dff4a485-c000.snappy.parquet | Bin 0 -> 475 bytes .../checksum_trailing_json_content/README.md | 22 ++ .../_delta_log/00000000000000000000.crc | 1 + .../_delta_log/00000000000000000000.json | 3 + .../00000000000000000001.checkpoint.parquet | Bin 0 -> 17389 bytes .../_delta_log/00000000000000000001.crc | 2 + .../_delta_log/00000000000000000001.json | 2 + .../_delta_log/_last_checkpoint | 1 + ...46b2-b014-76f9dff4a485-c000.snappy.parquet | Bin 0 -> 475 bytes .../checksum_without_metadata/README.md | 21 ++ .../_delta_log/00000000000000000000.crc | 1 + .../_delta_log/00000000000000000000.json | 3 + .../00000000000000000001.checkpoint.parquet | Bin 0 -> 17389 bytes .../_delta_log/00000000000000000001.crc | 1 + .../_delta_log/00000000000000000001.json | 2 + .../_delta_log/_last_checkpoint | 1 + ...46b2-b014-76f9dff4a485-c000.snappy.parquet | Bin 0 -> 475 bytes .../checksum_without_protocol/README.md | 21 ++ .../_delta_log/00000000000000000000.crc | 1 + .../_delta_log/00000000000000000000.json | 3 + .../00000000000000000001.checkpoint.parquet | Bin 0 -> 17389 bytes .../_delta_log/00000000000000000001.crc | 1 + .../_delta_log/00000000000000000001.json | 2 + .../_delta_log/_last_checkpoint | 1 + ...46b2-b014-76f9dff4a485-c000.snappy.parquet | Bin 0 -> 475 bytes .../TestDeltaLakeActiveFilesCache.java | 11 +- 75 files changed, 1129 insertions(+), 82 deletions(-) create mode 100644 plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/DeltaLakeVersionChecksum.java create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum/README.md create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum/_delta_log/00000000000000000000.crc create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum/_delta_log/00000000000000000000.json create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum/_delta_log/00000000000000000001.checkpoint.parquet create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum/_delta_log/00000000000000000001.crc create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum/_delta_log/00000000000000000001.json create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum/_delta_log/_last_checkpoint create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum/part-00000-b0a15e1d-2cb7-46b2-b014-76f9dff4a485-c000.snappy.parquet create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json/README.md create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json/_delta_log/00000000000000000000.crc create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json/_delta_log/00000000000000000000.json create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json/_delta_log/00000000000000000001.checkpoint.parquet create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json/_delta_log/00000000000000000001.crc create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json/_delta_log/00000000000000000001.json create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json/_delta_log/_last_checkpoint create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json/part-00000-b0a15e1d-2cb7-46b2-b014-76f9dff4a485-c000.snappy.parquet create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json_mapping/README.md create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json_mapping/_delta_log/00000000000000000000.crc create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json_mapping/_delta_log/00000000000000000000.json create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json_mapping/_delta_log/00000000000000000001.checkpoint.parquet create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json_mapping/_delta_log/00000000000000000001.crc create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json_mapping/_delta_log/00000000000000000001.json create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json_mapping/_delta_log/_last_checkpoint create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json_mapping/part-00000-b0a15e1d-2cb7-46b2-b014-76f9dff4a485-c000.snappy.parquet create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_missing_latest/README.md create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_missing_latest/_delta_log/00000000000000000000.crc create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_missing_latest/_delta_log/00000000000000000000.json create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_missing_latest/_delta_log/00000000000000000001.checkpoint.parquet create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_missing_latest/_delta_log/00000000000000000001.json create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_missing_latest/_delta_log/_last_checkpoint create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_missing_latest/part-00000-b0a15e1d-2cb7-46b2-b014-76f9dff4a485-c000.snappy.parquet create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_trailing_json_content/README.md create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_trailing_json_content/_delta_log/00000000000000000000.crc create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_trailing_json_content/_delta_log/00000000000000000000.json create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_trailing_json_content/_delta_log/00000000000000000001.checkpoint.parquet create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_trailing_json_content/_delta_log/00000000000000000001.crc create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_trailing_json_content/_delta_log/00000000000000000001.json create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_trailing_json_content/_delta_log/_last_checkpoint create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_trailing_json_content/part-00000-b0a15e1d-2cb7-46b2-b014-76f9dff4a485-c000.snappy.parquet create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_metadata/README.md create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_metadata/_delta_log/00000000000000000000.crc create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_metadata/_delta_log/00000000000000000000.json create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_metadata/_delta_log/00000000000000000001.checkpoint.parquet create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_metadata/_delta_log/00000000000000000001.crc create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_metadata/_delta_log/00000000000000000001.json create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_metadata/_delta_log/_last_checkpoint create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_metadata/part-00000-b0a15e1d-2cb7-46b2-b014-76f9dff4a485-c000.snappy.parquet create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_protocol/README.md create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_protocol/_delta_log/00000000000000000000.crc create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_protocol/_delta_log/00000000000000000000.json create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_protocol/_delta_log/00000000000000000001.checkpoint.parquet create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_protocol/_delta_log/00000000000000000001.crc create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_protocol/_delta_log/00000000000000000001.json create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_protocol/_delta_log/_last_checkpoint create mode 100644 plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_protocol/part-00000-b0a15e1d-2cb7-46b2-b014-76f9dff4a485-c000.snappy.parquet diff --git a/docs/src/main/sphinx/connector/delta-lake.md b/docs/src/main/sphinx/connector/delta-lake.md index ea5aba99aa16..82753ef33f74 100644 --- a/docs/src/main/sphinx/connector/delta-lake.md +++ b/docs/src/main/sphinx/connector/delta-lake.md @@ -201,6 +201,13 @@ values. Typical usage does not require you to configure them. - Number of threads used for retrieving checkpoint files of each table. Currently, only retrievals of V2 Checkpoint's sidecar files are parallelized. - `4` +* - `delta.load-metadata-from-checksum-file` + - Speed up query planning by reading table metadata and protocol + entries from the Delta version checksum file (`.crc`) when + available. Falls back to scanning the transaction log if the checksum + file is missing, incomplete, or malformed. The equivalent catalog + session property is `load_metadata_from_checksum_file`. + - `true` ::: ### Catalog session properties @@ -234,6 +241,12 @@ The following table describes {ref}`catalog session properties - Read only projected fields from row columns while performing `SELECT` queries. - `true` +* - `load_metadata_from_checksum_file` + - Speed up query planning by reading table metadata and protocol + entries from the Delta version checksum file (`.crc`) when + available. Falls back to scanning the transaction log if the checksum + file is missing, incomplete, or malformed. + - `true` ::: (delta-lake-fte-support)= diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeConfig.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeConfig.java index cb0a4f41388b..b5b2a540a919 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeConfig.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeConfig.java @@ -95,6 +95,7 @@ public class DeltaLakeConfig private boolean deltaLogFileSystemCacheDisabled; private int metadataParallelism = 8; private int checkpointProcessingParallelism = 4; + private boolean loadMetadataFromChecksumFile = true; public Duration getMetadataCacheTtl() { @@ -587,4 +588,17 @@ public DeltaLakeConfig setCheckpointProcessingParallelism(int checkpointProcessi this.checkpointProcessingParallelism = checkpointProcessingParallelism; return this; } + + public boolean isLoadMetadataFromChecksumFile() + { + return loadMetadataFromChecksumFile; + } + + @Config("delta.load-metadata-from-checksum-file") + @ConfigDescription("Read table metadata and protocol from the Delta version checksum file when available, falling back to the transaction log") + public DeltaLakeConfig setLoadMetadataFromChecksumFile(boolean loadMetadataFromChecksumFile) + { + this.loadMetadataFromChecksumFile = loadMetadataFromChecksumFile; + return this; + } } diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeMetadata.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeMetadata.java index 511005f1bc19..4e9ce140a929 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeMetadata.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeMetadata.java @@ -69,6 +69,7 @@ import io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport.ColumnMappingMode; import io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport.UnsupportedTypeException; import io.trino.plugin.deltalake.transactionlog.DeltaLakeTransactionLogEntry; +import io.trino.plugin.deltalake.transactionlog.DeltaLakeVersionChecksum; import io.trino.plugin.deltalake.transactionlog.MetadataEntry; import io.trino.plugin.deltalake.transactionlog.ProtocolEntry; import io.trino.plugin.deltalake.transactionlog.RemoveFileEntry; @@ -163,6 +164,7 @@ import io.trino.spi.type.VarcharType; import java.io.IOException; +import java.io.UncheckedIOException; import java.net.URI; import java.net.URISyntaxException; import java.time.Duration; @@ -242,6 +244,7 @@ import static io.trino.plugin.deltalake.DeltaLakeSessionProperties.getHiveCatalogName; import static io.trino.plugin.deltalake.DeltaLakeSessionProperties.isCollectExtendedStatisticsColumnStatisticsOnWrite; import static io.trino.plugin.deltalake.DeltaLakeSessionProperties.isExtendedStatisticsEnabled; +import static io.trino.plugin.deltalake.DeltaLakeSessionProperties.isLoadMetadataFromChecksumFile; import static io.trino.plugin.deltalake.DeltaLakeSessionProperties.isProjectionPushdownEnabled; import static io.trino.plugin.deltalake.DeltaLakeSessionProperties.isQueryPartitionFilterRequired; import static io.trino.plugin.deltalake.DeltaLakeSessionProperties.isStoreTableMetadataInMetastoreEnabled; @@ -299,8 +302,10 @@ import static io.trino.plugin.deltalake.transactionlog.MetadataEntry.DELTA_CHANGE_DATA_FEED_ENABLED_PROPERTY; import static io.trino.plugin.deltalake.transactionlog.MetadataEntry.configurationForNewTable; import static io.trino.plugin.deltalake.transactionlog.TemporalTimeTravelUtil.findLatestVersionUsingTemporal; +import static io.trino.plugin.deltalake.transactionlog.TransactionLogParser.findLatestCommitVersion; import static io.trino.plugin.deltalake.transactionlog.TransactionLogParser.getMandatoryCurrentVersion; import static io.trino.plugin.deltalake.transactionlog.TransactionLogParser.readLastCheckpoint; +import static io.trino.plugin.deltalake.transactionlog.TransactionLogParser.readVersionChecksumFile; import static io.trino.plugin.deltalake.transactionlog.TransactionLogUtil.getTransactionLogDir; import static io.trino.plugin.deltalake.transactionlog.TransactionLogUtil.getTransactionLogJsonEntryPath; import static io.trino.plugin.deltalake.transactionlog.checkpoint.TransactionLogTail.getEntriesFromJson; @@ -470,6 +475,10 @@ public class DeltaLakeMetadata private final DeltaLakeTableMetadataScheduler metadataScheduler; private final Map tableUpdateInfos = new ConcurrentHashMap<>(); private final Map latestTableVersions = new ConcurrentHashMap<>(); + // Per-transaction memoization of _last_checkpoint contents (Optional.empty() records the negative case). + // Populated when loadDescriptorFromChecksum or getSnapshot resolves the checkpoint, and consulted by + // subsequent snapshot loads in the same transaction. + private final Map> latestCheckpoints = new ConcurrentHashMap<>(); private final Map queriedSnapshots = new ConcurrentHashMap<>(); private final Executor metadataFetchingExecutor; private final TransactionLogReaderFactory transactionLogReaderFactory; @@ -482,6 +491,15 @@ private record QueriedTable(SchemaTableName schemaTableName, long version) } } + private record DeltaLakeTableDescriptor(long version, MetadataEntry metadataEntry, ProtocolEntry protocolEntry) + { + DeltaLakeTableDescriptor + { + requireNonNull(metadataEntry, "metadataEntry is null"); + requireNonNull(protocolEntry, "protocolEntry is null"); + } + } + public DeltaLakeMetadata( DeltaLakeMetastore metastore, TransactionLogAccess transactionLogAccess, @@ -536,12 +554,19 @@ private TableSnapshot getSnapshot(ConnectorSession session, DeltaLakeTableHandle public TableSnapshot getSnapshot(ConnectorSession session, DeltaMetastoreTable metastoreTable, Optional atVersion) { - return getSnapshot(metastoreTable.schemaTableName(), atVersion, () -> transactionLogAccess.loadSnapshot(session, metastoreTable, atVersion)); + SchemaTableName tableName = metastoreTable.schemaTableName(); + return getSnapshot(tableName, atVersion, () -> transactionLogAccess.loadSnapshot(session, metastoreTable, atVersion, resolveLastCheckpoint(tableName, fileSystemFactory.create(session, metastoreTable), metastoreTable.location()))); } public TableSnapshot getSnapshot(ConnectorSession session, DeltaLakeTableHandle tableHandle, Optional atVersion) { - return getSnapshot(tableHandle.getSchemaTableName(), atVersion, () -> transactionLogAccess.loadSnapshot(session, tableHandle, atVersion)); + SchemaTableName tableName = tableHandle.getSchemaTableName(); + return getSnapshot(tableName, atVersion, () -> transactionLogAccess.loadSnapshot(session, tableHandle, atVersion, resolveLastCheckpoint(tableName, fileSystemFactory.create(session, tableHandle), tableHandle.getLocation()))); + } + + private Optional resolveLastCheckpoint(SchemaTableName tableName, TrinoFileSystem fileSystem, String tableLocation) + { + return latestCheckpoints.computeIfAbsent(tableName, _ -> readLastCheckpoint(fileSystem, tableLocation)); } private interface SnapshotSupplier @@ -719,27 +744,22 @@ public LocatedTableHandle getTableHandle( String tableLocation = table.location(); TrinoFileSystem fileSystem = fileSystemFactory.create(session, table); - TableSnapshot tableSnapshot = getSnapshot(session, table, endVersion.map(version -> getVersion(session, fileSystem, tableLocation, version, metadataFetchingExecutor))); - MetadataAndProtocolEntries logEntries; + DeltaLakeTableDescriptor descriptor; try { - logEntries = transactionLogAccess.getMetadataAndProtocolEntry(session, fileSystem, tableSnapshot); + descriptor = loadDescriptor(session, table, fileSystem, tableLocation, endVersion); } catch (TrinoException e) { if (e.getErrorCode().equals(DELTA_LAKE_INVALID_SCHEMA.toErrorCode())) { - return new CorruptedDeltaLakeTableHandle(tableName, table.catalogOwned(), managed, tableLocation, e); + return new CorruptedDeltaLakeTableHandle(tableName, table.catalogOwned(), table.managed(), tableLocation, e); } throw e; } - MetadataEntry metadataEntry = logEntries.metadata().orElse(null); - if (metadataEntry == null) { - return new CorruptedDeltaLakeTableHandle(tableName, table.catalogOwned(), managed, tableLocation, new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + tableSnapshot.getTable())); - } - ProtocolEntry protocolEntry = logEntries.protocol().orElse(null); - if (protocolEntry == null) { - return new CorruptedDeltaLakeTableHandle(tableName, table.catalogOwned(), managed, tableLocation, new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Protocol not found in transaction log for " + tableSnapshot.getTable())); - } + MetadataEntry metadataEntry = descriptor.metadataEntry(); + ProtocolEntry protocolEntry = descriptor.protocolEntry(); + long snapshotVersion = descriptor.version(); + if (protocolEntry.minReaderVersion() > MAX_READER_VERSION) { LOG.debug("Skip %s because the reader version is unsupported: %d", tableName, protocolEntry.minReaderVersion()); return null; @@ -752,8 +772,8 @@ public LocatedTableHandle getTableHandle( verifySupportedColumnMapping(getColumnMappingMode(metadataEntry, protocolEntry)); if (metadataScheduler.canStoreTableMetadata(session, metadataEntry.getSchemaString(), Optional.ofNullable(metadataEntry.getDescription())) && endVersion.isEmpty() && - !isSameTransactionVersion(metastoreTable.get(), tableSnapshot)) { - tableUpdateInfos.put(tableName, new TableUpdateInfo(session, tableSnapshot.getVersion(), metadataEntry.getSchemaString(), Optional.ofNullable(metadataEntry.getDescription()))); + !isSameTransactionVersion(metastoreTable.get(), snapshotVersion)) { + tableUpdateInfos.put(tableName, new TableUpdateInfo(session, snapshotVersion, metadataEntry.getSchemaString(), Optional.ofNullable(metadataEntry.getDescription()))); } return new DeltaLakeTableHandle( tableName.getSchemaName(), @@ -767,10 +787,100 @@ public LocatedTableHandle getTableHandle( false, Optional.empty(), Optional.empty(), - tableSnapshot.getVersion(), + snapshotVersion, endVersion.isPresent()); } + private DeltaLakeTableDescriptor loadDescriptor( + ConnectorSession session, + DeltaMetastoreTable table, + TrinoFileSystem fileSystem, + String tableLocation, + Optional endVersion) + { + Optional endTableVersion = endVersion.map(version -> getVersion(session, fileSystem, tableLocation, version, metadataFetchingExecutor)); + + if (isLoadMetadataFromChecksumFile(session)) { + Optional descriptor = loadDescriptorFromChecksum(table.schemaTableName(), fileSystem, tableLocation, endTableVersion); + if (descriptor.isPresent()) { + latestTableVersions.put(table.schemaTableName(), descriptor.get().version()); + return descriptor.get(); + } + } + + return loadDescriptorFromTransactionLog(session, table, fileSystem, endTableVersion); + } + + private Optional loadDescriptorFromChecksum( + SchemaTableName tableName, + TrinoFileSystem fileSystem, + String tableLocation, + Optional endTableVersion) + { + long latestCommitVersion = endTableVersion.orElseGet(() -> resolveLatestCommitVersion(tableName, fileSystem, tableLocation)); + + Optional versionChecksum; + try { + versionChecksum = readVersionChecksumFile(fileSystem, tableLocation, latestCommitVersion); + } + catch (IOException | UncheckedIOException e) { + throw new TrinoException(DELTA_LAKE_FILESYSTEM_ERROR, format("Failed to read checksum file for version %d of table %s", latestCommitVersion, tableName), e); + } + if (versionChecksum.isEmpty()) { + return Optional.empty(); + } + DeltaLakeVersionChecksum checksum = versionChecksum.get(); + if (checksum.metadata().isEmpty() || checksum.protocol().isEmpty()) { + return Optional.empty(); + } + return Optional.of(new DeltaLakeTableDescriptor(latestCommitVersion, checksum.metadata().orElseThrow(), checksum.protocol().orElseThrow())); + } + + // Reuse the version resolved earlier in this transaction to skip _last_checkpoint and transaction log listing + private long resolveLatestCommitVersion(SchemaTableName tableName, TrinoFileSystem fileSystem, String tableLocation) + { + Long knownVersion = latestTableVersions.get(tableName); + if (knownVersion != null) { + return knownVersion; + } + Optional lastCheckpoint; + OptionalLong commit; + try { + lastCheckpoint = readLastCheckpoint(fileSystem, tableLocation); + commit = findLatestCommitVersion(fileSystem, tableLocation, lastCheckpoint); + } + catch (IOException | UncheckedIOException e) { + throw new TrinoException(DELTA_LAKE_FILESYSTEM_ERROR, format("Failed to determine latest commit version for %s", tableName), e); + } + if (commit.isEmpty()) { + throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA, format("Delta table %s has no commits", tableName)); + } + latestCheckpoints.put(tableName, lastCheckpoint); + return commit.getAsLong(); + } + + private DeltaLakeTableDescriptor loadDescriptorFromTransactionLog( + ConnectorSession session, + DeltaMetastoreTable table, + TrinoFileSystem fileSystem, + Optional endTableVersion) + { + TableSnapshot tableSnapshot = getSnapshot(session, table, endTableVersion); + MetadataAndProtocolEntries logEntries = transactionLogAccess.getMetadataAndProtocolEntry(session, fileSystem, tableSnapshot); + + MetadataEntry metadataEntry = logEntries.metadata().orElse(null); + if (metadataEntry == null) { + throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + tableSnapshot.getTable()); + } + + ProtocolEntry protocolEntry = logEntries.protocol().orElse(null); + if (protocolEntry == null) { + throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Protocol not found in transaction log for " + tableSnapshot.getTable()); + } + + return new DeltaLakeTableDescriptor(tableSnapshot.getVersion(), metadataEntry, protocolEntry); + } + @Override public ConnectorTableProperties getTableProperties(ConnectorSession session, ConnectorTableHandle tableHandle) { diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeSessionProperties.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeSessionProperties.java index 222600a94362..4af3fea198d2 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeSessionProperties.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeSessionProperties.java @@ -74,6 +74,7 @@ public final class DeltaLakeSessionProperties public static final String EXTENDED_STATISTICS_COLLECT_ON_WRITE = "extended_statistics_collect_on_write"; private static final String PROJECTION_PUSHDOWN_ENABLED = "projection_pushdown_enabled"; private static final String QUERY_PARTITION_FILTER_REQUIRED = "query_partition_filter_required"; + private static final String LOAD_METADATA_FROM_CHECKSUM_FILE = "load_metadata_from_checksum_file"; private static final String STORE_TABLE_METADATA = "store_table_metadata"; private final List> sessionProperties; @@ -226,6 +227,11 @@ public DeltaLakeSessionProperties( "Require filter on partition column", deltaLakeConfig.isQueryPartitionFilterRequired(), false), + booleanProperty( + LOAD_METADATA_FROM_CHECKSUM_FILE, + "Read table metadata and protocol from the Delta version checksum file when available, falling back to the transaction log", + deltaLakeConfig.isLoadMetadataFromChecksumFile(), + false), booleanProperty( STORE_TABLE_METADATA, "Store table metadata in metastore", @@ -344,6 +350,11 @@ public static boolean isQueryPartitionFilterRequired(ConnectorSession session) return session.getProperty(QUERY_PARTITION_FILTER_REQUIRED, Boolean.class); } + public static boolean isLoadMetadataFromChecksumFile(ConnectorSession session) + { + return session.getProperty(LOAD_METADATA_FROM_CHECKSUM_FILE, Boolean.class); + } + public static boolean isStoreTableMetadataInMetastoreEnabled(ConnectorSession session) { return session.getProperty(STORE_TABLE_METADATA, Boolean.class); diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/metastore/DeltaLakeTableMetadataScheduler.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/metastore/DeltaLakeTableMetadataScheduler.java index 64b22f89c339..bb1f4c922709 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/metastore/DeltaLakeTableMetadataScheduler.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/metastore/DeltaLakeTableMetadataScheduler.java @@ -213,9 +213,14 @@ public void stop() } public static boolean isSameTransactionVersion(Table table, TableSnapshot snapshot) + { + return isSameTransactionVersion(table, snapshot.getVersion()); + } + + public static boolean isSameTransactionVersion(Table table, long snapshotVersion) { return getLastTransactionVersion(table) - .map(version -> version == snapshot.getVersion()) + .map(version -> version == snapshotVersion) .orElse(false); } diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/DeltaLakeVersionChecksum.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/DeltaLakeVersionChecksum.java new file mode 100644 index 000000000000..b059430112cc --- /dev/null +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/DeltaLakeVersionChecksum.java @@ -0,0 +1,30 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.deltalake.transactionlog; + +import java.util.Optional; + +import static java.util.Objects.requireNonNull; + +// Ref. https://github.com/delta-io/delta/blob/master/PROTOCOL.md#version-checksum-file +// Only the fields currently read by Trino are modeled. The spec marks both as required, but +// non-compliant writers may omit them; absent entries trigger fallback to the transaction log. +public record DeltaLakeVersionChecksum(Optional metadata, Optional protocol) +{ + public DeltaLakeVersionChecksum + { + requireNonNull(metadata, "metadata is null"); + requireNonNull(protocol, "protocol is null"); + } +} diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TableSnapshot.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TableSnapshot.java index b26e0e263486..5c2dfe44652c 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TableSnapshot.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TableSnapshot.java @@ -54,7 +54,6 @@ import static io.airlift.slice.SizeOf.sizeOf; import static io.trino.plugin.deltalake.DeltaLakeErrorCode.DELTA_LAKE_FILESYSTEM_ERROR; import static io.trino.plugin.deltalake.DeltaLakeErrorCode.DELTA_LAKE_INVALID_SCHEMA; -import static io.trino.plugin.deltalake.transactionlog.TransactionLogParser.readLastCheckpoint; import static io.trino.plugin.deltalake.transactionlog.TransactionLogUtil.getTransactionLogDir; import static io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointEntryIterator.EntryType.ADD; import static io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointEntryIterator.EntryType.REMOVE; @@ -131,22 +130,21 @@ public static TableSnapshot load( transactionLogMaxCachedFileSize); } - public Optional getUpdatedSnapshot(ConnectorSession session, TransactionLogReader transactionLogReader, TrinoFileSystem fileSystem, Optional toVersion) + public Optional getUpdatedSnapshot(ConnectorSession session, TransactionLogReader transactionLogReader, TrinoFileSystem fileSystem, Optional toVersion, Optional currentLastCheckpoint) throws IOException { if (toVersion.isEmpty()) { // Load any newer table snapshot - Optional lastCheckpoint = readLastCheckpoint(fileSystem, tableLocation); - if (lastCheckpoint.isPresent()) { + if (currentLastCheckpoint.isPresent()) { long ourCheckpointVersion = getLastCheckpointVersion().orElse(0L); - if (ourCheckpointVersion != lastCheckpoint.get().version()) { + if (ourCheckpointVersion != currentLastCheckpoint.get().version()) { // There is a new checkpoint in the table, load anew return Optional.of(TableSnapshot.load( session, transactionLogReader, table, - lastCheckpoint, + currentLastCheckpoint, tableLocation, parquetReaderOptions, checkpointRowStatisticsWritingEnabled, diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TemporalTimeTravelUtil.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TemporalTimeTravelUtil.java index bf1a6d7e117e..54f5b4586875 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TemporalTimeTravelUtil.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TemporalTimeTravelUtil.java @@ -25,17 +25,17 @@ import java.time.Instant; import java.util.Objects; import java.util.Optional; +import java.util.OptionalLong; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.Executor; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import java.util.stream.Stream; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.collect.MoreCollectors.toOptional; import static io.trino.plugin.base.util.ExecutorUtil.processWithAdditionalThreads; import static io.trino.plugin.deltalake.transactionlog.TransactionLogParser.readLastCheckpoint; +import static io.trino.plugin.deltalake.transactionlog.TransactionLogUtil.extractCommitVersion; import static io.trino.plugin.deltalake.transactionlog.TransactionLogUtil.getTransactionLogDir; import static io.trino.plugin.deltalake.transactionlog.TransactionLogUtil.getTransactionLogJsonEntryPath; import static io.trino.plugin.deltalake.transactionlog.checkpoint.TransactionLogTail.getEntriesFromJson; @@ -44,7 +44,6 @@ public final class TemporalTimeTravelUtil { - private static final Pattern TRANSACTION_LOG_PATTERN = Pattern.compile("^(\\d{20})\\.json$"); private static final int VERSION_NOT_FOUND = -1; private TemporalTimeTravelUtil() {} @@ -227,11 +226,11 @@ private static long findLatestVersionFromWholeTransactions(TrinoFileSystem fileS long version = VERSION_NOT_FOUND; while (fileIterator.hasNext()) { Location location = fileIterator.next().location(); - Matcher matcher = TRANSACTION_LOG_PATTERN.matcher(location.fileName()); - if (!matcher.matches()) { + OptionalLong commitVersion = extractCommitVersion(location.fileName()); + if (commitVersion.isEmpty()) { continue; } - long entryNumber = Long.parseLong(matcher.group(1)); + long entryNumber = commitVersion.getAsLong(); Stream logEntryStream = getEntriesFromJson(entryNumber, fileSystem.newInputFile(getTransactionLogJsonEntryPath(transactionLogDir, entryNumber)), DataSize.ofBytes(0)) .map(entry -> entry.getEntries(fileSystem)) diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java index f858f18f87a2..b7b0271e98e9 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java @@ -164,13 +164,25 @@ public CacheStatsMBean getMetadataCacheStats() public TableSnapshot loadSnapshot(ConnectorSession session, DeltaMetastoreTable table, Optional endVersion) throws IOException { - return loadSnapshot(session, transactionLogReaderFactory.createReader(table), table.schemaTableName(), table.location(), endVersion, VendedCredentialsHandle.of(table)); + return loadSnapshot(session, table, endVersion, readLastCheckpoint(fileSystemFactory.create(session, table), table.location())); + } + + public TableSnapshot loadSnapshot(ConnectorSession session, DeltaMetastoreTable table, Optional endVersion, Optional lastCheckpoint) + throws IOException + { + return loadSnapshot(session, transactionLogReaderFactory.createReader(table), table.schemaTableName(), table.location(), endVersion, VendedCredentialsHandle.of(table), lastCheckpoint); } public TableSnapshot loadSnapshot(ConnectorSession session, DeltaLakeTableHandle tableHandle, Optional endVersion) throws IOException { - return loadSnapshot(session, transactionLogReaderFactory.createReader(tableHandle), tableHandle.getSchemaTableName(), tableHandle.getLocation(), endVersion, tableHandle.toCredentialsHandle()); + return loadSnapshot(session, tableHandle, endVersion, readLastCheckpoint(fileSystemFactory.create(session, tableHandle), tableHandle.getLocation())); + } + + public TableSnapshot loadSnapshot(ConnectorSession session, DeltaLakeTableHandle tableHandle, Optional endVersion, Optional lastCheckpoint) + throws IOException + { + return loadSnapshot(session, transactionLogReaderFactory.createReader(tableHandle), tableHandle.getSchemaTableName(), tableHandle.getLocation(), endVersion, tableHandle.toCredentialsHandle(), lastCheckpoint); } /** @@ -190,10 +202,23 @@ public TableSnapshot loadSnapshot( Optional endVersion, VendedCredentialsHandle credentialsHandle) throws IOException + { + return loadSnapshot(session, transactionLogReader, table, tableLocation, endVersion, credentialsHandle, readLastCheckpoint(fileSystemFactory.create(session, credentialsHandle), tableLocation)); + } + + public TableSnapshot loadSnapshot( + ConnectorSession session, + TransactionLogReader transactionLogReader, + SchemaTableName table, + String tableLocation, + Optional endVersion, + VendedCredentialsHandle credentialsHandle, + Optional lastCheckpoint) + throws IOException { TrinoFileSystem fileSystem = fileSystemFactory.create(session, credentialsHandle); if (endVersion.isPresent()) { - return loadSnapshotForTimeTravel(session, transactionLogReader, fileSystem, table, tableLocation, endVersion.get()); + return loadSnapshotForTimeTravel(session, transactionLogReader, fileSystem, table, tableLocation, endVersion.get(), lastCheckpoint); } TableLocation cacheKey = new TableLocation(table, tableLocation); @@ -201,7 +226,6 @@ public TableSnapshot loadSnapshot( TableSnapshot snapshot; if (cachedSnapshot == null) { try { - Optional lastCheckpoint = readLastCheckpoint(fileSystem, tableLocation); snapshot = tableSnapshots.get(cacheKey, () -> TableSnapshot.load( session, @@ -221,7 +245,7 @@ public TableSnapshot loadSnapshot( } } else { - Optional updatedSnapshot = cachedSnapshot.getUpdatedSnapshot(session, transactionLogReader, fileSystem, Optional.empty()); + Optional updatedSnapshot = cachedSnapshot.getUpdatedSnapshot(session, transactionLogReader, fileSystem, Optional.empty(), lastCheckpoint); if (updatedSnapshot.isPresent()) { snapshot = updatedSnapshot.get(); tableSnapshots.asMap().replace(cacheKey, cachedSnapshot, snapshot); @@ -233,14 +257,14 @@ public TableSnapshot loadSnapshot( return snapshot; } - private TableSnapshot loadSnapshotForTimeTravel(ConnectorSession session, TransactionLogReader transactionLogReader, TrinoFileSystem fileSystem, SchemaTableName table, String tableLocation, long endVersion) + private TableSnapshot loadSnapshotForTimeTravel(ConnectorSession session, TransactionLogReader transactionLogReader, TrinoFileSystem fileSystem, SchemaTableName table, String tableLocation, long endVersion, Optional lastCheckpoint) throws IOException { return TableSnapshot.load( session, transactionLogReader, table, - findCheckpoint(fileSystem, tableLocation, endVersion), + findCheckpoint(fileSystem, tableLocation, endVersion, lastCheckpoint), tableLocation, parquetReaderOptions, checkpointRowStatisticsWritingEnabled, @@ -249,9 +273,8 @@ private TableSnapshot loadSnapshotForTimeTravel(ConnectorSession session, Transa Optional.of(endVersion)); } - private static Optional findCheckpoint(TrinoFileSystem fileSystem, String tableLocation, long endVersion) + private static Optional findCheckpoint(TrinoFileSystem fileSystem, String tableLocation, long endVersion, Optional lastCheckpoint) { - Optional lastCheckpoint = readLastCheckpoint(fileSystem, tableLocation); if (lastCheckpoint.isPresent() && lastCheckpoint.get().version() <= endVersion) { return lastCheckpoint; } diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogParser.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogParser.java index 3d099de35e50..56d27a1cd9b8 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogParser.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogParser.java @@ -14,6 +14,7 @@ package io.trino.plugin.deltalake.transactionlog; import com.fasterxml.jackson.core.JsonParseException; +import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.JsonMappingException; import com.fasterxml.jackson.databind.json.JsonMapper; import com.google.common.annotations.VisibleForTesting; @@ -22,6 +23,7 @@ import dev.failsafe.RetryPolicy; import io.airlift.json.JsonMapperProvider; import io.airlift.log.Logger; +import io.trino.filesystem.FileIterator; import io.trino.filesystem.Location; import io.trino.filesystem.TrinoFileSystem; import io.trino.filesystem.TrinoInputFile; @@ -54,11 +56,14 @@ import java.time.temporal.ChronoField; import java.util.Locale; import java.util.Optional; +import java.util.OptionalLong; import java.util.function.Function; import static com.google.common.base.Verify.verify; import static com.google.common.math.LongMath.divide; import static io.airlift.slice.Slices.utf8Slice; +import static io.trino.plugin.deltalake.transactionlog.TransactionLogUtil.extractCommitVersion; +import static io.trino.plugin.deltalake.transactionlog.TransactionLogUtil.getTransactionLogChecksumEntryPath; import static io.trino.plugin.deltalake.transactionlog.TransactionLogUtil.getTransactionLogDir; import static io.trino.plugin.deltalake.transactionlog.TransactionLogUtil.getTransactionLogJsonEntryPath; import static io.trino.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR; @@ -321,4 +326,48 @@ public static long getMandatoryCurrentVersion(TrinoFileSystem fileSystem, String version++; } } + + public static OptionalLong findLatestCommitVersion(TrinoFileSystem fileSystem, String tableLocation) + throws IOException + { + return findLatestCommitVersion(fileSystem, tableLocation, readLastCheckpoint(fileSystem, tableLocation)); + } + + public static OptionalLong findLatestCommitVersion(TrinoFileSystem fileSystem, String tableLocation, Optional lastCheckpoint) + throws IOException + { + // When a checkpoint exists, skip over commits older than the checkpoint: the latest commit + // is either the checkpoint itself or a later commit, never an earlier one. + String startingFrom = lastCheckpoint.map(checkpoint -> "%020d".formatted(checkpoint.version())).orElse(""); + long latestCommitVersion = lastCheckpoint.map(LastCheckpoint::version).orElse(-1L); + + FileIterator files = fileSystem.listFilesStartingFrom(Location.of(getTransactionLogDir(tableLocation)), startingFrom); + while (files.hasNext()) { + OptionalLong commitVersion = extractCommitVersion(files.next().location().fileName()); + if (commitVersion.isPresent() && commitVersion.getAsLong() > latestCommitVersion) { + latestCommitVersion = commitVersion.getAsLong(); + } + } + return latestCommitVersion == -1 ? OptionalLong.empty() : OptionalLong.of(latestCommitVersion); + } + + public static Optional readVersionChecksumFile(TrinoFileSystem fileSystem, String tableLocation, long version) + throws IOException + { + TrinoInputFile inputFile = fileSystem.newInputFile(getTransactionLogChecksumEntryPath(getTransactionLogDir(tableLocation), version)); + try (InputStream checksumInput = inputFile.newStream()) { + return Optional.of(JsonUtils.parseJson(JSON_MAPPER, checksumInput, DeltaLakeVersionChecksum.class)); + } + catch (IllegalArgumentException e) { + // JsonUtils throws IllegalArgumentException for trailing content after the JSON object + return Optional.empty(); + } + catch (IOException | UncheckedIOException e) { + // Checksum files are optional; missing or malformed checksum files fall back to scanning the Delta log + if (isFileNotFoundException(e) || Throwables.getCausalChain(e).stream().anyMatch(JsonProcessingException.class::isInstance)) { + return Optional.empty(); + } + throw e; + } + } } diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogUtil.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogUtil.java index 30fd14eff95d..504b05c06ac7 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogUtil.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogUtil.java @@ -17,9 +17,13 @@ import java.util.Map; import java.util.Optional; +import java.util.OptionalLong; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import static com.google.common.collect.ImmutableMap.toImmutableMap; import static io.trino.filesystem.Locations.appendPath; +import static java.lang.Long.parseLong; public final class TransactionLogUtil { @@ -27,6 +31,8 @@ private TransactionLogUtil() {} public static final String TRANSACTION_LOG_DIRECTORY = "_delta_log"; + private static final Pattern TRANSACTION_LOG_PATTERN = Pattern.compile("^(\\d{20})\\.json$"); + public static String getTransactionLogDir(String tableLocation) { return appendPath(tableLocation, TRANSACTION_LOG_DIRECTORY); @@ -37,6 +43,26 @@ public static Location getTransactionLogJsonEntryPath(String transactionLogDir, return Location.of(transactionLogDir).appendPath("%020d.json".formatted(entryNumber)); } + public static Location getTransactionLogChecksumEntryPath(String transactionLogDir, long entryNumber) + { + return Location.of(transactionLogDir).appendPath("%020d.crc".formatted(entryNumber)); + } + + public static OptionalLong extractCommitVersion(String fileName) + { + Matcher matcher = TRANSACTION_LOG_PATTERN.matcher(fileName); + if (!matcher.matches()) { + return OptionalLong.empty(); + } + try { + return OptionalLong.of(parseLong(matcher.group(1))); + } + catch (NumberFormatException e) { + // 20-digit strings can overflow long (max is 19 digits); treat as non-commit file + return OptionalLong.empty(); + } + } + public static Map> canonicalizePartitionValues(Map partitionValues) { return partitionValues.entrySet().stream() diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/checkpoint/TransactionLogTail.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/checkpoint/TransactionLogTail.java index f2e647988ee8..549068a7cc2d 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/checkpoint/TransactionLogTail.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/checkpoint/TransactionLogTail.java @@ -128,6 +128,7 @@ private static TransactionLogTail loadNewTail( String transactionLogDir = getTransactionLogDir(tableLocation); long entryNumber = startVersion; + // TODO use listFilesStartingFrom to enumerate the log tail in a single listing instead of one InputFile.length per commit while (true) { Optional results = getEntriesFromJson(entryNumber, fileSystem.newInputFile(getTransactionLogJsonEntryPath(transactionLogDir, entryNumber)), transactionLogMaxCachedFileSize); if (results.isEmpty()) { diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeAlluxioCacheFileOperations.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeAlluxioCacheFileOperations.java index 480ed8e57972..e6c9835301a9 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeAlluxioCacheFileOperations.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeAlluxioCacheFileOperations.java @@ -17,6 +17,7 @@ import com.google.common.collect.ImmutableMultiset; import com.google.common.collect.Multiset; import com.google.common.io.Resources; +import io.trino.Session; import io.trino.testing.AbstractTestQueryFramework; import io.trino.testing.DistributedQueryRunner; import org.intellij.lang.annotations.Language; @@ -90,6 +91,7 @@ public void testCacheFileOperations() .add(new CacheOperation("InputFile.length", "00000000000000000001.json")) .add(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 658)) .add(new CacheOperation("InputFile.length", "00000000000000000002.json")) + .add(new CacheOperation("InputFile.length", "00000000000000000002.crc")) .add(new CacheOperation("Alluxio.readExternalStream", "00000000000000000002.json", 0, 658)) .add(new CacheOperation("InputFile.newStream", "00000000000000000002.json")) .add(new CacheOperation("Alluxio.writeCache", "00000000000000000002.json", 0, 658)) @@ -111,6 +113,7 @@ public void testCacheFileOperations() .add(new CacheOperation("InputFile.length", "00000000000000000001.json")) .add(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 658)) .add(new CacheOperation("InputFile.length", "00000000000000000002.json")) + .add(new CacheOperation("InputFile.length", "00000000000000000002.crc")) .add(new CacheOperation("InputFile.length", "00000000000000000003.json")) .add(new CacheOperation("InputFile.newStream", "_last_checkpoint")) .add(new CacheOperation("Alluxio.readCached", "key=p1/", 0, 229)) @@ -137,6 +140,7 @@ public void testCacheFileOperations() .add(new CacheOperation("Alluxio.writeCache", "00000000000000000005.json", 0, 658)) .add(new CacheOperation("InputFile.newStream", "00000000000000000005.json")) .add(new CacheOperation("InputFile.length", "00000000000000000005.json")) + .add(new CacheOperation("InputFile.length", "00000000000000000005.crc")) .add(new CacheOperation("InputFile.length", "00000000000000000006.json")) .add(new CacheOperation("InputFile.newStream", "_last_checkpoint")) .add(new CacheOperation("Alluxio.readCached", "key=p1/", 0, 229)) @@ -166,6 +170,7 @@ public void testCacheFileOperations() .add(new CacheOperation("InputFile.length", "00000000000000000004.json")) .add(new CacheOperation("Alluxio.readCached", "00000000000000000005.json", 0, 658)) .add(new CacheOperation("InputFile.length", "00000000000000000005.json")) + .add(new CacheOperation("InputFile.length", "00000000000000000005.crc")) .add(new CacheOperation("InputFile.length", "00000000000000000006.json")) .add(new CacheOperation("InputFile.newStream", "_last_checkpoint")) .addCopies(new CacheOperation("Alluxio.readCached", "key=p1/", 0, 229), 1) @@ -186,6 +191,7 @@ public void testCacheCheckpointAndExtendedStatsFileOperations() ImmutableMultiset.builder() .addCopies(new CacheOperation("Alluxio.readCached", "00000000000000000003.checkpoint.parquet", 0, 7077), 2) .addCopies(new CacheOperation("InputFile.length", "00000000000000000003.checkpoint.parquet"), 2) + .add(new CacheOperation("InputFile.length", "00000000000000000003.crc")) .add(new CacheOperation("InputFile.length", "00000000000000000004.json")) .addAll(Stream.of("int_part=10/string_part=part1/", "int_part=20/string_part=part2/", "int_part=__HIVE_DEFAULT_PARTITION__/string_part=__HIVE_DEFAULT_PARTITION__/") .flatMap(fileId -> @@ -201,6 +207,7 @@ public void testCacheCheckpointAndExtendedStatsFileOperations() ImmutableMultiset.builder() .addCopies(new CacheOperation("Alluxio.readCached", "00000000000000000003.checkpoint.parquet", 0, 7077), 3) .addCopies(new CacheOperation("InputFile.length", "00000000000000000003.checkpoint.parquet"), 3) + .add(new CacheOperation("InputFile.length", "00000000000000000003.crc")) .add(new CacheOperation("InputFile.length", "00000000000000000004.json")) .addAll(Stream.of("int_part=10/string_part=part1/", "int_part=20/string_part=part2/", "int_part=__HIVE_DEFAULT_PARTITION__/string_part=__HIVE_DEFAULT_PARTITION__/") .flatMap(fileId -> Stream.of(new CacheOperation("Alluxio.readCached", fileId, 0, 199))) @@ -224,6 +231,7 @@ public void testCacheDeletionVectorsFileOperations() .add(new CacheOperation("InputFile.length", "00000000000000000001.json")) .add(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 1607)) .add(new CacheOperation("InputFile.length", "00000000000000000002.json")) + .add(new CacheOperation("InputFile.length", "00000000000000000002.crc")) .add(new CacheOperation("InputFile.length", "00000000000000000003.json")) .add(new CacheOperation("Alluxio.readCached", "data", 0, 796)) .add(new CacheOperation("Input.readFully", "data", 0, 796)) @@ -243,6 +251,7 @@ public void testCacheDeletionVectorsFileOperations() .add(new CacheOperation("InputFile.length", "00000000000000000001.json")) .add(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 1607)) .add(new CacheOperation("InputFile.length", "00000000000000000002.json")) + .add(new CacheOperation("InputFile.length", "00000000000000000002.crc")) .add(new CacheOperation("InputFile.length", "00000000000000000003.json")) .add(new CacheOperation("Alluxio.readCached", "data", 0, 796)) .add(new CacheOperation("Alluxio.readCached", "deletion_vector", 1, 42)) @@ -265,6 +274,7 @@ public void testChangeDataFileOperations() .add(new CacheOperation("InputFile.length", "00000000000000000000.json")) .addCopies(new CacheOperation("Alluxio.readCached", "00000000000000000001.json", 0, 1100), 2) .addCopies(new CacheOperation("InputFile.length", "00000000000000000001.json"), 2) + .add(new CacheOperation("InputFile.length", "00000000000000000001.crc")) .add(new CacheOperation("InputFile.length", "00000000000000000002.json")) .add(new CacheOperation("InputFile.newStream", "_last_checkpoint")) .add(new CacheOperation("Alluxio.readCached", "change_data/key=1/", 0, 389)) @@ -283,6 +293,7 @@ public void testChangeDataFileOperations() .add(new CacheOperation("InputFile.length", "00000000000000000000.json")) .addCopies(new CacheOperation("Alluxio.readCached", "00000000000000000001.json", 0, 1100), 2) .addCopies(new CacheOperation("InputFile.length", "00000000000000000001.json"), 2) + .add(new CacheOperation("InputFile.length", "00000000000000000001.crc")) .add(new CacheOperation("InputFile.length", "00000000000000000002.json")) .add(new CacheOperation("InputFile.newStream", "_last_checkpoint")) .add(new CacheOperation("Alluxio.readCached", "change_data/key=1/", 0, 389)) @@ -311,6 +322,7 @@ public void testTimeTravelWithLastCheckpoint() .add(new CacheOperation("Alluxio.writeCache", "00000000000000000001.json", 0, 613)) .add(new CacheOperation("Alluxio.readCached", "00000000000000000001.json", 0, 613)) .add(new CacheOperation("InputFile.length", "00000000000000000001.json")) + .add(new CacheOperation("InputFile.length", "00000000000000000001.crc")) .add(new CacheOperation("InputFile.exists", "00000000000000000001.json")) .add(new CacheOperation("InputFile.newStream", "00000000000000000001.json")) .add(new CacheOperation("InputFile.newStream", "_last_checkpoint")) @@ -323,6 +335,7 @@ public void testTimeTravelWithLastCheckpoint() ImmutableMultiset.builder() .addCopies(new CacheOperation("Alluxio.readCached", "00000000000000000002.checkpoint.parquet", 0, 5884), 2) .addCopies(new CacheOperation("InputFile.length", "00000000000000000002.checkpoint.parquet"), 2) + .add(new CacheOperation("InputFile.length", "00000000000000000002.crc")) .add(new CacheOperation("InputFile.exists", "00000000000000000002.json")) .add(new CacheOperation("Input.readFully", "data", 0, 199)) .add(new CacheOperation("Alluxio.writeCache", "data", 0, 199)) @@ -334,6 +347,7 @@ public void testTimeTravelWithLastCheckpoint() ImmutableMultiset.builder() .addCopies(new CacheOperation("Alluxio.readCached", "00000000000000000002.checkpoint.parquet", 0, 5884), 2) .addCopies(new CacheOperation("InputFile.length", "00000000000000000002.checkpoint.parquet"), 2) + .add(new CacheOperation("InputFile.length", "00000000000000000002.crc")) .addCopies(new CacheOperation("Alluxio.readCached", "data", 0, 199), 3) .add(new CacheOperation("InputFile.exists", "00000000000000000002.json")) .add(new CacheOperation("InputFile.newStream", "_last_checkpoint")) @@ -356,6 +370,7 @@ public void testTimeTravelWithLastCheckpointUsingTemporalVersion() .add(new CacheOperation("InputFile.length", "00000000000000000000.json")) .addCopies(new CacheOperation("Alluxio.readCached", "00000000000000000001.json", 0, 613), 2) .addCopies(new CacheOperation("InputFile.length", "00000000000000000001.json"), 3) + .add(new CacheOperation("InputFile.length", "00000000000000000001.crc")) .addCopies(new CacheOperation("InputFile.newStream", "_last_checkpoint"), 2) .add(new CacheOperation("Alluxio.writeCache", "00000000000000000002.json", 0, 613)) .add(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 613)) @@ -369,6 +384,7 @@ public void testTimeTravelWithLastCheckpointUsingTemporalVersion() ImmutableMultiset.builder() .addCopies(new CacheOperation("Alluxio.readCached", "00000000000000000002.checkpoint.parquet", 0, 5884), 2) .addCopies(new CacheOperation("InputFile.length", "00000000000000000002.checkpoint.parquet"), 2) + .add(new CacheOperation("InputFile.length", "00000000000000000002.crc")) .addCopies(new CacheOperation("Alluxio.readCached", "data", 0, 199), 3) .addCopies(new CacheOperation("InputFile.newStream", "_last_checkpoint"), 2) .addCopies(new CacheOperation("InputFile.length", "00000000000000000002.json"), 2) @@ -379,6 +395,7 @@ public void testTimeTravelWithLastCheckpointUsingTemporalVersion() ImmutableMultiset.builder() .addCopies(new CacheOperation("Alluxio.readCached", "00000000000000000002.checkpoint.parquet", 0, 5884), 2) .addCopies(new CacheOperation("InputFile.length", "00000000000000000002.checkpoint.parquet"), 2) + .add(new CacheOperation("InputFile.length", "00000000000000000002.crc")) .addCopies(new CacheOperation("Alluxio.readCached", "data", 0, 199), 3) .addCopies(new CacheOperation("InputFile.newStream", "_last_checkpoint"), 2) .addCopies(new CacheOperation("InputFile.length", "00000000000000000002.json"), 2) @@ -407,6 +424,7 @@ public void testTimeTravelWithoutLastCheckpoint() .add(new CacheOperation("InputFile.length", "00000000000000000000.json")) .add(new CacheOperation("Alluxio.readCached", "00000000000000000001.json", 0, 613)) .add(new CacheOperation("InputFile.length", "00000000000000000001.json")) + .add(new CacheOperation("InputFile.length", "00000000000000000001.crc")) .add(new CacheOperation("InputFile.exists", "00000000000000000001.json")) .addCopies(new CacheOperation("Input.readFully", "data", 0, 199), 2) .addCopies(new CacheOperation("Alluxio.writeCache", "data", 0, 199), 2) @@ -420,6 +438,7 @@ public void testTimeTravelWithoutLastCheckpoint() .add(new CacheOperation("Alluxio.writeCache", "00000000000000000002.checkpoint.parquet", 0, 5884)) .addCopies(new CacheOperation("Alluxio.readCached", "00000000000000000002.checkpoint.parquet", 0, 5884), 2) .addCopies(new CacheOperation("InputFile.length", "00000000000000000002.checkpoint.parquet"), 2) + .add(new CacheOperation("InputFile.length", "00000000000000000002.crc")) .add(new CacheOperation("InputFile.exists", "00000000000000000002.json")) .add(new CacheOperation("Input.readFully", "data", 0, 199)) .add(new CacheOperation("Alluxio.writeCache", "data", 0, 199)) @@ -431,6 +450,7 @@ public void testTimeTravelWithoutLastCheckpoint() ImmutableMultiset.builder() .addCopies(new CacheOperation("Alluxio.readCached", "00000000000000000002.checkpoint.parquet", 0, 5884), 2) .addCopies(new CacheOperation("InputFile.length", "00000000000000000002.checkpoint.parquet"), 2) + .add(new CacheOperation("InputFile.length", "00000000000000000002.crc")) .add(new CacheOperation("InputFile.exists", "00000000000000000002.json")) .addCopies(new CacheOperation("Alluxio.readCached", "data", 0, 199), 3) .add(new CacheOperation("InputFile.newStream", "_last_checkpoint")) @@ -459,6 +479,7 @@ public void testTimeTravelWithoutLastCheckpointUsingTemporal() .addCopies(new CacheOperation("InputFile.length", "00000000000000000000.json"), 3) .addCopies(new CacheOperation("Alluxio.readCached", "00000000000000000001.json", 0, 613), 2) .addCopies(new CacheOperation("InputFile.length", "00000000000000000001.json"), 3) + .add(new CacheOperation("InputFile.length", "00000000000000000001.crc")) .addCopies(new CacheOperation("Input.readFully", "data", 0, 199), 2) .addCopies(new CacheOperation("Alluxio.writeCache", "data", 0, 199), 2) .addCopies(new CacheOperation("Alluxio.readCached", "data", 0, 199), 2) @@ -484,6 +505,7 @@ public void testTimeTravelWithoutLastCheckpointUsingTemporal() .add(new CacheOperation("Alluxio.writeCache", "00000000000000000002.checkpoint.parquet", 0, 5884)) .addCopies(new CacheOperation("Alluxio.readCached", "00000000000000000002.checkpoint.parquet", 0, 5884), 2) .addCopies(new CacheOperation("InputFile.length", "00000000000000000002.json"), 2) + .add(new CacheOperation("InputFile.length", "00000000000000000002.crc")) .addCopies(new CacheOperation("InputFile.length", "00000000000000000002.checkpoint.parquet"), 2) .add(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 613)) .addCopies(new CacheOperation("InputFile.length", "00000000000000000003.json"), 2) @@ -505,6 +527,7 @@ public void testTimeTravelWithoutLastCheckpointUsingTemporal() .addCopies(new CacheOperation("Alluxio.readCached", "00000000000000000002.checkpoint.parquet", 0, 5884), 2) .add(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 613)) .addCopies(new CacheOperation("InputFile.length", "00000000000000000002.json"), 2) + .add(new CacheOperation("InputFile.length", "00000000000000000002.crc")) .addCopies(new CacheOperation("InputFile.length", "00000000000000000002.checkpoint.parquet"), 2) .addCopies(new CacheOperation("InputFile.length", "00000000000000000003.json"), 2) .add(new CacheOperation("Alluxio.readCached", "00000000000000000003.json", 0, 613)) @@ -540,6 +563,7 @@ public void testReadV2CheckpointJson() .add(new CacheOperation("Alluxio.writeCache", "00000000000000000001.checkpoint.0000000001.0000000001.90cf4e21-dbaa-41d6-8ae5-6709cfbfbfe0.parquet", 0, 9176)) .add(new CacheOperation("InputFile.length", "00000000000000000001.checkpoint.0000000001.0000000001.90cf4e21-dbaa-41d6-8ae5-6709cfbfbfe0.parquet")) .add(new CacheOperation("InputFile.length", "00000000000000000002.json")) + .add(new CacheOperation("InputFile.length", "00000000000000000001.crc")) .add(new CacheOperation("InputFile.newStream", "_last_checkpoint")) .add(new CacheOperation("Alluxio.writeCache", "data", 0, 666)) .add(new CacheOperation("Alluxio.readCached", "data", 0, 666)) @@ -553,6 +577,7 @@ public void testReadV2CheckpointJson() .add(new CacheOperation("Alluxio.readCached", "00000000000000000001.checkpoint.0000000001.0000000001.90cf4e21-dbaa-41d6-8ae5-6709cfbfbfe0.parquet", 0, 9176)) .add(new CacheOperation("InputFile.length", "00000000000000000001.checkpoint.0000000001.0000000001.90cf4e21-dbaa-41d6-8ae5-6709cfbfbfe0.parquet")) .add(new CacheOperation("InputFile.length", "00000000000000000002.json")) + .add(new CacheOperation("InputFile.length", "00000000000000000001.crc")) .add(new CacheOperation("InputFile.newStream", "_last_checkpoint")) .add(new CacheOperation("Alluxio.readCached", "data", 0, 666)) .build()); @@ -584,6 +609,7 @@ public void testReadV2CheckpointParquet() .add(new CacheOperation("Input.readFully", "00000000000000000001.checkpoint.0000000001.0000000001.03288d7e-af16-44ed-829c-196064a71812.parquet", 0, 9415)) .add(new CacheOperation("Alluxio.writeCache", "00000000000000000001.checkpoint.0000000001.0000000001.03288d7e-af16-44ed-829c-196064a71812.parquet", 0, 9415)) .add(new CacheOperation("InputFile.length", "00000000000000000002.json")) + .add(new CacheOperation("InputFile.length", "00000000000000000001.crc")) .add(new CacheOperation("Alluxio.writeCache", "data", 0, 666)) .add(new CacheOperation("Alluxio.readCached", "data", 0, 666)) .add(new CacheOperation("Input.readFully", "data", 0, 666)) @@ -597,6 +623,7 @@ public void testReadV2CheckpointParquet() .add(new CacheOperation("Alluxio.readCached", "00000000000000000001.checkpoint.0000000001.0000000001.03288d7e-af16-44ed-829c-196064a71812.parquet", 0, 9415)) .add(new CacheOperation("InputFile.length", "00000000000000000001.checkpoint.0000000001.0000000001.03288d7e-af16-44ed-829c-196064a71812.parquet")) .add(new CacheOperation("InputFile.length", "00000000000000000002.json")) + .add(new CacheOperation("InputFile.length", "00000000000000000001.crc")) .add(new CacheOperation("InputFile.newStream", "_last_checkpoint")) .add(new CacheOperation("Alluxio.readCached", "data", 0, 666)) .build()); @@ -614,6 +641,7 @@ public void testCreateOrReplaceTable() .add(new CacheOperation("InputFile.newStream", "00000000000000000000.json")) .add(new CacheOperation("Alluxio.writeCache", "00000000000000000000.json", 0, 821)) .add(new CacheOperation("InputFile.length", "00000000000000000000.json")) + .addCopies(new CacheOperation("InputFile.length", "00000000000000000000.crc"), 2) .add(new CacheOperation("InputFile.exists", "00000000000000000001.json")) .add(new CacheOperation("InputFile.length", "00000000000000000001.json")) .add(new CacheOperation("InputFile.newStream", "_last_checkpoint")) @@ -641,6 +669,7 @@ public void testCreateOrReplaceTableAsSelect() .add(new CacheOperation("Alluxio.writeCache", "00000000000000000000.json", 0, 1063)) .add(new CacheOperation("InputFile.newStream", "00000000000000000000.json")) .add(new CacheOperation("InputFile.length", "00000000000000000000.json")) + .addCopies(new CacheOperation("InputFile.length", "00000000000000000000.crc"), 3) .add(new CacheOperation("InputFile.length", "00000000000000000001.json")) .add(new CacheOperation("InputFile.exists", "00000000000000000001.json")) .add(new CacheOperation("InputFile.exists", "extendeded_stats.json")) @@ -651,11 +680,53 @@ public void testCreateOrReplaceTableAsSelect() assertUpdate("DROP TABLE test_create_or_replace_as_select"); } + @Test + public void testCacheFileOperationsWithChecksumFilesDisabled() + { + String catalog = getSession().getCatalog().orElseThrow(); + Session session = Session.builder(getSession()) + .setCatalogSessionProperty(catalog, "load_metadata_from_checksum_file", "false") + .build(); + + assertUpdate("DROP TABLE IF EXISTS test_cache_file_operations_without_checksum"); + assertUpdate("CREATE TABLE test_cache_file_operations_without_checksum(key varchar, data varchar) with (partitioned_by=ARRAY['key'])"); + assertUpdate("INSERT INTO test_cache_file_operations_without_checksum VALUES ('p1', '1-abc')", 1); + assertUpdate("INSERT INTO test_cache_file_operations_without_checksum VALUES ('p2', '2-xyz')", 1); + + assertFileSystemAccesses( + session, + "SELECT * FROM test_cache_file_operations_without_checksum", + ImmutableMultiset.builder() + .add(new CacheOperation("Alluxio.readCached", "00000000000000000000.json", 0, 816)) + .add(new CacheOperation("InputFile.length", "00000000000000000000.json")) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000001.json", 0, 658)) + .add(new CacheOperation("InputFile.length", "00000000000000000001.json")) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 658)) + .add(new CacheOperation("InputFile.length", "00000000000000000002.json")) + .add(new CacheOperation("Alluxio.readExternalStream", "00000000000000000002.json", 0, 658)) + .add(new CacheOperation("InputFile.newStream", "00000000000000000002.json")) + .add(new CacheOperation("Alluxio.writeCache", "00000000000000000002.json", 0, 658)) + .add(new CacheOperation("InputFile.length", "00000000000000000003.json")) + .add(new CacheOperation("InputFile.newStream", "_last_checkpoint")) + .add(new CacheOperation("Alluxio.readCached", "key=p1/", 0, 229)) + .add(new CacheOperation("Alluxio.readCached", "key=p2/", 0, 229)) + .add(new CacheOperation("Input.readFully", "key=p1/", 0, 229)) + .add(new CacheOperation("Input.readFully", "key=p2/", 0, 229)) + .add(new CacheOperation("Alluxio.writeCache", "key=p1/", 0, 229)) + .add(new CacheOperation("Alluxio.writeCache", "key=p2/", 0, 229)) + .build()); + } + private void assertFileSystemAccesses(@Language("SQL") String query, Multiset expectedCacheAccesses) + { + assertFileSystemAccesses(getSession(), query, expectedCacheAccesses); + } + + private void assertFileSystemAccesses(Session session, @Language("SQL") String query, Multiset expectedCacheAccesses) { assertUpdate("CALL system.flush_metadata_cache()"); DistributedQueryRunner queryRunner = getDistributedQueryRunner(); - queryRunner.executeWithPlan(queryRunner.getDefaultSession(), query); + queryRunner.executeWithPlan(session, query); assertMultisetsEqual(getCacheOperations(queryRunner), expectedCacheAccesses); } } diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeAlluxioCacheMutableTransactionLog.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeAlluxioCacheMutableTransactionLog.java index a69d2df8a19d..ea96fa0e2cbb 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeAlluxioCacheMutableTransactionLog.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeAlluxioCacheMutableTransactionLog.java @@ -77,6 +77,7 @@ public void testTableDataCachedWhileTransactionLogNotCached() ImmutableMultiset.builder() .addCopies(new CacheFileSystemTraceUtils.CacheOperation("InputFile.length", "00000000000000000002.checkpoint.parquet"), 2) .addCopies(new CacheFileSystemTraceUtils.CacheOperation("Input.readTail", "00000000000000000002.checkpoint.parquet"), 2) + .add(new CacheFileSystemTraceUtils.CacheOperation("InputFile.newStream", "00000000000000000002.crc")) .add(new CacheFileSystemTraceUtils.CacheOperation("InputFile.length", "00000000000000000003.json")) .add(new CacheFileSystemTraceUtils.CacheOperation("InputFile.newStream", "_last_checkpoint")) .add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.readCached", "key=p1/", 0, 229)) @@ -91,6 +92,7 @@ public void testTableDataCachedWhileTransactionLogNotCached() ImmutableMultiset.builder() .addCopies(new CacheFileSystemTraceUtils.CacheOperation("InputFile.length", "00000000000000000002.checkpoint.parquet"), 2) .addCopies(new CacheFileSystemTraceUtils.CacheOperation("Input.readTail", "00000000000000000002.checkpoint.parquet"), 2) + .add(new CacheFileSystemTraceUtils.CacheOperation("InputFile.newStream", "00000000000000000002.crc")) .add(new CacheFileSystemTraceUtils.CacheOperation("InputFile.length", "00000000000000000003.json")) .add(new CacheFileSystemTraceUtils.CacheOperation("InputFile.newStream", "_last_checkpoint")) .add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.readCached", "key=p1/", 0, 229)) diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeBasic.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeBasic.java index 47eb42d76fb0..2b862b81f0c5 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeBasic.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeBasic.java @@ -2037,36 +2037,37 @@ private void testCorruptedTableLocation(String tableName, Path tableLocation, bo assertUpdate("CALL system.flush_metadata_cache(schema_name => CURRENT_SCHEMA, table_name => '" + tableName + "')"); // Assert queries fail cleanly - assertQueryFails("TABLE " + tableName, "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("SELECT * FROM \"" + tableName + "$history\"", "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("SELECT * FROM \"" + tableName + "$properties\"", "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("SELECT * FROM \"" + tableName + "$partitions\"", "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("SELECT * FROM " + tableName + " WHERE false", "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("SELECT 1 FROM " + tableName + " WHERE false", "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("SHOW CREATE TABLE " + tableName, "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("CREATE TABLE a_new_table (LIKE " + tableName + " EXCLUDING PROPERTIES)", "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("DESCRIBE " + tableName, "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("SHOW COLUMNS FROM " + tableName, "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("SHOW STATS FOR " + tableName, "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("ANALYZE " + tableName, "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("ALTER TABLE " + tableName + " EXECUTE optimize", "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("ALTER TABLE " + tableName + " EXECUTE vacuum", "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("ALTER TABLE " + tableName + " RENAME TO bad_person_some_new_name", "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("ALTER TABLE " + tableName + " ADD COLUMN foo int", "Metadata not found in transaction log for tpch." + tableName); + String corruptedTableMessageRegex = "(Metadata not found in transaction log for tpch\\." + tableName + "|Delta table tpch\\." + tableName + " has no commits)"; + assertQueryFails("TABLE " + tableName, corruptedTableMessageRegex); + assertQueryFails("SELECT * FROM \"" + tableName + "$history\"", corruptedTableMessageRegex); + assertQueryFails("SELECT * FROM \"" + tableName + "$properties\"", corruptedTableMessageRegex); + assertQueryFails("SELECT * FROM \"" + tableName + "$partitions\"", corruptedTableMessageRegex); + assertQueryFails("SELECT * FROM " + tableName + " WHERE false", corruptedTableMessageRegex); + assertQueryFails("SELECT 1 FROM " + tableName + " WHERE false", corruptedTableMessageRegex); + assertQueryFails("SHOW CREATE TABLE " + tableName, corruptedTableMessageRegex); + assertQueryFails("CREATE TABLE a_new_table (LIKE " + tableName + " EXCLUDING PROPERTIES)", corruptedTableMessageRegex); + assertQueryFails("DESCRIBE " + tableName, corruptedTableMessageRegex); + assertQueryFails("SHOW COLUMNS FROM " + tableName, corruptedTableMessageRegex); + assertQueryFails("SHOW STATS FOR " + tableName, corruptedTableMessageRegex); + assertQueryFails("ANALYZE " + tableName, corruptedTableMessageRegex); + assertQueryFails("ALTER TABLE " + tableName + " EXECUTE optimize", corruptedTableMessageRegex); + assertQueryFails("ALTER TABLE " + tableName + " EXECUTE vacuum", corruptedTableMessageRegex); + assertQueryFails("ALTER TABLE " + tableName + " RENAME TO bad_person_some_new_name", corruptedTableMessageRegex); + assertQueryFails("ALTER TABLE " + tableName + " ADD COLUMN foo int", corruptedTableMessageRegex); // TODO (https://github.com/trinodb/trino/issues/16248) ADD field - assertQueryFails("ALTER TABLE " + tableName + " DROP COLUMN foo", "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("ALTER TABLE " + tableName + " DROP COLUMN foo.bar", "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("ALTER TABLE " + tableName + " SET PROPERTIES change_data_feed_enabled = true", "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("INSERT INTO " + tableName + " VALUES (NULL)", "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("UPDATE " + tableName + " SET foo = 'bar'", "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("DELETE FROM " + tableName, "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("MERGE INTO " + tableName + " USING (SELECT 1 a) input ON true WHEN MATCHED THEN DELETE", "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("TRUNCATE TABLE " + tableName, "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("COMMENT ON TABLE " + tableName + " IS NULL", "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("COMMENT ON COLUMN " + tableName + ".foo IS NULL", "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("CALL system.vacuum(CURRENT_SCHEMA, '" + tableName + "', '7d')", "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("SELECT * FROM TABLE(system.table_changes(CURRENT_SCHEMA, '" + tableName + "'))", "Metadata not found in transaction log for tpch." + tableName); - assertQueryFails("CREATE OR REPLACE TABLE " + tableName + " (id INTEGER)", "Metadata not found in transaction log for tpch." + tableName); + assertQueryFails("ALTER TABLE " + tableName + " DROP COLUMN foo", corruptedTableMessageRegex); + assertQueryFails("ALTER TABLE " + tableName + " DROP COLUMN foo.bar", corruptedTableMessageRegex); + assertQueryFails("ALTER TABLE " + tableName + " SET PROPERTIES change_data_feed_enabled = true", corruptedTableMessageRegex); + assertQueryFails("INSERT INTO " + tableName + " VALUES (NULL)", corruptedTableMessageRegex); + assertQueryFails("UPDATE " + tableName + " SET foo = 'bar'", corruptedTableMessageRegex); + assertQueryFails("DELETE FROM " + tableName, corruptedTableMessageRegex); + assertQueryFails("MERGE INTO " + tableName + " USING (SELECT 1 a) input ON true WHEN MATCHED THEN DELETE", corruptedTableMessageRegex); + assertQueryFails("TRUNCATE TABLE " + tableName, corruptedTableMessageRegex); + assertQueryFails("COMMENT ON TABLE " + tableName + " IS NULL", corruptedTableMessageRegex); + assertQueryFails("COMMENT ON COLUMN " + tableName + ".foo IS NULL", corruptedTableMessageRegex); + assertQueryFails("CALL system.vacuum(CURRENT_SCHEMA, '" + tableName + "', '7d')", corruptedTableMessageRegex); + assertQueryFails("SELECT * FROM TABLE(system.table_changes(CURRENT_SCHEMA, '" + tableName + "'))", corruptedTableMessageRegex); + assertQueryFails("CREATE OR REPLACE TABLE " + tableName + " (id INTEGER)", corruptedTableMessageRegex); assertQuerySucceeds("CALL system.drop_extended_stats(CURRENT_SCHEMA, '" + tableName + "')"); // Avoid failing metadata queries diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeConfig.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeConfig.java index 4a5c55667164..03d20cec5a8b 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeConfig.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeConfig.java @@ -75,7 +75,8 @@ public void testDefaults() .setDeletionVectorsEnabled(false) .setDeltaLogFileSystemCacheDisabled(false) .setMetadataParallelism(8) - .setCheckpointProcessingParallelism(4)); + .setCheckpointProcessingParallelism(4) + .setLoadMetadataFromChecksumFile(true)); } @Test @@ -118,6 +119,7 @@ public void testExplicitPropertyMappings() .put("delta.fs.cache.disable-transaction-log-caching", "true") .put("delta.metadata.parallelism", "10") .put("delta.checkpoint-processing.parallelism", "8") + .put("delta.load-metadata-from-checksum-file", "false") .buildOrThrow(); DeltaLakeConfig expected = new DeltaLakeConfig() @@ -156,7 +158,8 @@ public void testExplicitPropertyMappings() .setDeletionVectorsEnabled(true) .setDeltaLogFileSystemCacheDisabled(true) .setMetadataParallelism(10) - .setCheckpointProcessingParallelism(8); + .setCheckpointProcessingParallelism(8) + .setLoadMetadataFromChecksumFile(false); assertFullMapping(properties, expected); } diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeFileOperations.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeFileOperations.java index 059cdd384f09..99c41f6f62c4 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeFileOperations.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeFileOperations.java @@ -56,6 +56,7 @@ import static io.trino.filesystem.tracing.FileSystemAttributes.FILE_LOCATION; import static io.trino.plugin.deltalake.TestDeltaLakeFileOperations.FileType.CDF_DATA; import static io.trino.plugin.deltalake.TestDeltaLakeFileOperations.FileType.CHECKPOINT; +import static io.trino.plugin.deltalake.TestDeltaLakeFileOperations.FileType.CHECKSUM; import static io.trino.plugin.deltalake.TestDeltaLakeFileOperations.FileType.DATA; import static io.trino.plugin.deltalake.TestDeltaLakeFileOperations.FileType.DELETION_VECTOR; import static io.trino.plugin.deltalake.TestDeltaLakeFileOperations.FileType.LAST_CHECKPOINT; @@ -144,6 +145,7 @@ public void testCreateOrReplaceTable() assertFileSystemAccesses("CREATE OR REPLACE TABLE test_create_or_replace (id VARCHAR, age INT)", ImmutableMultiset.builder() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) + .addCopies(new FileOperation(CHECKSUM, "00000000000000000000.crc", "InputFile.newStream"), 2) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "OutputFile.createOrOverwrite")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) @@ -175,6 +177,7 @@ public void testCreateOrReplaceTableAsSelect() .add(new FileOperation(TRINO_EXTENDED_STATS_JSON, "extended_stats.json", "InputFile.newStream")) .add(new FileOperation(TRINO_EXTENDED_STATS_JSON, "extended_stats.json", "OutputFile.createOrOverwrite")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) + .addCopies(new FileOperation(CHECKSUM, "00000000000000000000.crc", "InputFile.newStream"), 3) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "OutputFile.createOrOverwrite")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) @@ -205,6 +208,7 @@ public void testReadUnpartitionedTable() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000002.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) @@ -221,6 +225,7 @@ public void testReadUnpartitionedTable() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000002.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) @@ -245,6 +250,7 @@ public void testReadTableCheckpointInterval() "TABLE test_read_checkpoint", ImmutableMultiset.builder() .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000002.crc", "InputFile.newStream")) .addCopies(new FileOperation(CHECKPOINT, "00000000000000000002.checkpoint.parquet", "InputFile.length"), 2) // TODO (https://github.com/trinodb/trino/issues/18916) should be checked once per query .addCopies(new FileOperation(CHECKPOINT, "00000000000000000002.checkpoint.parquet", "InputFile.newInput"), 2) // TODO (https://github.com/trinodb/trino/issues/18916) should be checked once per query .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000003.json", "InputFile.length")) @@ -268,6 +274,7 @@ public void testReadPartitionTableWithCheckpointFiltering() "TABLE test_checkpoint_filtering", ImmutableMultiset.builder() .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000002.crc", "InputFile.newStream")) .addCopies(new FileOperation(CHECKPOINT, "00000000000000000002.checkpoint.parquet", "InputFile.length"), 2) // TODO (https://github.com/trinodb/trino/issues/18916) should be checked once per query .addCopies(new FileOperation(CHECKPOINT, "00000000000000000002.checkpoint.parquet", "InputFile.newInput"), 2) // TODO (https://github.com/trinodb/trino/issues/18916) should be checked once per query .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000003.json", "InputFile.length")) @@ -297,6 +304,7 @@ public void testReadWholePartition() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000002.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) @@ -314,6 +322,7 @@ public void testReadWholePartition() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000002.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) @@ -331,6 +340,7 @@ public void testReadWholePartition() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000002.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) @@ -346,6 +356,7 @@ public void testReadWholePartition() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000002.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) @@ -361,6 +372,7 @@ public void testReadWholePartition() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000002.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) @@ -376,6 +388,7 @@ public void testReadWholePartition() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000002.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) @@ -412,6 +425,7 @@ public void testReadWholePartitionSplittableFile() .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000001.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) @@ -426,6 +440,7 @@ public void testReadWholePartitionSplittableFile() .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000001.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) @@ -449,6 +464,7 @@ public void testSelfJoin() ImmutableMultiset.builder() .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) + .addCopies(new FileOperation(CHECKSUM, "00000000000000000000.crc", "InputFile.newStream"), 2) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) .add(new FileOperation(TRINO_EXTENDED_STATS_JSON, "extended_stats.json", "InputFile.newStream")) @@ -471,6 +487,7 @@ public void testSelectFromVersionedTable() ImmutableMultiset.builder() .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000000.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.exists")) .build()); @@ -479,6 +496,7 @@ public void testSelectFromVersionedTable() .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000001.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.exists")) @@ -490,6 +508,7 @@ public void testSelectFromVersionedTable() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000002.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) @@ -508,6 +527,7 @@ public void testSelectFromVersionedTable() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000006.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000007.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000008.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000008.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) @@ -528,6 +548,7 @@ public void testSelectFromVersionedTable() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000011.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000012.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000013.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000013.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000011.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000012.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000013.json", "InputFile.length")) @@ -537,6 +558,7 @@ public void testSelectFromVersionedTable() assertFileSystemAccesses("SELECT * FROM " + tableName + " FOR VERSION AS OF 20", ImmutableMultiset.builder() .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000020.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000020.json", "InputFile.exists")) .addCopies(new FileOperation(CHECKPOINT, "00000000000000000020.checkpoint.parquet", "InputFile.length"), 2) .addCopies(new FileOperation(CHECKPOINT, "00000000000000000020.checkpoint.parquet", "InputFile.newInput"), 2) @@ -550,6 +572,7 @@ public void testSelectFromVersionedTable() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000021.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000022.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000023.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000023.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000021.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000022.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000023.json", "InputFile.length")) @@ -590,6 +613,7 @@ public void testSelectFromTemporalVersionedTable() .addAll(allInsertOperations.subList(0, 20 * 2 + 1)) .addCopies(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream"), 2) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000000.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000020.json", "InputFile.length")) .build()); @@ -599,6 +623,7 @@ public void testSelectFromTemporalVersionedTable() .addAll(allInsertOperations.subList(0, 20 * 2 + 1)) .addCopies(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream"), 2) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000002.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000020.json", "InputFile.length")) .addCopies(new FileOperation(DATA, "no partition", "InputFile.newInput"), 2) @@ -609,6 +634,7 @@ public void testSelectFromTemporalVersionedTable() .addAll(allInsertOperations.subList(0, 20 * 2 + 1)) .addCopies(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream"), 2) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000003.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000003.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000003.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000020.json", "InputFile.length")) .addCopies(new FileOperation(DATA, "no partition", "InputFile.newInput"), 3) @@ -619,6 +645,7 @@ public void testSelectFromTemporalVersionedTable() .addAll(allInsertOperations.subList(0, 20 * 2 + 1)) .addCopies(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream"), 2) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000009.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000009.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000009.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000020.json", "InputFile.length")) .addCopies(new FileOperation(DATA, "no partition", "InputFile.newInput"), 9) @@ -629,6 +656,7 @@ public void testSelectFromTemporalVersionedTable() .addAll(allInsertOperations.subList(11 * 2, 20 * 2 + 1)) .addCopies(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream"), 2) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000014.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000014.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000014.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000020.json", "InputFile.length")) .addCopies(new FileOperation(CHECKPOINT, "00000000000000000010.checkpoint.parquet", "InputFile.length"), 2) @@ -641,6 +669,7 @@ public void testSelectFromTemporalVersionedTable() .addAll(allInsertOperations.subList(20 * 2, 22 * 2 + 1)) .addCopies(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream"), 2) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000021.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000021.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000021.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000022.json", "InputFile.length")) .addCopies(new FileOperation(CHECKPOINT, "00000000000000000020.checkpoint.parquet", "InputFile.length"), 2) @@ -659,6 +688,7 @@ public void testSelectFromTemporalVersionedTable() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000023.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000023.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000024.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000024.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000024.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000025.json", "InputFile.length")) .addCopies(new FileOperation(CHECKPOINT, "00000000000000000020.checkpoint.parquet", "InputFile.length"), 2) @@ -688,6 +718,7 @@ public void testDeleteWholePartition() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) + .addCopies(new FileOperation(CHECKSUM, "00000000000000000002.crc", "InputFile.newStream"), 2) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000003.json", "InputFile.exists")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000003.json", "OutputFile.createOrOverwrite")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) @@ -717,6 +748,7 @@ public void testDeleteWholeTable() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) + .addCopies(new FileOperation(CHECKSUM, "00000000000000000002.crc", "InputFile.newStream"), 2) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000003.json", "InputFile.exists")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000003.json", "OutputFile.createOrOverwrite")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) @@ -747,6 +779,7 @@ public void testDeleteWithNonPartitionFilter() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000003.json", "InputFile.newStream")) + .addCopies(new FileOperation(CHECKSUM, "00000000000000000003.crc", "InputFile.newStream"), 2) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000004.json", "InputFile.exists")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000004.json", "OutputFile.createOrOverwrite")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) @@ -895,6 +928,7 @@ public void testTableChangesFileSystemAccess() .addCopies(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000004.json", "InputFile.newStream"), 2) .addCopies(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000005.json", "InputFile.newStream"), 2) .addCopies(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000006.json", "InputFile.newStream"), 2) + .add(new FileOperation(CHECKSUM, "00000000000000000006.crc", "InputFile.newStream")) .addCopies(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length"), 2) .addCopies(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length"), 2) .addCopies(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length"), 2) @@ -991,6 +1025,7 @@ private void testInformationSchemaColumns(boolean removeCachedProperties) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000002.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) @@ -1013,6 +1048,7 @@ private void testInformationSchemaColumns(boolean removeCachedProperties) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) + .addCopies(new FileOperation(CHECKSUM, "00000000000000000002.crc", "InputFile.newStream"), 2) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) @@ -1113,6 +1149,7 @@ private void testSystemMetadataTableComments(boolean removeCachedProperties) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000002.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) @@ -1158,12 +1195,129 @@ public void testReadMultipartCheckpoint() .addCopies(new FileOperation(CHECKPOINT, "00000000000000000006.checkpoint.0000000002.0000000002.parquet", "InputFile.length"), 2) // TODO (https://github.com/trinodb/trino/issues/18916) should be checked once per query .addCopies(new FileOperation(CHECKPOINT, "00000000000000000006.checkpoint.0000000002.0000000002.parquet", "InputFile.newInput"), 2) // TODO (https://github.com/trinodb/trino/issues/18916) should be checked once per query .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000007.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000007.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000007.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000008.json", "InputFile.length")) .addCopies(new FileOperation(DATA, "no partition", "InputFile.newInput"), 7) .build()); } + /** + * @see deltalake.checksum + */ + @Test + public void testLoadMetadataFromChecksumFileForDescribe() + throws Exception + { + String catalog = getSession().getCatalog().orElseThrow(); + Session session = Session.builder(getSession()) + .setCatalogSessionProperty(catalog, "load_metadata_from_checksum_file", "true") + .build(); + + String tableName = "test_load_metadata_from_checksum_file_for_describe_" + randomNameSuffix(); + Path tableLocation = Files.createTempFile(tableName, null); + copyDirectoryContents(new File(Resources.getResource("deltalake/checksum").toURI()).toPath(), tableLocation); + + assertUpdate(session, "CALL system.register_table(CURRENT_SCHEMA, '%s', '%s')".formatted(tableName, tableLocation.toUri())); + + assertFileSystemAccesses(session, "DESCRIBE " + tableName, + ImmutableMultiset.builder() + .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) + .addCopies(new FileOperation(CHECKSUM, "00000000000000000001.crc", "InputFile.newStream"), 2) + .build()); + + assertUpdate(session, "DROP TABLE " + tableName); + } + + /** + * @see deltalake.checksum + */ + @Test + public void testLoadMetadataWithFromChecksumFileDisabledForDescribe() + throws Exception + { + String catalog = getSession().getCatalog().orElseThrow(); + Session session = Session.builder(getSession()) + .setCatalogSessionProperty(catalog, "load_metadata_from_checksum_file", "false") + .build(); + + String tableName = "test_load_metadata_with_from_checksum_file_disabled_for_describe_" + randomNameSuffix(); + Path tableLocation = Files.createTempFile(tableName, null); + copyDirectoryContents(new File(Resources.getResource("deltalake/checksum").toURI()).toPath(), tableLocation); + + assertUpdate(session, "CALL system.register_table(CURRENT_SCHEMA, '%s', '%s')".formatted(tableName, tableLocation.toUri())); + + assertFileSystemAccesses(session, "DESCRIBE " + tableName, + ImmutableMultiset.builder() + .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) + .add(new FileOperation(CHECKPOINT, "00000000000000000001.checkpoint.parquet", "InputFile.length")) + .add(new FileOperation(CHECKPOINT, "00000000000000000001.checkpoint.parquet", "InputFile.newInput")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) + .build()); + + assertUpdate(session, "DROP TABLE " + tableName); + } + + /** + * @see deltalake.checksum_missing_latest + */ + @Test + public void testLoadMetadataFromMissingLatestChecksumFileForDescribe() + throws Exception + { + String catalog = getSession().getCatalog().orElseThrow(); + Session session = Session.builder(getSession()) + .setCatalogSessionProperty(catalog, "load_metadata_from_checksum_file", "true") + .build(); + + String tableName = "test_load_metadata_from_missing_latest_checksum_file_for_describe_" + randomNameSuffix(); + Path tableLocation = Files.createTempFile(tableName, null); + copyDirectoryContents(new File(Resources.getResource("deltalake/checksum_missing_latest").toURI()).toPath(), tableLocation); + + assertUpdate(session, "CALL system.register_table(CURRENT_SCHEMA, '%s', '%s')".formatted(tableName, tableLocation.toUri())); + + assertFileSystemAccesses(session, "DESCRIBE " + tableName, + ImmutableMultiset.builder() + .addCopies(new FileOperation(CHECKSUM, "00000000000000000001.crc", "InputFile.newStream"), 2) + .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) + .add(new FileOperation(CHECKPOINT, "00000000000000000001.checkpoint.parquet", "InputFile.length")) + .add(new FileOperation(CHECKPOINT, "00000000000000000001.checkpoint.parquet", "InputFile.newInput")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) + .build()); + + assertUpdate(session, "DROP TABLE " + tableName); + } + + /** + * @see deltalake.checksum_without_metadata + */ + @Test + public void testLoadMetadataFromChecksumFileWithoutMetadataForDescribe() + throws Exception + { + String catalog = getSession().getCatalog().orElseThrow(); + Session session = Session.builder(getSession()) + .setCatalogSessionProperty(catalog, "load_metadata_from_checksum_file", "true") + .build(); + + String tableName = "test_load_metadata_from_checksum_file_without_metadata_for_describe_" + randomNameSuffix(); + Path tableLocation = Files.createTempFile(tableName, null); + copyDirectoryContents(new File(Resources.getResource("deltalake/checksum_without_metadata").toURI()).toPath(), tableLocation); + + assertUpdate(session, "CALL system.register_table(CURRENT_SCHEMA, '%s', '%s')".formatted(tableName, tableLocation.toUri())); + + assertFileSystemAccesses(session, "DESCRIBE " + tableName, + ImmutableMultiset.builder() + .addCopies(new FileOperation(CHECKSUM, "00000000000000000001.crc", "InputFile.newStream"), 2) + .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) + .add(new FileOperation(CHECKPOINT, "00000000000000000001.checkpoint.parquet", "InputFile.length")) + .add(new FileOperation(CHECKPOINT, "00000000000000000001.checkpoint.parquet", "InputFile.newInput")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) + .build()); + + assertUpdate(session, "DROP TABLE " + tableName); + } + @Test public void testReadMultipartV2Checkpoint() throws Exception @@ -1176,6 +1330,7 @@ public void testReadMultipartV2Checkpoint() assertFileSystemAccesses("SELECT * FROM " + tableName, ImmutableMultiset.builder() .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000004.crc", "InputFile.newStream")) .addCopies(new FileOperation(CHECKPOINT, "00000000000000000004.checkpoint.42f48375-5c72-4d2f-8dcc-7ce4d45e2d8c.json", "InputFile.length"), 2) // TODO (https://github.com/trinodb/trino/issues/18916) should be checked once per query .addCopies(new FileOperation(CHECKPOINT, "00000000000000000004.checkpoint.42f48375-5c72-4d2f-8dcc-7ce4d45e2d8c.json", "InputFile.newStream"), 2) // TODO (https://github.com/trinodb/trino/issues/18916) should be checked once per query .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000005.json", "InputFile.length")) @@ -1210,6 +1365,7 @@ public void testV2CheckpointJson() assertFileSystemAccesses("SELECT * FROM " + tableName, ImmutableMultiset.builder() .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000001.crc", "InputFile.newStream")) .addCopies(new FileOperation(CHECKPOINT, "00000000000000000001.checkpoint.73a4ddb8-2bfc-40d8-b09f-1b6a0abdfb04.json", "InputFile.length"), 2) // TODO (https://github.com/trinodb/trino/issues/18916) should be checked once per query .addCopies(new FileOperation(CHECKPOINT, "00000000000000000001.checkpoint.73a4ddb8-2bfc-40d8-b09f-1b6a0abdfb04.json", "InputFile.newStream"), 2) // TODO (https://github.com/trinodb/trino/issues/18916) should be checked once per query .add(new FileOperation(CHECKPOINT, "00000000000000000001.checkpoint.0000000001.0000000001.90cf4e21-dbaa-41d6-8ae5-6709cfbfbfe0.parquet", "InputFile.length")) @@ -1240,6 +1396,7 @@ public void testV2CheckpointParquet() assertFileSystemAccesses("SELECT * FROM " + tableName, ImmutableMultiset.builder() .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000001.crc", "InputFile.newStream")) .addCopies(new FileOperation(CHECKPOINT, "00000000000000000001.checkpoint.156b3304-76b2-49c3-a9a1-626f07df27c9.parquet", "InputFile.length"), 4) // TODO (https://github.com/trinodb/trino/issues/18916) should be checked once per query .addCopies(new FileOperation(CHECKPOINT, "00000000000000000001.checkpoint.156b3304-76b2-49c3-a9a1-626f07df27c9.parquet", "InputFile.newInput"), 2) // TODO (https://github.com/trinodb/trino/issues/18916) should be checked once per query .add(new FileOperation(CHECKPOINT, "00000000000000000001.checkpoint.0000000001.0000000001.03288d7e-af16-44ed-829c-196064a71812.parquet", "InputFile.length")) @@ -1263,6 +1420,7 @@ public void testDeletionVectors() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000002.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) @@ -1277,6 +1435,7 @@ public void testDeletionVectors() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000002.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) @@ -1300,6 +1459,7 @@ public void testReadMetadataAndProtocolEntry() ImmutableMultiset.builder() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000000.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) .build()); @@ -1316,6 +1476,7 @@ public void testReadMetadataAndProtocolEntry() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000002.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000003.json", "InputFile.length")) .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) .build()); @@ -1328,6 +1489,7 @@ public void testReadMetadataAndProtocolEntry() ImmutableMultiset.builder() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000004.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000004.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000004.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000005.json", "InputFile.length")) .add(new FileOperation(CHECKPOINT, "00000000000000000003.checkpoint.parquet", "InputFile.newInput")) .add(new FileOperation(CHECKPOINT, "00000000000000000003.checkpoint.parquet", "InputFile.length")) @@ -1345,12 +1507,99 @@ public void testReadMetadataAndProtocolEntry() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000004.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000005.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000005.json", "InputFile.newStream")) + .add(new FileOperation(CHECKSUM, "00000000000000000005.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000006.json", "InputFile.length")) .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) .build()); } } + @Test + public void testSelectFromVersionedTableWithChecksumFilesDisabled() + { + String catalog = getSession().getCatalog().orElseThrow(); + Session session = Session.builder(getSession()) + .setCatalogSessionProperty(catalog, "load_metadata_from_checksum_file", "false") + .build(); + + String tableName = "test_select_from_versioned_table_without_checksum_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + "(id int)"); + for (int i = 0; i < 25; i++) { + assertUpdate("INSERT INTO " + tableName + " VALUES " + i, 1); + } + + assertFileSystemAccesses( + session, + "SELECT * FROM " + tableName + " FOR VERSION AS OF 2", + ImmutableMultiset.builder() + .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.exists")) + .addCopies(new FileOperation(DATA, "no partition", "InputFile.newInput"), 2) + .build()); + + assertFileSystemAccesses( + session, + "SELECT * FROM " + tableName + " FOR VERSION AS OF 23", + ImmutableMultiset.builder() + .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) + .addCopies(new FileOperation(CHECKPOINT, "00000000000000000020.checkpoint.parquet", "InputFile.length"), 2) + .addCopies(new FileOperation(CHECKPOINT, "00000000000000000020.checkpoint.parquet", "InputFile.newInput"), 2) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000021.json", "InputFile.newStream")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000022.json", "InputFile.newStream")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000023.json", "InputFile.newStream")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000021.json", "InputFile.length")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000022.json", "InputFile.length")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000023.json", "InputFile.length")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000023.json", "InputFile.exists")) + .addCopies(new FileOperation(DATA, "no partition", "InputFile.newInput"), 23) + .build()); + + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testReadMetadataAndProtocolEntryWithChecksumFilesDisabled() + { + String catalog = getSession().getCatalog().orElseThrow(); + Session session = Session.builder(getSession()) + .setCatalogSessionProperty(catalog, "load_metadata_from_checksum_file", "false") + .build(); + + try (TestTable table = newTrinoTable("test_read_metadata_protocol_without_checksum", "(data int , part varchar) WITH (checkpoint_interval = 3)")) { + assertFileSystemAccesses( + session, + "SHOW CREATE TABLE " + table.getName(), + ImmutableMultiset.builder() + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) + .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) + .build()); + + assertUpdate("INSERT INTO " + table.getName() + " VALUES (1, 'aa')", 1); + assertUpdate("INSERT INTO " + table.getName() + " VALUES (2, 'bb')", 1); + assertFileSystemAccesses( + session, + "SHOW CREATE TABLE " + table.getName(), + ImmutableMultiset.builder() + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000003.json", "InputFile.length")) + .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) + .build()); + } + } + private int countCdfFilesForKey(String partitionValue) { String path = (String) computeScalar("SELECT \"$path\" FROM table_changes_file_system_access WHERE key = '" + partitionValue + "'"); @@ -1400,6 +1649,9 @@ public static FileOperation create(String path, String operationType) if (path.matches(".*/_delta_log/\\d+\\.json")) { return new FileOperation(TRANSACTION_LOG_JSON, fileName, operationType); } + if (path.matches(".*/_delta_log/\\d+\\.crc")) { + return new FileOperation(CHECKSUM, fileName, operationType); + } if (path.matches(".*/_delta_log/_trino_meta/extended_stats.json")) { return new FileOperation(TRINO_EXTENDED_STATS_JSON, fileName, operationType); } @@ -1443,6 +1695,7 @@ enum FileType DATA, CDF_DATA, DELETION_VECTOR, + CHECKSUM, } private void registerTable(String name, String resourcePath) diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeMetadata.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeMetadata.java index 1ec70c88ac1c..92c69f9bf01c 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeMetadata.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeMetadata.java @@ -15,6 +15,7 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableMultimap; import com.google.common.collect.ImmutableSet; import com.google.inject.AbstractModule; import com.google.inject.Injector; @@ -28,19 +29,25 @@ import io.trino.filesystem.hdfs.HdfsFileSystemFactory; import io.trino.hdfs.HdfsEnvironment; import io.trino.hdfs.TrinoHdfsFileSystemStats; +import io.trino.metastore.Column; import io.trino.metastore.Database; import io.trino.metastore.HiveMetastoreFactory; +import io.trino.metastore.PrincipalPrivileges; import io.trino.metastore.RawHiveMetastoreFactory; +import io.trino.metastore.Table; import io.trino.plugin.base.ConnectorContextModule; import io.trino.plugin.deltalake.metastore.DeltaLakeMetastore; import io.trino.plugin.deltalake.metastore.DeltaLakeMetastoreModule; import io.trino.plugin.deltalake.metastore.HiveMetastoreBackedDeltaLakeMetastore; import io.trino.plugin.deltalake.transactionlog.MetadataEntry; import io.trino.plugin.deltalake.transactionlog.ProtocolEntry; +import io.trino.plugin.hive.parquet.ParquetReaderConfig; +import io.trino.plugin.hive.parquet.ParquetWriterConfig; import io.trino.spi.TrinoException; import io.trino.spi.connector.Assignment; import io.trino.spi.connector.ColumnHandle; import io.trino.spi.connector.ColumnMetadata; +import io.trino.spi.connector.ConnectorSession; import io.trino.spi.connector.ConnectorTableHandle; import io.trino.spi.connector.ConnectorTableLayout; import io.trino.spi.connector.ConnectorTableMetadata; @@ -59,6 +66,7 @@ import io.trino.spi.type.Type; import io.trino.spi.type.VarcharType; import io.trino.testing.TestingConnectorContext; +import io.trino.testing.TestingConnectorSession; import io.trino.tests.BogusType; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; @@ -68,6 +76,8 @@ import java.io.File; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.List; import java.util.Map; import java.util.Optional; @@ -82,10 +92,18 @@ import static io.airlift.testing.Closeables.closeAll; import static io.trino.hdfs.HdfsTestUtils.HDFS_ENVIRONMENT; import static io.trino.hdfs.HdfsTestUtils.HDFS_FILE_SYSTEM_STATS; +import static io.trino.metastore.HiveType.HIVE_STRING; import static io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR; +import static io.trino.plugin.deltalake.DeltaLakeMetadata.DELTA_STORAGE_FORMAT; +import static io.trino.plugin.deltalake.DeltaLakeMetadata.PATH_PROPERTY; import static io.trino.plugin.deltalake.DeltaLakeTableProperties.COLUMN_MAPPING_MODE_PROPERTY; +import static io.trino.plugin.deltalake.DeltaLakeTableProperties.LOCATION_PROPERTY; import static io.trino.plugin.deltalake.DeltaLakeTableProperties.PARTITIONED_BY_PROPERTY; import static io.trino.plugin.deltalake.DeltaTestingConnectorSession.SESSION; +import static io.trino.plugin.deltalake.TestingDeltaLakeUtils.copyDirectoryContents; +import static io.trino.plugin.deltalake.metastore.HiveMetastoreBackedDeltaLakeMetastore.TABLE_PROVIDER_PROPERTY; +import static io.trino.plugin.deltalake.metastore.HiveMetastoreBackedDeltaLakeMetastore.TABLE_PROVIDER_VALUE; +import static io.trino.plugin.hive.TableType.EXTERNAL_TABLE; import static io.trino.spi.connector.SaveMode.FAIL; import static io.trino.spi.security.PrincipalType.USER; import static io.trino.spi.type.BigintType.BIGINT; @@ -171,6 +189,7 @@ public class TestDeltaLakeMetadata private File temporaryCatalogDirectory; private DeltaLakeMetadataFactory deltaLakeMetadataFactory; + private DeltaLakeMetastore deltaLakeMetastore; @BeforeAll public void setUp() @@ -191,6 +210,7 @@ public void setUp() new DeltaLakeSecurityModule(), new DeltaLakeMetastoreModule(), new DeltaLakeModule(), + new TestingDeltaLakeExtensionsModule(), // test setup binder -> { binder.bind(HdfsEnvironment.class).toInstance(HDFS_ENVIRONMENT); @@ -214,13 +234,13 @@ public DeltaLakeMetastore getDeltaLakeMetastore(@RawHiveMetastoreFactory HiveMet deltaLakeMetadataFactory = injector.getInstance(DeltaLakeMetadataFactory.class); - injector.getInstance(DeltaLakeMetastore.class) - .createDatabase(Database.builder() - .setDatabaseName(DATABASE_NAME) - .setOwnerName(Optional.of("test")) - .setOwnerType(Optional.of(USER)) - .setLocation(Optional.empty()) - .build()); + deltaLakeMetastore = injector.getInstance(DeltaLakeMetastore.class); + deltaLakeMetastore.createDatabase(Database.builder() + .setDatabaseName(DATABASE_NAME) + .setOwnerName(Optional.of("test")) + .setOwnerType(Optional.of(USER)) + .setLocation(Optional.empty()) + .build()); } @AfterAll @@ -484,6 +504,102 @@ public void testGetInputInfoForUnPartitionedTable() deltaLakeMetadata.cleanupQuery(SESSION); } + /** + * @see deltalake.checksum + * @see deltalake.checksum_missing_latest + * @see deltalake.checksum_without_metadata + * @see deltalake.checksum_without_protocol + */ + @Test + public void testGetTableHandleChecksum() + throws Exception + { + ConnectorSession loadMetadataFromChecksumFileEnabledSession = loadMetadataFromChecksumFileSession(true); + DeltaLakeMetadata checksumFileDeltaLakeMetadata = deltaLakeMetadataFactory.create(loadMetadataFromChecksumFileEnabledSession.getIdentity()); + + ConnectorSession loadMetadataFromChecksumFileDisabledSession = loadMetadataFromChecksumFileSession(false); + DeltaLakeMetadata transactionLogDeltaLakeMetadata = deltaLakeMetadataFactory.create(loadMetadataFromChecksumFileDisabledSession.getIdentity()); + + // For a structurally-valid Delta table, we should see identical metadata and protocol information whether loaded from + // the latest checksum file or from the Delta log. Additionally, if the checksum file for the latest commit is missing + // or does not contain the requisite information, we should smoothly fall back to loading from the transaction log + + for (String fixture : ImmutableList.of( + "deltalake/checksum", + "deltalake/checksum_missing_latest", + "deltalake/checksum_without_metadata", + "deltalake/checksum_without_protocol")) { + SchemaTableName tableName = registerFixtureTable(fixture); + + DeltaLakeTableHandle checksumTableHandle = (DeltaLakeTableHandle) checksumFileDeltaLakeMetadata.getTableHandle( + loadMetadataFromChecksumFileEnabledSession, + tableName, + Optional.empty(), + Optional.empty()); + DeltaLakeTableHandle transactionLogTableHandle = (DeltaLakeTableHandle) transactionLogDeltaLakeMetadata.getTableHandle( + loadMetadataFromChecksumFileDisabledSession, + tableName, + Optional.empty(), + Optional.empty()); + + assertThat(checksumTableHandle.getMetadataEntry()).isEqualTo(transactionLogTableHandle.getMetadataEntry()); + assertThat(checksumTableHandle.getProtocolEntry()).isEqualTo(transactionLogTableHandle.getProtocolEntry()); + + checksumFileDeltaLakeMetadata.cleanupQuery(loadMetadataFromChecksumFileEnabledSession); + transactionLogDeltaLakeMetadata.cleanupQuery(loadMetadataFromChecksumFileDisabledSession); + } + } + + /** + * @see deltalake.checksum_invalid_json + * @see deltalake.checksum_trailing_json_content + * @see deltalake.checksum_invalid_json_mapping + */ + @Test + public void testGetTableHandleFallsBackForMalformedChecksumWhenEnabled() + throws Exception + { + ConnectorSession loadMetadataFromChecksumFileEnabledSession = loadMetadataFromChecksumFileSession(true); + DeltaLakeMetadata checksumFileDeltaLakeMetadata = deltaLakeMetadataFactory.create(loadMetadataFromChecksumFileEnabledSession.getIdentity()); + + ConnectorSession loadMetadataFromChecksumFileDisabledSession = loadMetadataFromChecksumFileSession(false); + DeltaLakeMetadata transactionLogDeltaLakeMetadata = deltaLakeMetadataFactory.create(loadMetadataFromChecksumFileDisabledSession.getIdentity()); + + for (String fixture : ImmutableList.of( + "deltalake/checksum_invalid_json", + "deltalake/checksum_trailing_json_content", + "deltalake/checksum_invalid_json_mapping")) { + SchemaTableName tableName = registerFixtureTable(fixture); + + // Malformed checksum files should be treated like missing/insufficient checksum files and should trigger a + // fallback to loading metadata from the Delta log + DeltaLakeTableHandle checksumTableHandle = (DeltaLakeTableHandle) checksumFileDeltaLakeMetadata.getTableHandle( + loadMetadataFromChecksumFileEnabledSession, + tableName, + Optional.empty(), + Optional.empty()); + checksumFileDeltaLakeMetadata.cleanupQuery(loadMetadataFromChecksumFileEnabledSession); + + DeltaLakeTableHandle transactionLogTableHandle = (DeltaLakeTableHandle) transactionLogDeltaLakeMetadata.getTableHandle( + loadMetadataFromChecksumFileDisabledSession, + tableName, + Optional.empty(), + Optional.empty()); + transactionLogDeltaLakeMetadata.cleanupQuery(loadMetadataFromChecksumFileDisabledSession); + + assertThat(checksumTableHandle.getMetadataEntry()).isEqualTo(transactionLogTableHandle.getMetadataEntry()); + assertThat(checksumTableHandle.getProtocolEntry()).isEqualTo(transactionLogTableHandle.getProtocolEntry()); + } + } + + private static ConnectorSession loadMetadataFromChecksumFileSession(boolean enabled) + { + return TestingConnectorSession.builder() + .setPropertyMetadata(new DeltaLakeSessionProperties(new DeltaLakeConfig(), new ParquetReaderConfig(), new ParquetWriterConfig()).getSessionProperties()) + .setPropertyValues(ImmutableMap.of("load_metadata_from_checksum_file", enabled)) + .build(); + } + private static DeltaLakeTableHandle createDeltaLakeTableHandle(Set projectedColumns, Set constrainedColumns) { return new DeltaLakeTableHandle( @@ -554,4 +670,33 @@ private static SchemaTableName newMockSchemaTableName() String randomSuffix = UUID.randomUUID().toString().toLowerCase(ENGLISH).replace("-", ""); return new SchemaTableName(DATABASE_NAME, "table_" + randomSuffix); } + + private SchemaTableName registerFixtureTable(String fixtureResource) + throws Exception + { + SchemaTableName tableName = newMockSchemaTableName(); + Path tableLocation = Files.createTempDirectory(temporaryCatalogDirectory.toPath(), "delta-fixture-"); + Path source = Path.of(getClass().getClassLoader().getResource(fixtureResource).toURI()); + copyDirectoryContents(source, tableLocation); + + String location = tableLocation.toUri().toString(); + Table.Builder table = Table.builder() + .setDatabaseName(tableName.getSchemaName()) + .setTableName(tableName.getTableName()) + .setOwner(Optional.of("test")) + .setTableType(EXTERNAL_TABLE.name()) + .setDataColumns(ImmutableList.of(new Column("col", HIVE_STRING, Optional.empty(), Map.of()))) + .setParameters(ImmutableMap.of( + LOCATION_PROPERTY, location, + TABLE_PROVIDER_PROPERTY, TABLE_PROVIDER_VALUE, + "EXTERNAL", "TRUE")); + + table.getStorageBuilder() + .setStorageFormat(DELTA_STORAGE_FORMAT) + .setSerdeParameters(ImmutableMap.of(PATH_PROPERTY, location)) + .setLocation(location); + + deltaLakeMetastore.createTable(table.build(), new PrincipalPrivileges(ImmutableMultimap.of(), ImmutableMultimap.of())); + return tableName; + } } diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/transactionlog/TestTransactionLogParser.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/transactionlog/TestTransactionLogParser.java index fbfd4afd7c6f..6d9d7b411d31 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/transactionlog/TestTransactionLogParser.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/transactionlog/TestTransactionLogParser.java @@ -17,10 +17,14 @@ import io.trino.filesystem.TrinoFileSystem; import org.junit.jupiter.api.Test; +import java.util.Optional; + import static io.trino.hdfs.HdfsTestUtils.HDFS_FILE_SYSTEM_FACTORY; import static io.trino.plugin.deltalake.DeltaTestingConnectorSession.SESSION; +import static io.trino.plugin.deltalake.transactionlog.TransactionLogParser.findLatestCommitVersion; import static io.trino.plugin.deltalake.transactionlog.TransactionLogParser.getMandatoryCurrentVersion; import static io.trino.plugin.deltalake.transactionlog.TransactionLogParser.readPartitionTimestampWithZone; +import static io.trino.plugin.deltalake.transactionlog.TransactionLogParser.readVersionChecksumFile; import static org.assertj.core.api.Assertions.assertThat; public class TestTransactionLogParser @@ -60,4 +64,88 @@ void testReadPartitionTimestampWithZoneIso8601() assertThat(readPartitionTimestampWithZone("1970-01-01T00:00:00.000000Z")).isEqualTo(0L); assertThat(readPartitionTimestampWithZone("1970-01-01T01:00:00.000000+01:00")).isEqualTo(0L); } + + /** + * @see deltalake.checksum + */ + @Test + public void testFindLatestCommitVersion() + throws Exception + { + TrinoFileSystem fileSystem = HDFS_FILE_SYSTEM_FACTORY.create(SESSION); + String tableLocation = getClass().getClassLoader().getResource("deltalake/checksum").toURI().toString(); + + assertThat(findLatestCommitVersion(fileSystem, tableLocation)).hasValue(1L); + } + + /** + * @see deltalake.checksum + */ + @Test + public void testReadVersionChecksum() + throws Exception + { + TrinoFileSystem fileSystem = HDFS_FILE_SYSTEM_FACTORY.create(SESSION); + String tableLocation = getClass().getClassLoader().getResource("deltalake/checksum").toURI().toString(); + + DeltaLakeVersionChecksum checksum = readVersionChecksumFile(fileSystem, tableLocation, 1).orElseThrow(); + assertThat(checksum.metadata().orElseThrow().getId()).isEqualTo("a953d1d0-a84e-4ca6-bb2a-ed181213a3f0"); + assertThat(checksum.metadata().orElseThrow().getLowercasePartitionColumns()).isEmpty(); + assertThat(checksum.metadata().orElseThrow().getConfiguration()) + .containsEntry("delta.checkpointInterval", "1") + .hasSize(1); + assertThat(checksum.protocol()).hasValue(new ProtocolEntry(1, 2, Optional.empty(), Optional.empty())); + } + + /** + * @see deltalake.checksum_missing_latest + */ + @Test + public void testReadVersionChecksumMissingFile() + throws Exception + { + TrinoFileSystem fileSystem = HDFS_FILE_SYSTEM_FACTORY.create(SESSION); + String tableLocation = getClass().getClassLoader().getResource("deltalake/checksum_missing_latest").toURI().toString(); + + assertThat(readVersionChecksumFile(fileSystem, tableLocation, 1)).isEmpty(); + } + + /** + * @see deltalake.checksum_invalid_json + */ + @Test + public void testReadVersionChecksumInvalidJson() + throws Exception + { + TrinoFileSystem fileSystem = HDFS_FILE_SYSTEM_FACTORY.create(SESSION); + String tableLocation = getClass().getClassLoader().getResource("deltalake/checksum_invalid_json").toURI().toString(); + + assertThat(readVersionChecksumFile(fileSystem, tableLocation, 1)).isEmpty(); + } + + /** + * @see deltalake.checksum_invalid_json_mapping + */ + @Test + public void testReadVersionChecksumInvalidJsonMapping() + throws Exception + { + TrinoFileSystem fileSystem = HDFS_FILE_SYSTEM_FACTORY.create(SESSION); + String tableLocation = getClass().getClassLoader().getResource("deltalake/checksum_invalid_json_mapping").toURI().toString(); + + assertThat(readVersionChecksumFile(fileSystem, tableLocation, 1)).isEmpty(); + } + + /** + * @see deltalake.checksum_trailing_json_content + */ + @Test + public void testReadVersionChecksumJsonWithTrailingContent() + throws Exception + { + TrinoFileSystem fileSystem = HDFS_FILE_SYSTEM_FACTORY.create(SESSION); + String tableLocation = getClass().getClassLoader().getResource("deltalake/checksum_trailing_json_content").toURI().toString(); + + assertThat(readVersionChecksumFile(fileSystem, tableLocation, 1)).isEmpty(); + } } diff --git a/plugin/trino-delta-lake/src/test/resources/deltalake/checksum/README.md b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum/README.md new file mode 100644 index 000000000000..aa9ef0854dde --- /dev/null +++ b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum/README.md @@ -0,0 +1,18 @@ +Data generated using Apache Spark 4.0.0 & Delta Lake OSS 4.0.0 + +This test resource is used to verify whether reading Delta metadata and protocol information from +checksum files (rather than from the Delta log) works as expected. + +From https://github.com/delta-io/delta/blob/master/PROTOCOL.md#version-checksum-file: + +> The Delta transaction log must remain an append-only log. To enable the detection of +> non-compliant modifications to Delta files, writers can optionally emit an auxiliary file with +> every commit, which contains important information about the state of the table as of that +> version. + +Spark SQL: +``` +CREATE SCHEMA checksum LOCATION 'file:///tmp/checksum'; +CREATE TABLE checksum.checksum (data INTEGER) USING DELTA TBLPROPERTIES (delta.checkpointInterval = 1); +INSERT INTO checksum.checksum values 1; +``` diff --git a/plugin/trino-delta-lake/src/test/resources/deltalake/checksum/_delta_log/00000000000000000000.crc b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum/_delta_log/00000000000000000000.crc new file mode 100644 index 000000000000..49754b816a47 --- /dev/null +++ b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum/_delta_log/00000000000000000000.crc @@ -0,0 +1 @@ +{"txnId":"97fef1b5-2b45-4c53-ad38-03be8e6e6739","tableSizeBytes":0,"numFiles":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"a953d1d0-a84e-4ca6-bb2a-ed181213a3f0","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"data\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpointInterval":"1"},"createdTime":1771884417572},"protocol":{"minReaderVersion":1,"minWriterVersion":2},"allFiles":[]} diff --git a/plugin/trino-delta-lake/src/test/resources/deltalake/checksum/_delta_log/00000000000000000000.json b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum/_delta_log/00000000000000000000.json new file mode 100644 index 000000000000..13db1d4e6d6d --- /dev/null +++ b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum/_delta_log/00000000000000000000.json @@ -0,0 +1,3 @@ +{"commitInfo":{"timestamp":1771884417678,"operation":"CREATE TABLE","operationParameters":{"partitionBy":"[]","clusterBy":"[]","description":null,"isManaged":"true","properties":"{\"delta.checkpointInterval\":\"1\"}"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Apache-Spark/4.0.0 Delta-Lake/4.0.0","txnId":"97fef1b5-2b45-4c53-ad38-03be8e6e6739"}} +{"metaData":{"id":"a953d1d0-a84e-4ca6-bb2a-ed181213a3f0","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"data\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpointInterval":"1"},"createdTime":1771884417572}} +{"protocol":{"minReaderVersion":1,"minWriterVersion":2}} diff --git a/plugin/trino-delta-lake/src/test/resources/deltalake/checksum/_delta_log/00000000000000000001.checkpoint.parquet b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum/_delta_log/00000000000000000001.checkpoint.parquet new file mode 100644 index 0000000000000000000000000000000000000000..db6bee1c51638183be47582d6d2dfcf624e8d0be GIT binary patch literal 17389 zcmeHP4Qv$G5#Bu?XM@4~g!ecHhZqa4Cc);;*v1eS2&RsSC`FXGk)kwR-|dl8?#H=X z6DUGT2u10YRwzwSY9WHs){3l(S}h_!jR-PMpg(5dfBSb0c%-h|! zZ};}jcXr56(_mot{d_ZTX5P%~ycag!yD3B@QblHw>mz^odsz+f!K;r{NK(1KOp+w{ z{VDVxkjX)rT>R}TI|Kq7Rx<(~sRQWrDrv1DD9aZcdV)%*MGZw7n!-KH8k$>snxHAv z+_0=Qx;zq%HY?3-Ee+vdFxZ$$D!ski8)53>eX0g*-cII`x1YcIZ?8$kCrQ(S9<~gF z7L2MPwPfh*(<2^XR8_M+%8O&+5r)kpb>yjk^sh2Ufk>ve87P02VI2Ht47>K>=y6eW zd|cblvwIJ~SQs~h%qGw5fAek58{k#c7pZq=r-JNTzN@XmMR4 zmh4vIeJZ@|sEa6?0qUK02wOFTb4#b zkzj+;)~q%(hn3cbo}MP9L5+mkLQSEi%F<}iFq+}#-}}9YV>sd&xcyPih58qedXl}7 zeXUa9W7b;6M^GvUJ}hU6n*Zr*15Zsihl0PcY5q7jkRpbik;zG!?0)z4zi>S8InkF5 z5j+H>chqUyd)2xXbr~()7uM<)*F|G$9Cz76u<4Y9Ja;zRUW;cgIuuK4>J~K(V{m^d zJ#m-^vsBo3s;20>aL3M_593B-3;D@M&xnrGwvZ>;LN<}jESCM2%}oWB5@u{8(=axL5~_q_-+@pc5c>k$>$w`c#zwa9fuUHzXN;NfHKB3?2hN z`pqC8Yef&Rhi7?Ipy2i|A3Z$4d53Y8^-O3L3=FVv%ddh-W- zj8mC_Tl}zFx|^$K%GG`eeu$@<`tYX}n3YOx_Eh^<>q!Ho0>SZR4HJh*<08Cl5njpT zTtvVpRd-^$TSqtRfF4*$g_%$Yk4|J9X66&ZqsuTEqzhqwY|hYl9CAmNyS>=q>;Xk} zvJS_L&#}D~`N?Ica5EZCDM8*P_9rI}kt(R3@otJ@b`~>A+;L9HG1apUXHR~1I^BTK zB0i_~!mb2Vzo3{vT;#AZRanqGfthk68e2~s65NKnKLNLq$Du^^9#Ecc;)SCv24X$j znqk=GngZAOyM}hL7kqj~=8vaj za#m(z_CScJq5I3odTYw1GP1#X8}hO@9Cp)7HbMtJo}EP27?1mW;^YyZIC-#~thAu6 z*xm*zSRbN&U_HJxnYUiF9uH38ts~ZBf2D{%TuFxYMGdk=4NVi}Fsi>HwV^65h*9ft zXu7zJ8`H&QoSwmFWM_(~r)_W9S$x=O>oI$q)oMLvXIrh-W0sJ8`Wm{B(|#%vyI_0k zpF_HtXsdAK=p2#XV}9{%Z#DTu2R+IJvum!{du}e-rS}FA=zI-%kikL@Q$q*u;2OVv z2kA02eqtVf%-$(t4cp!ZYDMlZ+ujD|>&u5-b7N!I)C~B)#~#T*ww+MORH?<|FVnUq ziBBR^7kd1Ypt`dzvQS!I1I9oV(_pIpDNj7avwktaws`%MA5*tK0vUdvO1wB(@|Qu2NtQ9fdd(JO#G8qI zU)8yokw`^i(O6hP^v$sZG)pjI3i{TLN0sCj6?yY&%9hLor~*V4I7pQ(K@DrEG{}od zsWJmE$`*Aqd|R0;BGXkUfp4YPCF8NAx^}DC<$*3=DjLnG8f1v@Qi)4hmt3=5Q!|EK z!%8|5ODgf0w%urfV#lV`6YG)@b(`KYxkt&U*c8#<=S0+~(ihjDsUww0#I$ZxQ^?D# zaJ(-A-%+vTmW}Du)>uSMoBS~OW;8|17%i|;Us_G1wyI=St}dAw%rqr|;)c12-dve8 zCCpa0X(})s>5RqIZyFztsE%7>4_{g(0pk_;UA-1;mOz~WmEwursUDyR@_MjUq*SB&+g0!Zl`bKL&sv%RWTPh@9bv^oe-NzFT_0e{^I#AIL zNltYQR4PD3Qagoo#wZU+bl)m+JH0%gG)f5K6+q0ez+t-VFpDgZ!Bu27eXh<0M2sdJ z=ovXL(y>*9(i0{UUlYV27JC@P8JV7y1E=J6e+Qan16C@=S*m~oj!y-Idh5S1yoj3K zBl9APMBiCOYUyYlIqYG&o?5*S?4_#=Wb1U?R`AB4&4k`X*9Z2O+4Tm7WPBxH>P|mi zMQZ55g`}x?_(^((fQEWM-4M9sPQdt}0b^Ya6wWOq&30o&>%e%+7)6{RZ@X3ev7SIn zyd-3`3i4tm%?e2LSF6DgcL&IZ98x){2_w6nZVcQ+vSu;KdP1_ft?pz`AldUKS;^dv zutFjAVFXvwHG#8}+My%`KR|;0?G8GOpohMes~DC);PB6Ag(lDM-wUdHVUfXe0Z=jFiq! zPnpPu<9B1XQb=4m@=KHLk`{T+t^wq0#UKj_Fh{;v0QnFizkHV=4IZq(I;_l@109@_ z%Oa~`O7tk=4BzF9SAdf*Ylp|R55nBWa=ESztYa85uI6mgI-z#Hg&`e zauUn#8Old89hL*bmF@lqCR%}YJ$1cBu;8=4RQw9Ql@5w=_2wFgtC#M!;;NpA8Q)u0 z-h!#v&@@+Vv$`%l*a6z!*XS@y11IuMe>*N;Mr!~Ce15w(8_-!&{LF&Hc|FBzI{=h@6l^r}0(fnq(i#2)^}q~Hi# zH{5A->og?#!df75eTgGAF{oJC35xji3_H7mh>xrVVH^%}Vdx>sKp86nB6fDB9b5M% zorWY%BaUAjUz)`z$-}ET$No@A?J^=8fWTb3!3w zy8!qL2!7QBH$$O2(>c+QaeiBV&JQE_fhK#*YRd!e42X>L0^lbR{EP#<-T-&RM8@ds zTt-;|T%s2c>PC~h^C=h>8I$>eM-g~%sokdQd1VADBFs(fmFYe?kR|QK9VQhw3SvxA zM^nY2i4mYn_^{g3N#@bvrR10_W|r0xS(S(T!hUMkfi2+wq7(Md{+4{g63asz@IoQL z;|O@t0ocZ+@6#(wyrRTFH_`4u|D1Mykj5WCoaW&gZxe-5g+kHK*d#iFls{@A>kXlx zV}H;|*sNQ*e1zbK+~}MfS?Omy?6gjz*O11sR{ptABn^jeKjwF(VxIg-^tt6T=)^joZCuO=z4xF3Yp8tq8e-EfF z@4;^VS)#y#W>E(Z#Oc-hz_(>rkY`QbW|vsL#RxlWGkq{{z6J^vA@~9I*xw>S4j|j2vZKQa2D%_jy>pLI z@G-u$>v?%Or>x9quccjq>p+_wZR>vmP2tU*)*DFcrFOECEzy;htyaxxotOu-umGd~ ziL|UYcUtF>)a#X=uBFelnm77@NLzs4TP7FzKzCFu76T< zW3UnKUM|WRk$F6B-tBKpr=DmuZ+`lJx1g9%1*Z#eeN+!=#f)dC&3p!4QiX%1u`Zx4 zW;a*yE{(As3m~2k9Bv>L|tjxah~yy7st93U?KFR zQgKy5TP;$Jc$I)p|GcDW9>I6B%LuvbzQ9z;Pb{|UE=MA@#E-MA*}iKOK^EF zo?DwrxIQrf6rpl$p_FK&)*X$~AWHBc{MJWl6f9}wxh67RxZHas7E-AWoRz6donC_M z^pkK-fN$IaJ7r=H*xaK0mJFv|qWAk$yq4Ql9E+ino#6jcbTJk6e_v`{MYBw!WRLuR zUux8Giisa3I56{u7*|a(E-YrHoF&J2ymH2gl{{OxpEiCFz39m27qepX${U|lGUvt( zIS#<7ekK`n2^JwWjP*slELQvOUN zZly=Ul|jx+QNg+drJ2mnTmF%9K*Bd7QWxo{3f%t-Mq21VTRelDf zOVF^r)|AUz0X^$e`IM6Pvio)_8OFFZYVlEIemJI(9^NVW?+1s2|6oKzB3)}#ZE8b} zp_ay`+WMZpSUghO*0Nkx!lBldo?u&adDF6}5^8Q)9t(Tg66c#z^%L^kQB2Gg%3RiyML2>uxV zg?~$&et7aSyYuGFdklk1|2ZuA#7itWOJ21KW5>Ro+wHXJSV(3Qn> The Delta transaction log must remain an append-only log. To enable the detection of +> non-compliant modifications to Delta files, writers can optionally emit an auxiliary file with +> every commit, which contains important information about the state of the table as of that +> version. + +Spark SQL: +``` +CREATE SCHEMA checksum_invalid_json LOCATION 'file:///tmp/checksum_invalid_json'; +CREATE TABLE checksum_invalid_json.checksum_invalid_json (data INTEGER) USING DELTA TBLPROPERTIES (delta.checkpointInterval = 1); +INSERT INTO checksum_invalid_json.checksum_invalid_json values 1; +``` + +Subsequently, the latest checksum file (`_delta_log/00000000000000000001.crc`) was overwritten with +invalid JSON by hand. This fixture is otherwise identical to `deltalake/checksum`. diff --git a/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json/_delta_log/00000000000000000000.crc b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json/_delta_log/00000000000000000000.crc new file mode 100644 index 000000000000..49754b816a47 --- /dev/null +++ b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json/_delta_log/00000000000000000000.crc @@ -0,0 +1 @@ +{"txnId":"97fef1b5-2b45-4c53-ad38-03be8e6e6739","tableSizeBytes":0,"numFiles":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"a953d1d0-a84e-4ca6-bb2a-ed181213a3f0","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"data\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpointInterval":"1"},"createdTime":1771884417572},"protocol":{"minReaderVersion":1,"minWriterVersion":2},"allFiles":[]} diff --git a/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json/_delta_log/00000000000000000000.json b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json/_delta_log/00000000000000000000.json new file mode 100644 index 000000000000..13db1d4e6d6d --- /dev/null +++ b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json/_delta_log/00000000000000000000.json @@ -0,0 +1,3 @@ +{"commitInfo":{"timestamp":1771884417678,"operation":"CREATE TABLE","operationParameters":{"partitionBy":"[]","clusterBy":"[]","description":null,"isManaged":"true","properties":"{\"delta.checkpointInterval\":\"1\"}"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Apache-Spark/4.0.0 Delta-Lake/4.0.0","txnId":"97fef1b5-2b45-4c53-ad38-03be8e6e6739"}} +{"metaData":{"id":"a953d1d0-a84e-4ca6-bb2a-ed181213a3f0","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"data\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpointInterval":"1"},"createdTime":1771884417572}} +{"protocol":{"minReaderVersion":1,"minWriterVersion":2}} diff --git a/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json/_delta_log/00000000000000000001.checkpoint.parquet b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json/_delta_log/00000000000000000001.checkpoint.parquet new file mode 100644 index 0000000000000000000000000000000000000000..db6bee1c51638183be47582d6d2dfcf624e8d0be GIT binary patch literal 17389 zcmeHP4Qv$G5#Bu?XM@4~g!ecHhZqa4Cc);;*v1eS2&RsSC`FXGk)kwR-|dl8?#H=X z6DUGT2u10YRwzwSY9WHs){3l(S}h_!jR-PMpg(5dfBSb0c%-h|! zZ};}jcXr56(_mot{d_ZTX5P%~ycag!yD3B@QblHw>mz^odsz+f!K;r{NK(1KOp+w{ z{VDVxkjX)rT>R}TI|Kq7Rx<(~sRQWrDrv1DD9aZcdV)%*MGZw7n!-KH8k$>snxHAv z+_0=Qx;zq%HY?3-Ee+vdFxZ$$D!ski8)53>eX0g*-cII`x1YcIZ?8$kCrQ(S9<~gF z7L2MPwPfh*(<2^XR8_M+%8O&+5r)kpb>yjk^sh2Ufk>ve87P02VI2Ht47>K>=y6eW zd|cblvwIJ~SQs~h%qGw5fAek58{k#c7pZq=r-JNTzN@XmMR4 zmh4vIeJZ@|sEa6?0qUK02wOFTb4#b zkzj+;)~q%(hn3cbo}MP9L5+mkLQSEi%F<}iFq+}#-}}9YV>sd&xcyPih58qedXl}7 zeXUa9W7b;6M^GvUJ}hU6n*Zr*15Zsihl0PcY5q7jkRpbik;zG!?0)z4zi>S8InkF5 z5j+H>chqUyd)2xXbr~()7uM<)*F|G$9Cz76u<4Y9Ja;zRUW;cgIuuK4>J~K(V{m^d zJ#m-^vsBo3s;20>aL3M_593B-3;D@M&xnrGwvZ>;LN<}jESCM2%}oWB5@u{8(=axL5~_q_-+@pc5c>k$>$w`c#zwa9fuUHzXN;NfHKB3?2hN z`pqC8Yef&Rhi7?Ipy2i|A3Z$4d53Y8^-O3L3=FVv%ddh-W- zj8mC_Tl}zFx|^$K%GG`eeu$@<`tYX}n3YOx_Eh^<>q!Ho0>SZR4HJh*<08Cl5njpT zTtvVpRd-^$TSqtRfF4*$g_%$Yk4|J9X66&ZqsuTEqzhqwY|hYl9CAmNyS>=q>;Xk} zvJS_L&#}D~`N?Ica5EZCDM8*P_9rI}kt(R3@otJ@b`~>A+;L9HG1apUXHR~1I^BTK zB0i_~!mb2Vzo3{vT;#AZRanqGfthk68e2~s65NKnKLNLq$Du^^9#Ecc;)SCv24X$j znqk=GngZAOyM}hL7kqj~=8vaj za#m(z_CScJq5I3odTYw1GP1#X8}hO@9Cp)7HbMtJo}EP27?1mW;^YyZIC-#~thAu6 z*xm*zSRbN&U_HJxnYUiF9uH38ts~ZBf2D{%TuFxYMGdk=4NVi}Fsi>HwV^65h*9ft zXu7zJ8`H&QoSwmFWM_(~r)_W9S$x=O>oI$q)oMLvXIrh-W0sJ8`Wm{B(|#%vyI_0k zpF_HtXsdAK=p2#XV}9{%Z#DTu2R+IJvum!{du}e-rS}FA=zI-%kikL@Q$q*u;2OVv z2kA02eqtVf%-$(t4cp!ZYDMlZ+ujD|>&u5-b7N!I)C~B)#~#T*ww+MORH?<|FVnUq ziBBR^7kd1Ypt`dzvQS!I1I9oV(_pIpDNj7avwktaws`%MA5*tK0vUdvO1wB(@|Qu2NtQ9fdd(JO#G8qI zU)8yokw`^i(O6hP^v$sZG)pjI3i{TLN0sCj6?yY&%9hLor~*V4I7pQ(K@DrEG{}od zsWJmE$`*Aqd|R0;BGXkUfp4YPCF8NAx^}DC<$*3=DjLnG8f1v@Qi)4hmt3=5Q!|EK z!%8|5ODgf0w%urfV#lV`6YG)@b(`KYxkt&U*c8#<=S0+~(ihjDsUww0#I$ZxQ^?D# zaJ(-A-%+vTmW}Du)>uSMoBS~OW;8|17%i|;Us_G1wyI=St}dAw%rqr|;)c12-dve8 zCCpa0X(})s>5RqIZyFztsE%7>4_{g(0pk_;UA-1;mOz~WmEwursUDyR@_MjUq*SB&+g0!Zl`bKL&sv%RWTPh@9bv^oe-NzFT_0e{^I#AIL zNltYQR4PD3Qagoo#wZU+bl)m+JH0%gG)f5K6+q0ez+t-VFpDgZ!Bu27eXh<0M2sdJ z=ovXL(y>*9(i0{UUlYV27JC@P8JV7y1E=J6e+Qan16C@=S*m~oj!y-Idh5S1yoj3K zBl9APMBiCOYUyYlIqYG&o?5*S?4_#=Wb1U?R`AB4&4k`X*9Z2O+4Tm7WPBxH>P|mi zMQZ55g`}x?_(^((fQEWM-4M9sPQdt}0b^Ya6wWOq&30o&>%e%+7)6{RZ@X3ev7SIn zyd-3`3i4tm%?e2LSF6DgcL&IZ98x){2_w6nZVcQ+vSu;KdP1_ft?pz`AldUKS;^dv zutFjAVFXvwHG#8}+My%`KR|;0?G8GOpohMes~DC);PB6Ag(lDM-wUdHVUfXe0Z=jFiq! zPnpPu<9B1XQb=4m@=KHLk`{T+t^wq0#UKj_Fh{;v0QnFizkHV=4IZq(I;_l@109@_ z%Oa~`O7tk=4BzF9SAdf*Ylp|R55nBWa=ESztYa85uI6mgI-z#Hg&`e zauUn#8Old89hL*bmF@lqCR%}YJ$1cBu;8=4RQw9Ql@5w=_2wFgtC#M!;;NpA8Q)u0 z-h!#v&@@+Vv$`%l*a6z!*XS@y11IuMe>*N;Mr!~Ce15w(8_-!&{LF&Hc|FBzI{=h@6l^r}0(fnq(i#2)^}q~Hi# zH{5A->og?#!df75eTgGAF{oJC35xji3_H7mh>xrVVH^%}Vdx>sKp86nB6fDB9b5M% zorWY%BaUAjUz)`z$-}ET$No@A?J^=8fWTb3!3w zy8!qL2!7QBH$$O2(>c+QaeiBV&JQE_fhK#*YRd!e42X>L0^lbR{EP#<-T-&RM8@ds zTt-;|T%s2c>PC~h^C=h>8I$>eM-g~%sokdQd1VADBFs(fmFYe?kR|QK9VQhw3SvxA zM^nY2i4mYn_^{g3N#@bvrR10_W|r0xS(S(T!hUMkfi2+wq7(Md{+4{g63asz@IoQL z;|O@t0ocZ+@6#(wyrRTFH_`4u|D1Mykj5WCoaW&gZxe-5g+kHK*d#iFls{@A>kXlx zV}H;|*sNQ*e1zbK+~}MfS?Omy?6gjz*O11sR{ptABn^jeKjwF(VxIg-^tt6T=)^joZCuO=z4xF3Yp8tq8e-EfF z@4;^VS)#y#W>E(Z#Oc-hz_(>rkY`QbW|vsL#RxlWGkq{{z6J^vA@~9I*xw>S4j|j2vZKQa2D%_jy>pLI z@G-u$>v?%Or>x9quccjq>p+_wZR>vmP2tU*)*DFcrFOECEzy;htyaxxotOu-umGd~ ziL|UYcUtF>)a#X=uBFelnm77@NLzs4TP7FzKzCFu76T< zW3UnKUM|WRk$F6B-tBKpr=DmuZ+`lJx1g9%1*Z#eeN+!=#f)dC&3p!4QiX%1u`Zx4 zW;a*yE{(As3m~2k9Bv>L|tjxah~yy7st93U?KFR zQgKy5TP;$Jc$I)p|GcDW9>I6B%LuvbzQ9z;Pb{|UE=MA@#E-MA*}iKOK^EF zo?DwrxIQrf6rpl$p_FK&)*X$~AWHBc{MJWl6f9}wxh67RxZHas7E-AWoRz6donC_M z^pkK-fN$IaJ7r=H*xaK0mJFv|qWAk$yq4Ql9E+ino#6jcbTJk6e_v`{MYBw!WRLuR zUux8Giisa3I56{u7*|a(E-YrHoF&J2ymH2gl{{OxpEiCFz39m27qepX${U|lGUvt( zIS#<7ekK`n2^JwWjP*slELQvOUN zZly=Ul|jx+QNg+drJ2mnTmF%9K*Bd7QWxo{3f%t-Mq21VTRelDf zOVF^r)|AUz0X^$e`IM6Pvio)_8OFFZYVlEIemJI(9^NVW?+1s2|6oKzB3)}#ZE8b} zp_ay`+WMZpSUghO*0Nkx!lBldo?u&adDF6}5^8Q)9t(Tg66c#z^%L^kQB2Gg%3RiyML2>uxV zg?~$&et7aSyYuGFdklk1|2ZuA#7itWOJ21KW5>Ro+wHXJSV(3Qn> The Delta transaction log must remain an append-only log. To enable the detection of +> non-compliant modifications to Delta files, writers can optionally emit an auxiliary file with +> every commit, which contains important information about the state of the table as of that +> version. + +Spark SQL: +``` +CREATE SCHEMA checksum LOCATION 'file:///tmp/checksum'; +CREATE TABLE checksum.checksum (data INTEGER) USING DELTA TBLPROPERTIES (delta.checkpointInterval = 1); +INSERT INTO checksum.checksum values 1; +``` + +Subsequently, the latest checksum file (`_delta_log/00000000000000000001.crc`) was overwritten by +hand with a JSON object with an invalid schema. This fixture is otherwise identical to +`deltalake/checksum`. \ No newline at end of file diff --git a/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json_mapping/_delta_log/00000000000000000000.crc b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json_mapping/_delta_log/00000000000000000000.crc new file mode 100644 index 000000000000..49754b816a47 --- /dev/null +++ b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json_mapping/_delta_log/00000000000000000000.crc @@ -0,0 +1 @@ +{"txnId":"97fef1b5-2b45-4c53-ad38-03be8e6e6739","tableSizeBytes":0,"numFiles":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"a953d1d0-a84e-4ca6-bb2a-ed181213a3f0","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"data\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpointInterval":"1"},"createdTime":1771884417572},"protocol":{"minReaderVersion":1,"minWriterVersion":2},"allFiles":[]} diff --git a/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json_mapping/_delta_log/00000000000000000000.json b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json_mapping/_delta_log/00000000000000000000.json new file mode 100644 index 000000000000..13db1d4e6d6d --- /dev/null +++ b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json_mapping/_delta_log/00000000000000000000.json @@ -0,0 +1,3 @@ +{"commitInfo":{"timestamp":1771884417678,"operation":"CREATE TABLE","operationParameters":{"partitionBy":"[]","clusterBy":"[]","description":null,"isManaged":"true","properties":"{\"delta.checkpointInterval\":\"1\"}"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Apache-Spark/4.0.0 Delta-Lake/4.0.0","txnId":"97fef1b5-2b45-4c53-ad38-03be8e6e6739"}} +{"metaData":{"id":"a953d1d0-a84e-4ca6-bb2a-ed181213a3f0","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"data\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpointInterval":"1"},"createdTime":1771884417572}} +{"protocol":{"minReaderVersion":1,"minWriterVersion":2}} diff --git a/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json_mapping/_delta_log/00000000000000000001.checkpoint.parquet b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_invalid_json_mapping/_delta_log/00000000000000000001.checkpoint.parquet new file mode 100644 index 0000000000000000000000000000000000000000..db6bee1c51638183be47582d6d2dfcf624e8d0be GIT binary patch literal 17389 zcmeHP4Qv$G5#Bu?XM@4~g!ecHhZqa4Cc);;*v1eS2&RsSC`FXGk)kwR-|dl8?#H=X z6DUGT2u10YRwzwSY9WHs){3l(S}h_!jR-PMpg(5dfBSb0c%-h|! zZ};}jcXr56(_mot{d_ZTX5P%~ycag!yD3B@QblHw>mz^odsz+f!K;r{NK(1KOp+w{ z{VDVxkjX)rT>R}TI|Kq7Rx<(~sRQWrDrv1DD9aZcdV)%*MGZw7n!-KH8k$>snxHAv z+_0=Qx;zq%HY?3-Ee+vdFxZ$$D!ski8)53>eX0g*-cII`x1YcIZ?8$kCrQ(S9<~gF z7L2MPwPfh*(<2^XR8_M+%8O&+5r)kpb>yjk^sh2Ufk>ve87P02VI2Ht47>K>=y6eW zd|cblvwIJ~SQs~h%qGw5fAek58{k#c7pZq=r-JNTzN@XmMR4 zmh4vIeJZ@|sEa6?0qUK02wOFTb4#b zkzj+;)~q%(hn3cbo}MP9L5+mkLQSEi%F<}iFq+}#-}}9YV>sd&xcyPih58qedXl}7 zeXUa9W7b;6M^GvUJ}hU6n*Zr*15Zsihl0PcY5q7jkRpbik;zG!?0)z4zi>S8InkF5 z5j+H>chqUyd)2xXbr~()7uM<)*F|G$9Cz76u<4Y9Ja;zRUW;cgIuuK4>J~K(V{m^d zJ#m-^vsBo3s;20>aL3M_593B-3;D@M&xnrGwvZ>;LN<}jESCM2%}oWB5@u{8(=axL5~_q_-+@pc5c>k$>$w`c#zwa9fuUHzXN;NfHKB3?2hN z`pqC8Yef&Rhi7?Ipy2i|A3Z$4d53Y8^-O3L3=FVv%ddh-W- zj8mC_Tl}zFx|^$K%GG`eeu$@<`tYX}n3YOx_Eh^<>q!Ho0>SZR4HJh*<08Cl5njpT zTtvVpRd-^$TSqtRfF4*$g_%$Yk4|J9X66&ZqsuTEqzhqwY|hYl9CAmNyS>=q>;Xk} zvJS_L&#}D~`N?Ica5EZCDM8*P_9rI}kt(R3@otJ@b`~>A+;L9HG1apUXHR~1I^BTK zB0i_~!mb2Vzo3{vT;#AZRanqGfthk68e2~s65NKnKLNLq$Du^^9#Ecc;)SCv24X$j znqk=GngZAOyM}hL7kqj~=8vaj za#m(z_CScJq5I3odTYw1GP1#X8}hO@9Cp)7HbMtJo}EP27?1mW;^YyZIC-#~thAu6 z*xm*zSRbN&U_HJxnYUiF9uH38ts~ZBf2D{%TuFxYMGdk=4NVi}Fsi>HwV^65h*9ft zXu7zJ8`H&QoSwmFWM_(~r)_W9S$x=O>oI$q)oMLvXIrh-W0sJ8`Wm{B(|#%vyI_0k zpF_HtXsdAK=p2#XV}9{%Z#DTu2R+IJvum!{du}e-rS}FA=zI-%kikL@Q$q*u;2OVv z2kA02eqtVf%-$(t4cp!ZYDMlZ+ujD|>&u5-b7N!I)C~B)#~#T*ww+MORH?<|FVnUq ziBBR^7kd1Ypt`dzvQS!I1I9oV(_pIpDNj7avwktaws`%MA5*tK0vUdvO1wB(@|Qu2NtQ9fdd(JO#G8qI zU)8yokw`^i(O6hP^v$sZG)pjI3i{TLN0sCj6?yY&%9hLor~*V4I7pQ(K@DrEG{}od zsWJmE$`*Aqd|R0;BGXkUfp4YPCF8NAx^}DC<$*3=DjLnG8f1v@Qi)4hmt3=5Q!|EK z!%8|5ODgf0w%urfV#lV`6YG)@b(`KYxkt&U*c8#<=S0+~(ihjDsUww0#I$ZxQ^?D# zaJ(-A-%+vTmW}Du)>uSMoBS~OW;8|17%i|;Us_G1wyI=St}dAw%rqr|;)c12-dve8 zCCpa0X(})s>5RqIZyFztsE%7>4_{g(0pk_;UA-1;mOz~WmEwursUDyR@_MjUq*SB&+g0!Zl`bKL&sv%RWTPh@9bv^oe-NzFT_0e{^I#AIL zNltYQR4PD3Qagoo#wZU+bl)m+JH0%gG)f5K6+q0ez+t-VFpDgZ!Bu27eXh<0M2sdJ z=ovXL(y>*9(i0{UUlYV27JC@P8JV7y1E=J6e+Qan16C@=S*m~oj!y-Idh5S1yoj3K zBl9APMBiCOYUyYlIqYG&o?5*S?4_#=Wb1U?R`AB4&4k`X*9Z2O+4Tm7WPBxH>P|mi zMQZ55g`}x?_(^((fQEWM-4M9sPQdt}0b^Ya6wWOq&30o&>%e%+7)6{RZ@X3ev7SIn zyd-3`3i4tm%?e2LSF6DgcL&IZ98x){2_w6nZVcQ+vSu;KdP1_ft?pz`AldUKS;^dv zutFjAVFXvwHG#8}+My%`KR|;0?G8GOpohMes~DC);PB6Ag(lDM-wUdHVUfXe0Z=jFiq! zPnpPu<9B1XQb=4m@=KHLk`{T+t^wq0#UKj_Fh{;v0QnFizkHV=4IZq(I;_l@109@_ z%Oa~`O7tk=4BzF9SAdf*Ylp|R55nBWa=ESztYa85uI6mgI-z#Hg&`e zauUn#8Old89hL*bmF@lqCR%}YJ$1cBu;8=4RQw9Ql@5w=_2wFgtC#M!;;NpA8Q)u0 z-h!#v&@@+Vv$`%l*a6z!*XS@y11IuMe>*N;Mr!~Ce15w(8_-!&{LF&Hc|FBzI{=h@6l^r}0(fnq(i#2)^}q~Hi# zH{5A->og?#!df75eTgGAF{oJC35xji3_H7mh>xrVVH^%}Vdx>sKp86nB6fDB9b5M% zorWY%BaUAjUz)`z$-}ET$No@A?J^=8fWTb3!3w zy8!qL2!7QBH$$O2(>c+QaeiBV&JQE_fhK#*YRd!e42X>L0^lbR{EP#<-T-&RM8@ds zTt-;|T%s2c>PC~h^C=h>8I$>eM-g~%sokdQd1VADBFs(fmFYe?kR|QK9VQhw3SvxA zM^nY2i4mYn_^{g3N#@bvrR10_W|r0xS(S(T!hUMkfi2+wq7(Md{+4{g63asz@IoQL z;|O@t0ocZ+@6#(wyrRTFH_`4u|D1Mykj5WCoaW&gZxe-5g+kHK*d#iFls{@A>kXlx zV}H;|*sNQ*e1zbK+~}MfS?Omy?6gjz*O11sR{ptABn^jeKjwF(VxIg-^tt6T=)^joZCuO=z4xF3Yp8tq8e-EfF z@4;^VS)#y#W>E(Z#Oc-hz_(>rkY`QbW|vsL#RxlWGkq{{z6J^vA@~9I*xw>S4j|j2vZKQa2D%_jy>pLI z@G-u$>v?%Or>x9quccjq>p+_wZR>vmP2tU*)*DFcrFOECEzy;htyaxxotOu-umGd~ ziL|UYcUtF>)a#X=uBFelnm77@NLzs4TP7FzKzCFu76T< zW3UnKUM|WRk$F6B-tBKpr=DmuZ+`lJx1g9%1*Z#eeN+!=#f)dC&3p!4QiX%1u`Zx4 zW;a*yE{(As3m~2k9Bv>L|tjxah~yy7st93U?KFR zQgKy5TP;$Jc$I)p|GcDW9>I6B%LuvbzQ9z;Pb{|UE=MA@#E-MA*}iKOK^EF zo?DwrxIQrf6rpl$p_FK&)*X$~AWHBc{MJWl6f9}wxh67RxZHas7E-AWoRz6donC_M z^pkK-fN$IaJ7r=H*xaK0mJFv|qWAk$yq4Ql9E+ino#6jcbTJk6e_v`{MYBw!WRLuR zUux8Giisa3I56{u7*|a(E-YrHoF&J2ymH2gl{{OxpEiCFz39m27qepX${U|lGUvt( zIS#<7ekK`n2^JwWjP*slELQvOUN zZly=Ul|jx+QNg+drJ2mnTmF%9K*Bd7QWxo{3f%t-Mq21VTRelDf zOVF^r)|AUz0X^$e`IM6Pvio)_8OFFZYVlEIemJI(9^NVW?+1s2|6oKzB3)}#ZE8b} zp_ay`+WMZpSUghO*0Nkx!lBldo?u&adDF6}5^8Q)9t(Tg66c#z^%L^kQB2Gg%3RiyML2>uxV zg?~$&et7aSyYuGFdklk1|2ZuA#7itWOJ21KW5>Ro+wHXJSV(3Qn> The Delta transaction log must remain an append-only log. To enable the detection of +> non-compliant modifications to Delta files, writers can optionally emit an auxiliary file with +> every commit, which contains important information about the state of the table as of that +> version. + +Spark SQL: +``` +CREATE SCHEMA checksum LOCATION 'file:///tmp/checksum'; +CREATE TABLE checksum.checksum (data INTEGER) USING DELTA TBLPROPERTIES (delta.checkpointInterval = 1); +INSERT INTO checksum.checksum values 1; +``` + +Subsequently, the latest checksum file (`_delta_log/00000000000000000001.crc`) was deleted by hand. +This fixture is otherwise identical to `deltalake/checksum`. diff --git a/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_missing_latest/_delta_log/00000000000000000000.crc b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_missing_latest/_delta_log/00000000000000000000.crc new file mode 100644 index 000000000000..49754b816a47 --- /dev/null +++ b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_missing_latest/_delta_log/00000000000000000000.crc @@ -0,0 +1 @@ +{"txnId":"97fef1b5-2b45-4c53-ad38-03be8e6e6739","tableSizeBytes":0,"numFiles":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"a953d1d0-a84e-4ca6-bb2a-ed181213a3f0","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"data\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpointInterval":"1"},"createdTime":1771884417572},"protocol":{"minReaderVersion":1,"minWriterVersion":2},"allFiles":[]} diff --git a/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_missing_latest/_delta_log/00000000000000000000.json b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_missing_latest/_delta_log/00000000000000000000.json new file mode 100644 index 000000000000..13db1d4e6d6d --- /dev/null +++ b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_missing_latest/_delta_log/00000000000000000000.json @@ -0,0 +1,3 @@ +{"commitInfo":{"timestamp":1771884417678,"operation":"CREATE TABLE","operationParameters":{"partitionBy":"[]","clusterBy":"[]","description":null,"isManaged":"true","properties":"{\"delta.checkpointInterval\":\"1\"}"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Apache-Spark/4.0.0 Delta-Lake/4.0.0","txnId":"97fef1b5-2b45-4c53-ad38-03be8e6e6739"}} +{"metaData":{"id":"a953d1d0-a84e-4ca6-bb2a-ed181213a3f0","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"data\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpointInterval":"1"},"createdTime":1771884417572}} +{"protocol":{"minReaderVersion":1,"minWriterVersion":2}} diff --git a/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_missing_latest/_delta_log/00000000000000000001.checkpoint.parquet b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_missing_latest/_delta_log/00000000000000000001.checkpoint.parquet new file mode 100644 index 0000000000000000000000000000000000000000..db6bee1c51638183be47582d6d2dfcf624e8d0be GIT binary patch literal 17389 zcmeHP4Qv$G5#Bu?XM@4~g!ecHhZqa4Cc);;*v1eS2&RsSC`FXGk)kwR-|dl8?#H=X z6DUGT2u10YRwzwSY9WHs){3l(S}h_!jR-PMpg(5dfBSb0c%-h|! zZ};}jcXr56(_mot{d_ZTX5P%~ycag!yD3B@QblHw>mz^odsz+f!K;r{NK(1KOp+w{ z{VDVxkjX)rT>R}TI|Kq7Rx<(~sRQWrDrv1DD9aZcdV)%*MGZw7n!-KH8k$>snxHAv z+_0=Qx;zq%HY?3-Ee+vdFxZ$$D!ski8)53>eX0g*-cII`x1YcIZ?8$kCrQ(S9<~gF z7L2MPwPfh*(<2^XR8_M+%8O&+5r)kpb>yjk^sh2Ufk>ve87P02VI2Ht47>K>=y6eW zd|cblvwIJ~SQs~h%qGw5fAek58{k#c7pZq=r-JNTzN@XmMR4 zmh4vIeJZ@|sEa6?0qUK02wOFTb4#b zkzj+;)~q%(hn3cbo}MP9L5+mkLQSEi%F<}iFq+}#-}}9YV>sd&xcyPih58qedXl}7 zeXUa9W7b;6M^GvUJ}hU6n*Zr*15Zsihl0PcY5q7jkRpbik;zG!?0)z4zi>S8InkF5 z5j+H>chqUyd)2xXbr~()7uM<)*F|G$9Cz76u<4Y9Ja;zRUW;cgIuuK4>J~K(V{m^d zJ#m-^vsBo3s;20>aL3M_593B-3;D@M&xnrGwvZ>;LN<}jESCM2%}oWB5@u{8(=axL5~_q_-+@pc5c>k$>$w`c#zwa9fuUHzXN;NfHKB3?2hN z`pqC8Yef&Rhi7?Ipy2i|A3Z$4d53Y8^-O3L3=FVv%ddh-W- zj8mC_Tl}zFx|^$K%GG`eeu$@<`tYX}n3YOx_Eh^<>q!Ho0>SZR4HJh*<08Cl5njpT zTtvVpRd-^$TSqtRfF4*$g_%$Yk4|J9X66&ZqsuTEqzhqwY|hYl9CAmNyS>=q>;Xk} zvJS_L&#}D~`N?Ica5EZCDM8*P_9rI}kt(R3@otJ@b`~>A+;L9HG1apUXHR~1I^BTK zB0i_~!mb2Vzo3{vT;#AZRanqGfthk68e2~s65NKnKLNLq$Du^^9#Ecc;)SCv24X$j znqk=GngZAOyM}hL7kqj~=8vaj za#m(z_CScJq5I3odTYw1GP1#X8}hO@9Cp)7HbMtJo}EP27?1mW;^YyZIC-#~thAu6 z*xm*zSRbN&U_HJxnYUiF9uH38ts~ZBf2D{%TuFxYMGdk=4NVi}Fsi>HwV^65h*9ft zXu7zJ8`H&QoSwmFWM_(~r)_W9S$x=O>oI$q)oMLvXIrh-W0sJ8`Wm{B(|#%vyI_0k zpF_HtXsdAK=p2#XV}9{%Z#DTu2R+IJvum!{du}e-rS}FA=zI-%kikL@Q$q*u;2OVv z2kA02eqtVf%-$(t4cp!ZYDMlZ+ujD|>&u5-b7N!I)C~B)#~#T*ww+MORH?<|FVnUq ziBBR^7kd1Ypt`dzvQS!I1I9oV(_pIpDNj7avwktaws`%MA5*tK0vUdvO1wB(@|Qu2NtQ9fdd(JO#G8qI zU)8yokw`^i(O6hP^v$sZG)pjI3i{TLN0sCj6?yY&%9hLor~*V4I7pQ(K@DrEG{}od zsWJmE$`*Aqd|R0;BGXkUfp4YPCF8NAx^}DC<$*3=DjLnG8f1v@Qi)4hmt3=5Q!|EK z!%8|5ODgf0w%urfV#lV`6YG)@b(`KYxkt&U*c8#<=S0+~(ihjDsUww0#I$ZxQ^?D# zaJ(-A-%+vTmW}Du)>uSMoBS~OW;8|17%i|;Us_G1wyI=St}dAw%rqr|;)c12-dve8 zCCpa0X(})s>5RqIZyFztsE%7>4_{g(0pk_;UA-1;mOz~WmEwursUDyR@_MjUq*SB&+g0!Zl`bKL&sv%RWTPh@9bv^oe-NzFT_0e{^I#AIL zNltYQR4PD3Qagoo#wZU+bl)m+JH0%gG)f5K6+q0ez+t-VFpDgZ!Bu27eXh<0M2sdJ z=ovXL(y>*9(i0{UUlYV27JC@P8JV7y1E=J6e+Qan16C@=S*m~oj!y-Idh5S1yoj3K zBl9APMBiCOYUyYlIqYG&o?5*S?4_#=Wb1U?R`AB4&4k`X*9Z2O+4Tm7WPBxH>P|mi zMQZ55g`}x?_(^((fQEWM-4M9sPQdt}0b^Ya6wWOq&30o&>%e%+7)6{RZ@X3ev7SIn zyd-3`3i4tm%?e2LSF6DgcL&IZ98x){2_w6nZVcQ+vSu;KdP1_ft?pz`AldUKS;^dv zutFjAVFXvwHG#8}+My%`KR|;0?G8GOpohMes~DC);PB6Ag(lDM-wUdHVUfXe0Z=jFiq! zPnpPu<9B1XQb=4m@=KHLk`{T+t^wq0#UKj_Fh{;v0QnFizkHV=4IZq(I;_l@109@_ z%Oa~`O7tk=4BzF9SAdf*Ylp|R55nBWa=ESztYa85uI6mgI-z#Hg&`e zauUn#8Old89hL*bmF@lqCR%}YJ$1cBu;8=4RQw9Ql@5w=_2wFgtC#M!;;NpA8Q)u0 z-h!#v&@@+Vv$`%l*a6z!*XS@y11IuMe>*N;Mr!~Ce15w(8_-!&{LF&Hc|FBzI{=h@6l^r}0(fnq(i#2)^}q~Hi# zH{5A->og?#!df75eTgGAF{oJC35xji3_H7mh>xrVVH^%}Vdx>sKp86nB6fDB9b5M% zorWY%BaUAjUz)`z$-}ET$No@A?J^=8fWTb3!3w zy8!qL2!7QBH$$O2(>c+QaeiBV&JQE_fhK#*YRd!e42X>L0^lbR{EP#<-T-&RM8@ds zTt-;|T%s2c>PC~h^C=h>8I$>eM-g~%sokdQd1VADBFs(fmFYe?kR|QK9VQhw3SvxA zM^nY2i4mYn_^{g3N#@bvrR10_W|r0xS(S(T!hUMkfi2+wq7(Md{+4{g63asz@IoQL z;|O@t0ocZ+@6#(wyrRTFH_`4u|D1Mykj5WCoaW&gZxe-5g+kHK*d#iFls{@A>kXlx zV}H;|*sNQ*e1zbK+~}MfS?Omy?6gjz*O11sR{ptABn^jeKjwF(VxIg-^tt6T=)^joZCuO=z4xF3Yp8tq8e-EfF z@4;^VS)#y#W>E(Z#Oc-hz_(>rkY`QbW|vsL#RxlWGkq{{z6J^vA@~9I*xw>S4j|j2vZKQa2D%_jy>pLI z@G-u$>v?%Or>x9quccjq>p+_wZR>vmP2tU*)*DFcrFOECEzy;htyaxxotOu-umGd~ ziL|UYcUtF>)a#X=uBFelnm77@NLzs4TP7FzKzCFu76T< zW3UnKUM|WRk$F6B-tBKpr=DmuZ+`lJx1g9%1*Z#eeN+!=#f)dC&3p!4QiX%1u`Zx4 zW;a*yE{(As3m~2k9Bv>L|tjxah~yy7st93U?KFR zQgKy5TP;$Jc$I)p|GcDW9>I6B%LuvbzQ9z;Pb{|UE=MA@#E-MA*}iKOK^EF zo?DwrxIQrf6rpl$p_FK&)*X$~AWHBc{MJWl6f9}wxh67RxZHas7E-AWoRz6donC_M z^pkK-fN$IaJ7r=H*xaK0mJFv|qWAk$yq4Ql9E+ino#6jcbTJk6e_v`{MYBw!WRLuR zUux8Giisa3I56{u7*|a(E-YrHoF&J2ymH2gl{{OxpEiCFz39m27qepX${U|lGUvt( zIS#<7ekK`n2^JwWjP*slELQvOUN zZly=Ul|jx+QNg+drJ2mnTmF%9K*Bd7QWxo{3f%t-Mq21VTRelDf zOVF^r)|AUz0X^$e`IM6Pvio)_8OFFZYVlEIemJI(9^NVW?+1s2|6oKzB3)}#ZE8b} zp_ay`+WMZpSUghO*0Nkx!lBldo?u&adDF6}5^8Q)9t(Tg66c#z^%L^kQB2Gg%3RiyML2>uxV zg?~$&et7aSyYuGFdklk1|2ZuA#7itWOJ21KW5>Ro+wHXJSV(3Qn> The Delta transaction log must remain an append-only log. To enable the detection of +> non-compliant modifications to Delta files, writers can optionally emit an auxiliary file with +> every commit, which contains important information about the state of the table as of that +> version. + +Spark SQL: +``` +CREATE SCHEMA checksum LOCATION 'file:///tmp/checksum'; +CREATE TABLE checksum.checksum (data INTEGER) USING DELTA TBLPROPERTIES (delta.checkpointInterval = 1); +INSERT INTO checksum.checksum values 1; +``` + +Subsequently, the latest checksum file (`_delta_log/00000000000000000001.crc`) was overwritten by +hand with additional trailing content past the end of the JSON object. This fixture is otherwise +identical to `deltalake/checksum`. diff --git a/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_trailing_json_content/_delta_log/00000000000000000000.crc b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_trailing_json_content/_delta_log/00000000000000000000.crc new file mode 100644 index 000000000000..49754b816a47 --- /dev/null +++ b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_trailing_json_content/_delta_log/00000000000000000000.crc @@ -0,0 +1 @@ +{"txnId":"97fef1b5-2b45-4c53-ad38-03be8e6e6739","tableSizeBytes":0,"numFiles":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"a953d1d0-a84e-4ca6-bb2a-ed181213a3f0","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"data\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpointInterval":"1"},"createdTime":1771884417572},"protocol":{"minReaderVersion":1,"minWriterVersion":2},"allFiles":[]} diff --git a/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_trailing_json_content/_delta_log/00000000000000000000.json b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_trailing_json_content/_delta_log/00000000000000000000.json new file mode 100644 index 000000000000..13db1d4e6d6d --- /dev/null +++ b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_trailing_json_content/_delta_log/00000000000000000000.json @@ -0,0 +1,3 @@ +{"commitInfo":{"timestamp":1771884417678,"operation":"CREATE TABLE","operationParameters":{"partitionBy":"[]","clusterBy":"[]","description":null,"isManaged":"true","properties":"{\"delta.checkpointInterval\":\"1\"}"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Apache-Spark/4.0.0 Delta-Lake/4.0.0","txnId":"97fef1b5-2b45-4c53-ad38-03be8e6e6739"}} +{"metaData":{"id":"a953d1d0-a84e-4ca6-bb2a-ed181213a3f0","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"data\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpointInterval":"1"},"createdTime":1771884417572}} +{"protocol":{"minReaderVersion":1,"minWriterVersion":2}} diff --git a/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_trailing_json_content/_delta_log/00000000000000000001.checkpoint.parquet b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_trailing_json_content/_delta_log/00000000000000000001.checkpoint.parquet new file mode 100644 index 0000000000000000000000000000000000000000..db6bee1c51638183be47582d6d2dfcf624e8d0be GIT binary patch literal 17389 zcmeHP4Qv$G5#Bu?XM@4~g!ecHhZqa4Cc);;*v1eS2&RsSC`FXGk)kwR-|dl8?#H=X z6DUGT2u10YRwzwSY9WHs){3l(S}h_!jR-PMpg(5dfBSb0c%-h|! zZ};}jcXr56(_mot{d_ZTX5P%~ycag!yD3B@QblHw>mz^odsz+f!K;r{NK(1KOp+w{ z{VDVxkjX)rT>R}TI|Kq7Rx<(~sRQWrDrv1DD9aZcdV)%*MGZw7n!-KH8k$>snxHAv z+_0=Qx;zq%HY?3-Ee+vdFxZ$$D!ski8)53>eX0g*-cII`x1YcIZ?8$kCrQ(S9<~gF z7L2MPwPfh*(<2^XR8_M+%8O&+5r)kpb>yjk^sh2Ufk>ve87P02VI2Ht47>K>=y6eW zd|cblvwIJ~SQs~h%qGw5fAek58{k#c7pZq=r-JNTzN@XmMR4 zmh4vIeJZ@|sEa6?0qUK02wOFTb4#b zkzj+;)~q%(hn3cbo}MP9L5+mkLQSEi%F<}iFq+}#-}}9YV>sd&xcyPih58qedXl}7 zeXUa9W7b;6M^GvUJ}hU6n*Zr*15Zsihl0PcY5q7jkRpbik;zG!?0)z4zi>S8InkF5 z5j+H>chqUyd)2xXbr~()7uM<)*F|G$9Cz76u<4Y9Ja;zRUW;cgIuuK4>J~K(V{m^d zJ#m-^vsBo3s;20>aL3M_593B-3;D@M&xnrGwvZ>;LN<}jESCM2%}oWB5@u{8(=axL5~_q_-+@pc5c>k$>$w`c#zwa9fuUHzXN;NfHKB3?2hN z`pqC8Yef&Rhi7?Ipy2i|A3Z$4d53Y8^-O3L3=FVv%ddh-W- zj8mC_Tl}zFx|^$K%GG`eeu$@<`tYX}n3YOx_Eh^<>q!Ho0>SZR4HJh*<08Cl5njpT zTtvVpRd-^$TSqtRfF4*$g_%$Yk4|J9X66&ZqsuTEqzhqwY|hYl9CAmNyS>=q>;Xk} zvJS_L&#}D~`N?Ica5EZCDM8*P_9rI}kt(R3@otJ@b`~>A+;L9HG1apUXHR~1I^BTK zB0i_~!mb2Vzo3{vT;#AZRanqGfthk68e2~s65NKnKLNLq$Du^^9#Ecc;)SCv24X$j znqk=GngZAOyM}hL7kqj~=8vaj za#m(z_CScJq5I3odTYw1GP1#X8}hO@9Cp)7HbMtJo}EP27?1mW;^YyZIC-#~thAu6 z*xm*zSRbN&U_HJxnYUiF9uH38ts~ZBf2D{%TuFxYMGdk=4NVi}Fsi>HwV^65h*9ft zXu7zJ8`H&QoSwmFWM_(~r)_W9S$x=O>oI$q)oMLvXIrh-W0sJ8`Wm{B(|#%vyI_0k zpF_HtXsdAK=p2#XV}9{%Z#DTu2R+IJvum!{du}e-rS}FA=zI-%kikL@Q$q*u;2OVv z2kA02eqtVf%-$(t4cp!ZYDMlZ+ujD|>&u5-b7N!I)C~B)#~#T*ww+MORH?<|FVnUq ziBBR^7kd1Ypt`dzvQS!I1I9oV(_pIpDNj7avwktaws`%MA5*tK0vUdvO1wB(@|Qu2NtQ9fdd(JO#G8qI zU)8yokw`^i(O6hP^v$sZG)pjI3i{TLN0sCj6?yY&%9hLor~*V4I7pQ(K@DrEG{}od zsWJmE$`*Aqd|R0;BGXkUfp4YPCF8NAx^}DC<$*3=DjLnG8f1v@Qi)4hmt3=5Q!|EK z!%8|5ODgf0w%urfV#lV`6YG)@b(`KYxkt&U*c8#<=S0+~(ihjDsUww0#I$ZxQ^?D# zaJ(-A-%+vTmW}Du)>uSMoBS~OW;8|17%i|;Us_G1wyI=St}dAw%rqr|;)c12-dve8 zCCpa0X(})s>5RqIZyFztsE%7>4_{g(0pk_;UA-1;mOz~WmEwursUDyR@_MjUq*SB&+g0!Zl`bKL&sv%RWTPh@9bv^oe-NzFT_0e{^I#AIL zNltYQR4PD3Qagoo#wZU+bl)m+JH0%gG)f5K6+q0ez+t-VFpDgZ!Bu27eXh<0M2sdJ z=ovXL(y>*9(i0{UUlYV27JC@P8JV7y1E=J6e+Qan16C@=S*m~oj!y-Idh5S1yoj3K zBl9APMBiCOYUyYlIqYG&o?5*S?4_#=Wb1U?R`AB4&4k`X*9Z2O+4Tm7WPBxH>P|mi zMQZ55g`}x?_(^((fQEWM-4M9sPQdt}0b^Ya6wWOq&30o&>%e%+7)6{RZ@X3ev7SIn zyd-3`3i4tm%?e2LSF6DgcL&IZ98x){2_w6nZVcQ+vSu;KdP1_ft?pz`AldUKS;^dv zutFjAVFXvwHG#8}+My%`KR|;0?G8GOpohMes~DC);PB6Ag(lDM-wUdHVUfXe0Z=jFiq! zPnpPu<9B1XQb=4m@=KHLk`{T+t^wq0#UKj_Fh{;v0QnFizkHV=4IZq(I;_l@109@_ z%Oa~`O7tk=4BzF9SAdf*Ylp|R55nBWa=ESztYa85uI6mgI-z#Hg&`e zauUn#8Old89hL*bmF@lqCR%}YJ$1cBu;8=4RQw9Ql@5w=_2wFgtC#M!;;NpA8Q)u0 z-h!#v&@@+Vv$`%l*a6z!*XS@y11IuMe>*N;Mr!~Ce15w(8_-!&{LF&Hc|FBzI{=h@6l^r}0(fnq(i#2)^}q~Hi# zH{5A->og?#!df75eTgGAF{oJC35xji3_H7mh>xrVVH^%}Vdx>sKp86nB6fDB9b5M% zorWY%BaUAjUz)`z$-}ET$No@A?J^=8fWTb3!3w zy8!qL2!7QBH$$O2(>c+QaeiBV&JQE_fhK#*YRd!e42X>L0^lbR{EP#<-T-&RM8@ds zTt-;|T%s2c>PC~h^C=h>8I$>eM-g~%sokdQd1VADBFs(fmFYe?kR|QK9VQhw3SvxA zM^nY2i4mYn_^{g3N#@bvrR10_W|r0xS(S(T!hUMkfi2+wq7(Md{+4{g63asz@IoQL z;|O@t0ocZ+@6#(wyrRTFH_`4u|D1Mykj5WCoaW&gZxe-5g+kHK*d#iFls{@A>kXlx zV}H;|*sNQ*e1zbK+~}MfS?Omy?6gjz*O11sR{ptABn^jeKjwF(VxIg-^tt6T=)^joZCuO=z4xF3Yp8tq8e-EfF z@4;^VS)#y#W>E(Z#Oc-hz_(>rkY`QbW|vsL#RxlWGkq{{z6J^vA@~9I*xw>S4j|j2vZKQa2D%_jy>pLI z@G-u$>v?%Or>x9quccjq>p+_wZR>vmP2tU*)*DFcrFOECEzy;htyaxxotOu-umGd~ ziL|UYcUtF>)a#X=uBFelnm77@NLzs4TP7FzKzCFu76T< zW3UnKUM|WRk$F6B-tBKpr=DmuZ+`lJx1g9%1*Z#eeN+!=#f)dC&3p!4QiX%1u`Zx4 zW;a*yE{(As3m~2k9Bv>L|tjxah~yy7st93U?KFR zQgKy5TP;$Jc$I)p|GcDW9>I6B%LuvbzQ9z;Pb{|UE=MA@#E-MA*}iKOK^EF zo?DwrxIQrf6rpl$p_FK&)*X$~AWHBc{MJWl6f9}wxh67RxZHas7E-AWoRz6donC_M z^pkK-fN$IaJ7r=H*xaK0mJFv|qWAk$yq4Ql9E+ino#6jcbTJk6e_v`{MYBw!WRLuR zUux8Giisa3I56{u7*|a(E-YrHoF&J2ymH2gl{{OxpEiCFz39m27qepX${U|lGUvt( zIS#<7ekK`n2^JwWjP*slELQvOUN zZly=Ul|jx+QNg+drJ2mnTmF%9K*Bd7QWxo{3f%t-Mq21VTRelDf zOVF^r)|AUz0X^$e`IM6Pvio)_8OFFZYVlEIemJI(9^NVW?+1s2|6oKzB3)}#ZE8b} zp_ay`+WMZpSUghO*0Nkx!lBldo?u&adDF6}5^8Q)9t(Tg66c#z^%L^kQB2Gg%3RiyML2>uxV zg?~$&et7aSyYuGFdklk1|2ZuA#7itWOJ21KW5>Ro+wHXJSV(3Qn> The Delta transaction log must remain an append-only log. To enable the detection of +> non-compliant modifications to Delta files, writers can optionally emit an auxiliary file with +> every commit, which contains important information about the state of the table as of that +> version. + +Spark SQL: +``` +CREATE SCHEMA checksum_without_metadata LOCATION 'file:///tmp/checksum_without_metadata'; +CREATE TABLE checksum_without_metadata.checksum_without_metadata (data INTEGER) USING DELTA TBLPROPERTIES (delta.checkpointInterval = 1); +INSERT INTO checksum_without_metadata.checksum_without_metadata values 1; +``` + +Subsequently, the `metadata` element was manually removed from all checksum files in the +`_delta_log`. This fixture is otherwise identical to `deltalake/checksum`. diff --git a/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_metadata/_delta_log/00000000000000000000.crc b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_metadata/_delta_log/00000000000000000000.crc new file mode 100644 index 000000000000..2768876f1e28 --- /dev/null +++ b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_metadata/_delta_log/00000000000000000000.crc @@ -0,0 +1 @@ +{"txnId":"97fef1b5-2b45-4c53-ad38-03be8e6e6739","tableSizeBytes":0,"numFiles":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"protocol":{"minReaderVersion":1,"minWriterVersion":2},"allFiles":[]} diff --git a/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_metadata/_delta_log/00000000000000000000.json b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_metadata/_delta_log/00000000000000000000.json new file mode 100644 index 000000000000..13db1d4e6d6d --- /dev/null +++ b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_metadata/_delta_log/00000000000000000000.json @@ -0,0 +1,3 @@ +{"commitInfo":{"timestamp":1771884417678,"operation":"CREATE TABLE","operationParameters":{"partitionBy":"[]","clusterBy":"[]","description":null,"isManaged":"true","properties":"{\"delta.checkpointInterval\":\"1\"}"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Apache-Spark/4.0.0 Delta-Lake/4.0.0","txnId":"97fef1b5-2b45-4c53-ad38-03be8e6e6739"}} +{"metaData":{"id":"a953d1d0-a84e-4ca6-bb2a-ed181213a3f0","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"data\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpointInterval":"1"},"createdTime":1771884417572}} +{"protocol":{"minReaderVersion":1,"minWriterVersion":2}} diff --git a/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_metadata/_delta_log/00000000000000000001.checkpoint.parquet b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_metadata/_delta_log/00000000000000000001.checkpoint.parquet new file mode 100644 index 0000000000000000000000000000000000000000..db6bee1c51638183be47582d6d2dfcf624e8d0be GIT binary patch literal 17389 zcmeHP4Qv$G5#Bu?XM@4~g!ecHhZqa4Cc);;*v1eS2&RsSC`FXGk)kwR-|dl8?#H=X z6DUGT2u10YRwzwSY9WHs){3l(S}h_!jR-PMpg(5dfBSb0c%-h|! zZ};}jcXr56(_mot{d_ZTX5P%~ycag!yD3B@QblHw>mz^odsz+f!K;r{NK(1KOp+w{ z{VDVxkjX)rT>R}TI|Kq7Rx<(~sRQWrDrv1DD9aZcdV)%*MGZw7n!-KH8k$>snxHAv z+_0=Qx;zq%HY?3-Ee+vdFxZ$$D!ski8)53>eX0g*-cII`x1YcIZ?8$kCrQ(S9<~gF z7L2MPwPfh*(<2^XR8_M+%8O&+5r)kpb>yjk^sh2Ufk>ve87P02VI2Ht47>K>=y6eW zd|cblvwIJ~SQs~h%qGw5fAek58{k#c7pZq=r-JNTzN@XmMR4 zmh4vIeJZ@|sEa6?0qUK02wOFTb4#b zkzj+;)~q%(hn3cbo}MP9L5+mkLQSEi%F<}iFq+}#-}}9YV>sd&xcyPih58qedXl}7 zeXUa9W7b;6M^GvUJ}hU6n*Zr*15Zsihl0PcY5q7jkRpbik;zG!?0)z4zi>S8InkF5 z5j+H>chqUyd)2xXbr~()7uM<)*F|G$9Cz76u<4Y9Ja;zRUW;cgIuuK4>J~K(V{m^d zJ#m-^vsBo3s;20>aL3M_593B-3;D@M&xnrGwvZ>;LN<}jESCM2%}oWB5@u{8(=axL5~_q_-+@pc5c>k$>$w`c#zwa9fuUHzXN;NfHKB3?2hN z`pqC8Yef&Rhi7?Ipy2i|A3Z$4d53Y8^-O3L3=FVv%ddh-W- zj8mC_Tl}zFx|^$K%GG`eeu$@<`tYX}n3YOx_Eh^<>q!Ho0>SZR4HJh*<08Cl5njpT zTtvVpRd-^$TSqtRfF4*$g_%$Yk4|J9X66&ZqsuTEqzhqwY|hYl9CAmNyS>=q>;Xk} zvJS_L&#}D~`N?Ica5EZCDM8*P_9rI}kt(R3@otJ@b`~>A+;L9HG1apUXHR~1I^BTK zB0i_~!mb2Vzo3{vT;#AZRanqGfthk68e2~s65NKnKLNLq$Du^^9#Ecc;)SCv24X$j znqk=GngZAOyM}hL7kqj~=8vaj za#m(z_CScJq5I3odTYw1GP1#X8}hO@9Cp)7HbMtJo}EP27?1mW;^YyZIC-#~thAu6 z*xm*zSRbN&U_HJxnYUiF9uH38ts~ZBf2D{%TuFxYMGdk=4NVi}Fsi>HwV^65h*9ft zXu7zJ8`H&QoSwmFWM_(~r)_W9S$x=O>oI$q)oMLvXIrh-W0sJ8`Wm{B(|#%vyI_0k zpF_HtXsdAK=p2#XV}9{%Z#DTu2R+IJvum!{du}e-rS}FA=zI-%kikL@Q$q*u;2OVv z2kA02eqtVf%-$(t4cp!ZYDMlZ+ujD|>&u5-b7N!I)C~B)#~#T*ww+MORH?<|FVnUq ziBBR^7kd1Ypt`dzvQS!I1I9oV(_pIpDNj7avwktaws`%MA5*tK0vUdvO1wB(@|Qu2NtQ9fdd(JO#G8qI zU)8yokw`^i(O6hP^v$sZG)pjI3i{TLN0sCj6?yY&%9hLor~*V4I7pQ(K@DrEG{}od zsWJmE$`*Aqd|R0;BGXkUfp4YPCF8NAx^}DC<$*3=DjLnG8f1v@Qi)4hmt3=5Q!|EK z!%8|5ODgf0w%urfV#lV`6YG)@b(`KYxkt&U*c8#<=S0+~(ihjDsUww0#I$ZxQ^?D# zaJ(-A-%+vTmW}Du)>uSMoBS~OW;8|17%i|;Us_G1wyI=St}dAw%rqr|;)c12-dve8 zCCpa0X(})s>5RqIZyFztsE%7>4_{g(0pk_;UA-1;mOz~WmEwursUDyR@_MjUq*SB&+g0!Zl`bKL&sv%RWTPh@9bv^oe-NzFT_0e{^I#AIL zNltYQR4PD3Qagoo#wZU+bl)m+JH0%gG)f5K6+q0ez+t-VFpDgZ!Bu27eXh<0M2sdJ z=ovXL(y>*9(i0{UUlYV27JC@P8JV7y1E=J6e+Qan16C@=S*m~oj!y-Idh5S1yoj3K zBl9APMBiCOYUyYlIqYG&o?5*S?4_#=Wb1U?R`AB4&4k`X*9Z2O+4Tm7WPBxH>P|mi zMQZ55g`}x?_(^((fQEWM-4M9sPQdt}0b^Ya6wWOq&30o&>%e%+7)6{RZ@X3ev7SIn zyd-3`3i4tm%?e2LSF6DgcL&IZ98x){2_w6nZVcQ+vSu;KdP1_ft?pz`AldUKS;^dv zutFjAVFXvwHG#8}+My%`KR|;0?G8GOpohMes~DC);PB6Ag(lDM-wUdHVUfXe0Z=jFiq! zPnpPu<9B1XQb=4m@=KHLk`{T+t^wq0#UKj_Fh{;v0QnFizkHV=4IZq(I;_l@109@_ z%Oa~`O7tk=4BzF9SAdf*Ylp|R55nBWa=ESztYa85uI6mgI-z#Hg&`e zauUn#8Old89hL*bmF@lqCR%}YJ$1cBu;8=4RQw9Ql@5w=_2wFgtC#M!;;NpA8Q)u0 z-h!#v&@@+Vv$`%l*a6z!*XS@y11IuMe>*N;Mr!~Ce15w(8_-!&{LF&Hc|FBzI{=h@6l^r}0(fnq(i#2)^}q~Hi# zH{5A->og?#!df75eTgGAF{oJC35xji3_H7mh>xrVVH^%}Vdx>sKp86nB6fDB9b5M% zorWY%BaUAjUz)`z$-}ET$No@A?J^=8fWTb3!3w zy8!qL2!7QBH$$O2(>c+QaeiBV&JQE_fhK#*YRd!e42X>L0^lbR{EP#<-T-&RM8@ds zTt-;|T%s2c>PC~h^C=h>8I$>eM-g~%sokdQd1VADBFs(fmFYe?kR|QK9VQhw3SvxA zM^nY2i4mYn_^{g3N#@bvrR10_W|r0xS(S(T!hUMkfi2+wq7(Md{+4{g63asz@IoQL z;|O@t0ocZ+@6#(wyrRTFH_`4u|D1Mykj5WCoaW&gZxe-5g+kHK*d#iFls{@A>kXlx zV}H;|*sNQ*e1zbK+~}MfS?Omy?6gjz*O11sR{ptABn^jeKjwF(VxIg-^tt6T=)^joZCuO=z4xF3Yp8tq8e-EfF z@4;^VS)#y#W>E(Z#Oc-hz_(>rkY`QbW|vsL#RxlWGkq{{z6J^vA@~9I*xw>S4j|j2vZKQa2D%_jy>pLI z@G-u$>v?%Or>x9quccjq>p+_wZR>vmP2tU*)*DFcrFOECEzy;htyaxxotOu-umGd~ ziL|UYcUtF>)a#X=uBFelnm77@NLzs4TP7FzKzCFu76T< zW3UnKUM|WRk$F6B-tBKpr=DmuZ+`lJx1g9%1*Z#eeN+!=#f)dC&3p!4QiX%1u`Zx4 zW;a*yE{(As3m~2k9Bv>L|tjxah~yy7st93U?KFR zQgKy5TP;$Jc$I)p|GcDW9>I6B%LuvbzQ9z;Pb{|UE=MA@#E-MA*}iKOK^EF zo?DwrxIQrf6rpl$p_FK&)*X$~AWHBc{MJWl6f9}wxh67RxZHas7E-AWoRz6donC_M z^pkK-fN$IaJ7r=H*xaK0mJFv|qWAk$yq4Ql9E+ino#6jcbTJk6e_v`{MYBw!WRLuR zUux8Giisa3I56{u7*|a(E-YrHoF&J2ymH2gl{{OxpEiCFz39m27qepX${U|lGUvt( zIS#<7ekK`n2^JwWjP*slELQvOUN zZly=Ul|jx+QNg+drJ2mnTmF%9K*Bd7QWxo{3f%t-Mq21VTRelDf zOVF^r)|AUz0X^$e`IM6Pvio)_8OFFZYVlEIemJI(9^NVW?+1s2|6oKzB3)}#ZE8b} zp_ay`+WMZpSUghO*0Nkx!lBldo?u&adDF6}5^8Q)9t(Tg66c#z^%L^kQB2Gg%3RiyML2>uxV zg?~$&et7aSyYuGFdklk1|2ZuA#7itWOJ21KW5>Ro+wHXJSV(3Qn> The Delta transaction log must remain an append-only log. To enable the detection of +> non-compliant modifications to Delta files, writers can optionally emit an auxiliary file with +> every commit, which contains important information about the state of the table as of that +> version. + +Spark SQL: +``` +CREATE SCHEMA checksum_without_protocol LOCATION 'file:///tmp/checksum_without_protocol'; +CREATE TABLE checksum_without_protocol.checksum_without_protocol (data INTEGER) USING DELTA TBLPROPERTIES (delta.checkpointInterval = 1); +INSERT INTO checksum_without_protocol.checksum_without_protocol values 1; +``` + +Subsequently, the `protocol` element was manually removed from all checksum files in the +`_delta_log`. This fixture is otherwise identical to `deltalake/checksum`. diff --git a/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_protocol/_delta_log/00000000000000000000.crc b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_protocol/_delta_log/00000000000000000000.crc new file mode 100644 index 000000000000..64879fa6858c --- /dev/null +++ b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_protocol/_delta_log/00000000000000000000.crc @@ -0,0 +1 @@ +{"txnId":"97fef1b5-2b45-4c53-ad38-03be8e6e6739","tableSizeBytes":0,"numFiles":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"a953d1d0-a84e-4ca6-bb2a-ed181213a3f0","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"data\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpointInterval":"1"},"createdTime":1771884417572},"allFiles":[]} diff --git a/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_protocol/_delta_log/00000000000000000000.json b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_protocol/_delta_log/00000000000000000000.json new file mode 100644 index 000000000000..13db1d4e6d6d --- /dev/null +++ b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_protocol/_delta_log/00000000000000000000.json @@ -0,0 +1,3 @@ +{"commitInfo":{"timestamp":1771884417678,"operation":"CREATE TABLE","operationParameters":{"partitionBy":"[]","clusterBy":"[]","description":null,"isManaged":"true","properties":"{\"delta.checkpointInterval\":\"1\"}"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Apache-Spark/4.0.0 Delta-Lake/4.0.0","txnId":"97fef1b5-2b45-4c53-ad38-03be8e6e6739"}} +{"metaData":{"id":"a953d1d0-a84e-4ca6-bb2a-ed181213a3f0","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"data\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpointInterval":"1"},"createdTime":1771884417572}} +{"protocol":{"minReaderVersion":1,"minWriterVersion":2}} diff --git a/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_protocol/_delta_log/00000000000000000001.checkpoint.parquet b/plugin/trino-delta-lake/src/test/resources/deltalake/checksum_without_protocol/_delta_log/00000000000000000001.checkpoint.parquet new file mode 100644 index 0000000000000000000000000000000000000000..db6bee1c51638183be47582d6d2dfcf624e8d0be GIT binary patch literal 17389 zcmeHP4Qv$G5#Bu?XM@4~g!ecHhZqa4Cc);;*v1eS2&RsSC`FXGk)kwR-|dl8?#H=X z6DUGT2u10YRwzwSY9WHs){3l(S}h_!jR-PMpg(5dfBSb0c%-h|! zZ};}jcXr56(_mot{d_ZTX5P%~ycag!yD3B@QblHw>mz^odsz+f!K;r{NK(1KOp+w{ z{VDVxkjX)rT>R}TI|Kq7Rx<(~sRQWrDrv1DD9aZcdV)%*MGZw7n!-KH8k$>snxHAv z+_0=Qx;zq%HY?3-Ee+vdFxZ$$D!ski8)53>eX0g*-cII`x1YcIZ?8$kCrQ(S9<~gF z7L2MPwPfh*(<2^XR8_M+%8O&+5r)kpb>yjk^sh2Ufk>ve87P02VI2Ht47>K>=y6eW zd|cblvwIJ~SQs~h%qGw5fAek58{k#c7pZq=r-JNTzN@XmMR4 zmh4vIeJZ@|sEa6?0qUK02wOFTb4#b zkzj+;)~q%(hn3cbo}MP9L5+mkLQSEi%F<}iFq+}#-}}9YV>sd&xcyPih58qedXl}7 zeXUa9W7b;6M^GvUJ}hU6n*Zr*15Zsihl0PcY5q7jkRpbik;zG!?0)z4zi>S8InkF5 z5j+H>chqUyd)2xXbr~()7uM<)*F|G$9Cz76u<4Y9Ja;zRUW;cgIuuK4>J~K(V{m^d zJ#m-^vsBo3s;20>aL3M_593B-3;D@M&xnrGwvZ>;LN<}jESCM2%}oWB5@u{8(=axL5~_q_-+@pc5c>k$>$w`c#zwa9fuUHzXN;NfHKB3?2hN z`pqC8Yef&Rhi7?Ipy2i|A3Z$4d53Y8^-O3L3=FVv%ddh-W- zj8mC_Tl}zFx|^$K%GG`eeu$@<`tYX}n3YOx_Eh^<>q!Ho0>SZR4HJh*<08Cl5njpT zTtvVpRd-^$TSqtRfF4*$g_%$Yk4|J9X66&ZqsuTEqzhqwY|hYl9CAmNyS>=q>;Xk} zvJS_L&#}D~`N?Ica5EZCDM8*P_9rI}kt(R3@otJ@b`~>A+;L9HG1apUXHR~1I^BTK zB0i_~!mb2Vzo3{vT;#AZRanqGfthk68e2~s65NKnKLNLq$Du^^9#Ecc;)SCv24X$j znqk=GngZAOyM}hL7kqj~=8vaj za#m(z_CScJq5I3odTYw1GP1#X8}hO@9Cp)7HbMtJo}EP27?1mW;^YyZIC-#~thAu6 z*xm*zSRbN&U_HJxnYUiF9uH38ts~ZBf2D{%TuFxYMGdk=4NVi}Fsi>HwV^65h*9ft zXu7zJ8`H&QoSwmFWM_(~r)_W9S$x=O>oI$q)oMLvXIrh-W0sJ8`Wm{B(|#%vyI_0k zpF_HtXsdAK=p2#XV}9{%Z#DTu2R+IJvum!{du}e-rS}FA=zI-%kikL@Q$q*u;2OVv z2kA02eqtVf%-$(t4cp!ZYDMlZ+ujD|>&u5-b7N!I)C~B)#~#T*ww+MORH?<|FVnUq ziBBR^7kd1Ypt`dzvQS!I1I9oV(_pIpDNj7avwktaws`%MA5*tK0vUdvO1wB(@|Qu2NtQ9fdd(JO#G8qI zU)8yokw`^i(O6hP^v$sZG)pjI3i{TLN0sCj6?yY&%9hLor~*V4I7pQ(K@DrEG{}od zsWJmE$`*Aqd|R0;BGXkUfp4YPCF8NAx^}DC<$*3=DjLnG8f1v@Qi)4hmt3=5Q!|EK z!%8|5ODgf0w%urfV#lV`6YG)@b(`KYxkt&U*c8#<=S0+~(ihjDsUww0#I$ZxQ^?D# zaJ(-A-%+vTmW}Du)>uSMoBS~OW;8|17%i|;Us_G1wyI=St}dAw%rqr|;)c12-dve8 zCCpa0X(})s>5RqIZyFztsE%7>4_{g(0pk_;UA-1;mOz~WmEwursUDyR@_MjUq*SB&+g0!Zl`bKL&sv%RWTPh@9bv^oe-NzFT_0e{^I#AIL zNltYQR4PD3Qagoo#wZU+bl)m+JH0%gG)f5K6+q0ez+t-VFpDgZ!Bu27eXh<0M2sdJ z=ovXL(y>*9(i0{UUlYV27JC@P8JV7y1E=J6e+Qan16C@=S*m~oj!y-Idh5S1yoj3K zBl9APMBiCOYUyYlIqYG&o?5*S?4_#=Wb1U?R`AB4&4k`X*9Z2O+4Tm7WPBxH>P|mi zMQZ55g`}x?_(^((fQEWM-4M9sPQdt}0b^Ya6wWOq&30o&>%e%+7)6{RZ@X3ev7SIn zyd-3`3i4tm%?e2LSF6DgcL&IZ98x){2_w6nZVcQ+vSu;KdP1_ft?pz`AldUKS;^dv zutFjAVFXvwHG#8}+My%`KR|;0?G8GOpohMes~DC);PB6Ag(lDM-wUdHVUfXe0Z=jFiq! zPnpPu<9B1XQb=4m@=KHLk`{T+t^wq0#UKj_Fh{;v0QnFizkHV=4IZq(I;_l@109@_ z%Oa~`O7tk=4BzF9SAdf*Ylp|R55nBWa=ESztYa85uI6mgI-z#Hg&`e zauUn#8Old89hL*bmF@lqCR%}YJ$1cBu;8=4RQw9Ql@5w=_2wFgtC#M!;;NpA8Q)u0 z-h!#v&@@+Vv$`%l*a6z!*XS@y11IuMe>*N;Mr!~Ce15w(8_-!&{LF&Hc|FBzI{=h@6l^r}0(fnq(i#2)^}q~Hi# zH{5A->og?#!df75eTgGAF{oJC35xji3_H7mh>xrVVH^%}Vdx>sKp86nB6fDB9b5M% zorWY%BaUAjUz)`z$-}ET$No@A?J^=8fWTb3!3w zy8!qL2!7QBH$$O2(>c+QaeiBV&JQE_fhK#*YRd!e42X>L0^lbR{EP#<-T-&RM8@ds zTt-;|T%s2c>PC~h^C=h>8I$>eM-g~%sokdQd1VADBFs(fmFYe?kR|QK9VQhw3SvxA zM^nY2i4mYn_^{g3N#@bvrR10_W|r0xS(S(T!hUMkfi2+wq7(Md{+4{g63asz@IoQL z;|O@t0ocZ+@6#(wyrRTFH_`4u|D1Mykj5WCoaW&gZxe-5g+kHK*d#iFls{@A>kXlx zV}H;|*sNQ*e1zbK+~}MfS?Omy?6gjz*O11sR{ptABn^jeKjwF(VxIg-^tt6T=)^joZCuO=z4xF3Yp8tq8e-EfF z@4;^VS)#y#W>E(Z#Oc-hz_(>rkY`QbW|vsL#RxlWGkq{{z6J^vA@~9I*xw>S4j|j2vZKQa2D%_jy>pLI z@G-u$>v?%Or>x9quccjq>p+_wZR>vmP2tU*)*DFcrFOECEzy;htyaxxotOu-umGd~ ziL|UYcUtF>)a#X=uBFelnm77@NLzs4TP7FzKzCFu76T< zW3UnKUM|WRk$F6B-tBKpr=DmuZ+`lJx1g9%1*Z#eeN+!=#f)dC&3p!4QiX%1u`Zx4 zW;a*yE{(As3m~2k9Bv>L|tjxah~yy7st93U?KFR zQgKy5TP;$Jc$I)p|GcDW9>I6B%LuvbzQ9z;Pb{|UE=MA@#E-MA*}iKOK^EF zo?DwrxIQrf6rpl$p_FK&)*X$~AWHBc{MJWl6f9}wxh67RxZHas7E-AWoRz6donC_M z^pkK-fN$IaJ7r=H*xaK0mJFv|qWAk$yq4Ql9E+ino#6jcbTJk6e_v`{MYBw!WRLuR zUux8Giisa3I56{u7*|a(E-YrHoF&J2ymH2gl{{OxpEiCFz39m27qepX${U|lGUvt( zIS#<7ekK`n2^JwWjP*slELQvOUN zZly=Ul|jx+QNg+drJ2mnTmF%9K*Bd7QWxo{3f%t-Mq21VTRelDf zOVF^r)|AUz0X^$e`IM6Pvio)_8OFFZYVlEIemJI(9^NVW?+1s2|6oKzB3)}#ZE8b} zp_ay`+WMZpSUghO*0Nkx!lBldo?u&adDF6}5^8Q)9t(Tg66c#z^%L^kQB2Gg%3RiyML2>uxV zg?~$&et7aSyYuGFdklk1|2ZuA#7itWOJ21KW5>Ro+wHXJSV(3Qn> onTrino().executeQuery("SELECT * FROM delta.default." + tableName)) + assertQueryFailure(() -> onTrino().executeQuery(selectWithChecksumFilesDisabled)) .hasMessageContaining("Error opening Hive split"); // Verify flushing cache resolve the query failure onTrino().executeQuery("CALL delta.system.flush_metadata_cache(schema_name => 'default', table_name => '" + tableName + "')"); - assertThat(onTrino().executeQuery("SELECT * FROM delta.default." + tableName)).containsOnly(row(2)); + assertThat(onTrino().executeQuery(selectWithChecksumFilesDisabled)).containsOnly(row(2)); onTrino().executeQuery("DROP TABLE delta.default." + tableName); } From a4b984916e5cdfb48927cce99a23291bbeeab752 Mon Sep 17 00:00:00 2001 From: Raunaq Morarka Date: Fri, 24 Apr 2026 17:40:21 +0530 Subject: [PATCH 3/3] Cache Delta table descriptor parsed from checksum file The checksum fast path in getTableHandle bypasses the TableSnapshot cache and therefore re-parses the .crc file on every query for an unchanged table. Add a cross-query cache on TransactionLogAccess keyed by (schema.table, location, version), populated by the checksum loader, so repeated queries reuse the parsed metadata and protocol. Cache Optional so a missing or malformed checksum is remembered too; subsequent calls fall through to the transaction-log path without re-reading the .crc. The cache is bounded to 1000 entries (descriptors are small) and invalidated alongside tableSnapshots in flushCache and invalidateCache. --- .../plugin/deltalake/DeltaLakeMetadata.java | 42 +++++++-------- .../DeltaLakeTableDescriptor.java | 25 +++++++++ .../transactionlog/TransactionLogAccess.java | 50 +++++++++++++++-- ...stDeltaLakeAlluxioCacheFileOperations.java | 4 +- .../TestDeltaLakeFileOperations.java | 20 +++---- .../deltalake/TestTransactionLogAccess.java | 54 +++++++++++++++++++ 6 files changed, 154 insertions(+), 41 deletions(-) create mode 100644 plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/DeltaLakeTableDescriptor.java diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeMetadata.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeMetadata.java index 4e9ce140a929..cb018eb635f0 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeMetadata.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeMetadata.java @@ -68,6 +68,7 @@ import io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport; import io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport.ColumnMappingMode; import io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport.UnsupportedTypeException; +import io.trino.plugin.deltalake.transactionlog.DeltaLakeTableDescriptor; import io.trino.plugin.deltalake.transactionlog.DeltaLakeTransactionLogEntry; import io.trino.plugin.deltalake.transactionlog.DeltaLakeVersionChecksum; import io.trino.plugin.deltalake.transactionlog.MetadataEntry; @@ -491,15 +492,6 @@ private record QueriedTable(SchemaTableName schemaTableName, long version) } } - private record DeltaLakeTableDescriptor(long version, MetadataEntry metadataEntry, ProtocolEntry protocolEntry) - { - DeltaLakeTableDescriptor - { - requireNonNull(metadataEntry, "metadataEntry is null"); - requireNonNull(protocolEntry, "protocolEntry is null"); - } - } - public DeltaLakeMetadata( DeltaLakeMetastore metastore, TransactionLogAccess transactionLogAccess, @@ -819,21 +811,23 @@ private Optional loadDescriptorFromChecksum( { long latestCommitVersion = endTableVersion.orElseGet(() -> resolveLatestCommitVersion(tableName, fileSystem, tableLocation)); - Optional versionChecksum; - try { - versionChecksum = readVersionChecksumFile(fileSystem, tableLocation, latestCommitVersion); - } - catch (IOException | UncheckedIOException e) { - throw new TrinoException(DELTA_LAKE_FILESYSTEM_ERROR, format("Failed to read checksum file for version %d of table %s", latestCommitVersion, tableName), e); - } - if (versionChecksum.isEmpty()) { - return Optional.empty(); - } - DeltaLakeVersionChecksum checksum = versionChecksum.get(); - if (checksum.metadata().isEmpty() || checksum.protocol().isEmpty()) { - return Optional.empty(); - } - return Optional.of(new DeltaLakeTableDescriptor(latestCommitVersion, checksum.metadata().orElseThrow(), checksum.protocol().orElseThrow())); + return transactionLogAccess.loadDescriptor(tableName, tableLocation, latestCommitVersion, () -> { + Optional versionChecksum; + try { + versionChecksum = readVersionChecksumFile(fileSystem, tableLocation, latestCommitVersion); + } + catch (IOException | UncheckedIOException e) { + throw new TrinoException(DELTA_LAKE_FILESYSTEM_ERROR, format("Failed to read checksum file for version %d of table %s", latestCommitVersion, tableName), e); + } + if (versionChecksum.isEmpty()) { + return Optional.empty(); + } + DeltaLakeVersionChecksum checksum = versionChecksum.get(); + if (checksum.metadata().isEmpty() || checksum.protocol().isEmpty()) { + return Optional.empty(); + } + return Optional.of(new DeltaLakeTableDescriptor(latestCommitVersion, checksum.metadata().orElseThrow(), checksum.protocol().orElseThrow())); + }); } // Reuse the version resolved earlier in this transaction to skip _last_checkpoint and transaction log listing diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/DeltaLakeTableDescriptor.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/DeltaLakeTableDescriptor.java new file mode 100644 index 000000000000..5acbbeab63f5 --- /dev/null +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/DeltaLakeTableDescriptor.java @@ -0,0 +1,25 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.deltalake.transactionlog; + +import static java.util.Objects.requireNonNull; + +public record DeltaLakeTableDescriptor(long version, MetadataEntry metadataEntry, ProtocolEntry protocolEntry) +{ + public DeltaLakeTableDescriptor + { + requireNonNull(metadataEntry, "metadataEntry is null"); + requireNonNull(protocolEntry, "protocolEntry is null"); + } +} diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java index b7b0271e98e9..bc9c74547861 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java @@ -78,6 +78,7 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeUnit; import java.util.function.Predicate; +import java.util.function.Supplier; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Stream; @@ -90,6 +91,7 @@ import static io.airlift.slice.SizeOf.estimatedSizeOf; import static io.airlift.slice.SizeOf.instanceSize; import static io.trino.cache.CacheUtils.invalidateAllIf; +import static io.trino.cache.CacheUtils.uncheckedCacheGet; import static io.trino.plugin.deltalake.DeltaLakeErrorCode.DELTA_LAKE_INVALID_SCHEMA; import static io.trino.plugin.deltalake.transactionlog.TransactionLogParser.readLastCheckpoint; import static io.trino.plugin.deltalake.transactionlog.TransactionLogUtil.getTransactionLogDir; @@ -108,6 +110,8 @@ public class TransactionLogAccess private static final Pattern MULTI_PART_CHECKPOINT = Pattern.compile("(\\d*)\\.checkpoint\\.(\\d*)\\.(\\d*)\\.parquet"); private static final Pattern V2_CHECKPOINT = Pattern.compile("(\\d*)\\.checkpoint\\.[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\\.(json|parquet)"); + private static final long DESCRIPTOR_CACHE_MAX_SIZE = 1000; + private final TypeManager typeManager; private final CheckpointSchemaManager checkpointSchemaManager; private final FileFormatDataSourceStats fileFormatDataSourceStats; @@ -121,6 +125,7 @@ public class TransactionLogAccess private final TransactionLogReaderFactory transactionLogReaderFactory; private final Cache tableSnapshots; + private final Cache> tableDescriptors; @Inject public TransactionLogAccess( @@ -152,6 +157,12 @@ public TransactionLogAccess( .shareNothingWhenDisabled() .recordStats() .build(); + tableDescriptors = EvictableCacheBuilder.newBuilder() + .maximumSize(DESCRIPTOR_CACHE_MAX_SIZE) + .expireAfterWrite(deltaLakeConfig.getMetadataCacheTtl().toMillis(), TimeUnit.MILLISECONDS) + .shareNothingWhenDisabled() + .recordStats() + .build(); } @Managed @@ -161,6 +172,22 @@ public CacheStatsMBean getMetadataCacheStats() return new CacheStatsMBean(tableSnapshots); } + @Managed + @Nested + public CacheStatsMBean getDescriptorCacheStats() + { + return new CacheStatsMBean(tableDescriptors); + } + + public Optional loadDescriptor( + SchemaTableName tableName, + String tableLocation, + long version, + Supplier> loader) + { + return uncheckedCacheGet(tableDescriptors, new TableDescriptorCacheKey(tableName, tableLocation, version), loader); + } + public TableSnapshot loadSnapshot(ConnectorSession session, DeltaMetastoreTable table, Optional endVersion) throws IOException { @@ -335,16 +362,20 @@ private static Optional extractCheckpointVersion(FileEntry file) public void flushCache() { tableSnapshots.invalidateAll(); + tableDescriptors.invalidateAll(); } public void invalidateCache(SchemaTableName schemaTableName, Optional tableLocation) { requireNonNull(schemaTableName, "schemaTableName is null"); - // Invalidate by location in case one table (location) unregistered and re-register under different name - tableLocation.ifPresent(location -> { - invalidateAllIf(tableSnapshots, cacheKey -> cacheKey.location().equals(location)); - }); - invalidateAllIf(tableSnapshots, cacheKey -> cacheKey.tableName().equals(schemaTableName)); + // Invalidate by location in case one table (location) unregistered and re-registered under a different name + invalidateAllIf(tableSnapshots, key -> matchesNameOrLocation(key.tableName(), key.location(), schemaTableName, tableLocation)); + invalidateAllIf(tableDescriptors, key -> matchesNameOrLocation(key.tableName(), key.location(), schemaTableName, tableLocation)); + } + + private static boolean matchesNameOrLocation(SchemaTableName name, String location, SchemaTableName matchName, Optional matchLocation) + { + return name.equals(matchName) || matchLocation.map(location::equals).orElse(false); } public MetadataEntry getMetadataEntry(ConnectorSession session, TrinoFileSystem fileSystem, TableSnapshot tableSnapshot) @@ -708,4 +739,13 @@ long getRetainedSizeInBytes() estimatedSizeOf(location); } } + + private record TableDescriptorCacheKey(SchemaTableName tableName, String location, long version) + { + TableDescriptorCacheKey + { + requireNonNull(tableName, "tableName is null"); + requireNonNull(location, "location is null"); + } + } } diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeAlluxioCacheFileOperations.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeAlluxioCacheFileOperations.java index e6c9835301a9..19ad75f33c57 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeAlluxioCacheFileOperations.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeAlluxioCacheFileOperations.java @@ -641,7 +641,7 @@ public void testCreateOrReplaceTable() .add(new CacheOperation("InputFile.newStream", "00000000000000000000.json")) .add(new CacheOperation("Alluxio.writeCache", "00000000000000000000.json", 0, 821)) .add(new CacheOperation("InputFile.length", "00000000000000000000.json")) - .addCopies(new CacheOperation("InputFile.length", "00000000000000000000.crc"), 2) + .add(new CacheOperation("InputFile.length", "00000000000000000000.crc")) .add(new CacheOperation("InputFile.exists", "00000000000000000001.json")) .add(new CacheOperation("InputFile.length", "00000000000000000001.json")) .add(new CacheOperation("InputFile.newStream", "_last_checkpoint")) @@ -669,7 +669,7 @@ public void testCreateOrReplaceTableAsSelect() .add(new CacheOperation("Alluxio.writeCache", "00000000000000000000.json", 0, 1063)) .add(new CacheOperation("InputFile.newStream", "00000000000000000000.json")) .add(new CacheOperation("InputFile.length", "00000000000000000000.json")) - .addCopies(new CacheOperation("InputFile.length", "00000000000000000000.crc"), 3) + .add(new CacheOperation("InputFile.length", "00000000000000000000.crc")) .add(new CacheOperation("InputFile.length", "00000000000000000001.json")) .add(new CacheOperation("InputFile.exists", "00000000000000000001.json")) .add(new CacheOperation("InputFile.exists", "extendeded_stats.json")) diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeFileOperations.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeFileOperations.java index 99c41f6f62c4..9b54a369eec4 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeFileOperations.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeFileOperations.java @@ -145,7 +145,7 @@ public void testCreateOrReplaceTable() assertFileSystemAccesses("CREATE OR REPLACE TABLE test_create_or_replace (id VARCHAR, age INT)", ImmutableMultiset.builder() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) - .addCopies(new FileOperation(CHECKSUM, "00000000000000000000.crc", "InputFile.newStream"), 2) + .add(new FileOperation(CHECKSUM, "00000000000000000000.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "OutputFile.createOrOverwrite")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) @@ -177,7 +177,7 @@ public void testCreateOrReplaceTableAsSelect() .add(new FileOperation(TRINO_EXTENDED_STATS_JSON, "extended_stats.json", "InputFile.newStream")) .add(new FileOperation(TRINO_EXTENDED_STATS_JSON, "extended_stats.json", "OutputFile.createOrOverwrite")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) - .addCopies(new FileOperation(CHECKSUM, "00000000000000000000.crc", "InputFile.newStream"), 3) + .add(new FileOperation(CHECKSUM, "00000000000000000000.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "OutputFile.createOrOverwrite")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) @@ -464,7 +464,7 @@ public void testSelfJoin() ImmutableMultiset.builder() .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) - .addCopies(new FileOperation(CHECKSUM, "00000000000000000000.crc", "InputFile.newStream"), 2) + .add(new FileOperation(CHECKSUM, "00000000000000000000.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) .add(new FileOperation(TRINO_EXTENDED_STATS_JSON, "extended_stats.json", "InputFile.newStream")) @@ -718,7 +718,7 @@ public void testDeleteWholePartition() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) - .addCopies(new FileOperation(CHECKSUM, "00000000000000000002.crc", "InputFile.newStream"), 2) + .add(new FileOperation(CHECKSUM, "00000000000000000002.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000003.json", "InputFile.exists")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000003.json", "OutputFile.createOrOverwrite")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) @@ -748,7 +748,7 @@ public void testDeleteWholeTable() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) - .addCopies(new FileOperation(CHECKSUM, "00000000000000000002.crc", "InputFile.newStream"), 2) + .add(new FileOperation(CHECKSUM, "00000000000000000002.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000003.json", "InputFile.exists")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000003.json", "OutputFile.createOrOverwrite")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) @@ -779,7 +779,7 @@ public void testDeleteWithNonPartitionFilter() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000003.json", "InputFile.newStream")) - .addCopies(new FileOperation(CHECKSUM, "00000000000000000003.crc", "InputFile.newStream"), 2) + .add(new FileOperation(CHECKSUM, "00000000000000000003.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000004.json", "InputFile.exists")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000004.json", "OutputFile.createOrOverwrite")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) @@ -1048,7 +1048,7 @@ private void testInformationSchemaColumns(boolean removeCachedProperties) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) - .addCopies(new FileOperation(CHECKSUM, "00000000000000000002.crc", "InputFile.newStream"), 2) + .add(new FileOperation(CHECKSUM, "00000000000000000002.crc", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) @@ -1223,7 +1223,7 @@ public void testLoadMetadataFromChecksumFileForDescribe() assertFileSystemAccesses(session, "DESCRIBE " + tableName, ImmutableMultiset.builder() .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) - .addCopies(new FileOperation(CHECKSUM, "00000000000000000001.crc", "InputFile.newStream"), 2) + .add(new FileOperation(CHECKSUM, "00000000000000000001.crc", "InputFile.newStream")) .build()); assertUpdate(session, "DROP TABLE " + tableName); @@ -1278,7 +1278,7 @@ public void testLoadMetadataFromMissingLatestChecksumFileForDescribe() assertFileSystemAccesses(session, "DESCRIBE " + tableName, ImmutableMultiset.builder() - .addCopies(new FileOperation(CHECKSUM, "00000000000000000001.crc", "InputFile.newStream"), 2) + .add(new FileOperation(CHECKSUM, "00000000000000000001.crc", "InputFile.newStream")) .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) .add(new FileOperation(CHECKPOINT, "00000000000000000001.checkpoint.parquet", "InputFile.length")) .add(new FileOperation(CHECKPOINT, "00000000000000000001.checkpoint.parquet", "InputFile.newInput")) @@ -1308,7 +1308,7 @@ public void testLoadMetadataFromChecksumFileWithoutMetadataForDescribe() assertFileSystemAccesses(session, "DESCRIBE " + tableName, ImmutableMultiset.builder() - .addCopies(new FileOperation(CHECKSUM, "00000000000000000001.crc", "InputFile.newStream"), 2) + .add(new FileOperation(CHECKSUM, "00000000000000000001.crc", "InputFile.newStream")) .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) .add(new FileOperation(CHECKPOINT, "00000000000000000001.checkpoint.parquet", "InputFile.length")) .add(new FileOperation(CHECKPOINT, "00000000000000000001.checkpoint.parquet", "InputFile.newInput")) diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestTransactionLogAccess.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestTransactionLogAccess.java index 0505ea7b7c96..40122835ea23 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestTransactionLogAccess.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestTransactionLogAccess.java @@ -26,6 +26,7 @@ import io.trino.plugin.base.metrics.FileFormatDataSourceStats; import io.trino.plugin.deltalake.metastore.NoOpVendedCredentialsProvider; import io.trino.plugin.deltalake.transactionlog.AddFileEntry; +import io.trino.plugin.deltalake.transactionlog.DeltaLakeTableDescriptor; import io.trino.plugin.deltalake.transactionlog.MetadataEntry; import io.trino.plugin.deltalake.transactionlog.ProtocolEntry; import io.trino.plugin.deltalake.transactionlog.TableSnapshot; @@ -63,6 +64,8 @@ import java.util.OptionalInt; import java.util.Set; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Supplier; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -432,6 +435,57 @@ public void testMetadataCacheUpdates() assertThat(updatedSnapshot.getVersion()).isEqualTo(12); } + @Test + public void testDescriptorCache() + throws Exception + { + setupTransactionLogAccessFromResources("person", "databricks73/person"); + + SchemaTableName tableName = new SchemaTableName("schema", "person"); + AtomicInteger loaderInvocations = new AtomicInteger(); + DeltaLakeTableDescriptor descriptor = new DeltaLakeTableDescriptor( + 0L, + new MetadataEntry("id", "test", "description", null, "", ImmutableList.of(), ImmutableMap.of(), 0), + new ProtocolEntry(1, 2, Optional.empty(), Optional.empty())); + Supplier> loader = () -> { + loaderInvocations.incrementAndGet(); + return Optional.of(descriptor); + }; + + // Same key: second call hits the cache. + transactionLogAccess.loadDescriptor(tableName, tableLocation, 0L, loader); + transactionLogAccess.loadDescriptor(tableName, tableLocation, 0L, loader); + assertThat(loaderInvocations).hasValue(1); + + // Different version: cache miss. + transactionLogAccess.loadDescriptor(tableName, tableLocation, 1L, loader); + assertThat(loaderInvocations).hasValue(2); + + // Negative result is also cached. + Supplier> emptyLoader = () -> { + loaderInvocations.incrementAndGet(); + return Optional.empty(); + }; + transactionLogAccess.loadDescriptor(tableName, tableLocation, 2L, emptyLoader); + transactionLogAccess.loadDescriptor(tableName, tableLocation, 2L, emptyLoader); + assertThat(loaderInvocations).hasValue(3); + + // Invalidating by name + location clears all entries for that table. + transactionLogAccess.invalidateCache(tableName, Optional.of(tableLocation)); + transactionLogAccess.loadDescriptor(tableName, tableLocation, 0L, loader); + assertThat(loaderInvocations).hasValue(4); + + // Invalidating by name with no location (used by flush_metadata_cache) also clears the entry. + transactionLogAccess.invalidateCache(tableName, Optional.empty()); + transactionLogAccess.loadDescriptor(tableName, tableLocation, 0L, loader); + assertThat(loaderInvocations).hasValue(5); + + // flushCache clears all cached descriptors. + transactionLogAccess.flushCache(); + transactionLogAccess.loadDescriptor(tableName, tableLocation, 0L, loader); + assertThat(loaderInvocations).hasValue(6); + } + @Test public void testUpdatingTailEntriesNoCheckpoint() throws Exception