Skip to content

Commit ac319d9

Browse files
committed
Add tests for $files table delete file deduplication
Add testFilesTableDeleteFileDeduplication to BaseIcebergSystemTables that verifies the $files table shows each delete file exactly once, with no duplicate entries (v2 position + equality deletes). Add testFilesTableDeletionVectors that verifies v3 deletion vector behavior: multiple DV entries share the same Puffin file_path in the $files table. Currently there are no content_offset/content_size_in_bytes columns to distinguish individual DVs within the shared Puffin file. Follow-up to trinodb#28911 as requested by findinpath.
1 parent a10f96a commit ac319d9

1 file changed

Lines changed: 79 additions & 0 deletions

File tree

plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergSystemTables.java

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,85 @@ public void testFilesTable()
431431
}
432432
}
433433

434+
@Test
435+
public void testFilesTableDeleteFileDeduplication()
436+
throws Exception
437+
{
438+
try (TestTable testTable = newTrinoTable("test_files_delete_dedup_",
439+
"WITH (partitioning = ARRAY['regionkey']) AS SELECT * FROM tpch.tiny.nation")) {
440+
String tableName = testTable.getName();
441+
Table icebergTable = loadTable(tableName);
442+
443+
// Verify initial state: only data files, no delete files
444+
assertThat(query("SELECT count(*) FROM \"" + tableName + "$files\" WHERE content = 0"))
445+
.matches("VALUES BIGINT '5'"); // one data file per regionkey partition
446+
assertThat(query("SELECT count(*) FROM \"" + tableName + "$files\" WHERE content != 0"))
447+
.matches("VALUES BIGINT '0'");
448+
449+
// Write a position delete via MOR path
450+
assertUpdate("DELETE FROM " + tableName + " WHERE nationkey = 7", 1);
451+
452+
// Write an equality delete file for regionkey=2
453+
writeEqualityDeleteForTable(icebergTable, fileSystemFactory,
454+
Optional.of(icebergTable.spec()),
455+
Optional.of(new PartitionData(new Long[] {2L})),
456+
ImmutableMap.of("regionkey", 2L),
457+
Optional.empty());
458+
459+
// Verify: each file path should appear exactly once (no duplicates)
460+
assertThat(query("SELECT count(*) FROM \"" + tableName + "$files\" WHERE content = 1"))
461+
.matches("VALUES BIGINT '1'"); // exactly 1 position delete file
462+
assertThat(query("SELECT count(*) FROM \"" + tableName + "$files\" WHERE content = 2"))
463+
.matches("VALUES BIGINT '1'"); // exactly 1 equality delete file
464+
465+
// Verify no duplicate file paths exist
466+
assertThat(query("SELECT count(file_path) - count(DISTINCT file_path) FROM \"" + tableName + "$files\""))
467+
.matches("VALUES BIGINT '0'");
468+
}
469+
}
470+
471+
@Test
472+
public void testFilesTableDeletionVectors()
473+
{
474+
try (TestTable testTable = newTrinoTable("test_files_dv_",
475+
"(id INTEGER) WITH (format_version = 3, format = 'PARQUET')")) {
476+
String tableName = testTable.getName();
477+
478+
// Insert data across multiple data files
479+
for (int i = 0; i < 3; i++) {
480+
assertUpdate("INSERT INTO " + tableName + " SELECT x FROM UNNEST(sequence(%s, %s)) t(x)".formatted(i * 100 + 1, (i + 1) * 100), 100);
481+
}
482+
483+
// Verify initial state: 3 data files, no delete files
484+
assertThat(query("SELECT count(*) FROM \"" + tableName + "$files\" WHERE content = 0"))
485+
.matches("VALUES BIGINT '3'");
486+
assertThat(query("SELECT count(*) FROM \"" + tableName + "$files\" WHERE content != 0"))
487+
.matches("VALUES BIGINT '0'");
488+
489+
// Delete rows to create deletion vectors (stored in shared Puffin files)
490+
assertUpdate("DELETE FROM " + tableName + " WHERE id % 2 = 0", 150);
491+
492+
// In v3, deletion vectors for multiple data files are stored in a single Puffin file.
493+
// The $files table shows one entry per DV (one per data file), all sharing the same file_path.
494+
// Currently there are no content_offset/content_size_in_bytes columns to distinguish
495+
// individual DVs within the shared Puffin file.
496+
assertThat(query("SELECT count(*) FROM \"" + tableName + "$files\" WHERE content = 1"))
497+
.matches("VALUES BIGINT '3'"); // one DV entry per data file
498+
assertThat(query("SELECT count_if(file_format = 'PUFFIN') FROM \"" + tableName + "$files\" WHERE content = 1"))
499+
.matches("VALUES BIGINT '3'");
500+
// All DV entries share the same Puffin file path
501+
assertThat(query("SELECT count(DISTINCT file_path) FROM \"" + tableName + "$files\" WHERE content = 1"))
502+
.matches("VALUES BIGINT '1'");
503+
504+
// The $files table does not yet expose content_offset/content_size_in_bytes columns,
505+
// which are needed to identify individual DVs within the shared Puffin file.
506+
assertThat(computeActual("SHOW COLUMNS FROM \"" + tableName + "$files\"")
507+
.getMaterializedRows().stream()
508+
.map(row -> (String) row.getField(0)))
509+
.doesNotContain("content_offset", "content_size_in_bytes");
510+
}
511+
}
512+
434513
@Test
435514
public void testFilesPartitionTable()
436515
{

0 commit comments

Comments
 (0)