@@ -431,6 +431,85 @@ public void testFilesTable()
431431 }
432432 }
433433
434+ @ Test
435+ public void testFilesTableDeleteFileDeduplication ()
436+ throws Exception
437+ {
438+ try (TestTable testTable = newTrinoTable ("test_files_delete_dedup_" ,
439+ "WITH (partitioning = ARRAY['regionkey']) AS SELECT * FROM tpch.tiny.nation" )) {
440+ String tableName = testTable .getName ();
441+ Table icebergTable = loadTable (tableName );
442+
443+ // Verify initial state: only data files, no delete files
444+ assertThat (query ("SELECT count(*) FROM \" " + tableName + "$files\" WHERE content = 0" ))
445+ .matches ("VALUES BIGINT '5'" ); // one data file per regionkey partition
446+ assertThat (query ("SELECT count(*) FROM \" " + tableName + "$files\" WHERE content != 0" ))
447+ .matches ("VALUES BIGINT '0'" );
448+
449+ // Write a position delete via MOR path
450+ assertUpdate ("DELETE FROM " + tableName + " WHERE nationkey = 7" , 1 );
451+
452+ // Write an equality delete file for regionkey=2
453+ writeEqualityDeleteForTable (icebergTable , fileSystemFactory ,
454+ Optional .of (icebergTable .spec ()),
455+ Optional .of (new PartitionData (new Long [] {2L })),
456+ ImmutableMap .of ("regionkey" , 2L ),
457+ Optional .empty ());
458+
459+ // Verify: each file path should appear exactly once (no duplicates)
460+ assertThat (query ("SELECT count(*) FROM \" " + tableName + "$files\" WHERE content = 1" ))
461+ .matches ("VALUES BIGINT '1'" ); // exactly 1 position delete file
462+ assertThat (query ("SELECT count(*) FROM \" " + tableName + "$files\" WHERE content = 2" ))
463+ .matches ("VALUES BIGINT '1'" ); // exactly 1 equality delete file
464+
465+ // Verify no duplicate file paths exist
466+ assertThat (query ("SELECT count(file_path) - count(DISTINCT file_path) FROM \" " + tableName + "$files\" " ))
467+ .matches ("VALUES BIGINT '0'" );
468+ }
469+ }
470+
471+ @ Test
472+ public void testFilesTableDeletionVectors ()
473+ {
474+ try (TestTable testTable = newTrinoTable ("test_files_dv_" ,
475+ "(id INTEGER) WITH (format_version = 3, format = 'PARQUET')" )) {
476+ String tableName = testTable .getName ();
477+
478+ // Insert data across multiple data files
479+ for (int i = 0 ; i < 3 ; i ++) {
480+ assertUpdate ("INSERT INTO " + tableName + " SELECT x FROM UNNEST(sequence(%s, %s)) t(x)" .formatted (i * 100 + 1 , (i + 1 ) * 100 ), 100 );
481+ }
482+
483+ // Verify initial state: 3 data files, no delete files
484+ assertThat (query ("SELECT count(*) FROM \" " + tableName + "$files\" WHERE content = 0" ))
485+ .matches ("VALUES BIGINT '3'" );
486+ assertThat (query ("SELECT count(*) FROM \" " + tableName + "$files\" WHERE content != 0" ))
487+ .matches ("VALUES BIGINT '0'" );
488+
489+ // Delete rows to create deletion vectors (stored in shared Puffin files)
490+ assertUpdate ("DELETE FROM " + tableName + " WHERE id % 2 = 0" , 150 );
491+
492+ // In v3, deletion vectors for multiple data files are stored in a single Puffin file.
493+ // The $files table shows one entry per DV (one per data file), all sharing the same file_path.
494+ // Currently there are no content_offset/content_size_in_bytes columns to distinguish
495+ // individual DVs within the shared Puffin file.
496+ assertThat (query ("SELECT count(*) FROM \" " + tableName + "$files\" WHERE content = 1" ))
497+ .matches ("VALUES BIGINT '3'" ); // one DV entry per data file
498+ assertThat (query ("SELECT count_if(file_format = 'PUFFIN') FROM \" " + tableName + "$files\" WHERE content = 1" ))
499+ .matches ("VALUES BIGINT '3'" );
500+ // All DV entries share the same Puffin file path
501+ assertThat (query ("SELECT count(DISTINCT file_path) FROM \" " + tableName + "$files\" WHERE content = 1" ))
502+ .matches ("VALUES BIGINT '1'" );
503+
504+ // The $files table does not yet expose content_offset/content_size_in_bytes columns,
505+ // which are needed to identify individual DVs within the shared Puffin file.
506+ assertThat (computeActual ("SHOW COLUMNS FROM \" " + tableName + "$files\" " )
507+ .getMaterializedRows ().stream ()
508+ .map (row -> (String ) row .getField (0 )))
509+ .doesNotContain ("content_offset" , "content_size_in_bytes" );
510+ }
511+ }
512+
434513 @ Test
435514 public void testFilesPartitionTable ()
436515 {
0 commit comments