Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions core/src/main/java/org/apache/iceberg/TableProperties.java
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,11 @@ private TableProperties() {}
public static final String DELETE_PARQUET_PAGE_ROW_LIMIT = "write.delete.parquet.page-row-limit";
public static final int PARQUET_PAGE_ROW_LIMIT_DEFAULT = 20_000;

public static final String PARQUET_ROW_GROUP_ROW_LIMIT = "write.parquet.row-group-row-limit";
public static final String DELETE_PARQUET_ROW_GROUP_ROW_LIMIT =
"write.delete.parquet.row-group-row-limit";
public static final int PARQUET_ROW_GROUP_ROW_LIMIT_DEFAULT = Integer.MAX_VALUE;

public static final String PARQUET_DICT_SIZE_BYTES = "write.parquet.dict-size-bytes";
public static final String DELETE_PARQUET_DICT_SIZE_BYTES =
"write.delete.parquet.dict-size-bytes";
Expand Down
25 changes: 25 additions & 0 deletions parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import static org.apache.iceberg.TableProperties.DELETE_PARQUET_PAGE_VERSION;
import static org.apache.iceberg.TableProperties.DELETE_PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT;
import static org.apache.iceberg.TableProperties.DELETE_PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT;
import static org.apache.iceberg.TableProperties.DELETE_PARQUET_ROW_GROUP_ROW_LIMIT;
import static org.apache.iceberg.TableProperties.DELETE_PARQUET_ROW_GROUP_SIZE_BYTES;
import static org.apache.iceberg.TableProperties.PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX;
import static org.apache.iceberg.TableProperties.PARQUET_BLOOM_FILTER_COLUMN_FPP_PREFIX;
Expand All @@ -49,6 +50,8 @@
import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT_DEFAULT;
import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT;
import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT_DEFAULT;
import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_ROW_LIMIT;
import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_ROW_LIMIT_DEFAULT;
import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES;
import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT;

Expand Down Expand Up @@ -365,6 +368,7 @@ public <D> FileAppender<D> build() throws IOException {
int rowGroupSize = context.rowGroupSize();
int pageSize = context.pageSize();
int pageRowLimit = context.pageRowLimit();
int rowGroupRowLimit = context.rowGroupRowLimit();
int dictionaryPageSize = context.dictionaryPageSize();
String compressionLevel = context.compressionLevel();
CompressionCodecName codec = context.codec();
Expand Down Expand Up @@ -433,6 +437,7 @@ public <D> FileAppender<D> build() throws IOException {
.withWriterVersion(context.writerVersion())
.withPageSize(pageSize)
.withPageRowCountLimit(pageRowLimit)
.withRowGroupRowCountLimit(rowGroupRowLimit)
.withDictionaryEncoding(dictionaryEnabled)
.withDictionaryPageSize(dictionaryPageSize)
.withMinRowCountForPageSizeCheck(rowGroupCheckMinRecordCount)
Expand Down Expand Up @@ -476,6 +481,7 @@ public <D> FileAppender<D> build() throws IOException {
.withRowGroupSize((long) rowGroupSize)
.withPageSize(pageSize)
.withPageRowCountLimit(pageRowLimit)
.withRowGroupRowCountLimit(rowGroupRowLimit)
.withDictionaryEncoding(dictionaryEnabled)
.withDictionaryPageSize(dictionaryPageSize)
.withEncryption(fileEncryptionProperties);
Expand All @@ -498,6 +504,7 @@ static class Context {
private final int rowGroupSize;
private final int pageSize;
private final int pageRowLimit;
private final int rowGroupRowLimit;
private final int dictionaryPageSize;
private final WriterVersion writerVersion;
private final CompressionCodecName codec;
Expand All @@ -515,6 +522,7 @@ private Context(
int rowGroupSize,
int pageSize,
int pageRowLimit,
int rowGroupRowLimit,
int dictionaryPageSize,
WriterVersion writerVersion,
CompressionCodecName codec,
Expand All @@ -530,6 +538,7 @@ private Context(
this.rowGroupSize = rowGroupSize;
this.pageSize = pageSize;
this.pageRowLimit = pageRowLimit;
this.rowGroupRowLimit = rowGroupRowLimit;
this.dictionaryPageSize = dictionaryPageSize;
this.writerVersion = writerVersion;
this.codec = codec;
Expand Down Expand Up @@ -560,6 +569,11 @@ static Context dataContext(Map<String, String> config) {
config, PARQUET_PAGE_ROW_LIMIT, PARQUET_PAGE_ROW_LIMIT_DEFAULT);
Preconditions.checkArgument(pageRowLimit > 0, "Page row count limit must be > 0");

int rowGroupRowLimit =
PropertyUtil.propertyAsInt(
config, PARQUET_ROW_GROUP_ROW_LIMIT, PARQUET_ROW_GROUP_ROW_LIMIT_DEFAULT);
Preconditions.checkArgument(rowGroupRowLimit > 0, "Row group row count limit must be > 0");

int dictionaryPageSize =
PropertyUtil.propertyAsInt(
config, PARQUET_DICT_SIZE_BYTES, PARQUET_DICT_SIZE_BYTES_DEFAULT);
Expand Down Expand Up @@ -619,6 +633,7 @@ static Context dataContext(Map<String, String> config) {
rowGroupSize,
pageSize,
pageRowLimit,
rowGroupRowLimit,
dictionaryPageSize,
writerVersion,
codec,
Expand Down Expand Up @@ -652,6 +667,11 @@ static Context deleteContext(Map<String, String> config) {
config, DELETE_PARQUET_PAGE_ROW_LIMIT, dataContext.pageRowLimit());
Preconditions.checkArgument(pageRowLimit > 0, "Page row count limit must be > 0");

int rowGroupRowLimit =
PropertyUtil.propertyAsInt(
config, DELETE_PARQUET_ROW_GROUP_ROW_LIMIT, dataContext.rowGroupRowLimit());
Preconditions.checkArgument(rowGroupRowLimit > 0, "Row group row count limit must be > 0");

int dictionaryPageSize =
PropertyUtil.propertyAsInt(
config, DELETE_PARQUET_DICT_SIZE_BYTES, dataContext.dictionaryPageSize());
Expand Down Expand Up @@ -696,6 +716,7 @@ static Context deleteContext(Map<String, String> config) {
rowGroupSize,
pageSize,
pageRowLimit,
rowGroupRowLimit,
dictionaryPageSize,
writerVersion,
codec,
Expand Down Expand Up @@ -739,6 +760,10 @@ int pageRowLimit() {
return pageRowLimit;
}

int rowGroupRowLimit() {
return rowGroupRowLimit;
}

int dictionaryPageSize() {
return dictionaryPageSize;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,12 @@ public List<Long> splitOffsets() {
}

private void checkSize() {
// This comparison is cheap, so we don't need the "spacing out checks" logic below.
if (recordCount >= props.getRowGroupRowCountLimit()) {
flushRowGroup(false);
return;
}

if (recordCount >= nextCheckRecordCount) {
long bufferedSize = writeStore.getBufferedSize();
double avgRecordSize = ((double) bufferedSize) / recordCount;
Expand Down
33 changes: 33 additions & 0 deletions parquet/src/test/java/org/apache/iceberg/parquet/TestParquet.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import static org.apache.iceberg.TableProperties.PARQUET_COLUMN_STATS_ENABLED_PREFIX;
import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT;
import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT;
import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_ROW_LIMIT;
import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES;
import static org.apache.iceberg.parquet.ParquetWritingTestUtils.createTempFile;
import static org.apache.iceberg.parquet.ParquetWritingTestUtils.write;
Expand Down Expand Up @@ -105,6 +106,38 @@ public void testRowGroupSizeConfigurableWithWriter() throws IOException {
}
}

@Test
public void testRowGroupRowLimitConfigurable() throws IOException {
Schema schema = new Schema(optional(1, "intCol", IntegerType.get()));

int recordCount = 25;
int rowGroupRowLimit = 10;

List<GenericData.Record> records = Lists.newArrayListWithCapacity(recordCount);
org.apache.avro.Schema avroSchema = AvroSchemaUtil.convert(schema.asStruct());
for (int i = 1; i <= recordCount; i++) {
GenericData.Record record = new GenericData.Record(avroSchema);
record.put("intCol", i);
records.add(record);
}

File file = createTempFile(temp);
write(
file,
schema,
ImmutableMap.of(PARQUET_ROW_GROUP_ROW_LIMIT, Integer.toString(rowGroupRowLimit)),
ParquetAvroWriter::buildWriter,
records.toArray(new GenericData.Record[] {}));

try (ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(localInput(file)))) {
List<BlockMetaData> blocks = reader.getFooter().getBlocks();
assertThat(blocks).hasSize(3);
for (BlockMetaData block : blocks) {
assertThat(block.getRowCount()).isLessThanOrEqualTo(rowGroupRowLimit);
}
}
}

@Test
public void testMetricsMissingColumnStatisticsInRowGroups() throws IOException {
Schema schema = new Schema(optional(1, "stringCol", Types.StringType.get()));
Expand Down
Loading