Skip to content
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
224 changes: 224 additions & 0 deletions src/main/flatbuf/parquet3.fbs
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
namespace parquet.format3;
Comment thread
alkis marked this conversation as resolved.
Outdated

// Optimization notes
Comment thread
alkis marked this conversation as resolved.
Outdated
// 1. Statistics are stored in integral types if their size is fixed, otherwise prefix + suffix
// 2. ColumnMetaData.encoding_stats are removed, they are replaced with
// ColumnMetaData.is_fully_dict_encoded.
// 3. RowGroups are limited to 2GB in size, so we can use int for sizes.
Comment thread
alkis marked this conversation as resolved.
Outdated
// 4. ColumnChunk/ColumnMetaData offsets are now relative to the start of the row group, so we can
// use int for offsets.
// 5. Remove ordinal.
// 6. Restrict RowGroups to 2^31-1 rows.
// 7. Remove offset/column indexes, they are small and just their offsets are of similar size.

///////////////////////////////////////////////////////////////////////////////////////////////////
// Physical types.
///////////////////////////////////////////////////////////////////////////////////////////////////

enum Type : byte {
BOOLEAN = 0,
INT32 = 1,
INT64 = 2,
INT96 = 3,
FLOAT = 4,
DOUBLE = 5,
BYTE_ARRAY = 6,
FIXED_LEN_BYTE_ARRAY = 7,
}

enum FieldRepetitionType : byte {
REQUIRED = 0,
OPTIONAL = 1,
REPEATED = 2,
}

///////////////////////////////////////////////////////////////////////////////////////////////////
// Encodings.
///////////////////////////////////////////////////////////////////////////////////////////////////

// Note: Match the thrift enum values so that we can cast between them.
enum Encoding : byte {
PLAIN = 0,
// GROUP_VAR_INT = 1,
PLAIN_DICTIONARY = 2,
RLE = 3,
// BIT_PACKED = 4,
DELTA_BINARY_PACKED = 5,
DELTA_LENGTH_BYTE_ARRAY = 6,
DELTA_BYTE_ARRAY = 7,
RLE_DICTIONARY = 8,
BYTE_STREAM_SPLIT = 9,
}

// Note: Match the thrift enum values so that we can cast between them.
enum CompressionCodec : byte {
UNCOMPRESSED = 0,
SNAPPY = 1,
GZIP = 2,
LZO = 3,
BROTLI = 4,
// LZ4 = 5,
Comment thread
alkis marked this conversation as resolved.
Outdated
ZSTD = 6,
LZ4_RAW = 7,
}

///////////////////////////////////////////////////////////////////////////////////////////////////
// Logical types.
///////////////////////////////////////////////////////////////////////////////////////////////////

table Empty {}
table DecimalOpts {
Comment thread
alkis marked this conversation as resolved.
Outdated
precision: int;
scale: int;
}
enum TimeUnit : byte {
MS = 0,
Comment thread
alkis marked this conversation as resolved.
Outdated
US = 1,
NS = 2,
}
table TimeOpts {
is_adjusted_to_utc: bool;
unit: TimeUnit;
}
table IntOpts {
bit_width: byte = 8;
is_signed: bool;
}
table GeometryType {
crs: string;
}
enum EdgeInterpolationAlgorithm : byte {
SPHERICAL = 0,
VINCENTY = 1,
THOMAS = 2,
ANDOYER = 3,
KARNEY = 4,
}
table GeographyType {
crs: string;
algorithm: EdgeInterpolationAlgorithm;
}
union LogicalType {
StringType:Empty,
MapType:Empty,
ListType:Empty,
EnumType:Empty,
DecimalType:DecimalOpts,
DateType:Empty,
TimeType:TimeOpts,
TimestampType:TimeOpts,
IntType:IntOpts,
NullType:Empty,
JsonType:Empty,
BsonType:Empty,
UUIDType:Empty,
Float16Type:Empty,
VariantType:Empty,
GeometryType:GeometryType,
GeographyType:GeographyType,
}

table Statistics {
null_count: int = null;
Comment thread
alkis marked this conversation as resolved.
Outdated
// Store min/max values fixed sized entities depending on the physical type. If len is present
// then the min/max value is present.
//
// - BOOLEAN: none
// - INT32/FLOAT: lo4 (little-endian)
// - INT64/DOUBLE: lo8 (little-endian)
// - INT96: lo4+lo8 (little-endian)
Comment thread
alkis marked this conversation as resolved.
Outdated
// - FIXED_LEN_BYTE_ARRAY:
Comment thread
alkis marked this conversation as resolved.
Outdated
// - BYTE_ARRAY:
// prefix: the longest common prefix of min/max
// lo8+hi8 zero padded 16 bytes (big-endian) of the suffix
// len: the length for the suffix of the value after removing the prefix. If > 16 then the
Comment thread
alkis marked this conversation as resolved.
Outdated
// value is inexact
Comment thread
alkis marked this conversation as resolved.
Outdated
min_lo4: uint;
min_lo8: ulong;
min_hi8: ulong;
min_len: byte = null;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
min_len: byte = null;
min_len: int = null;

Original suffix lenght could exceed int8 range of byte type.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @rok , the previous comment is outdated. max_len and min_len store the truncated suffix length, which means the maximum value is 16. We use negative numbers to represent inexact values. I have updated the comment and the example, please take a look.

max_lo4: uint;
max_lo8: ulong;
max_hi8: ulong;
max_len: byte = null;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As above:

Suggested change
max_len: byte = null;
max_len: int = null;

prefix: string;
Comment thread
alkis marked this conversation as resolved.
Outdated
}

union ColumnOrder {
TypeDefinedOrder:Empty,
}

table SchemaElement {
name: string;
type: Type = null;
repetition_type: FieldRepetitionType;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To allow for root to not have repetition type. In thrift we have optional:

3: optional FieldRepetitionType repetition_type;

Suggested change
repetition_type: FieldRepetitionType;
repetition_type: FieldRepetitionType = null;

logical_type: LogicalType;
type_length: int = null;
num_children: int = 0;
field_id: int = null;
column_order: ColumnOrder; // only present for leaf nodes
}

enum PageType : byte {
DATA_PAGE = 0,
INDEX_PAGE = 1,
DICTIONARY_PAGE = 2,
DATA_PAGE_V2 = 3,
}

table KV {
Comment thread
alkis marked this conversation as resolved.
Outdated
key: string;
val: string;
}

table ColumnMetadata {
codec: CompressionCodec;
num_values: long = null; // only present if not equal to rg.num_rows
total_uncompressed_size: long;
total_compressed_size: long;
key_value_metadata: [KV];
data_page_offset: long;
index_page_offset: long = null;
dictionary_page_offset: long = null;
statistics: Statistics;
is_fully_dict_encoded: bool;
bloom_filter_offset: long = null;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we this be made a struct/value to make the bloom filter info more self contained?

bloom_filter_length: int = null;
}

table ColumnChunk {
file_path: string;
meta_data: ColumnMetadata;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
meta_data: ColumnMetadata;
metadata: ColumnMetadata;

typo? or is metadata not allowed for some reason?

// crypto_metadata: ColumnCryptoMetadata; // TODO
// encrypted_column_metadata: [byte]; // TODO
}

table SortingColumn {
column_idx: int;
descending: bool;
nulls_first: bool;
}

table RowGroup {
columns: [ColumnChunk];
total_byte_size: long;
num_rows: long;
sorting_columns: [SortingColumn];
file_offset: long;
total_compressed_size: long;
ordinal: short = null;
}

table FileMetaData {
version: int;
schema: [SchemaElement];
num_rows: long;
row_groups: [RowGroup];
kv: [KV];
created_by: string;
// column_orders: [ColumnOrder]; // moved to SchemaElement
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove this row for now?

// encryption_algorithm: [EncryptionAlgorithm]; // TODO
// footer_signing_key_metadata: binary; // TODO
}

root_type FileMetaData;
Loading