-
Notifications
You must be signed in to change notification settings - Fork 486
GH-531: Add parquet flatbuf schema #544
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 1 commit
f951a6d
a77d277
bf0825c
5a0baf2
c938fc7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,224 @@ | ||||||||
| namespace parquet.format3; | ||||||||
|
|
||||||||
| // Optimization notes | ||||||||
|
alkis marked this conversation as resolved.
Outdated
|
||||||||
| // 1. Statistics are stored in integral types if their size is fixed, otherwise prefix + suffix | ||||||||
| // 2. ColumnMetaData.encoding_stats are removed, they are replaced with | ||||||||
| // ColumnMetaData.is_fully_dict_encoded. | ||||||||
| // 3. RowGroups are limited to 2GB in size, so we can use int for sizes. | ||||||||
|
alkis marked this conversation as resolved.
Outdated
|
||||||||
| // 4. ColumnChunk/ColumnMetaData offsets are now relative to the start of the row group, so we can | ||||||||
| // use int for offsets. | ||||||||
| // 5. Remove ordinal. | ||||||||
| // 6. Restrict RowGroups to 2^31-1 rows. | ||||||||
| // 7. Remove offset/column indexes, they are small and just their offsets are of similar size. | ||||||||
|
|
||||||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||||
| // Physical types. | ||||||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||||
|
|
||||||||
| enum Type : byte { | ||||||||
| BOOLEAN = 0, | ||||||||
| INT32 = 1, | ||||||||
| INT64 = 2, | ||||||||
| INT96 = 3, | ||||||||
| FLOAT = 4, | ||||||||
| DOUBLE = 5, | ||||||||
| BYTE_ARRAY = 6, | ||||||||
| FIXED_LEN_BYTE_ARRAY = 7, | ||||||||
| } | ||||||||
|
|
||||||||
| enum FieldRepetitionType : byte { | ||||||||
| REQUIRED = 0, | ||||||||
| OPTIONAL = 1, | ||||||||
| REPEATED = 2, | ||||||||
| } | ||||||||
|
|
||||||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||||
| // Encodings. | ||||||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||||
|
|
||||||||
| // Note: Match the thrift enum values so that we can cast between them. | ||||||||
| enum Encoding : byte { | ||||||||
| PLAIN = 0, | ||||||||
| // GROUP_VAR_INT = 1, | ||||||||
| PLAIN_DICTIONARY = 2, | ||||||||
| RLE = 3, | ||||||||
| // BIT_PACKED = 4, | ||||||||
| DELTA_BINARY_PACKED = 5, | ||||||||
| DELTA_LENGTH_BYTE_ARRAY = 6, | ||||||||
| DELTA_BYTE_ARRAY = 7, | ||||||||
| RLE_DICTIONARY = 8, | ||||||||
| BYTE_STREAM_SPLIT = 9, | ||||||||
| } | ||||||||
|
|
||||||||
| // Note: Match the thrift enum values so that we can cast between them. | ||||||||
| enum CompressionCodec : byte { | ||||||||
| UNCOMPRESSED = 0, | ||||||||
| SNAPPY = 1, | ||||||||
| GZIP = 2, | ||||||||
| LZO = 3, | ||||||||
| BROTLI = 4, | ||||||||
| // LZ4 = 5, | ||||||||
|
alkis marked this conversation as resolved.
Outdated
|
||||||||
| ZSTD = 6, | ||||||||
| LZ4_RAW = 7, | ||||||||
| } | ||||||||
|
|
||||||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||||
| // Logical types. | ||||||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||||
|
|
||||||||
| table Empty {} | ||||||||
| table DecimalOpts { | ||||||||
|
alkis marked this conversation as resolved.
Outdated
|
||||||||
| precision: int; | ||||||||
| scale: int; | ||||||||
| } | ||||||||
| enum TimeUnit : byte { | ||||||||
| MS = 0, | ||||||||
|
alkis marked this conversation as resolved.
Outdated
|
||||||||
| US = 1, | ||||||||
| NS = 2, | ||||||||
| } | ||||||||
| table TimeOpts { | ||||||||
| is_adjusted_to_utc: bool; | ||||||||
| unit: TimeUnit; | ||||||||
| } | ||||||||
| table IntOpts { | ||||||||
| bit_width: byte = 8; | ||||||||
| is_signed: bool; | ||||||||
| } | ||||||||
| table GeometryType { | ||||||||
| crs: string; | ||||||||
| } | ||||||||
| enum EdgeInterpolationAlgorithm : byte { | ||||||||
| SPHERICAL = 0, | ||||||||
| VINCENTY = 1, | ||||||||
| THOMAS = 2, | ||||||||
| ANDOYER = 3, | ||||||||
| KARNEY = 4, | ||||||||
| } | ||||||||
| table GeographyType { | ||||||||
| crs: string; | ||||||||
| algorithm: EdgeInterpolationAlgorithm; | ||||||||
| } | ||||||||
| union LogicalType { | ||||||||
| StringType:Empty, | ||||||||
| MapType:Empty, | ||||||||
| ListType:Empty, | ||||||||
| EnumType:Empty, | ||||||||
| DecimalType:DecimalOpts, | ||||||||
| DateType:Empty, | ||||||||
| TimeType:TimeOpts, | ||||||||
| TimestampType:TimeOpts, | ||||||||
| IntType:IntOpts, | ||||||||
| NullType:Empty, | ||||||||
| JsonType:Empty, | ||||||||
| BsonType:Empty, | ||||||||
| UUIDType:Empty, | ||||||||
| Float16Type:Empty, | ||||||||
| VariantType:Empty, | ||||||||
| GeometryType:GeometryType, | ||||||||
| GeographyType:GeographyType, | ||||||||
| } | ||||||||
|
|
||||||||
| table Statistics { | ||||||||
| null_count: int = null; | ||||||||
|
alkis marked this conversation as resolved.
Outdated
|
||||||||
| // Store min/max values fixed sized entities depending on the physical type. If len is present | ||||||||
| // then the min/max value is present. | ||||||||
| // | ||||||||
| // - BOOLEAN: none | ||||||||
| // - INT32/FLOAT: lo4 (little-endian) | ||||||||
| // - INT64/DOUBLE: lo8 (little-endian) | ||||||||
| // - INT96: lo4+lo8 (little-endian) | ||||||||
|
alkis marked this conversation as resolved.
Outdated
|
||||||||
| // - FIXED_LEN_BYTE_ARRAY: | ||||||||
|
alkis marked this conversation as resolved.
Outdated
|
||||||||
| // - BYTE_ARRAY: | ||||||||
| // prefix: the longest common prefix of min/max | ||||||||
| // lo8+hi8 zero padded 16 bytes (big-endian) of the suffix | ||||||||
| // len: the length for the suffix of the value after removing the prefix. If > 16 then the | ||||||||
|
alkis marked this conversation as resolved.
Outdated
|
||||||||
| // value is inexact | ||||||||
|
alkis marked this conversation as resolved.
Outdated
|
||||||||
| min_lo4: uint; | ||||||||
| min_lo8: ulong; | ||||||||
| min_hi8: ulong; | ||||||||
| min_len: byte = null; | ||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Original suffix lenght could exceed int8 range of byte type. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @rok , the previous comment is outdated. |
||||||||
| max_lo4: uint; | ||||||||
| max_lo8: ulong; | ||||||||
| max_hi8: ulong; | ||||||||
| max_len: byte = null; | ||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As above:
Suggested change
|
||||||||
| prefix: string; | ||||||||
|
alkis marked this conversation as resolved.
Outdated
|
||||||||
| } | ||||||||
|
|
||||||||
| union ColumnOrder { | ||||||||
| TypeDefinedOrder:Empty, | ||||||||
| } | ||||||||
|
|
||||||||
| table SchemaElement { | ||||||||
| name: string; | ||||||||
| type: Type = null; | ||||||||
| repetition_type: FieldRepetitionType; | ||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To allow for root to not have repetition type. In thrift we have optional: parquet-format/src/main/thrift/parquet.thrift Line 518 in 38818fa
Suggested change
|
||||||||
| logical_type: LogicalType; | ||||||||
| type_length: int = null; | ||||||||
| num_children: int = 0; | ||||||||
| field_id: int = null; | ||||||||
| column_order: ColumnOrder; // only present for leaf nodes | ||||||||
| } | ||||||||
|
|
||||||||
| enum PageType : byte { | ||||||||
| DATA_PAGE = 0, | ||||||||
| INDEX_PAGE = 1, | ||||||||
| DICTIONARY_PAGE = 2, | ||||||||
| DATA_PAGE_V2 = 3, | ||||||||
| } | ||||||||
|
|
||||||||
| table KV { | ||||||||
|
alkis marked this conversation as resolved.
Outdated
|
||||||||
| key: string; | ||||||||
| val: string; | ||||||||
| } | ||||||||
|
|
||||||||
| table ColumnMetadata { | ||||||||
| codec: CompressionCodec; | ||||||||
| num_values: long = null; // only present if not equal to rg.num_rows | ||||||||
| total_uncompressed_size: long; | ||||||||
| total_compressed_size: long; | ||||||||
| key_value_metadata: [KV]; | ||||||||
| data_page_offset: long; | ||||||||
| index_page_offset: long = null; | ||||||||
| dictionary_page_offset: long = null; | ||||||||
| statistics: Statistics; | ||||||||
| is_fully_dict_encoded: bool; | ||||||||
| bloom_filter_offset: long = null; | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we this be made a struct/value to make the bloom filter info more self contained? |
||||||||
| bloom_filter_length: int = null; | ||||||||
| } | ||||||||
|
|
||||||||
| table ColumnChunk { | ||||||||
| file_path: string; | ||||||||
| meta_data: ColumnMetadata; | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
typo? or is metadata not allowed for some reason? |
||||||||
| // crypto_metadata: ColumnCryptoMetadata; // TODO | ||||||||
| // encrypted_column_metadata: [byte]; // TODO | ||||||||
| } | ||||||||
|
|
||||||||
| table SortingColumn { | ||||||||
| column_idx: int; | ||||||||
| descending: bool; | ||||||||
| nulls_first: bool; | ||||||||
| } | ||||||||
|
|
||||||||
| table RowGroup { | ||||||||
| columns: [ColumnChunk]; | ||||||||
| total_byte_size: long; | ||||||||
| num_rows: long; | ||||||||
| sorting_columns: [SortingColumn]; | ||||||||
| file_offset: long; | ||||||||
| total_compressed_size: long; | ||||||||
| ordinal: short = null; | ||||||||
| } | ||||||||
|
|
||||||||
| table FileMetaData { | ||||||||
| version: int; | ||||||||
| schema: [SchemaElement]; | ||||||||
| num_rows: long; | ||||||||
| row_groups: [RowGroup]; | ||||||||
| kv: [KV]; | ||||||||
| created_by: string; | ||||||||
| // column_orders: [ColumnOrder]; // moved to SchemaElement | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove this row for now? |
||||||||
| // encryption_algorithm: [EncryptionAlgorithm]; // TODO | ||||||||
| // footer_signing_key_metadata: binary; // TODO | ||||||||
| } | ||||||||
|
|
||||||||
| root_type FileMetaData; | ||||||||
Uh oh!
There was an error while loading. Please reload this page.