From f951a6d657af90c82a180c92292d8f8a4c57ad1a Mon Sep 17 00:00:00 2001
From: Alkis Evlogimenos <alkis@evlogimenos.com>
Date: Fri, 12 Dec 2025 08:56:36 +0100
Subject: [PATCH 1/5] Add parquet flatbuf schema

---
 src/main/flatbuf/parquet3.fbs | 224 ++++++++++++++++++++++++++++++++++
 1 file changed, 224 insertions(+)
 create mode 100644 src/main/flatbuf/parquet3.fbs

diff --git a/src/main/flatbuf/parquet3.fbs b/src/main/flatbuf/parquet3.fbs
new file mode 100644
index 000000000..68d858f50
--- /dev/null
+++ b/src/main/flatbuf/parquet3.fbs
@@ -0,0 +1,224 @@
+namespace parquet.format3;
+
+// Optimization notes
+// 1. Statistics are stored in integral types if their size is fixed, otherwise prefix + suffix
+// 2. ColumnMetaData.encoding_stats are removed, they are replaced with
+//    ColumnMetaData.is_fully_dict_encoded.
+// 3. RowGroups are limited to 2GB in size, so we can use int for sizes.
+// 4. ColumnChunk/ColumnMetaData offsets are now relative to the start of the row group, so we can
+//    use int for offsets.
+// 5. Remove ordinal.
+// 6. Restrict RowGroups to 2^31-1 rows.
+// 7. Remove offset/column indexes, they are small and just their offsets are of similar size.
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Physical types.
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum Type : byte {
+  BOOLEAN = 0,
+  INT32 = 1,
+  INT64 = 2,
+  INT96 = 3,
+  FLOAT = 4,
+  DOUBLE = 5,
+  BYTE_ARRAY = 6,
+  FIXED_LEN_BYTE_ARRAY = 7,
+}
+
+enum FieldRepetitionType : byte {
+  REQUIRED = 0,
+  OPTIONAL = 1,
+  REPEATED = 2,
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Encodings.
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Note: Match the thrift enum values so that we can cast between them.
+enum Encoding : byte {
+  PLAIN = 0,
+  // GROUP_VAR_INT = 1,
+  PLAIN_DICTIONARY = 2,
+  RLE = 3,
+  // BIT_PACKED = 4,
+  DELTA_BINARY_PACKED = 5,
+  DELTA_LENGTH_BYTE_ARRAY = 6,
+  DELTA_BYTE_ARRAY = 7,
+  RLE_DICTIONARY = 8,
+  BYTE_STREAM_SPLIT = 9,
+}
+
+// Note: Match the thrift enum values so that we can cast between them.
+enum CompressionCodec : byte {
+  UNCOMPRESSED = 0,
+  SNAPPY = 1,
+  GZIP = 2,
+  LZO = 3,
+  BROTLI = 4,
+  // LZ4 = 5,
+  ZSTD = 6,
+  LZ4_RAW = 7,
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Logical types.
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+table Empty {}
+table DecimalOpts {
+  precision: int;
+  scale: int;
+}
+enum TimeUnit : byte {
+  MS = 0,
+  US = 1,
+  NS = 2,
+}
+table TimeOpts {
+  is_adjusted_to_utc: bool;
+  unit: TimeUnit;
+}
+table IntOpts {
+  bit_width: byte = 8;
+  is_signed: bool;
+}
+table GeometryType {
+  crs: string;
+}
+enum EdgeInterpolationAlgorithm : byte {
+  SPHERICAL = 0,
+  VINCENTY = 1,
+  THOMAS = 2,
+  ANDOYER = 3,
+  KARNEY = 4,
+}
+table GeographyType {
+  crs: string;
+  algorithm: EdgeInterpolationAlgorithm;
+}
+union LogicalType {
+  StringType:Empty,
+  MapType:Empty,
+  ListType:Empty,
+  EnumType:Empty,
+  DecimalType:DecimalOpts,
+  DateType:Empty,
+  TimeType:TimeOpts,
+  TimestampType:TimeOpts,
+  IntType:IntOpts,
+  NullType:Empty,
+  JsonType:Empty,
+  BsonType:Empty,
+  UUIDType:Empty,
+  Float16Type:Empty,
+  VariantType:Empty,
+  GeometryType:GeometryType,
+  GeographyType:GeographyType,
+}
+
+table Statistics {
+  null_count: int = null;
+  // Store min/max values fixed sized entities depending on the physical type. If len is present
+  // then the min/max value is present.
+  //
+  // - BOOLEAN: none
+  // - INT32/FLOAT: lo4 (little-endian)
+  // - INT64/DOUBLE: lo8 (little-endian)
+  // - INT96: lo4+lo8 (little-endian)
+  // - FIXED_LEN_BYTE_ARRAY:
+  // - BYTE_ARRAY:
+  //   prefix: the longest common prefix of min/max
+  //   lo8+hi8 zero padded 16 bytes (big-endian) of the suffix
+  //   len: the length for the suffix of the value after removing the prefix. If > 16 then the
+  //        value is inexact
+  min_lo4: uint;
+  min_lo8: ulong;
+  min_hi8: ulong;
+  min_len: byte = null;
+  max_lo4: uint;
+  max_lo8: ulong;
+  max_hi8: ulong;
+  max_len: byte = null;
+  prefix: string;
+}
+
+union ColumnOrder {
+  TypeDefinedOrder:Empty,
+}
+
+table SchemaElement {
+  name: string;
+  type: Type = null;
+  repetition_type: FieldRepetitionType;
+  logical_type: LogicalType;
+  type_length: int = null;
+  num_children: int = 0;
+  field_id: int = null;
+  column_order: ColumnOrder;  // only present for leaf nodes
+}
+
+enum PageType : byte {
+  DATA_PAGE = 0,
+  INDEX_PAGE = 1,
+  DICTIONARY_PAGE = 2,
+  DATA_PAGE_V2 = 3,
+}
+
+table KV {
+  key: string;
+  val: string;
+}
+
+table ColumnMetadata {
+  codec: CompressionCodec;
+  num_values: long = null;  // only present if not equal to rg.num_rows
+  total_uncompressed_size: long;
+  total_compressed_size: long;
+  key_value_metadata: [KV];
+  data_page_offset: long;
+  index_page_offset: long = null;
+  dictionary_page_offset: long = null;
+  statistics: Statistics;
+  is_fully_dict_encoded: bool;
+  bloom_filter_offset: long = null;
+  bloom_filter_length: int = null;
+}
+
+table ColumnChunk {
+  file_path: string;
+  meta_data: ColumnMetadata;
+  // crypto_metadata: ColumnCryptoMetadata;  // TODO
+  // encrypted_column_metadata: [byte];  // TODO
+}
+
+table SortingColumn {
+  column_idx: int;
+  descending: bool;
+  nulls_first: bool;
+}
+
+table RowGroup {
+  columns: [ColumnChunk];
+  total_byte_size: long;
+  num_rows: long;
+  sorting_columns: [SortingColumn];
+  file_offset: long;
+  total_compressed_size: long;
+  ordinal: short = null;
+}
+
+table FileMetaData {
+  version: int;
+  schema: [SchemaElement];
+  num_rows: long;
+  row_groups: [RowGroup];
+  kv: [KV];
+  created_by: string;
+  // column_orders: [ColumnOrder];  // moved to SchemaElement
+  // encryption_algorithm: [EncryptionAlgorithm];  // TODO
+  // footer_signing_key_metadata: binary;  // TODO
+}
+
+root_type FileMetaData;

From a77d2774c99d68953df0de9c1ea14af0fb4e4001 Mon Sep 17 00:00:00 2001
From: Jiayi Wang <jiayi.wang@your.hostname.com>
Date: Tue, 3 Mar 2026 11:53:18 +0000
Subject: [PATCH 2/5] address comments

---
 src/main/flatbuf/parquet3.fbs | 500 ++++++++++++++++++++++++++++++----
 1 file changed, 440 insertions(+), 60 deletions(-)

diff --git a/src/main/flatbuf/parquet3.fbs b/src/main/flatbuf/parquet3.fbs
index 68d858f50..8406d5c0a 100644
--- a/src/main/flatbuf/parquet3.fbs
+++ b/src/main/flatbuf/parquet3.fbs
@@ -1,65 +1,165 @@
-namespace parquet.format3;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
 
-// Optimization notes
-// 1. Statistics are stored in integral types if their size is fixed, otherwise prefix + suffix
-// 2. ColumnMetaData.encoding_stats are removed, they are replaced with
-//    ColumnMetaData.is_fully_dict_encoded.
-// 3. RowGroups are limited to 2GB in size, so we can use int for sizes.
-// 4. ColumnChunk/ColumnMetaData offsets are now relative to the start of the row group, so we can
-//    use int for offsets.
-// 5. Remove ordinal.
-// 6. Restrict RowGroups to 2^31-1 rows.
-// 7. Remove offset/column indexes, they are small and just their offsets are of similar size.
+namespace parquet.format;
 
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Physical types.
-///////////////////////////////////////////////////////////////////////////////////////////////////
+// The FlatBuffers footer preserves the same information as the Thrift Parquet footer,
+// while removing duplicated fields, unused details, and inefficient encodings that
+// waste space and memory.
+// It can currently be attached as a footer extension, and may fully replace the
+// Thrift footer in the future.
+//
+// Optimization notes:
+// 1. Statistics use fixed-width integral types when possible; otherwise they are
+//    encoded as prefix + suffix.
+// 2. ColumnChunk file_path and file_offset are removed since they are unused.
+// 3. ColumnMetaData.encoding_stats are removed and replaced by
+//    ColumnMetaData.is_fully_dict_encoded.
+// 4. ColumnMetaData.path_in_schema is removed since it can be derived from the schema.
+// 5. ConvertedType is fully dropped as it is superseded by LogicalType.
+// 6. Offset and column indexes are removed since they are small and their offsets
+//    alone take comparable space.
 
+/**
+ * Types supported by Parquet. These types are intended to be used in combination
+ * with the encodings to control the on disk storage format.
+ * For example INT16 is not included as a type since a good encoding of INT32
+ * would handle this.
+ */
 enum Type : byte {
   BOOLEAN = 0,
   INT32 = 1,
   INT64 = 2,
-  INT96 = 3,
+  INT96 = 3,  // deprecated, new Parquet writers should not write data in INT96
   FLOAT = 4,
   DOUBLE = 5,
   BYTE_ARRAY = 6,
   FIXED_LEN_BYTE_ARRAY = 7,
 }
 
+/**
+ * Representation of Schemas
+ */
 enum FieldRepetitionType : byte {
+  /** This field is required (can not be null) and each row has exactly 1 value. */
   REQUIRED = 0,
+
+  /** The field is optional (can be null) and each row has 0 or 1 values. */
   OPTIONAL = 1,
+
+  /** The field is repeated and can contain 0 or more values */
   REPEATED = 2,
 }
 
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Encodings.
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Note: Match the thrift enum values so that we can cast between them.
+/**
+ * Encodings supported by Parquet. Not all encodings are valid for all types. These
+ * enums are also used to specify the encoding of definition and repetition levels.
+ * See the accompanying doc for the details of the more complicated encodings.
+ * Note: Match the thrift enum values so that we can cast between them.
+ */
 enum Encoding : byte {
+  /** Default encoding.
+   * BOOLEAN - 1 bit per value. 0 is false; 1 is true.
+   * INT32 - 4 bytes per value. Stored as little-endian.
+   * INT64 - 8 bytes per value. Stored as little-endian.
+   * FLOAT - 4 bytes per value. IEEE. Stored as little-endian.
+   * DOUBLE - 8 bytes per value. IEEE. Stored as little-endian.
+   * BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes.
+   * FIXED_LEN_BYTE_ARRAY - Just the bytes.
+   */
   PLAIN = 0,
+
+  /** Group VarInt encoding for INT32/INT64.
+   * This encoding is deprecated. It was never used
+   */
   // GROUP_VAR_INT = 1,
+
+  /**
+   * Deprecated: Dictionary encoding. The values in the dictionary are encoded in the
+   * plain type.
+   * in a data page use RLE_DICTIONARY instead.
+   * in a Dictionary page use PLAIN instead
+   */
   PLAIN_DICTIONARY = 2,
+
+  /** Group packed run length encoding. Usable for definition/repetition levels
+   * encoding and Booleans (on one bit: 0 is false; 1 is true.)
+   */
   RLE = 3,
+
+  /** Bit packed encoding. This can only be used if the data has a known max
+   * width. Usable for definition/repetition levels encoding.
+   * This encoding is deprecated and is replaced by the RLE/bit-packing hybrid encoding. 
+   */
   // BIT_PACKED = 4,
+
+  /** Delta encoding for integers. This can be used for int columns and works best
+   * on sorted data
+   */
   DELTA_BINARY_PACKED = 5,
+
+  /** Encoding for byte arrays to separate the length values and the data. The lengths
+   * are encoded using DELTA_BINARY_PACKED
+   */
   DELTA_LENGTH_BYTE_ARRAY = 6,
+
+  /** Incremental-encoded byte array. Prefix lengths are encoded using DELTA_BINARY_PACKED.
+   * Suffixes are stored as delta length byte arrays.
+   */
   DELTA_BYTE_ARRAY = 7,
+
+  /** Dictionary encoding: the ids are encoded using the RLE encoding
+   */
   RLE_DICTIONARY = 8,
+
+  /** Encoding for fixed-width data (FLOAT, DOUBLE, INT32, INT64, FIXED_LEN_BYTE_ARRAY).
+      K byte-streams are created where K is the size in bytes of the data type.
+      The individual bytes of a value are scattered to the corresponding stream and
+      the streams are concatenated.
+      This itself does not reduce the size of the data but can lead to better compression
+      afterwards.
+
+      Added in 2.8 for FLOAT and DOUBLE.
+      Support for INT32, INT64 and FIXED_LEN_BYTE_ARRAY added in 2.11.
+   */
   BYTE_STREAM_SPLIT = 9,
 }
 
-// Note: Match the thrift enum values so that we can cast between them.
+/**
+ * Supported compression algorithms.
+ *
+ * Codecs added in format version X.Y can be read by readers based on X.Y and later.
+ * Codec support may vary between readers based on the format version and
+ * libraries available at runtime.
+ *
+ * See Compression.md for a detailed specification of these algorithms.
+ * Note: Match the thrift enum values so that we can cast between them.
+ */
 enum CompressionCodec : byte {
   UNCOMPRESSED = 0,
   SNAPPY = 1,
   GZIP = 2,
   LZO = 3,
-  BROTLI = 4,
-  // LZ4 = 5,
-  ZSTD = 6,
-  LZ4_RAW = 7,
+  BROTLI = 4,  // Added in 2.4
+  LZ4 = 5,     // DEPRECATED (Added in 2.4)
+  ZSTD = 6,    // Added in 2.4
+  LZ4_RAW = 7, // Added in 2.9
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -67,26 +167,62 @@ enum CompressionCodec : byte {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 table Empty {}
-table DecimalOpts {
+
+/**
+ * Decimal logical type annotation
+ *
+ * Scale must be zero or a positive integer less than or equal to the precision.
+ * Precision must be a non-zero positive integer.
+ *
+ * To maintain forward-compatibility in v1, implementations using this logical
+ * type must also set scale and precision on the annotated SchemaElement.
+ *
+ * Allowed for physical types: INT32, INT64, FIXED_LEN_BYTE_ARRAY, and BYTE_ARRAY.
+ */
+table DecimalOptions {
   precision: int;
   scale: int;
 }
+
+/** Time units for logical types */
 enum TimeUnit : byte {
-  MS = 0,
-  US = 1,
-  NS = 2,
+  MILLIS = 0,
+  MICROS = 1,
+  NANOS = 2,
 }
-table TimeOpts {
+
+/**
+ * Timestamp logical type annotation
+ *
+ * Allowed for physical types: INT64
+ */
+table TimeOptions {
   is_adjusted_to_utc: bool;
   unit: TimeUnit;
 }
-table IntOpts {
+
+/**
+ * Integer logical type annotation
+ *
+ * bitWidth must be 8, 16, 32, or 64.
+ *
+ * Allowed for physical types: INT32, INT64
+ */
+table IntOptions {
   bit_width: byte = 8;
   is_signed: bool;
 }
-table GeometryType {
-  crs: string;
+
+/**
+ * Embedded Variant logical type annotation
+ */
+table VariantType {
+  // The version of the variant specification that the variant was
+  // written with.
+  specification_version: byte = null;
 }
+
+/** Edge interpolation algorithm for Geography logical type */
 enum EdgeInterpolationAlgorithm : byte {
   SPHERICAL = 0,
   VINCENTY = 1,
@@ -94,45 +230,97 @@ enum EdgeInterpolationAlgorithm : byte {
   ANDOYER = 3,
   KARNEY = 4,
 }
+
+/**
+ * Embedded Geometry logical type annotation
+ *
+ * Geospatial features in the Well-Known Binary (WKB) format and edges interpolation
+ * is always linear/planar.
+ *
+ * A custom CRS can be set by the crs field. If unset, it defaults to "OGC:CRS84",
+ * which means that the geometries must be stored in longitude, latitude based on
+ * the WGS84 datum.
+ *
+ * Allowed for physical type: BYTE_ARRAY.
+ *
+ * See Geospatial.md for details.
+ */
+table GeometryType {
+  crs: string;
+}
+
+/**
+ * Embedded Geography logical type annotation
+ *
+ * Geospatial features in the WKB format with an explicit (non-linear/non-planar)
+ * edges interpolation algorithm.
+ *
+ * A custom geographic CRS can be set by the crs field, where longitudes are
+ * bound by [-180, 180] and latitudes are bound by [-90, 90]. If unset, the CRS
+ * defaults to "OGC:CRS84".
+ *
+ * An optional algorithm can be set to correctly interpret edges interpolation
+ * of the geometries. If unset, the algorithm defaults to SPHERICAL.
+ *
+ * Allowed for physical type: BYTE_ARRAY.
+ *
+ * See Geospatial.md for details.
+ */
 table GeographyType {
   crs: string;
   algorithm: EdgeInterpolationAlgorithm;
 }
+
+/**
+ * LogicalType annotations to replace ConvertedType.
+ */
 union LogicalType {
-  StringType:Empty,
+  StringType:Empty, 
   MapType:Empty,
   ListType:Empty,
   EnumType:Empty,
-  DecimalType:DecimalOpts,
+  DecimalType:DecimalOptions,
   DateType:Empty,
-  TimeType:TimeOpts,
-  TimestampType:TimeOpts,
-  IntType:IntOpts,
+  TimeType:TimeOptions,
+  TimestampType:TimeOptions,
+  IntType:IntOptions,
   NullType:Empty,
   JsonType:Empty,
   BsonType:Empty,
   UUIDType:Empty,
   Float16Type:Empty,
-  VariantType:Empty,
+  VariantType:VariantType,
   GeometryType:GeometryType,
   GeographyType:GeographyType,
 }
 
 table Statistics {
   null_count: int = null;
-  // Store min/max values fixed sized entities depending on the physical type. If len is present
-  // then the min/max value is present.
+  // Store min/max values as fixed-width entities depending on the physical type.
+  // If min_len/max_len is present then the corresponding min/max value is present.
   //
   // - BOOLEAN: none
-  // - INT32/FLOAT: lo4 (little-endian)
-  // - INT64/DOUBLE: lo8 (little-endian)
-  // - INT96: lo4+lo8 (little-endian)
+  // - INT32/FLOAT: min_lo4/max_lo4 (little-endian, 4 bytes)
+  // - INT64/DOUBLE: min_lo8/max_lo8 (little-endian, 8 bytes)
+  // - INT96: lo4 contains the low 4 bytes, lo8 contains the high 8 bytes (little-endian, 12 bytes total)
   // - FIXED_LEN_BYTE_ARRAY:
   // - BYTE_ARRAY:
-  //   prefix: the longest common prefix of min/max
-  //   lo8+hi8 zero padded 16 bytes (big-endian) of the suffix
-  //   len: the length for the suffix of the value after removing the prefix. If > 16 then the
-  //        value is inexact
+  //   prefix: the longest common prefix of min and max values
+  //   lo8+hi8: zero-padded 16 bytes (big-endian) of the suffix after removing the prefix
+  //   min_len/max_len: the length of the suffix of the original value after removing the prefix.
+  //        If > 16 then the value stored in lo8+hi8 is a truncated approximation (inexact).
+  //        If <= 16 then the value is exact.
+  //
+  // Example for BYTE_ARRAY with min="apple" and max="application":
+  //   prefix = "appl"  (longest common prefix)
+  //   min suffix = "e" (1 byte), max suffix = "ication" (7 bytes)
+  //   min_lo8 = big-endian encoding of "e" zero-padded to 16 bytes
+  //   min_len = 1 (exact, since 1 <= 16)
+  //   max_lo8 = big-endian encoding of "ication" zero-padded to 16 bytes
+  //   max_len = 7 (exact, since 7 <= 16)
+  //
+  // Example for INT32 with min=42:
+  //   min_lo4 = 0x2A000000 (42 in little-endian)
   min_lo4: uint;
   min_lo8: ulong;
   min_hi8: ulong;
@@ -144,17 +332,90 @@ table Statistics {
   prefix: string;
 }
 
+/**
+ * Bloom filter metadata for a column chunk.
+ */
+table BloomFilterInfo {
+  /** Byte offset from beginning of file to Bloom filter data. **/
+  offset: long;
+
+  /** Size of Bloom filter data including the serialized header, in bytes.
+   * Writers should write this field so readers can read the bloom filter
+   * in a single I/O.
+   */
+  length: int;
+}
+
+table AesGcmV1 {
+  /** AAD prefix **/
+  aad_prefix: [byte];
+
+  /** Unique file identifier part of AAD suffix **/
+  aad_file_unique: [byte];
+
+  /** In files encrypted with AAD prefix without storing it,
+   * readers must supply the prefix **/
+  supply_aad_prefix: bool;
+}
+
+table AesGcmCtrV1 {
+  /** AAD prefix **/
+  aad_prefix: [byte];
+
+  /** Unique file identifier part of AAD suffix **/
+  aad_file_unique: [byte];
+
+  /** In files encrypted with AAD prefix without storing it,
+   * readers must supply the prefix **/
+  supply_aad_prefix: bool;
+}
+
+union EncryptionAlgorithm {
+  AesGcmV1:AesGcmV1,
+  AesGcmCtrV1:AesGcmCtrV1,
+}
+
 union ColumnOrder {
   TypeDefinedOrder:Empty,
 }
 
+/**
+ * Represents a element inside a schema definition.
+ *  - if it is a group (inner node) then type is undefined and num_children is defined
+ *  - if it is a primitive type (leaf) then type is defined and num_children is undefined
+ * the nodes are listed in depth first traversal order.
+ */
 table SchemaElement {
+  /** Name of the field in the schema */
   name: string;
+
+  /** Data type for this field. Not set if the current element is a non-leaf node */
   type: Type = null;
+
+  /** repetition of the field. The root of the schema does not have a repetition_type.
+   * All other nodes must have one */
   repetition_type: FieldRepetitionType;
+
+  /** The logical type of this SchemaElement */
   logical_type: LogicalType;
+
+  /** If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the values.
+   * Otherwise, if specified, this is the maximum bit length to store any of the values.
+   * (e.g. a low cardinality INT col could have this set to 3).  Note that this is
+   * in the schema, and therefore fixed for the entire file.
+   */
   type_length: int = null;
+
+  /** Nested fields.  Since thrift does not support nested fields,
+   * the nesting is flattened to a single list by a depth-first traversal.
+   * The children count is used to construct the nested relationship.
+   * This field is not set when the element is a primitive type
+   */
   num_children: int = 0;
+
+  /** When the original schema supports field ids, this will save the
+   * original field id in the parquet schema
+   */
   field_id: int = null;
   column_order: ColumnOrder;  // only present for leaf nodes
 }
@@ -166,59 +427,178 @@ enum PageType : byte {
   DATA_PAGE_V2 = 3,
 }
 
-table KV {
+table KeyValue {
   key: string;
   val: string;
 }
 
+/**
+ * Description for column metadata
+ */
 table ColumnMetadata {
+  /** Compression codec **/
   codec: CompressionCodec;
-  num_values: long = null;  // only present if not equal to rg.num_rows
+
+  /** Number of values in this column, only present if not equal to rg.num_rows **/
+  num_values: long = null;
+
+  /** total byte size of all uncompressed pages in this column chunk (including the headers) **/
   total_uncompressed_size: long;
+
+  /** total byte size of all compressed, and potentially encrypted, pages 
+   * in this column chunk (including the headers) **/
   total_compressed_size: long;
-  key_value_metadata: [KV];
+
+  /** Optional key/value metadata **/
+  key_value_metadata: [KeyValue];
+
+  /** Byte offset from beginning of file to first data page **/
   data_page_offset: long;
+
+  /** Byte offset from beginning of file to root index page **/
   index_page_offset: long = null;
+
+  /** Byte offset from the beginning of file to first (only) dictionary page **/
   dictionary_page_offset: long = null;
+
+  /** optional statistics for this column chunk */
   statistics: Statistics;
+
+  /** Indicates whether the column chunk pages are fully dictionary encoded. */
   is_fully_dict_encoded: bool;
-  bloom_filter_offset: long = null;
-  bloom_filter_length: int = null;
+
+  /** Optional Bloom filter information for this column chunk */
+  bloom_filter: BloomFilterInfo;
+}
+
+union ColumnCryptoMetadata {
+  EncryptionWithFooterKey:Empty,
+  EncryptionWithColumnKey:Empty,
 }
 
 table ColumnChunk {
-  file_path: string;
+  /** Column metadata for this chunk.
+   * Note: while marked as optional, this field is in fact required by most major
+   * Parquet implementations. As such, writers MUST populate this field.
+   **/
   meta_data: ColumnMetadata;
-  // crypto_metadata: ColumnCryptoMetadata;  // TODO
-  // encrypted_column_metadata: [byte];  // TODO
+
+  /** Crypto metadata of encrypted columns **/
+  crypto_metadata: ColumnCryptoMetadata;
+
+  /** Encrypted column metadata for this chunk **/
+  encrypted_column_metadata: [byte];
 }
 
+/**
+ * Sort order within a RowGroup of a leaf column
+ */
 table SortingColumn {
+  /** The ordinal position of the column (in this row group) **/
   column_idx: int;
+
+  /** If true, indicates this column is sorted in descending order. **/
   descending: bool;
+
+  /** If true, nulls will come before non-null values, otherwise,
+   * nulls go at the end. */
   nulls_first: bool;
 }
 
 table RowGroup {
+  /** Metadata for each column chunk in this row group.
+   * This list must have the same order as the SchemaElement list in FileMetaData.
+   **/
   columns: [ColumnChunk];
+
+  /** Total byte size of all the uncompressed column data in this row group **/
   total_byte_size: long;
+
+  /** Number of rows in this row group **/
   num_rows: long;
+
+  /** If set, specifies a sort ordering of the rows in this RowGroup.
+   * The sorting columns can be a subset of all the columns.
+   */
   sorting_columns: [SortingColumn];
+
+  /** Byte offset from beginning of file to first page (data or dictionary)
+   * in this row group **/
   file_offset: long;
+
+  /** Total byte size of all compressed (and potentially encrypted) column data 
+   * in this row group **/
   total_compressed_size: long;
+
+  /** Row group ordinal in the file **/
   ordinal: short = null;
 }
 
+/**
+ * Crypto metadata for files with encrypted footer.
+ */
+table FileCryptoMetaData {
+  /** 
+   * Encryption algorithm. This field is only used for files
+   * with encrypted footer. Files with plaintext footer store algorithm id
+   * inside footer (FileMetaData structure).
+   */
+  encryption_algorithm: EncryptionAlgorithm;
+    
+  /** Retrieval metadata of key used for encryption of footer, 
+   * and (possibly) columns **/
+  key_metadata: [byte];
+}
+
+/**
+ * Description for file metadata
+ */
 table FileMetaData {
+  /** Version of this file 
+    * 
+    * As of December 2025, there is no agreed upon consensus of what constitutes 
+    * version 2 of the file. For maximum compatibility with readers, writers should 
+    * always populate "1" for version. For maximum compatibility with writers, 
+    * readers should accept "1" and "2" interchangeably. All other versions are 
+    * reserved for potential future use-cases.
+    */
   version: int;
+
+  /** Parquet schema for this file. This schema contains metadata for all the columns.
+   * The schema is represented as a tree with a single root. The nodes of the tree
+   * are flattened to a list by doing a depth-first traversal.
+   * The column metadata contains the path in the schema for that column which can be
+   * used to map columns to nodes in the schema.
+   * The first element is the root **/
   schema: [SchemaElement];
+
+  /** Number of rows in this file **/
   num_rows: long;
+
+  /** Row groups in this file **/
   row_groups: [RowGroup];
-  kv: [KV];
+
+  /** Optional key/value metadata **/
+  kv: [KeyValue];
+
+  /** String for application that wrote this file. This should be in the format
+   * <Application> version <App Version> (build <App Build Hash>).
+   * e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55)
+   **/
   created_by: string;
-  // column_orders: [ColumnOrder];  // moved to SchemaElement
-  // encryption_algorithm: [EncryptionAlgorithm];  // TODO
-  // footer_signing_key_metadata: binary;  // TODO
+
+  /** 
+   * Encryption algorithm. This field is set only in encrypted files
+   * with plaintext footer. Files with encrypted footer store algorithm id
+   * in FileCryptoMetaData structure.
+   */
+  encryption_algorithm: EncryptionAlgorithm;
+
+  /** 
+   * Retrieval metadata of key used for signing the footer. 
+   * Used only in encrypted files with plaintext footer. 
+   */ 
+  footer_signing_key_metadata: [byte];
 }
 
 root_type FileMetaData;

From bf0825c1f60f2464c57d04d8a0b0fffc0ef7ddf7 Mon Sep 17 00:00:00 2001
From: Jiayi Wang <jiayi.wang@your.hostname.com>
Date: Wed, 11 Mar 2026 17:07:31 +0000
Subject: [PATCH 3/5] address comments - Add GeospatialStatistics, type change
 in Statistics

---
 src/main/flatbuf/parquet3.fbs | 56 ++++++++++++++++++++++++++++-------
 1 file changed, 46 insertions(+), 10 deletions(-)

diff --git a/src/main/flatbuf/parquet3.fbs b/src/main/flatbuf/parquet3.fbs
index 8406d5c0a..eccc25718 100644
--- a/src/main/flatbuf/parquet3.fbs
+++ b/src/main/flatbuf/parquet3.fbs
@@ -23,11 +23,13 @@ namespace parquet.format;
 // while removing duplicated fields, unused details, and inefficient encodings that
 // waste space and memory.
 // It can currently be attached as a footer extension, and may fully replace the
-// Thrift footer in the future.
+// Thrift footer in the future. As of now, the Thrift footer is still required;
+// this FlatBuffers footer is supplementary.
 //
 // Optimization notes:
 // 1. Statistics use fixed-width integral types when possible; otherwise they are
-//    encoded as prefix + suffix.
+//    encoded as prefix + suffix. SizeStatistics and Statistics.distinct_count
+//    are removed.
 // 2. ColumnChunk file_path and file_offset are removed since they are unused.
 // 3. ColumnMetaData.encoding_stats are removed and replaced by
 //    ColumnMetaData.is_fully_dict_encoded.
@@ -180,8 +182,8 @@ table Empty {}
  * Allowed for physical types: INT32, INT64, FIXED_LEN_BYTE_ARRAY, and BYTE_ARRAY.
  */
 table DecimalOptions {
-  precision: int;
   scale: int;
+  precision: int;
 }
 
 /** Time units for logical types */
@@ -295,7 +297,7 @@ union LogicalType {
 }
 
 table Statistics {
-  null_count: int = null;
+  null_count: long = null;
   // Store min/max values as fixed-width entities depending on the physical type.
   // If min_len/max_len is present then the corresponding min/max value is present.
   //
@@ -303,7 +305,7 @@ table Statistics {
   // - INT32/FLOAT: min_lo4/max_lo4 (little-endian, 4 bytes)
   // - INT64/DOUBLE: min_lo8/max_lo8 (little-endian, 8 bytes)
   // - INT96: lo4 contains the low 4 bytes, lo8 contains the high 8 bytes (little-endian, 12 bytes total)
-  // - FIXED_LEN_BYTE_ARRAY:
+  // - FIXED_LEN_BYTE_ARRAY: Encoded the same way as BYTE_ARRAY below
   // - BYTE_ARRAY:
   //   prefix: the longest common prefix of min and max values
   //   lo8+hi8: zero-padded 16 bytes (big-endian) of the suffix after removing the prefix
@@ -324,12 +326,35 @@ table Statistics {
   min_lo4: uint;
   min_lo8: ulong;
   min_hi8: ulong;
-  min_len: byte = null;
+  min_len: int = null;
   max_lo4: uint;
   max_lo8: ulong;
   max_hi8: ulong;
-  max_len: byte = null;
-  prefix: string;
+  max_len: int = null;
+  prefix: [byte];
+}
+
+/**
+ * Bounding box for GEOMETRY or GEOGRAPHY type in the representation of min/max
+ * value pair of coordinates from each axis.
+ */
+table BoundingBox {
+  xmin: double;
+  xmax: double;
+  ymin: double;
+  ymax: double;
+  zmin: double = null;
+  zmax: double = null;
+  mmin: double = null;
+  mmax: double = null;
+}
+
+/** Statistics specific to Geometry and Geography logical types */
+table GeospatialStatistics {
+  /** A bounding box of geospatial instances */
+  bbox: BoundingBox;
+  /** Geospatial type codes of all instances, or an empty list if not known */
+  geospatial_types: [int];
 }
 
 /**
@@ -394,7 +419,7 @@ table SchemaElement {
 
   /** repetition of the field. The root of the schema does not have a repetition_type.
    * All other nodes must have one */
-  repetition_type: FieldRepetitionType;
+  repetition_type: FieldRepetitionType = null;
 
   /** The logical type of this SchemaElement */
   logical_type: LogicalType;
@@ -469,11 +494,22 @@ table ColumnMetadata {
 
   /** Optional Bloom filter information for this column chunk */
   bloom_filter: BloomFilterInfo;
+
+  /** Optional statistics specific for Geometry and Geography logical types */
+  geospatial_statistics: GeospatialStatistics;
+}
+
+table EncryptionWithColumnKey {
+  /** Column path in schema **/
+  path_in_schema: [string];
+
+  /** Retrieval metadata of column encryption key **/
+  key_metadata: [byte];
 }
 
 union ColumnCryptoMetadata {
   EncryptionWithFooterKey:Empty,
-  EncryptionWithColumnKey:Empty,
+  EncryptionWithColumnKey:EncryptionWithColumnKey,
 }
 
 table ColumnChunk {

From 5a0baf26259f4ba5f28567c43c7c60e4a71dc042 Mon Sep 17 00:00:00 2001
From: Jiayi Wang <jiayi.wang@your.hostname.com>
Date: Wed, 11 Mar 2026 18:58:46 +0000
Subject: [PATCH 4/5] fix Statistics min_len and max_len

---
 src/main/flatbuf/parquet3.fbs | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/main/flatbuf/parquet3.fbs b/src/main/flatbuf/parquet3.fbs
index eccc25718..2d58e11e1 100644
--- a/src/main/flatbuf/parquet3.fbs
+++ b/src/main/flatbuf/parquet3.fbs
@@ -28,7 +28,7 @@ namespace parquet.format;
 //
 // Optimization notes:
 // 1. Statistics use fixed-width integral types when possible; otherwise they are
-//    encoded as prefix + suffix. SizeStatistics and Statistics.distinct_count
+//    encoded as prefix + truncated suffix. SizeStatistics and Statistics.distinct_count
 //    are removed.
 // 2. ColumnChunk file_path and file_offset are removed since they are unused.
 // 3. ColumnMetaData.encoding_stats are removed and replaced by
@@ -309,28 +309,28 @@ table Statistics {
   // - BYTE_ARRAY:
   //   prefix: the longest common prefix of min and max values
   //   lo8+hi8: zero-padded 16 bytes (big-endian) of the suffix after removing the prefix
-  //   min_len/max_len: the length of the suffix of the original value after removing the prefix.
-  //        If > 16 then the value stored in lo8+hi8 is a truncated approximation (inexact).
-  //        If <= 16 then the value is exact.
+  //   min_len/max_len: the absolute value is the min/max length without prefix if prefix exists.
+  //       If >= 0, the value is exact. If < 0, the value is inexact.
   //
-  // Example for BYTE_ARRAY with min="apple" and max="application":
-  //   prefix = "appl"  (longest common prefix)
-  //   min suffix = "e" (1 byte), max suffix = "ication" (7 bytes)
-  //   min_lo8 = big-endian encoding of "e" zero-padded to 16 bytes
-  //   min_len = 1 (exact, since 1 <= 16)
-  //   max_lo8 = big-endian encoding of "ication" zero-padded to 16 bytes
-  //   max_len = 7 (exact, since 7 <= 16)
+  // Example for BYTE_ARRAY with min="apple" and max="application_is_a_very_long_suffix":
+  //   prefix = "appl"  (longest common prefix, 4 bytes)
+  //   min suffix = "e" (1 byte), max suffix = "ication_is_a_very_long_suffix" (29 bytes)
+  //   min_lo8 = big-endian encoding of "e" (1 byte)
+  //   min_len = 1 (>= 0, exact)
+  //   max_lo8+max_hi8 = big-endian encoding of "ication_is_a_ves" (truncated to 16 bytes,
+  //       last byte incremented by 1 to ensure it is still an upper bound)
+  //   max_len = -16 (< 0, inexact)
   //
   // Example for INT32 with min=42:
   //   min_lo4 = 0x2A000000 (42 in little-endian)
   min_lo4: uint;
   min_lo8: ulong;
   min_hi8: ulong;
-  min_len: int = null;
+  min_len: byte = null;
   max_lo4: uint;
   max_lo8: ulong;
   max_hi8: ulong;
-  max_len: int = null;
+  max_len: byte = null;
   prefix: [byte];
 }
 

From c938fc70f83c4953071d2ed63c2c34c8e60e4acd Mon Sep 17 00:00:00 2001
From: Jiayi Wang <jiayi.wang@your.hostname.com>
Date: Wed, 8 Apr 2026 09:28:52 +0000
Subject: [PATCH 5/5] address review comments from emkornfield and adamreeve

- Rename parquet3.fbs to parquet.fbs
- Comment out deprecated PLAIN_DICTIONARY encoding (like BIT_PACKED)
- Add distinct_count back to Statistics
- Remove ConvertedType forward-compat constraint from DecimalOptions
- Add backward-compat note for LogicalType Empty union types
- Reorder SchemaElement fields to match Thrift ordering
- Expand is_fully_dict_encoded documentation
- Rename meta_data to metadata in ColumnChunk
- Clarify total_byte_size in RowGroup
- Remove FileCryptoMetaData (encrypted footer layout not yet specified)
---
 .../flatbuf/{parquet3.fbs => parquet.fbs}     | 71 ++++++++-----------
 1 file changed, 30 insertions(+), 41 deletions(-)
 rename src/main/flatbuf/{parquet3.fbs => parquet.fbs} (93%)

diff --git a/src/main/flatbuf/parquet3.fbs b/src/main/flatbuf/parquet.fbs
similarity index 93%
rename from src/main/flatbuf/parquet3.fbs
rename to src/main/flatbuf/parquet.fbs
index 2d58e11e1..86a403c9c 100644
--- a/src/main/flatbuf/parquet3.fbs
+++ b/src/main/flatbuf/parquet.fbs
@@ -35,8 +35,6 @@ namespace parquet.format;
 //    ColumnMetaData.is_fully_dict_encoded.
 // 4. ColumnMetaData.path_in_schema is removed since it can be derived from the schema.
 // 5. ConvertedType is fully dropped as it is superseded by LogicalType.
-// 6. Offset and column indexes are removed since they are small and their offsets
-//    alone take comparable space.
 
 /**
  * Types supported by Parquet. These types are intended to be used in combination
@@ -95,19 +93,19 @@ enum Encoding : byte {
   /**
    * Deprecated: Dictionary encoding. The values in the dictionary are encoded in the
    * plain type.
-   * in a data page use RLE_DICTIONARY instead.
-   * in a Dictionary page use PLAIN instead
+   * In a data page use RLE_DICTIONARY instead.
+   * In a Dictionary page use PLAIN instead.
    */
-  PLAIN_DICTIONARY = 2,
+  // PLAIN_DICTIONARY = 2,
 
   /** Group packed run length encoding. Usable for definition/repetition levels
    * encoding and Booleans (on one bit: 0 is false; 1 is true.)
    */
   RLE = 3,
 
-  /** Bit packed encoding. This can only be used if the data has a known max
+  /** Deprecated: Bit packed encoding. This can only be used if the data has a known max
    * width. Usable for definition/repetition levels encoding.
-   * This encoding is deprecated and is replaced by the RLE/bit-packing hybrid encoding. 
+   * This encoding is replaced by the RLE/bit-packing hybrid encoding.
    */
   // BIT_PACKED = 4,
 
@@ -176,9 +174,6 @@ table Empty {}
  * Scale must be zero or a positive integer less than or equal to the precision.
  * Precision must be a non-zero positive integer.
  *
- * To maintain forward-compatibility in v1, implementations using this logical
- * type must also set scale and precision on the annotated SchemaElement.
- *
  * Allowed for physical types: INT32, INT64, FIXED_LEN_BYTE_ARRAY, and BYTE_ARRAY.
  */
 table DecimalOptions {
@@ -275,9 +270,13 @@ table GeographyType {
 
 /**
  * LogicalType annotations to replace ConvertedType.
+ *
+ * Types with no parameters use `Empty`. To add parameters later, append a new
+ * union member (e.g., StringTypeV2:StringOptions); new readers remain
+ * backward-compatible with old files.
  */
 union LogicalType {
-  StringType:Empty, 
+  StringType:Empty,
   MapType:Empty,
   ListType:Empty,
   EnumType:Empty,
@@ -298,6 +297,8 @@ union LogicalType {
 
 table Statistics {
   null_count: long = null;
+  /** count of distinct values occurring */
+  distinct_count: long = null;
   // Store min/max values as fixed-width entities depending on the physical type.
   // If min_len/max_len is present then the corresponding min/max value is present.
   //
@@ -411,19 +412,9 @@ union ColumnOrder {
  * the nodes are listed in depth first traversal order.
  */
 table SchemaElement {
-  /** Name of the field in the schema */
-  name: string;
-
   /** Data type for this field. Not set if the current element is a non-leaf node */
   type: Type = null;
 
-  /** repetition of the field. The root of the schema does not have a repetition_type.
-   * All other nodes must have one */
-  repetition_type: FieldRepetitionType = null;
-
-  /** The logical type of this SchemaElement */
-  logical_type: LogicalType;
-
   /** If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the values.
    * Otherwise, if specified, this is the maximum bit length to store any of the values.
    * (e.g. a low cardinality INT col could have this set to 3).  Note that this is
@@ -431,6 +422,13 @@ table SchemaElement {
    */
   type_length: int = null;
 
+  /** repetition of the field. The root of the schema does not have a repetition_type.
+   * All other nodes must have one */
+  repetition_type: FieldRepetitionType = null;
+
+  /** Name of the field in the schema */
+  name: string;
+
   /** Nested fields.  Since thrift does not support nested fields,
    * the nesting is flattened to a single list by a depth-first traversal.
    * The children count is used to construct the nested relationship.
@@ -442,7 +440,12 @@ table SchemaElement {
    * original field id in the parquet schema
    */
   field_id: int = null;
-  column_order: ColumnOrder;  // only present for leaf nodes
+
+  /** The logical type of this SchemaElement */
+  logical_type: LogicalType;
+
+  /** Column ordering for leaf nodes, used to interpret min/max statistics */
+  column_order: ColumnOrder;
 }
 
 enum PageType : byte {
@@ -489,7 +492,9 @@ table ColumnMetadata {
   /** optional statistics for this column chunk */
   statistics: Statistics;
 
-  /** Indicates whether the column chunk pages are fully dictionary encoded. */
+  /** True if every data page in this column chunk is dictionary-encoded
+   * (no fallback). Replaces Thrift encoding_stats.
+   */
   is_fully_dict_encoded: bool;
 
   /** Optional Bloom filter information for this column chunk */
@@ -517,7 +522,7 @@ table ColumnChunk {
    * Note: while marked as optional, this field is in fact required by most major
    * Parquet implementations. As such, writers MUST populate this field.
    **/
-  meta_data: ColumnMetadata;
+  metadata: ColumnMetadata;
 
   /** Crypto metadata of encrypted columns **/
   crypto_metadata: ColumnCryptoMetadata;
@@ -547,7 +552,7 @@ table RowGroup {
    **/
   columns: [ColumnChunk];
 
-  /** Total byte size of all the uncompressed column data in this row group **/
+  /** Sum of total_uncompressed_size across all columns (uncompressed, encoded) **/
   total_byte_size: long;
 
   /** Number of rows in this row group **/
@@ -570,22 +575,6 @@ table RowGroup {
   ordinal: short = null;
 }
 
-/**
- * Crypto metadata for files with encrypted footer.
- */
-table FileCryptoMetaData {
-  /** 
-   * Encryption algorithm. This field is only used for files
-   * with encrypted footer. Files with plaintext footer store algorithm id
-   * inside footer (FileMetaData structure).
-   */
-  encryption_algorithm: EncryptionAlgorithm;
-    
-  /** Retrieval metadata of key used for encryption of footer, 
-   * and (possibly) columns **/
-  key_metadata: [byte];
-}
-
 /**
  * Description for file metadata
  */