-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Minor: Improve parquet PageIndex documentation #6042
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
a143581
84596aa
4476adf
e1524c0
b5162a9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -50,17 +50,27 @@ use crate::schema::types::{ | |
| Type as SchemaType, | ||
| }; | ||
|
|
||
| /// [`Index`] for each row group of each column. | ||
| /// Page level statistics for each column chunk of each row group. | ||
| /// | ||
| /// This structure is an memory representation of multiple [`ColumnIndex`] | ||
| /// structures in a parquet file footer, as described in the Parquet [PageIndex | ||
| /// documentation]. Each [`Index`] holds statistics about all the pages in a | ||
| /// particular column chunk. | ||
| /// | ||
| /// `column_index[row_group_number][column_number]` holds the | ||
| /// [`Index`] corresponding to column `column_number` of row group | ||
| /// `row_group_number`. | ||
| /// | ||
| /// For example `column_index[2][3]` holds the [`Index`] for the forth | ||
| /// column in the third row group of the parquet file. | ||
| /// | ||
| /// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md | ||
| pub type ParquetColumnIndex = Vec<Vec<Index>>; | ||
|
|
||
| /// [`PageLocation`] for each data page of each row group of each column. | ||
| /// [`PageLocation`] for each data page of each row group of each column | ||
| /// | ||
| /// This structure is the parsed representation of the [`OffsetIndex`] from the | ||
| /// Parquet file footer, as described in the Parquet [PageIndex documentation]. | ||
| /// | ||
| /// `offset_index[row_group_number][column_number][page_number]` holds | ||
| /// the [`PageLocation`] corresponding to page `page_number` of column | ||
|
|
@@ -69,6 +79,8 @@ pub type ParquetColumnIndex = Vec<Vec<Index>>; | |
| /// For example `offset_index[2][3][4]` holds the [`PageLocation`] for | ||
| /// the fifth page of the forth column in the third row group of the | ||
| /// parquet file. | ||
| /// | ||
| /// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md | ||
| pub type ParquetOffsetIndex = Vec<Vec<Vec<PageLocation>>>; | ||
|
|
||
| /// Parsed metadata for a single Parquet file | ||
|
|
@@ -942,14 +954,21 @@ impl ColumnChunkMetaDataBuilder { | |
| } | ||
| } | ||
|
|
||
| /// Builder for column index | ||
| /// Builder for Parquet [`ColumnIndex`], part of the Parquet [PageIndex] | ||
| /// | ||
| /// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md | ||
| pub struct ColumnIndexBuilder { | ||
| null_pages: Vec<bool>, | ||
| min_values: Vec<Vec<u8>>, | ||
| max_values: Vec<Vec<u8>>, | ||
| null_counts: Vec<i64>, | ||
| boundary_order: BoundaryOrder, | ||
| // If one page can't get build index, need to ignore all index in this column | ||
| /// Is the information in the builder valid? | ||
| /// | ||
| /// Set to `false` if any entry in the page doesn't have statistics for | ||
| /// some reason. This might happen if the page is entirely null, or | ||
| /// is a floating point column without any non-nan values | ||
| /// e.g. <https://github.com/apache/parquet-format/pull/196> | ||
|
alamb marked this conversation as resolved.
|
||
| valid: bool, | ||
| } | ||
|
|
||
|
|
@@ -971,6 +990,7 @@ impl ColumnIndexBuilder { | |
| } | ||
| } | ||
|
|
||
| /// Append statistics for the next page | ||
| pub fn append( | ||
| &mut self, | ||
| null_page: bool, | ||
|
|
@@ -988,15 +1008,19 @@ impl ColumnIndexBuilder { | |
| self.boundary_order = boundary_order; | ||
| } | ||
|
|
||
| /// Mark this column index as invalid | ||
| pub fn to_invalid(&mut self) { | ||
| self.valid = false; | ||
| } | ||
|
|
||
| /// Is the information in the builder valid? | ||
| pub fn valid(&self) -> bool { | ||
| self.valid | ||
| } | ||
|
|
||
| /// Build and get the thrift metadata of column index | ||
| /// | ||
| /// Note: callers should check [`Self::valid`] before calling this method | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes me wonder if
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree that would be a good improvement -- filed #6064 |
||
| pub fn build_to_thrift(self) -> ColumnIndex { | ||
| ColumnIndex::new( | ||
| self.null_pages, | ||
|
|
@@ -1008,7 +1032,9 @@ impl ColumnIndexBuilder { | |
| } | ||
| } | ||
|
|
||
| /// Builder for offset index | ||
| /// Builder for offset index, part of the Parquet [PageIndex]. | ||
| /// | ||
| /// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md | ||
| pub struct OffsetIndexBuilder { | ||
| offset_array: Vec<i64>, | ||
| compressed_page_size_array: Vec<i32>, | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.