Skip to content

Commit

Permalink
Use LevelHistogram in PageIndex (#6135)
Browse files Browse the repository at this point in the history
* use LevelHistogram in PageIndex and ColumnIndexBuilder

* revert changes to OffsetIndexBuilder
  • Loading branch information
etseidl authored Jul 29, 2024
1 parent 5f5a82c commit 80ed712
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 16 deletions.
2 changes: 1 addition & 1 deletion parquet/src/file/metadata/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -569,7 +569,7 @@ pub struct ColumnChunkMetaData {
/// For example, `vec[0]` is the number of rows with level 0, `vec[1]` is the
/// number of rows with level 1, and so on.
///
#[derive(Debug, Clone, PartialEq)]
#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
pub struct LevelHistogram {
inner: Vec<i64>,
}
Expand Down
28 changes: 16 additions & 12 deletions parquet/src/file/page_index/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use crate::basic::Type;
use crate::data_type::private::ParquetValueType;
use crate::data_type::{AsBytes, ByteArray, FixedLenByteArray, Int96};
use crate::errors::ParquetError;
use crate::file::metadata::LevelHistogram;
use crate::format::{BoundaryOrder, ColumnIndex};
use crate::util::bit_util::from_le_slice;
use std::fmt::Debug;
Expand All @@ -40,13 +41,13 @@ pub struct PageIndex<T> {
///
/// `repetition_level_histogram[i]` is a count of how many values are at repetition level `i`.
/// For example, `repetition_level_histogram[0]` indicates how many rows the page contains.
pub repetition_level_histogram: Option<Vec<i64>>,
pub repetition_level_histogram: Option<LevelHistogram>,
/// Definition level histogram for the page
///
/// `definition_level_histogram[i]` is a count of how many values are at definition level `i`.
/// For example, `definition_level_histogram[max_definition_level]` indicates how many
/// non-null values are present in the page.
pub definition_level_histogram: Option<Vec<i64>>,
pub definition_level_histogram: Option<LevelHistogram>,
}

impl<T> PageIndex<T> {
Expand All @@ -59,10 +60,10 @@ impl<T> PageIndex<T> {
pub fn null_count(&self) -> Option<i64> {
self.null_count
}
pub fn repetition_level_histogram(&self) -> Option<&Vec<i64>> {
pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
self.repetition_level_histogram.as_ref()
}
pub fn definition_level_histogram(&self) -> Option<&Vec<i64>> {
pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
self.definition_level_histogram.as_ref()
}
}
Expand Down Expand Up @@ -175,17 +176,17 @@ impl<T: ParquetValueType> NativeIndex<T> {
for i in 0..len {
let page_idx = i * num_levels;
let page_hist = hist[page_idx..page_idx + num_levels].to_vec();
res.push(Some(page_hist));
res.push(Some(LevelHistogram::from(page_hist)));
}
res
} else {
vec![None; len]
}
};

let rep_hists: Vec<Option<Vec<i64>>> =
let rep_hists: Vec<Option<LevelHistogram>> =
to_page_histograms(index.repetition_level_histograms);
let def_hists: Vec<Option<Vec<i64>>> =
let def_hists: Vec<Option<LevelHistogram>> =
to_page_histograms(index.definition_level_histograms);

let indexes = index
Expand Down Expand Up @@ -236,19 +237,22 @@ mod tests {
min: Some(-123),
max: Some(234),
null_count: Some(0),
repetition_level_histogram: Some(vec![1, 2]),
definition_level_histogram: Some(vec![1, 2, 3]),
repetition_level_histogram: Some(LevelHistogram::from(vec![1, 2])),
definition_level_histogram: Some(LevelHistogram::from(vec![1, 2, 3])),
};

assert_eq!(page_index.min().unwrap(), &-123);
assert_eq!(page_index.max().unwrap(), &234);
assert_eq!(page_index.min_bytes().unwrap(), (-123).as_bytes());
assert_eq!(page_index.max_bytes().unwrap(), 234.as_bytes());
assert_eq!(page_index.null_count().unwrap(), 0);
assert_eq!(page_index.repetition_level_histogram(), Some(&vec![1, 2]));
assert_eq!(
page_index.definition_level_histogram(),
Some(&vec![1, 2, 3])
page_index.repetition_level_histogram().unwrap().values(),
&vec![1, 2]
);
assert_eq!(
page_index.definition_level_histogram().unwrap().values(),
&vec![1, 2, 3]
);
}

Expand Down
6 changes: 3 additions & 3 deletions parquet/src/file/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1951,7 +1951,7 @@ mod tests {

assert!(col_idx.repetition_level_histogram().is_none());
assert!(col_idx.definition_level_histogram().is_some());
check_def_hist(col_idx.definition_level_histogram().unwrap());
check_def_hist(col_idx.definition_level_histogram().unwrap().values());

assert!(reader.metadata().offset_index().is_some());
let offset_index = reader.metadata().offset_index().unwrap();
Expand Down Expand Up @@ -2066,8 +2066,8 @@ mod tests {
unreachable!()
};

check_def_hist(col_idx.definition_level_histogram().unwrap());
check_rep_hist(col_idx.repetition_level_histogram().unwrap());
check_def_hist(col_idx.definition_level_histogram().unwrap().values());
check_rep_hist(col_idx.repetition_level_histogram().unwrap().values());

assert!(reader.metadata().offset_index().is_some());
let offset_index = reader.metadata().offset_index().unwrap();
Expand Down

0 comments on commit 80ed712

Please sign in to comment.